* [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
@ 2024-11-09 0:41 Vadim Fedorenko
2024-11-09 0:41 ` [PATCH bpf-next v5 2/4] bpf: add bpf_cpu_cycles_to_ns helper Vadim Fedorenko
` (5 more replies)
0 siblings, 6 replies; 23+ messages in thread
From: Vadim Fedorenko @ 2024-11-09 0:41 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Eduard Zingerman, Thomas Gleixner, Vadim Fedorenko,
Mykola Lysenko, Jakub Kicinski
Cc: x86, bpf, Vadim Fedorenko, Martin KaFai Lau
New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
it into rdtsc ordered call. Other architectures will get JIT
implementation too if supported. The fallback is to
__arch_get_hw_counter().
Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
---
v4 -> v5:
* use if instead of ifdef with IS_ENABLED
v3 -> v4:
* change name of the helper to bpf_get_cpu_cycles (Andrii)
* Hide the helper behind CONFIG_GENERIC_GETTIMEOFDAY to avoid exposing
it on architectures which do not have vDSO functions and data
* reduce the scope of check of inlined functions in verifier to only 2,
which are actually inlined.
v2 -> v3:
* change name of the helper to bpf_get_cpu_cycles_counter to explicitly
mention what counter it provides (Andrii)
* move kfunc definition to bpf.h to use it in JIT.
* introduce another kfunc to convert cycles into nanoseconds as more
meaningful time units for generic tracing use case (Andrii)
v1 -> v2:
* Fix incorrect function return value type to u64
* Introduce bpf_jit_inlines_kfunc_call() and use it in
mark_fastcall_pattern_for_call() to avoid clobbering in case of
running programs with no JIT (Eduard)
* Avoid rewriting instruction and check function pointer directly
in JIT (Alexei)
* Change includes to fix compile issues on non x86 architectures
---
arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++++++++
arch/x86/net/bpf_jit_comp32.c | 14 ++++++++++++++
include/linux/bpf.h | 5 +++++
include/linux/filter.h | 1 +
kernel/bpf/core.c | 11 +++++++++++
kernel/bpf/helpers.c | 13 +++++++++++++
kernel/bpf/verifier.c | 30 +++++++++++++++++++++++++++++-
7 files changed, 101 insertions(+), 1 deletion(-)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 06b080b61aa5..4f78ed93ee7f 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2126,6 +2126,26 @@ st: if (is_imm8(insn->off))
case BPF_JMP | BPF_CALL: {
u8 *ip = image + addrs[i - 1];
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+ imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
+ /* Save RDX because RDTSC will use EDX:EAX to return u64 */
+ emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
+ if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
+ EMIT_LFENCE();
+ EMIT2(0x0F, 0x31);
+
+ /* shl RDX, 32 */
+ maybe_emit_1mod(&prog, BPF_REG_3, true);
+ EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
+ /* or RAX, RDX */
+ maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
+ EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
+ /* restore RDX from R11 */
+ emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
+
+ break;
+ }
+
func = (u8 *) __bpf_call_base + imm32;
if (tail_call_reachable) {
LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
@@ -3652,3 +3672,11 @@ u64 bpf_arch_uaddress_limit(void)
{
return 0;
}
+
+/* x86-64 JIT can inline kfunc */
+bool bpf_jit_inlines_kfunc_call(s32 imm)
+{
+ if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
+ return true;
+ return false;
+}
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index de0f9e5f9f73..e6097a371b69 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -2094,6 +2094,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
int err;
+ if (imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
+ if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
+ EMIT3(0x0F, 0xAE, 0xE8);
+ EMIT2(0x0F, 0x31);
+ break;
+ }
+
err = emit_kfunc_call(bpf_prog,
image + addrs[i],
insn, &prog);
@@ -2621,3 +2628,10 @@ bool bpf_jit_supports_kfunc_call(void)
{
return true;
}
+
+bool bpf_jit_inlines_kfunc_call(s32 imm)
+{
+ if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
+ return true;
+ return false;
+}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1b84613b10ac..fed5f36d387a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3328,6 +3328,11 @@ void bpf_user_rnd_init_once(void);
u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+/* Inlined kfuncs */
+#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
+u64 bpf_get_cpu_cycles(void);
+#endif
+
#if defined(CONFIG_NET)
bool bpf_sock_common_is_valid_access(int off, int size,
enum bpf_access_type type,
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7d7578a8eac1..8bdd5e6b2a65 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1111,6 +1111,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
void bpf_jit_compile(struct bpf_prog *prog);
bool bpf_jit_needs_zext(void);
bool bpf_jit_inlines_helper_call(s32 imm);
+bool bpf_jit_inlines_kfunc_call(s32 imm);
bool bpf_jit_supports_subprog_tailcalls(void);
bool bpf_jit_supports_percpu_insn(void);
bool bpf_jit_supports_kfunc_call(void);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 233ea78f8f1b..ab6a2452ade0 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2965,6 +2965,17 @@ bool __weak bpf_jit_inlines_helper_call(s32 imm)
return false;
}
+/* Return true if the JIT inlines the call to the kfunc corresponding to
+ * the imm.
+ *
+ * The verifier will not patch the insn->imm for the call to the helper if
+ * this returns true.
+ */
+bool __weak bpf_jit_inlines_kfunc_call(s32 imm)
+{
+ return false;
+}
+
/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
bool __weak bpf_jit_supports_subprog_tailcalls(void)
{
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 395221e53832..5c6c0383ebf4 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -23,6 +23,9 @@
#include <linux/btf_ids.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/kasan.h>
+#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
+#include <vdso/datapage.h>
+#endif
#include "../../lib/kstrtox.h"
@@ -3023,6 +3026,13 @@ __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user
return ret + 1;
}
+#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
+__bpf_kfunc u64 bpf_get_cpu_cycles(void)
+{
+ return __arch_get_hw_counter(1, NULL);
+}
+#endif
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(generic_btf_ids)
@@ -3115,6 +3125,9 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache)
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
+#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
+BTF_ID_FLAGS(func, bpf_get_cpu_cycles, KF_FASTCALL)
+#endif
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7958d6ff6b73..b5220d996231 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -16273,6 +16273,24 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
}
}
+/* True if fixup_kfunc_call() replaces calls to kfunc number 'imm',
+ * replacement patch is presumed to follow bpf_fastcall contract
+ * (see mark_fastcall_pattern_for_call() below).
+ */
+static bool verifier_inlines_kfunc_call(struct bpf_verifier_env *env, s32 imm)
+{
+ const struct bpf_kfunc_desc *desc = find_kfunc_desc(env->prog, imm, 0);
+
+ if (!env->prog->jit_requested)
+ return false;
+
+ if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast])
+ return true;
+
+ return false;
+}
+
/* Same as helper_fastcall_clobber_mask() but for kfuncs, see comment above */
static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -16400,7 +16418,10 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env,
return;
clobbered_regs_mask = kfunc_fastcall_clobber_mask(&meta);
- can_be_inlined = is_fastcall_kfunc_call(&meta);
+ can_be_inlined = is_fastcall_kfunc_call(&meta) &&
+ (verifier_inlines_kfunc_call(env, call->imm) ||
+ (meta.btf == btf_vmlinux &&
+ bpf_jit_inlines_kfunc_call(call->imm)));
}
if (clobbered_regs_mask == ALL_CALLER_SAVED_REGS)
@@ -20402,6 +20423,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
const struct bpf_kfunc_desc *desc;
+ s32 imm = insn->imm;
if (!insn->imm) {
verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
@@ -20488,6 +20510,12 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
node_offset_reg, insn, insn_buf, cnt);
} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ if (!verifier_inlines_kfunc_call(env, imm)) {
+ verbose(env, "verifier internal error: kfunc id %d is not defined in checker\n",
+ desc->func_id);
+ return -EFAULT;
+ }
+
insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
*cnt = 1;
} else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) {
--
2.43.5
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH bpf-next v5 2/4] bpf: add bpf_cpu_cycles_to_ns helper
2024-11-09 0:41 [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc Vadim Fedorenko
@ 2024-11-09 0:41 ` Vadim Fedorenko
2024-11-12 23:03 ` Eduard Zingerman
2024-11-09 0:41 ` [PATCH bpf-next v5 3/4] selftests/bpf: add selftest to check rdtsc jit Vadim Fedorenko
` (4 subsequent siblings)
5 siblings, 1 reply; 23+ messages in thread
From: Vadim Fedorenko @ 2024-11-09 0:41 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Eduard Zingerman, Thomas Gleixner, Vadim Fedorenko,
Mykola Lysenko, Jakub Kicinski
Cc: x86, bpf, Vadim Fedorenko, Martin KaFai Lau
The new helper should be used to convert cycles received by
bpf_get_cpu_cycle() into nanoseconds.
Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
---
v5:
* no changes
v4:
* change helper name to bpf_cpu_cycles_to_ns.
* hide it behind CONFIG_GENERIC_GETTIMEOFDAY to avoid exposing on
unsupported architectures.
---
arch/x86/net/bpf_jit_comp.c | 22 ++++++++++++++++++++++
arch/x86/net/bpf_jit_comp32.c | 19 +++++++++++++++++++
include/linux/bpf.h | 1 +
kernel/bpf/helpers.c | 10 +++++++++-
4 files changed, 51 insertions(+), 1 deletion(-)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 4f78ed93ee7f..ddc73d9a90f4 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -11,6 +11,7 @@
#include <linux/bpf.h>
#include <linux/memory.h>
#include <linux/sort.h>
+#include <linux/clocksource.h>
#include <asm/extable.h>
#include <asm/ftrace.h>
#include <asm/set_memory.h>
@@ -2146,6 +2147,24 @@ st: if (is_imm8(insn->off))
break;
}
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+ imm32 == BPF_CALL_IMM(bpf_cpu_cycles_to_ns) &&
+ boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ u32 mult, shift;
+
+ clocks_calc_mult_shift(&mult, &shift, tsc_khz, USEC_PER_SEC, 0);
+ /* imul RAX, RDI, mult */
+ maybe_emit_mod(&prog, BPF_REG_1, BPF_REG_0, true);
+ EMIT2_off32(0x69, add_2reg(0xC0, BPF_REG_1, BPF_REG_0),
+ mult);
+
+ /* shr RAX, shift (which is less than 64) */
+ maybe_emit_1mod(&prog, BPF_REG_0, true);
+ EMIT3(0xC1, add_1reg(0xE8, BPF_REG_0), shift);
+
+ break;
+ }
+
func = (u8 *) __bpf_call_base + imm32;
if (tail_call_reachable) {
LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
@@ -3678,5 +3697,8 @@ bool bpf_jit_inlines_kfunc_call(s32 imm)
{
if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
return true;
+ if (imm == BPF_CALL_IMM(bpf_cpu_cycles_to_ns) &&
+ boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ return true;
return false;
}
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index e6097a371b69..34f762f28c82 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -12,6 +12,7 @@
#include <linux/netdevice.h>
#include <linux/filter.h>
#include <linux/if_vlan.h>
+#include <linux/clocksource.h>
#include <asm/cacheflush.h>
#include <asm/set_memory.h>
#include <asm/nospec-branch.h>
@@ -2100,6 +2101,24 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
EMIT2(0x0F, 0x31);
break;
}
+ if (imm32 == BPF_CALL_IMM(bpf_cpu_cycles_to_ns) &&
+ boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
+ u32 mult, shift;
+
+ clocks_calc_mult_shift(&mult, &shift, tsc_khz,
+ USEC_PER_SEC, 0);
+
+ /* move parameter to BPF_REG_0 */
+ emit_ia32_mov_r64(true, bpf2ia32[BPF_REG_0],
+ bpf2ia32[BPF_REG_1], true, true,
+ &prog, bpf_prog->aux);
+ /* multiply parameter by mut */
+ emit_ia32_mul_i64(bpf2ia32[BPF_REG_0],
+ mult, true, &prog);
+ /* shift parameter by shift which is less than 64 */
+ emit_ia32_rsh_i64(bpf2ia32[BPF_REG_0],
+ shift, true, &prog);
+ }
err = emit_kfunc_call(bpf_prog,
image + addrs[i],
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index fed5f36d387a..46fa662d95e4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3331,6 +3331,7 @@ u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
/* Inlined kfuncs */
#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
u64 bpf_get_cpu_cycles(void);
+u64 bpf_cpu_cycles_to_ns(u64 cycles);
#endif
#if defined(CONFIG_NET)
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 5c6c0383ebf4..72d5819e5df2 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -25,6 +25,7 @@
#include <linux/kasan.h>
#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
#include <vdso/datapage.h>
+#include <asm/vdso/vsyscall.h>
#endif
#include "../../lib/kstrtox.h"
@@ -3031,8 +3032,14 @@ __bpf_kfunc u64 bpf_get_cpu_cycles(void)
{
return __arch_get_hw_counter(1, NULL);
}
-#endif
+__bpf_kfunc u64 bpf_cpu_cycles_to_ns(u64 cycles)
+{
+ const struct vdso_data *vd = __arch_get_k_vdso_data();
+
+ return mul_u64_u32_shr(cycles, vd->mult, vd->shift);
+}
+#endif
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(generic_btf_ids)
@@ -3127,6 +3134,7 @@ BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLE
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
BTF_ID_FLAGS(func, bpf_get_cpu_cycles, KF_FASTCALL)
+BTF_ID_FLAGS(func, bpf_cpu_cycles_to_ns, KF_FASTCALL)
#endif
BTF_KFUNCS_END(common_btf_ids)
--
2.43.5
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH bpf-next v5 3/4] selftests/bpf: add selftest to check rdtsc jit
2024-11-09 0:41 [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc Vadim Fedorenko
2024-11-09 0:41 ` [PATCH bpf-next v5 2/4] bpf: add bpf_cpu_cycles_to_ns helper Vadim Fedorenko
@ 2024-11-09 0:41 ` Vadim Fedorenko
2024-11-12 23:17 ` Eduard Zingerman
2024-11-09 0:41 ` [PATCH bpf-next v5 4/4] selftests/bpf: add usage example for cpu cycles kfuncs Vadim Fedorenko
` (3 subsequent siblings)
5 siblings, 1 reply; 23+ messages in thread
From: Vadim Fedorenko @ 2024-11-09 0:41 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Eduard Zingerman, Thomas Gleixner, Vadim Fedorenko,
Mykola Lysenko, Jakub Kicinski
Cc: x86, bpf, Vadim Fedorenko, Martin KaFai Lau
get_cpu_cycles() is replaced with rdtsc instruction on x86_64. Add
tests to check it.
Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
---
.../selftests/bpf/prog_tests/verifier.c | 2 +
.../selftests/bpf/progs/verifier_cpu_cycles.c | 104 ++++++++++++++++++
2 files changed, 106 insertions(+)
create mode 100644 tools/testing/selftests/bpf/progs/verifier_cpu_cycles.c
diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c
index 75f7a2ce334b..d76cc90f4bfb 100644
--- a/tools/testing/selftests/bpf/prog_tests/verifier.c
+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c
@@ -97,6 +97,7 @@
#include "verifier_xdp_direct_packet_access.skel.h"
#include "verifier_bits_iter.skel.h"
#include "verifier_lsm.skel.h"
+#include "verifier_cpu_cycles.skel.h"
#define MAX_ENTRIES 11
@@ -223,6 +224,7 @@ void test_verifier_xdp(void) { RUN(verifier_xdp); }
void test_verifier_xdp_direct_packet_access(void) { RUN(verifier_xdp_direct_packet_access); }
void test_verifier_bits_iter(void) { RUN(verifier_bits_iter); }
void test_verifier_lsm(void) { RUN(verifier_lsm); }
+void test_verifier_cpu_cycles(void) { RUN(verifier_cpu_cycles); }
void test_verifier_mtu(void)
{
diff --git a/tools/testing/selftests/bpf/progs/verifier_cpu_cycles.c b/tools/testing/selftests/bpf/progs/verifier_cpu_cycles.c
new file mode 100644
index 000000000000..88bfa7211858
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/verifier_cpu_cycles.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Inc. */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_misc.h"
+
+extern u64 bpf_cpu_cycles_to_ns(u64 cycles) __weak __ksym;
+extern u64 bpf_get_cpu_cycles(void) __weak __ksym;
+
+SEC("syscall")
+__arch_x86_64
+__xlated("0: call kernel-function")
+__naked int bpf_rdtsc(void)
+{
+ asm volatile(
+ "call %[bpf_get_cpu_cycles];"
+ "exit"
+ :
+ : __imm(bpf_get_cpu_cycles)
+ : __clobber_all
+ );
+}
+
+SEC("syscall")
+__arch_x86_64
+/* program entry for bpf_rdtsc_jit_x86_64(), regular function prologue */
+__jited(" endbr64")
+__jited(" nopl (%rax,%rax)")
+__jited(" nopl (%rax)")
+__jited(" pushq %rbp")
+__jited(" movq %rsp, %rbp")
+__jited(" endbr64")
+/* save RDX in R11 as it will be overwritten */
+__jited(" movq %rdx, %r11")
+/* lfence may not be executed depending on cpu features */
+__jited(" {{(lfence|)}}")
+__jited(" rdtsc")
+/* combine EDX:EAX into RAX */
+__jited(" shlq ${{(32|0x20)}}, %rdx")
+__jited(" orq %rdx, %rax")
+/* restore RDX from R11 */
+__jited(" movq %r11, %rdx")
+__jited(" leave")
+__naked int bpf_rdtsc_jit_x86_64(void)
+{
+ asm volatile(
+ "call %[bpf_get_cpu_cycles];"
+ "exit"
+ :
+ : __imm(bpf_get_cpu_cycles)
+ : __clobber_all
+ );
+}
+
+SEC("syscall")
+__arch_x86_64
+__xlated("0: r1 = 42")
+__xlated("1: call kernel-function")
+__naked int bpf_cyc2ns(void)
+{
+ asm volatile(
+ "r1=0x2a;"
+ "call %[bpf_cpu_cycles_to_ns];"
+ "exit"
+ :
+ : __imm(bpf_cpu_cycles_to_ns)
+ : __clobber_all
+ );
+}
+
+SEC("syscall")
+__arch_x86_64
+/* program entry for bpf_rdtsc_jit_x86_64(), regular function prologue */
+__jited(" endbr64")
+__jited(" nopl (%rax,%rax)")
+__jited(" nopl (%rax)")
+__jited(" pushq %rbp")
+__jited(" movq %rsp, %rbp")
+__jited(" endbr64")
+/* save RDX in R11 as it will be overwritten */
+__jited(" movabsq $0x2a2a2a2a2a, %rdi")
+__jited(" imulq ${{.*}}, %rdi, %rax")
+__jited(" shrq ${{.*}}, %rax")
+__jited(" leave")
+__naked int bpf_cyc2ns_jit_x86(void)
+{
+ asm volatile(
+ "r1=0x2a2a2a2a2a ll;"
+ "call %[bpf_cpu_cycles_to_ns];"
+ "exit"
+ :
+ : __imm(bpf_cpu_cycles_to_ns)
+ : __clobber_all
+ );
+}
+
+void rdtsc(void)
+{
+ bpf_get_cpu_cycles();
+ bpf_cpu_cycles_to_ns(42);
+}
+
+char _license[] SEC("license") = "GPL";
--
2.43.5
^ permalink raw reply related [flat|nested] 23+ messages in thread
* [PATCH bpf-next v5 4/4] selftests/bpf: add usage example for cpu cycles kfuncs
2024-11-09 0:41 [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc Vadim Fedorenko
2024-11-09 0:41 ` [PATCH bpf-next v5 2/4] bpf: add bpf_cpu_cycles_to_ns helper Vadim Fedorenko
2024-11-09 0:41 ` [PATCH bpf-next v5 3/4] selftests/bpf: add selftest to check rdtsc jit Vadim Fedorenko
@ 2024-11-09 0:41 ` Vadim Fedorenko
2024-11-12 5:50 ` [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc Andrii Nakryiko
` (2 subsequent siblings)
5 siblings, 0 replies; 23+ messages in thread
From: Vadim Fedorenko @ 2024-11-09 0:41 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Eduard Zingerman, Thomas Gleixner, Vadim Fedorenko,
Mykola Lysenko, Jakub Kicinski
Cc: x86, bpf, Vadim Fedorenko, Martin KaFai Lau
The selftest provides an example of how to measure the latency of bpf
kfunc/helper call in cycles and nanoseconds.
Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
---
.../bpf/prog_tests/test_cpu_cycles.c | 35 +++++++++++++++++++
.../selftests/bpf/progs/test_cpu_cycles.c | 25 +++++++++++++
2 files changed, 60 insertions(+)
create mode 100644 tools/testing/selftests/bpf/prog_tests/test_cpu_cycles.c
create mode 100644 tools/testing/selftests/bpf/progs/test_cpu_cycles.c
diff --git a/tools/testing/selftests/bpf/prog_tests/test_cpu_cycles.c b/tools/testing/selftests/bpf/prog_tests/test_cpu_cycles.c
new file mode 100644
index 000000000000..d7f3b66594b3
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_cpu_cycles.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Inc. */
+
+#include <test_progs.h>
+#include "test_cpu_cycles.skel.h"
+
+static void cpu_cycles(void)
+{
+ LIBBPF_OPTS(bpf_test_run_opts, opts);
+ struct test_cpu_cycles *skel;
+ int err, pfd;
+
+ skel = test_cpu_cycles__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "test_cpu_cycles open and load"))
+ return;
+
+ pfd = bpf_program__fd(skel->progs.bpf_cpu_cycles);
+ if (!ASSERT_GT(pfd, 0, "test_cpu_cycles fd"))
+ goto fail;
+
+ err = bpf_prog_test_run_opts(pfd, &opts);
+ if (!ASSERT_OK(err, "test_cpu_cycles test run"))
+ goto fail;
+
+ ASSERT_NEQ(skel->bss->cycles, 0, "test_cpu_cycles 0 cycles");
+ ASSERT_NEQ(skel->bss->ns, 0, "test_cpu_cycles 0 ns");
+fail:
+ test_cpu_cycles__destroy(skel);
+}
+
+void test_cpu_cycles(void)
+{
+ if (test__start_subtest("cpu_cycles"))
+ cpu_cycles();
+}
diff --git a/tools/testing/selftests/bpf/progs/test_cpu_cycles.c b/tools/testing/selftests/bpf/progs/test_cpu_cycles.c
new file mode 100644
index 000000000000..48762f07ad7f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_cpu_cycles.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Inc. */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+extern u64 bpf_cpu_cycles_to_ns(u64 cycles) __weak __ksym;
+extern u64 bpf_get_cpu_cycles(void) __weak __ksym;
+
+__u64 cycles, ns;
+
+SEC("syscall")
+int bpf_cpu_cycles(void)
+{
+ struct bpf_pidns_info pidns;
+ __u64 start;
+
+ start = bpf_get_cpu_cycles();
+ bpf_get_ns_current_pid_tgid(0, 0, &pidns, sizeof(struct bpf_pidns_info));
+ cycles = bpf_get_cpu_cycles() - start;
+ ns = bpf_cpu_cycles_to_ns(cycles);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
--
2.43.5
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-09 0:41 [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc Vadim Fedorenko
` (2 preceding siblings ...)
2024-11-09 0:41 ` [PATCH bpf-next v5 4/4] selftests/bpf: add usage example for cpu cycles kfuncs Vadim Fedorenko
@ 2024-11-12 5:50 ` Andrii Nakryiko
2024-11-12 21:43 ` Vadim Fedorenko
2024-11-12 21:21 ` Eduard Zingerman
2024-11-13 17:38 ` Yonghong Song
5 siblings, 1 reply; 23+ messages in thread
From: Andrii Nakryiko @ 2024-11-12 5:50 UTC (permalink / raw)
To: Vadim Fedorenko
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Eduard Zingerman, Thomas Gleixner, Vadim Fedorenko,
Mykola Lysenko, Jakub Kicinski, x86, bpf, Martin KaFai Lau
On Fri, Nov 8, 2024 at 4:42 PM Vadim Fedorenko <vadfed@meta.com> wrote:
>
> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
> it into rdtsc ordered call. Other architectures will get JIT
> implementation too if supported. The fallback is to
> __arch_get_hw_counter().
>
> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
> ---
nit: please add cover letter for the next revision, multi-patch sets
generally should come with a cover letter, unless it's some set of
trivial and mostly independent patches. Anyways...
I haven't yet looked through the code (yet), but I was curious to
benchmark the perf benefit, so that's what I did for fun this evening.
(!!!) BTW, a complete aside, but I think it's interesting. It turned
out that using bpf_test_prog_run() scales *VERY POORLY* with large
number of CPUs, because we start spending tons of time in
fdget()/fdput(), so I initially got pretty unscalable results,
profiled a bit, and then switched to just doing
syscall(syscall(__NR_getpgid); + SEC("raw_tp/sys_enter")). Anyways,
the point is that microbenchmarking is tricky and we need to improve
our existing bench setup for some benchmarks. Anyways, getting back to
the main topic.
I wrote a quick two benchmarks testing what I see as intended use case
for these kfuncs (batching amortizes the cost of triggering BPF
program, batch_iters = 500 in my case):
SEC("?raw_tp/sys_enter")
int trigger_driver_ktime(void *ctx)
{
volatile __u64 total = 0;
int i;
for (i = 0; i < batch_iters; i++) {
__u64 start, end;
start = bpf_ktime_get_ns();
end = bpf_ktime_get_ns();
total += end - start;
}
inc_counter_batch(batch_iters);
return 0;
}
extern __u64 bpf_get_cpu_cycles(void) __weak __ksym;
extern __u64 bpf_cpu_cycles_to_ns(__u64 cycles) __weak __ksym;
SEC("?raw_tp/sys_enter")
int trigger_driver_cycles(void *ctx)
{
volatile __u64 total = 0;
int i;
for (i = 0; i < batch_iters; i++) {
__u64 start, end;
start = bpf_get_cpu_cycles();
end = bpf_get_cpu_cycles();
total += bpf_cpu_cycles_to_ns(end - start);
}
inc_counter_batch(batch_iters);
return 0;
}
And here's what I got across multiple numbers of parallel CPUs on our
production host.
# ./bench_timing.sh
ktime ( 1 cpus): 32.286 ± 0.309M/s ( 32.286M/s/cpu)
ktime ( 2 cpus): 63.021 ± 0.538M/s ( 31.511M/s/cpu)
ktime ( 3 cpus): 94.211 ± 0.686M/s ( 31.404M/s/cpu)
ktime ( 4 cpus): 124.757 ± 0.691M/s ( 31.189M/s/cpu)
ktime ( 5 cpus): 154.855 ± 0.693M/s ( 30.971M/s/cpu)
ktime ( 6 cpus): 185.551 ± 2.304M/s ( 30.925M/s/cpu)
ktime ( 7 cpus): 211.117 ± 4.755M/s ( 30.160M/s/cpu)
ktime ( 8 cpus): 236.454 ± 0.226M/s ( 29.557M/s/cpu)
ktime (10 cpus): 295.526 ± 0.126M/s ( 29.553M/s/cpu)
ktime (12 cpus): 322.282 ± 0.153M/s ( 26.857M/s/cpu)
ktime (14 cpus): 375.347 ± 0.087M/s ( 26.811M/s/cpu)
ktime (16 cpus): 399.813 ± 0.181M/s ( 24.988M/s/cpu)
ktime (24 cpus): 617.675 ± 7.053M/s ( 25.736M/s/cpu)
ktime (32 cpus): 819.695 ± 0.231M/s ( 25.615M/s/cpu)
ktime (40 cpus): 996.264 ± 0.290M/s ( 24.907M/s/cpu)
ktime (48 cpus): 1180.201 ± 0.160M/s ( 24.588M/s/cpu)
ktime (56 cpus): 1321.084 ± 0.099M/s ( 23.591M/s/cpu)
ktime (64 cpus): 1482.061 ± 0.121M/s ( 23.157M/s/cpu)
ktime (72 cpus): 1666.540 ± 0.460M/s ( 23.146M/s/cpu)
ktime (80 cpus): 1851.419 ± 0.439M/s ( 23.143M/s/cpu)
cycles ( 1 cpus): 45.815 ± 0.018M/s ( 45.815M/s/cpu)
cycles ( 2 cpus): 86.706 ± 0.068M/s ( 43.353M/s/cpu)
cycles ( 3 cpus): 129.899 ± 0.101M/s ( 43.300M/s/cpu)
cycles ( 4 cpus): 168.435 ± 0.073M/s ( 42.109M/s/cpu)
cycles ( 5 cpus): 210.520 ± 0.164M/s ( 42.104M/s/cpu)
cycles ( 6 cpus): 252.596 ± 0.050M/s ( 42.099M/s/cpu)
cycles ( 7 cpus): 294.356 ± 0.159M/s ( 42.051M/s/cpu)
cycles ( 8 cpus): 317.167 ± 0.163M/s ( 39.646M/s/cpu)
cycles (10 cpus): 396.141 ± 0.208M/s ( 39.614M/s/cpu)
cycles (12 cpus): 431.938 ± 0.511M/s ( 35.995M/s/cpu)
cycles (14 cpus): 503.055 ± 0.070M/s ( 35.932M/s/cpu)
cycles (16 cpus): 534.261 ± 0.107M/s ( 33.391M/s/cpu)
cycles (24 cpus): 836.838 ± 0.141M/s ( 34.868M/s/cpu)
cycles (32 cpus): 1099.689 ± 0.314M/s ( 34.365M/s/cpu)
cycles (40 cpus): 1336.573 ± 0.015M/s ( 33.414M/s/cpu)
cycles (48 cpus): 1571.734 ± 11.151M/s ( 32.744M/s/cpu)
cycles (56 cpus): 1819.242 ± 4.627M/s ( 32.486M/s/cpu)
cycles (64 cpus): 2046.285 ± 5.169M/s ( 31.973M/s/cpu)
cycles (72 cpus): 2287.683 ± 0.787M/s ( 31.773M/s/cpu)
cycles (80 cpus): 2505.414 ± 0.626M/s ( 31.318M/s/cpu)
So, from about +42% on a single CPU, to +36% at 80 CPUs. Not that bad.
Scalability-wise, we still see some drop off in performance, but
believe me, with bpf_prog_test_run() it was so much worse :) I also
verified that now we spend cycles almost exclusively inside the BPF
program (so presumably in those benchmarked kfuncs).
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-09 0:41 [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc Vadim Fedorenko
` (3 preceding siblings ...)
2024-11-12 5:50 ` [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc Andrii Nakryiko
@ 2024-11-12 21:21 ` Eduard Zingerman
2024-11-12 21:39 ` Vadim Fedorenko
2024-11-13 17:38 ` Yonghong Song
5 siblings, 1 reply; 23+ messages in thread
From: Eduard Zingerman @ 2024-11-12 21:21 UTC (permalink / raw)
To: Vadim Fedorenko, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, Thomas Gleixner, Vadim Fedorenko, Mykola Lysenko,
Jakub Kicinski
Cc: x86, bpf, Martin KaFai Lau
On Fri, 2024-11-08 at 16:41 -0800, Vadim Fedorenko wrote:
> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
> it into rdtsc ordered call. Other architectures will get JIT
> implementation too if supported. The fallback is to
> __arch_get_hw_counter().
>
> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
> ---
Aside from a note below, I think this patch is in good shape.
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
[...]
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 06b080b61aa5..4f78ed93ee7f 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -2126,6 +2126,26 @@ st: if (is_imm8(insn->off))
> case BPF_JMP | BPF_CALL: {
> u8 *ip = image + addrs[i - 1];
>
> + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
> + imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
> + /* Save RDX because RDTSC will use EDX:EAX to return u64 */
> + emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
> + EMIT_LFENCE();
> + EMIT2(0x0F, 0x31);
> +
> + /* shl RDX, 32 */
> + maybe_emit_1mod(&prog, BPF_REG_3, true);
> + EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
> + /* or RAX, RDX */
> + maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
> + EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
> + /* restore RDX from R11 */
> + emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
Note: The default implementation of this kfunc uses __arch_get_hw_counter(),
which is implemented as `(u64)rdtsc_ordered() & S64_MAX`.
Here we don't do `& S64_MAX`.
The masking in __arch_get_hw_counter() was added by this commit:
77750f78b0b3 ("x86/vdso: Fix gettimeofday masking").
Also, the default implementation does not issue `lfence`.
Not sure if this makes any real-world difference.
> +
> + break;
> + }
> +
> func = (u8 *) __bpf_call_base + imm32;
> if (tail_call_reachable) {
> LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
[...]
> @@ -20488,6 +20510,12 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
> node_offset_reg, insn, insn_buf, cnt);
> } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
> desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
> + if (!verifier_inlines_kfunc_call(env, imm)) {
> + verbose(env, "verifier internal error: kfunc id %d is not defined in checker\n",
> + desc->func_id);
> + return -EFAULT;
> + }
> +
Nit: still think that moving this check as the first conditional would
have been better:
if (verifier_inlines_kfunc_call(env, imm)) {
if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
// ...
} else {
// report error
}
} else if (...) {
// ... rest of the cases
}
> insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
> *cnt = 1;
> } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) {
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-12 21:21 ` Eduard Zingerman
@ 2024-11-12 21:39 ` Vadim Fedorenko
2024-11-12 21:53 ` Eduard Zingerman
0 siblings, 1 reply; 23+ messages in thread
From: Vadim Fedorenko @ 2024-11-12 21:39 UTC (permalink / raw)
To: Eduard Zingerman, Vadim Fedorenko, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Thomas Gleixner, Mykola Lysenko,
Jakub Kicinski
Cc: x86, bpf, Martin KaFai Lau
On 12/11/2024 21:21, Eduard Zingerman wrote:
> On Fri, 2024-11-08 at 16:41 -0800, Vadim Fedorenko wrote:
>> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
>> it into rdtsc ordered call. Other architectures will get JIT
>> implementation too if supported. The fallback is to
>> __arch_get_hw_counter().
>>
>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>> ---
>
> Aside from a note below, I think this patch is in good shape.
>
> Acked-by: Eduard Zingerman <eddyz87@gmail.com>
>
> [...]
>
>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>> index 06b080b61aa5..4f78ed93ee7f 100644
>> --- a/arch/x86/net/bpf_jit_comp.c
>> +++ b/arch/x86/net/bpf_jit_comp.c
>> @@ -2126,6 +2126,26 @@ st: if (is_imm8(insn->off))
>> case BPF_JMP | BPF_CALL: {
>> u8 *ip = image + addrs[i - 1];
>>
>> + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
>> + imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>> + /* Save RDX because RDTSC will use EDX:EAX to return u64 */
>> + emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
>> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>> + EMIT_LFENCE();
>> + EMIT2(0x0F, 0x31);
>> +
>> + /* shl RDX, 32 */
>> + maybe_emit_1mod(&prog, BPF_REG_3, true);
>> + EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
>> + /* or RAX, RDX */
>> + maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
>> + EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
>> + /* restore RDX from R11 */
>> + emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
>
> Note: The default implementation of this kfunc uses __arch_get_hw_counter(),
> which is implemented as `(u64)rdtsc_ordered() & S64_MAX`.
> Here we don't do `& S64_MAX`.
> The masking in __arch_get_hw_counter() was added by this commit:
> 77750f78b0b3 ("x86/vdso: Fix gettimeofday masking").
I think we already discussed it with Alexey in v1, we don't really need
any masking here for BPF case. We can use values provided by CPU
directly. It will never happen that within one BPF program we will have
inlined and non-inlined implementation of this helper, hence the values
to compare will be of the same source.
> Also, the default implementation does not issue `lfence`.
> Not sure if this makes any real-world difference.
Well, it actually does. rdtsc_ordered is translated into `lfence; rdtsc`
or `rdtscp` (which is rdtsc + lfence + u32 cookie) depending on the cpu
features.
>> +
>> + break;
>> + }
>> +
>> func = (u8 *) __bpf_call_base + imm32;
>> if (tail_call_reachable) {
>> LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
>
> [...]
>
>> @@ -20488,6 +20510,12 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>> node_offset_reg, insn, insn_buf, cnt);
>> } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
>> desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
>> + if (!verifier_inlines_kfunc_call(env, imm)) {
>> + verbose(env, "verifier internal error: kfunc id %d is not defined in checker\n",
>> + desc->func_id);
>> + return -EFAULT;
>> + }
>> +
>
> Nit: still think that moving this check as the first conditional would
> have been better:
>
> if (verifier_inlines_kfunc_call(env, imm)) {
> if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
> desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
> // ...
> } else {
> // report error
> }
> } else if (...) {
> // ... rest of the cases
> }
I can change it in the next iteration (if it's needed) if you insist
>> insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
>> *cnt = 1;
>> } else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) {
>
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-12 5:50 ` [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc Andrii Nakryiko
@ 2024-11-12 21:43 ` Vadim Fedorenko
2024-11-12 23:59 ` Andrii Nakryiko
0 siblings, 1 reply; 23+ messages in thread
From: Vadim Fedorenko @ 2024-11-12 21:43 UTC (permalink / raw)
To: Andrii Nakryiko, Vadim Fedorenko
Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Eduard Zingerman, Thomas Gleixner, Mykola Lysenko, Jakub Kicinski,
x86, bpf, Martin KaFai Lau
On 12/11/2024 05:50, Andrii Nakryiko wrote:
> On Fri, Nov 8, 2024 at 4:42 PM Vadim Fedorenko <vadfed@meta.com> wrote:
>>
>> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
>> it into rdtsc ordered call. Other architectures will get JIT
>> implementation too if supported. The fallback is to
>> __arch_get_hw_counter().
>>
>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>> ---
>
> nit: please add cover letter for the next revision, multi-patch sets
> generally should come with a cover letter, unless it's some set of
> trivial and mostly independent patches. Anyways...
Yeah, sure. This series has grown from the first small version...
>
> I haven't yet looked through the code (yet), but I was curious to
> benchmark the perf benefit, so that's what I did for fun this evening.
>
> (!!!) BTW, a complete aside, but I think it's interesting. It turned
> out that using bpf_test_prog_run() scales *VERY POORLY* with large
> number of CPUs, because we start spending tons of time in
> fdget()/fdput(), so I initially got pretty unscalable results,
> profiled a bit, and then switched to just doing
> syscall(syscall(__NR_getpgid); + SEC("raw_tp/sys_enter")). Anyways,
> the point is that microbenchmarking is tricky and we need to improve
> our existing bench setup for some benchmarks. Anyways, getting back to
> the main topic.
>
> I wrote a quick two benchmarks testing what I see as intended use case
> for these kfuncs (batching amortizes the cost of triggering BPF
> program, batch_iters = 500 in my case):
>
> SEC("?raw_tp/sys_enter")
> int trigger_driver_ktime(void *ctx)
> {
> volatile __u64 total = 0;
> int i;
>
> for (i = 0; i < batch_iters; i++) {
> __u64 start, end;
>
> start = bpf_ktime_get_ns();
> end = bpf_ktime_get_ns();
> total += end - start;
> }
> inc_counter_batch(batch_iters);
>
> return 0;
> }
>
> extern __u64 bpf_get_cpu_cycles(void) __weak __ksym;
> extern __u64 bpf_cpu_cycles_to_ns(__u64 cycles) __weak __ksym;
>
> SEC("?raw_tp/sys_enter")
> int trigger_driver_cycles(void *ctx)
> {
> volatile __u64 total = 0;
> int i;
>
> for (i = 0; i < batch_iters; i++) {
> __u64 start, end;
>
> start = bpf_get_cpu_cycles();
> end = bpf_get_cpu_cycles();
> total += bpf_cpu_cycles_to_ns(end - start);
> }
> inc_counter_batch(batch_iters);
>
> return 0;
> }
>
> And here's what I got across multiple numbers of parallel CPUs on our
> production host.
>
> # ./bench_timing.sh
>
> ktime ( 1 cpus): 32.286 ± 0.309M/s ( 32.286M/s/cpu)
> ktime ( 2 cpus): 63.021 ± 0.538M/s ( 31.511M/s/cpu)
> ktime ( 3 cpus): 94.211 ± 0.686M/s ( 31.404M/s/cpu)
> ktime ( 4 cpus): 124.757 ± 0.691M/s ( 31.189M/s/cpu)
> ktime ( 5 cpus): 154.855 ± 0.693M/s ( 30.971M/s/cpu)
> ktime ( 6 cpus): 185.551 ± 2.304M/s ( 30.925M/s/cpu)
> ktime ( 7 cpus): 211.117 ± 4.755M/s ( 30.160M/s/cpu)
> ktime ( 8 cpus): 236.454 ± 0.226M/s ( 29.557M/s/cpu)
> ktime (10 cpus): 295.526 ± 0.126M/s ( 29.553M/s/cpu)
> ktime (12 cpus): 322.282 ± 0.153M/s ( 26.857M/s/cpu)
> ktime (14 cpus): 375.347 ± 0.087M/s ( 26.811M/s/cpu)
> ktime (16 cpus): 399.813 ± 0.181M/s ( 24.988M/s/cpu)
> ktime (24 cpus): 617.675 ± 7.053M/s ( 25.736M/s/cpu)
> ktime (32 cpus): 819.695 ± 0.231M/s ( 25.615M/s/cpu)
> ktime (40 cpus): 996.264 ± 0.290M/s ( 24.907M/s/cpu)
> ktime (48 cpus): 1180.201 ± 0.160M/s ( 24.588M/s/cpu)
> ktime (56 cpus): 1321.084 ± 0.099M/s ( 23.591M/s/cpu)
> ktime (64 cpus): 1482.061 ± 0.121M/s ( 23.157M/s/cpu)
> ktime (72 cpus): 1666.540 ± 0.460M/s ( 23.146M/s/cpu)
> ktime (80 cpus): 1851.419 ± 0.439M/s ( 23.143M/s/cpu)
>
> cycles ( 1 cpus): 45.815 ± 0.018M/s ( 45.815M/s/cpu)
> cycles ( 2 cpus): 86.706 ± 0.068M/s ( 43.353M/s/cpu)
> cycles ( 3 cpus): 129.899 ± 0.101M/s ( 43.300M/s/cpu)
> cycles ( 4 cpus): 168.435 ± 0.073M/s ( 42.109M/s/cpu)
> cycles ( 5 cpus): 210.520 ± 0.164M/s ( 42.104M/s/cpu)
> cycles ( 6 cpus): 252.596 ± 0.050M/s ( 42.099M/s/cpu)
> cycles ( 7 cpus): 294.356 ± 0.159M/s ( 42.051M/s/cpu)
> cycles ( 8 cpus): 317.167 ± 0.163M/s ( 39.646M/s/cpu)
> cycles (10 cpus): 396.141 ± 0.208M/s ( 39.614M/s/cpu)
> cycles (12 cpus): 431.938 ± 0.511M/s ( 35.995M/s/cpu)
> cycles (14 cpus): 503.055 ± 0.070M/s ( 35.932M/s/cpu)
> cycles (16 cpus): 534.261 ± 0.107M/s ( 33.391M/s/cpu)
> cycles (24 cpus): 836.838 ± 0.141M/s ( 34.868M/s/cpu)
> cycles (32 cpus): 1099.689 ± 0.314M/s ( 34.365M/s/cpu)
> cycles (40 cpus): 1336.573 ± 0.015M/s ( 33.414M/s/cpu)
> cycles (48 cpus): 1571.734 ± 11.151M/s ( 32.744M/s/cpu)
> cycles (56 cpus): 1819.242 ± 4.627M/s ( 32.486M/s/cpu)
> cycles (64 cpus): 2046.285 ± 5.169M/s ( 31.973M/s/cpu)
> cycles (72 cpus): 2287.683 ± 0.787M/s ( 31.773M/s/cpu)
> cycles (80 cpus): 2505.414 ± 0.626M/s ( 31.318M/s/cpu)
>
> So, from about +42% on a single CPU, to +36% at 80 CPUs. Not that bad.
> Scalability-wise, we still see some drop off in performance, but
> believe me, with bpf_prog_test_run() it was so much worse :) I also
> verified that now we spend cycles almost exclusively inside the BPF
> program (so presumably in those benchmarked kfuncs).
Am I right that the numbers show how many iterations were done during
the very same amount of time? It would be also great to understand if
we get more precise measurements - just in case you have your tests
ready...
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-12 21:39 ` Vadim Fedorenko
@ 2024-11-12 21:53 ` Eduard Zingerman
2024-11-12 22:19 ` Eduard Zingerman
0 siblings, 1 reply; 23+ messages in thread
From: Eduard Zingerman @ 2024-11-12 21:53 UTC (permalink / raw)
To: Vadim Fedorenko, Vadim Fedorenko, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Thomas Gleixner, Mykola Lysenko,
Jakub Kicinski
Cc: x86, bpf, Martin KaFai Lau
On Tue, 2024-11-12 at 21:39 +0000, Vadim Fedorenko wrote:
[...]
> > > + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
> > > + imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
> > > + /* Save RDX because RDTSC will use EDX:EAX to return u64 */
> > > + emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
> > > + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
> > > + EMIT_LFENCE();
> > > + EMIT2(0x0F, 0x31);
> > > +
> > > + /* shl RDX, 32 */
> > > + maybe_emit_1mod(&prog, BPF_REG_3, true);
> > > + EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
> > > + /* or RAX, RDX */
> > > + maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
> > > + EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
> > > + /* restore RDX from R11 */
> > > + emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
> >
> > Note: The default implementation of this kfunc uses __arch_get_hw_counter(),
> > which is implemented as `(u64)rdtsc_ordered() & S64_MAX`.
> > Here we don't do `& S64_MAX`.
> > The masking in __arch_get_hw_counter() was added by this commit:
> > 77750f78b0b3 ("x86/vdso: Fix gettimeofday masking").
>
> I think we already discussed it with Alexey in v1, we don't really need
> any masking here for BPF case. We can use values provided by CPU
> directly. It will never happen that within one BPF program we will have
> inlined and non-inlined implementation of this helper, hence the values
> to compare will be of the same source.
>
> > Also, the default implementation does not issue `lfence`.
> > Not sure if this makes any real-world difference.
>
> Well, it actually does. rdtsc_ordered is translated into `lfence; rdtsc`
> or `rdtscp` (which is rdtsc + lfence + u32 cookie) depending on the cpu
> features.
I see the following disassembly:
0000000000008980 <bpf_get_cpu_cycles>:
; {
8980: f3 0f 1e fa endbr64
8984: e8 00 00 00 00 callq 0x8989 <bpf_get_cpu_cycles+0x9>
0000000000008985: R_X86_64_PLT32 __fentry__-0x4
; asm volatile(ALTERNATIVE_2("rdtsc",
8989: 0f 31 rdtsc
898b: 90 nop
898c: 90 nop
898d: 90 nop
; return EAX_EDX_VAL(val, low, high);
898e: 48 c1 e2 20 shlq $0x20, %rdx
8992: 48 09 d0 orq %rdx, %rax
8995: 48 b9 ff ff ff ff ff ff ff 7f movabsq $0x7fffffffffffffff, %rcx # imm = 0x7FFFFFFFFFFFFFFF
; return (u64)rdtsc_ordered() & S64_MAX;
899f: 48 21 c8 andq %rcx, %rax
; return __arch_get_hw_counter(1, NULL);
89a2: 2e e9 00 00 00 00 jmp 0x89a8 <bpf_get_cpu_cycles+0x28>
Is it patched when kernel is loaded to replace nops with lfence?
By real-world difference I meant difference between default
implementation and inlined assembly.
[...]
> > > @@ -20488,6 +20510,12 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
> > > node_offset_reg, insn, insn_buf, cnt);
> > > } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
> > > desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
> > > + if (!verifier_inlines_kfunc_call(env, imm)) {
> > > + verbose(env, "verifier internal error: kfunc id %d is not defined in checker\n",
> > > + desc->func_id);
> > > + return -EFAULT;
> > > + }
> > > +
> >
> > Nit: still think that moving this check as the first conditional would
> > have been better:
> >
> > if (verifier_inlines_kfunc_call(env, imm)) {
> > if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
> > desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
> > // ...
> > } else {
> > // report error
> > }
> > } else if (...) {
> > // ... rest of the cases
> > }
>
> I can change it in the next iteration (if it's needed) if you insist
No need to change if there would be no next iteration.
[...]
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-12 21:53 ` Eduard Zingerman
@ 2024-11-12 22:19 ` Eduard Zingerman
2024-11-12 22:27 ` Alexei Starovoitov
0 siblings, 1 reply; 23+ messages in thread
From: Eduard Zingerman @ 2024-11-12 22:19 UTC (permalink / raw)
To: Vadim Fedorenko, Vadim Fedorenko, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Thomas Gleixner, Mykola Lysenko,
Jakub Kicinski
Cc: x86, bpf, Martin KaFai Lau
On Tue, 2024-11-12 at 13:53 -0800, Eduard Zingerman wrote:
> On Tue, 2024-11-12 at 21:39 +0000, Vadim Fedorenko wrote:
>
> [...]
>
> > > > + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
> > > > + imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
> > > > + /* Save RDX because RDTSC will use EDX:EAX to return u64 */
> > > > + emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
> > > > + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
> > > > + EMIT_LFENCE();
> > > > + EMIT2(0x0F, 0x31);
> > > > +
> > > > + /* shl RDX, 32 */
> > > > + maybe_emit_1mod(&prog, BPF_REG_3, true);
> > > > + EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
> > > > + /* or RAX, RDX */
> > > > + maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
> > > > + EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
> > > > + /* restore RDX from R11 */
> > > > + emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
> > >
> > > Note: The default implementation of this kfunc uses __arch_get_hw_counter(),
> > > which is implemented as `(u64)rdtsc_ordered() & S64_MAX`.
> > > Here we don't do `& S64_MAX`.
> > > The masking in __arch_get_hw_counter() was added by this commit:
> > > 77750f78b0b3 ("x86/vdso: Fix gettimeofday masking").
> >
> > I think we already discussed it with Alexey in v1, we don't really need
> > any masking here for BPF case. We can use values provided by CPU
> > directly. It will never happen that within one BPF program we will have
> > inlined and non-inlined implementation of this helper, hence the values
> > to compare will be of the same source.
> >
> > > Also, the default implementation does not issue `lfence`.
> > > Not sure if this makes any real-world difference.
> >
> > Well, it actually does. rdtsc_ordered is translated into `lfence; rdtsc`
> > or `rdtscp` (which is rdtsc + lfence + u32 cookie) depending on the cpu
> > features.
>
> I see the following disassembly:
>
> 0000000000008980 <bpf_get_cpu_cycles>:
> ; {
> 8980: f3 0f 1e fa endbr64
> 8984: e8 00 00 00 00 callq 0x8989 <bpf_get_cpu_cycles+0x9>
> 0000000000008985: R_X86_64_PLT32 __fentry__-0x4
> ; asm volatile(ALTERNATIVE_2("rdtsc",
> 8989: 0f 31 rdtsc
> 898b: 90 nop
> 898c: 90 nop
> 898d: 90 nop
> ; return EAX_EDX_VAL(val, low, high);
> 898e: 48 c1 e2 20 shlq $0x20, %rdx
> 8992: 48 09 d0 orq %rdx, %rax
> 8995: 48 b9 ff ff ff ff ff ff ff 7f movabsq $0x7fffffffffffffff, %rcx # imm = 0x7FFFFFFFFFFFFFFF
> ; return (u64)rdtsc_ordered() & S64_MAX;
> 899f: 48 21 c8 andq %rcx, %rax
> ; return __arch_get_hw_counter(1, NULL);
> 89a2: 2e e9 00 00 00 00 jmp 0x89a8 <bpf_get_cpu_cycles+0x28>
>
> Is it patched when kernel is loaded to replace nops with lfence?
> By real-world difference I meant difference between default
> implementation and inlined assembly.
Talked with Vadim off-list, he explained that 'rttsc nop nop nop' is
indeed patched at kernel load. Regarding S64_MAX patching we just hope
this should never be an issue for BPF use-case.
So, no more questions from my side.
[...]
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-12 22:19 ` Eduard Zingerman
@ 2024-11-12 22:27 ` Alexei Starovoitov
2024-11-12 23:08 ` Vadim Fedorenko
0 siblings, 1 reply; 23+ messages in thread
From: Alexei Starovoitov @ 2024-11-12 22:27 UTC (permalink / raw)
To: Eduard Zingerman
Cc: Vadim Fedorenko, Vadim Fedorenko, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Thomas Gleixner, Mykola Lysenko,
Jakub Kicinski, X86 ML, bpf, Martin KaFai Lau
On Tue, Nov 12, 2024 at 2:20 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> On Tue, 2024-11-12 at 13:53 -0800, Eduard Zingerman wrote:
> > On Tue, 2024-11-12 at 21:39 +0000, Vadim Fedorenko wrote:
> >
> > [...]
> >
> > > > > + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
> > > > > + imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
> > > > > + /* Save RDX because RDTSC will use EDX:EAX to return u64 */
> > > > > + emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
> > > > > + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
> > > > > + EMIT_LFENCE();
> > > > > + EMIT2(0x0F, 0x31);
> > > > > +
> > > > > + /* shl RDX, 32 */
> > > > > + maybe_emit_1mod(&prog, BPF_REG_3, true);
> > > > > + EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
> > > > > + /* or RAX, RDX */
> > > > > + maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
> > > > > + EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
> > > > > + /* restore RDX from R11 */
> > > > > + emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
> > > >
> > > > Note: The default implementation of this kfunc uses __arch_get_hw_counter(),
> > > > which is implemented as `(u64)rdtsc_ordered() & S64_MAX`.
> > > > Here we don't do `& S64_MAX`.
> > > > The masking in __arch_get_hw_counter() was added by this commit:
> > > > 77750f78b0b3 ("x86/vdso: Fix gettimeofday masking").
> > >
> > > I think we already discussed it with Alexey in v1, we don't really need
> > > any masking here for BPF case. We can use values provided by CPU
> > > directly. It will never happen that within one BPF program we will have
> > > inlined and non-inlined implementation of this helper, hence the values
> > > to compare will be of the same source.
> > >
> > > > Also, the default implementation does not issue `lfence`.
> > > > Not sure if this makes any real-world difference.
> > >
> > > Well, it actually does. rdtsc_ordered is translated into `lfence; rdtsc`
> > > or `rdtscp` (which is rdtsc + lfence + u32 cookie) depending on the cpu
> > > features.
> >
> > I see the following disassembly:
> >
> > 0000000000008980 <bpf_get_cpu_cycles>:
> > ; {
> > 8980: f3 0f 1e fa endbr64
> > 8984: e8 00 00 00 00 callq 0x8989 <bpf_get_cpu_cycles+0x9>
> > 0000000000008985: R_X86_64_PLT32 __fentry__-0x4
> > ; asm volatile(ALTERNATIVE_2("rdtsc",
> > 8989: 0f 31 rdtsc
> > 898b: 90 nop
> > 898c: 90 nop
> > 898d: 90 nop
> > ; return EAX_EDX_VAL(val, low, high);
> > 898e: 48 c1 e2 20 shlq $0x20, %rdx
> > 8992: 48 09 d0 orq %rdx, %rax
> > 8995: 48 b9 ff ff ff ff ff ff ff 7f movabsq $0x7fffffffffffffff, %rcx # imm = 0x7FFFFFFFFFFFFFFF
> > ; return (u64)rdtsc_ordered() & S64_MAX;
> > 899f: 48 21 c8 andq %rcx, %rax
> > ; return __arch_get_hw_counter(1, NULL);
> > 89a2: 2e e9 00 00 00 00 jmp 0x89a8 <bpf_get_cpu_cycles+0x28>
> >
> > Is it patched when kernel is loaded to replace nops with lfence?
> > By real-world difference I meant difference between default
> > implementation and inlined assembly.
>
> Talked with Vadim off-list, he explained that 'rttsc nop nop nop' is
> indeed patched at kernel load. Regarding S64_MAX patching we just hope
> this should never be an issue for BPF use-case.
> So, no more questions from my side.
since s64 question came up twice it should be a comment.
nop nop as well.
pw-bot: cr
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 2/4] bpf: add bpf_cpu_cycles_to_ns helper
2024-11-09 0:41 ` [PATCH bpf-next v5 2/4] bpf: add bpf_cpu_cycles_to_ns helper Vadim Fedorenko
@ 2024-11-12 23:03 ` Eduard Zingerman
0 siblings, 0 replies; 23+ messages in thread
From: Eduard Zingerman @ 2024-11-12 23:03 UTC (permalink / raw)
To: Vadim Fedorenko, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, Thomas Gleixner, Vadim Fedorenko, Mykola Lysenko,
Jakub Kicinski
Cc: x86, bpf, Martin KaFai Lau
On Fri, 2024-11-08 at 16:41 -0800, Vadim Fedorenko wrote:
> The new helper should be used to convert cycles received by
> bpf_get_cpu_cycle() into nanoseconds.
>
> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
> ---
Tried this with and without invtsc flag for qemu,
the switch between call / inlined version occurs as expected.
In the off-list discussion Vadim explained that:
- mult and shift constants are not updated after boot if constant_tsc
feature is present for CPU.
- despite gettimeofday.h:vdso_calc_ns() doing more complex
calculations to avoid negative time motion, we again assume that
this is not needed for BPF use-case (two measurements close in time).
(If there would be a v6, it would be nice to have a comment about
these differences, I think).
Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
[...]
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-12 22:27 ` Alexei Starovoitov
@ 2024-11-12 23:08 ` Vadim Fedorenko
2024-11-13 0:09 ` Alexei Starovoitov
0 siblings, 1 reply; 23+ messages in thread
From: Vadim Fedorenko @ 2024-11-12 23:08 UTC (permalink / raw)
To: Alexei Starovoitov, Eduard Zingerman
Cc: Vadim Fedorenko, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, Thomas Gleixner, Mykola Lysenko, Jakub Kicinski,
X86 ML, bpf, Martin KaFai Lau
On 12/11/2024 22:27, Alexei Starovoitov wrote:
> On Tue, Nov 12, 2024 at 2:20 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>>
>> On Tue, 2024-11-12 at 13:53 -0800, Eduard Zingerman wrote:
>>> On Tue, 2024-11-12 at 21:39 +0000, Vadim Fedorenko wrote:
>>>
>>> [...]
>>>
>>>>>> + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
>>>>>> + imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>>>>> + /* Save RDX because RDTSC will use EDX:EAX to return u64 */
>>>>>> + emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
>>>>>> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>>>>> + EMIT_LFENCE();
>>>>>> + EMIT2(0x0F, 0x31);
>>>>>> +
>>>>>> + /* shl RDX, 32 */
>>>>>> + maybe_emit_1mod(&prog, BPF_REG_3, true);
>>>>>> + EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
>>>>>> + /* or RAX, RDX */
>>>>>> + maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
>>>>>> + EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
>>>>>> + /* restore RDX from R11 */
>>>>>> + emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
>>>>>
>>>>> Note: The default implementation of this kfunc uses __arch_get_hw_counter(),
>>>>> which is implemented as `(u64)rdtsc_ordered() & S64_MAX`.
>>>>> Here we don't do `& S64_MAX`.
>>>>> The masking in __arch_get_hw_counter() was added by this commit:
>>>>> 77750f78b0b3 ("x86/vdso: Fix gettimeofday masking").
>>>>
>>>> I think we already discussed it with Alexey in v1, we don't really need
>>>> any masking here for BPF case. We can use values provided by CPU
>>>> directly. It will never happen that within one BPF program we will have
>>>> inlined and non-inlined implementation of this helper, hence the values
>>>> to compare will be of the same source.
>>>>
>>>>> Also, the default implementation does not issue `lfence`.
>>>>> Not sure if this makes any real-world difference.
>>>>
>>>> Well, it actually does. rdtsc_ordered is translated into `lfence; rdtsc`
>>>> or `rdtscp` (which is rdtsc + lfence + u32 cookie) depending on the cpu
>>>> features.
>>>
>>> I see the following disassembly:
>>>
>>> 0000000000008980 <bpf_get_cpu_cycles>:
>>> ; {
>>> 8980: f3 0f 1e fa endbr64
>>> 8984: e8 00 00 00 00 callq 0x8989 <bpf_get_cpu_cycles+0x9>
>>> 0000000000008985: R_X86_64_PLT32 __fentry__-0x4
>>> ; asm volatile(ALTERNATIVE_2("rdtsc",
>>> 8989: 0f 31 rdtsc
>>> 898b: 90 nop
>>> 898c: 90 nop
>>> 898d: 90 nop
>>> ; return EAX_EDX_VAL(val, low, high);
>>> 898e: 48 c1 e2 20 shlq $0x20, %rdx
>>> 8992: 48 09 d0 orq %rdx, %rax
>>> 8995: 48 b9 ff ff ff ff ff ff ff 7f movabsq $0x7fffffffffffffff, %rcx # imm = 0x7FFFFFFFFFFFFFFF
>>> ; return (u64)rdtsc_ordered() & S64_MAX;
>>> 899f: 48 21 c8 andq %rcx, %rax
>>> ; return __arch_get_hw_counter(1, NULL);
>>> 89a2: 2e e9 00 00 00 00 jmp 0x89a8 <bpf_get_cpu_cycles+0x28>
>>>
>>> Is it patched when kernel is loaded to replace nops with lfence?
>>> By real-world difference I meant difference between default
>>> implementation and inlined assembly.
>>
>> Talked with Vadim off-list, he explained that 'rttsc nop nop nop' is
>> indeed patched at kernel load. Regarding S64_MAX patching we just hope
>> this should never be an issue for BPF use-case.
>> So, no more questions from my side.
>
> since s64 question came up twice it should be a comment.
sure, will do it.
>
> nop nop as well.
do you mean why there are nop;nop instructions in the kernel's assembly?
>
> pw-bot: cr
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 3/4] selftests/bpf: add selftest to check rdtsc jit
2024-11-09 0:41 ` [PATCH bpf-next v5 3/4] selftests/bpf: add selftest to check rdtsc jit Vadim Fedorenko
@ 2024-11-12 23:17 ` Eduard Zingerman
0 siblings, 0 replies; 23+ messages in thread
From: Eduard Zingerman @ 2024-11-12 23:17 UTC (permalink / raw)
To: Vadim Fedorenko, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, Thomas Gleixner, Vadim Fedorenko, Mykola Lysenko,
Jakub Kicinski
Cc: x86, bpf, Martin KaFai Lau
On Fri, 2024-11-08 at 16:41 -0800, Vadim Fedorenko wrote:
> get_cpu_cycles() is replaced with rdtsc instruction on x86_64. Add
> tests to check it.
>
> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
> ---
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-12 21:43 ` Vadim Fedorenko
@ 2024-11-12 23:59 ` Andrii Nakryiko
0 siblings, 0 replies; 23+ messages in thread
From: Andrii Nakryiko @ 2024-11-12 23:59 UTC (permalink / raw)
To: Vadim Fedorenko
Cc: Vadim Fedorenko, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, Eduard Zingerman, Thomas Gleixner,
Mykola Lysenko, Jakub Kicinski, x86, bpf, Martin KaFai Lau
On Tue, Nov 12, 2024 at 1:43 PM Vadim Fedorenko
<vadim.fedorenko@linux.dev> wrote:
>
> On 12/11/2024 05:50, Andrii Nakryiko wrote:
> > On Fri, Nov 8, 2024 at 4:42 PM Vadim Fedorenko <vadfed@meta.com> wrote:
> >>
> >> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
> >> it into rdtsc ordered call. Other architectures will get JIT
> >> implementation too if supported. The fallback is to
> >> __arch_get_hw_counter().
> >>
> >> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
> >> ---
> >
> > nit: please add cover letter for the next revision, multi-patch sets
> > generally should come with a cover letter, unless it's some set of
> > trivial and mostly independent patches. Anyways...
>
> Yeah, sure. This series has grown from the first small version...
>
> >
> > I haven't yet looked through the code (yet), but I was curious to
> > benchmark the perf benefit, so that's what I did for fun this evening.
> >
> > (!!!) BTW, a complete aside, but I think it's interesting. It turned
> > out that using bpf_test_prog_run() scales *VERY POORLY* with large
> > number of CPUs, because we start spending tons of time in
> > fdget()/fdput(), so I initially got pretty unscalable results,
> > profiled a bit, and then switched to just doing
> > syscall(syscall(__NR_getpgid); + SEC("raw_tp/sys_enter")). Anyways,
> > the point is that microbenchmarking is tricky and we need to improve
> > our existing bench setup for some benchmarks. Anyways, getting back to
> > the main topic.
> >
> > I wrote a quick two benchmarks testing what I see as intended use case
> > for these kfuncs (batching amortizes the cost of triggering BPF
> > program, batch_iters = 500 in my case):
> >
> > SEC("?raw_tp/sys_enter")
> > int trigger_driver_ktime(void *ctx)
> > {
> > volatile __u64 total = 0;
> > int i;
> >
> > for (i = 0; i < batch_iters; i++) {
> > __u64 start, end;
> >
> > start = bpf_ktime_get_ns();
> > end = bpf_ktime_get_ns();
> > total += end - start;
> > }
> > inc_counter_batch(batch_iters);
> >
> > return 0;
> > }
> >
> > extern __u64 bpf_get_cpu_cycles(void) __weak __ksym;
> > extern __u64 bpf_cpu_cycles_to_ns(__u64 cycles) __weak __ksym;
> >
> > SEC("?raw_tp/sys_enter")
> > int trigger_driver_cycles(void *ctx)
> > {
> > volatile __u64 total = 0;
> > int i;
> >
> > for (i = 0; i < batch_iters; i++) {
> > __u64 start, end;
> >
> > start = bpf_get_cpu_cycles();
> > end = bpf_get_cpu_cycles();
> > total += bpf_cpu_cycles_to_ns(end - start);
> > }
> > inc_counter_batch(batch_iters);
> >
> > return 0;
> > }
> >
> > And here's what I got across multiple numbers of parallel CPUs on our
> > production host.
> >
> > # ./bench_timing.sh
> >
> > ktime ( 1 cpus): 32.286 ± 0.309M/s ( 32.286M/s/cpu)
> > ktime ( 2 cpus): 63.021 ± 0.538M/s ( 31.511M/s/cpu)
> > ktime ( 3 cpus): 94.211 ± 0.686M/s ( 31.404M/s/cpu)
> > ktime ( 4 cpus): 124.757 ± 0.691M/s ( 31.189M/s/cpu)
> > ktime ( 5 cpus): 154.855 ± 0.693M/s ( 30.971M/s/cpu)
> > ktime ( 6 cpus): 185.551 ± 2.304M/s ( 30.925M/s/cpu)
> > ktime ( 7 cpus): 211.117 ± 4.755M/s ( 30.160M/s/cpu)
> > ktime ( 8 cpus): 236.454 ± 0.226M/s ( 29.557M/s/cpu)
> > ktime (10 cpus): 295.526 ± 0.126M/s ( 29.553M/s/cpu)
> > ktime (12 cpus): 322.282 ± 0.153M/s ( 26.857M/s/cpu)
> > ktime (14 cpus): 375.347 ± 0.087M/s ( 26.811M/s/cpu)
> > ktime (16 cpus): 399.813 ± 0.181M/s ( 24.988M/s/cpu)
> > ktime (24 cpus): 617.675 ± 7.053M/s ( 25.736M/s/cpu)
> > ktime (32 cpus): 819.695 ± 0.231M/s ( 25.615M/s/cpu)
> > ktime (40 cpus): 996.264 ± 0.290M/s ( 24.907M/s/cpu)
> > ktime (48 cpus): 1180.201 ± 0.160M/s ( 24.588M/s/cpu)
> > ktime (56 cpus): 1321.084 ± 0.099M/s ( 23.591M/s/cpu)
> > ktime (64 cpus): 1482.061 ± 0.121M/s ( 23.157M/s/cpu)
> > ktime (72 cpus): 1666.540 ± 0.460M/s ( 23.146M/s/cpu)
> > ktime (80 cpus): 1851.419 ± 0.439M/s ( 23.143M/s/cpu)
> >
> > cycles ( 1 cpus): 45.815 ± 0.018M/s ( 45.815M/s/cpu)
> > cycles ( 2 cpus): 86.706 ± 0.068M/s ( 43.353M/s/cpu)
> > cycles ( 3 cpus): 129.899 ± 0.101M/s ( 43.300M/s/cpu)
> > cycles ( 4 cpus): 168.435 ± 0.073M/s ( 42.109M/s/cpu)
> > cycles ( 5 cpus): 210.520 ± 0.164M/s ( 42.104M/s/cpu)
> > cycles ( 6 cpus): 252.596 ± 0.050M/s ( 42.099M/s/cpu)
> > cycles ( 7 cpus): 294.356 ± 0.159M/s ( 42.051M/s/cpu)
> > cycles ( 8 cpus): 317.167 ± 0.163M/s ( 39.646M/s/cpu)
> > cycles (10 cpus): 396.141 ± 0.208M/s ( 39.614M/s/cpu)
> > cycles (12 cpus): 431.938 ± 0.511M/s ( 35.995M/s/cpu)
> > cycles (14 cpus): 503.055 ± 0.070M/s ( 35.932M/s/cpu)
> > cycles (16 cpus): 534.261 ± 0.107M/s ( 33.391M/s/cpu)
> > cycles (24 cpus): 836.838 ± 0.141M/s ( 34.868M/s/cpu)
> > cycles (32 cpus): 1099.689 ± 0.314M/s ( 34.365M/s/cpu)
> > cycles (40 cpus): 1336.573 ± 0.015M/s ( 33.414M/s/cpu)
> > cycles (48 cpus): 1571.734 ± 11.151M/s ( 32.744M/s/cpu)
> > cycles (56 cpus): 1819.242 ± 4.627M/s ( 32.486M/s/cpu)
> > cycles (64 cpus): 2046.285 ± 5.169M/s ( 31.973M/s/cpu)
> > cycles (72 cpus): 2287.683 ± 0.787M/s ( 31.773M/s/cpu)
> > cycles (80 cpus): 2505.414 ± 0.626M/s ( 31.318M/s/cpu)
> >
> > So, from about +42% on a single CPU, to +36% at 80 CPUs. Not that bad.
> > Scalability-wise, we still see some drop off in performance, but
> > believe me, with bpf_prog_test_run() it was so much worse :) I also
> > verified that now we spend cycles almost exclusively inside the BPF
> > program (so presumably in those benchmarked kfuncs).
>
> Am I right that the numbers show how many iterations were done during
> the very same amount of time? It would be also great to understand if
yeah, it's millions of operations per second
> we get more precise measurements - just in case you have your tests
> ready...
>
not sure what precision you mean here?... I didn't plan to publish
tests, but I can push what I have into a local branch if you'd like to
play with this
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-12 23:08 ` Vadim Fedorenko
@ 2024-11-13 0:09 ` Alexei Starovoitov
2024-11-13 0:20 ` Vadim Fedorenko
0 siblings, 1 reply; 23+ messages in thread
From: Alexei Starovoitov @ 2024-11-13 0:09 UTC (permalink / raw)
To: Vadim Fedorenko
Cc: Eduard Zingerman, Vadim Fedorenko, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Thomas Gleixner, Mykola Lysenko,
Jakub Kicinski, X86 ML, bpf, Martin KaFai Lau
On Tue, Nov 12, 2024 at 3:08 PM Vadim Fedorenko
<vadim.fedorenko@linux.dev> wrote:
>
> On 12/11/2024 22:27, Alexei Starovoitov wrote:
> > On Tue, Nov 12, 2024 at 2:20 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
> >>
> >> On Tue, 2024-11-12 at 13:53 -0800, Eduard Zingerman wrote:
> >>> On Tue, 2024-11-12 at 21:39 +0000, Vadim Fedorenko wrote:
> >>>
> >>> [...]
> >>>
> >>>>>> + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
> >>>>>> + imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
> >>>>>> + /* Save RDX because RDTSC will use EDX:EAX to return u64 */
> >>>>>> + emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
> >>>>>> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
> >>>>>> + EMIT_LFENCE();
> >>>>>> + EMIT2(0x0F, 0x31);
> >>>>>> +
> >>>>>> + /* shl RDX, 32 */
> >>>>>> + maybe_emit_1mod(&prog, BPF_REG_3, true);
> >>>>>> + EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
> >>>>>> + /* or RAX, RDX */
> >>>>>> + maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
> >>>>>> + EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
> >>>>>> + /* restore RDX from R11 */
> >>>>>> + emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
> >>>>>
> >>>>> Note: The default implementation of this kfunc uses __arch_get_hw_counter(),
> >>>>> which is implemented as `(u64)rdtsc_ordered() & S64_MAX`.
> >>>>> Here we don't do `& S64_MAX`.
> >>>>> The masking in __arch_get_hw_counter() was added by this commit:
> >>>>> 77750f78b0b3 ("x86/vdso: Fix gettimeofday masking").
> >>>>
> >>>> I think we already discussed it with Alexey in v1, we don't really need
> >>>> any masking here for BPF case. We can use values provided by CPU
> >>>> directly. It will never happen that within one BPF program we will have
> >>>> inlined and non-inlined implementation of this helper, hence the values
> >>>> to compare will be of the same source.
> >>>>
> >>>>> Also, the default implementation does not issue `lfence`.
> >>>>> Not sure if this makes any real-world difference.
> >>>>
> >>>> Well, it actually does. rdtsc_ordered is translated into `lfence; rdtsc`
> >>>> or `rdtscp` (which is rdtsc + lfence + u32 cookie) depending on the cpu
> >>>> features.
> >>>
> >>> I see the following disassembly:
> >>>
> >>> 0000000000008980 <bpf_get_cpu_cycles>:
> >>> ; {
> >>> 8980: f3 0f 1e fa endbr64
> >>> 8984: e8 00 00 00 00 callq 0x8989 <bpf_get_cpu_cycles+0x9>
> >>> 0000000000008985: R_X86_64_PLT32 __fentry__-0x4
> >>> ; asm volatile(ALTERNATIVE_2("rdtsc",
> >>> 8989: 0f 31 rdtsc
> >>> 898b: 90 nop
> >>> 898c: 90 nop
> >>> 898d: 90 nop
> >>> ; return EAX_EDX_VAL(val, low, high);
> >>> 898e: 48 c1 e2 20 shlq $0x20, %rdx
> >>> 8992: 48 09 d0 orq %rdx, %rax
> >>> 8995: 48 b9 ff ff ff ff ff ff ff 7f movabsq $0x7fffffffffffffff, %rcx # imm = 0x7FFFFFFFFFFFFFFF
> >>> ; return (u64)rdtsc_ordered() & S64_MAX;
> >>> 899f: 48 21 c8 andq %rcx, %rax
> >>> ; return __arch_get_hw_counter(1, NULL);
> >>> 89a2: 2e e9 00 00 00 00 jmp 0x89a8 <bpf_get_cpu_cycles+0x28>
> >>>
> >>> Is it patched when kernel is loaded to replace nops with lfence?
> >>> By real-world difference I meant difference between default
> >>> implementation and inlined assembly.
> >>
> >> Talked with Vadim off-list, he explained that 'rttsc nop nop nop' is
> >> indeed patched at kernel load. Regarding S64_MAX patching we just hope
> >> this should never be an issue for BPF use-case.
> >> So, no more questions from my side.
> >
> > since s64 question came up twice it should be a comment.
>
> sure, will do it.
>
> >
> > nop nop as well.
>
> do you mean why there are nop;nop instructions in the kernel's assembly?
Explanation on why JITed matches __arch_get_hw_counter.
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-13 0:09 ` Alexei Starovoitov
@ 2024-11-13 0:20 ` Vadim Fedorenko
0 siblings, 0 replies; 23+ messages in thread
From: Vadim Fedorenko @ 2024-11-13 0:20 UTC (permalink / raw)
To: Alexei Starovoitov
Cc: Eduard Zingerman, Vadim Fedorenko, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Thomas Gleixner, Mykola Lysenko,
Jakub Kicinski, X86 ML, bpf, Martin KaFai Lau
On 13/11/2024 00:09, Alexei Starovoitov wrote:
> On Tue, Nov 12, 2024 at 3:08 PM Vadim Fedorenko
> <vadim.fedorenko@linux.dev> wrote:
>>
>> On 12/11/2024 22:27, Alexei Starovoitov wrote:
>>> On Tue, Nov 12, 2024 at 2:20 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>>>>
>>>> On Tue, 2024-11-12 at 13:53 -0800, Eduard Zingerman wrote:
>>>>> On Tue, 2024-11-12 at 21:39 +0000, Vadim Fedorenko wrote:
>>>>>
>>>>> [...]
>>>>>
>>>>>>>> + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
>>>>>>>> + imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>>>>>>> + /* Save RDX because RDTSC will use EDX:EAX to return u64 */
>>>>>>>> + emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
>>>>>>>> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>>>>>>> + EMIT_LFENCE();
>>>>>>>> + EMIT2(0x0F, 0x31);
>>>>>>>> +
>>>>>>>> + /* shl RDX, 32 */
>>>>>>>> + maybe_emit_1mod(&prog, BPF_REG_3, true);
>>>>>>>> + EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
>>>>>>>> + /* or RAX, RDX */
>>>>>>>> + maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
>>>>>>>> + EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
>>>>>>>> + /* restore RDX from R11 */
>>>>>>>> + emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
>>>>>>>
>>>>>>> Note: The default implementation of this kfunc uses __arch_get_hw_counter(),
>>>>>>> which is implemented as `(u64)rdtsc_ordered() & S64_MAX`.
>>>>>>> Here we don't do `& S64_MAX`.
>>>>>>> The masking in __arch_get_hw_counter() was added by this commit:
>>>>>>> 77750f78b0b3 ("x86/vdso: Fix gettimeofday masking").
>>>>>>
>>>>>> I think we already discussed it with Alexey in v1, we don't really need
>>>>>> any masking here for BPF case. We can use values provided by CPU
>>>>>> directly. It will never happen that within one BPF program we will have
>>>>>> inlined and non-inlined implementation of this helper, hence the values
>>>>>> to compare will be of the same source.
>>>>>>
>>>>>>> Also, the default implementation does not issue `lfence`.
>>>>>>> Not sure if this makes any real-world difference.
>>>>>>
>>>>>> Well, it actually does. rdtsc_ordered is translated into `lfence; rdtsc`
>>>>>> or `rdtscp` (which is rdtsc + lfence + u32 cookie) depending on the cpu
>>>>>> features.
>>>>>
>>>>> I see the following disassembly:
>>>>>
>>>>> 0000000000008980 <bpf_get_cpu_cycles>:
>>>>> ; {
>>>>> 8980: f3 0f 1e fa endbr64
>>>>> 8984: e8 00 00 00 00 callq 0x8989 <bpf_get_cpu_cycles+0x9>
>>>>> 0000000000008985: R_X86_64_PLT32 __fentry__-0x4
>>>>> ; asm volatile(ALTERNATIVE_2("rdtsc",
>>>>> 8989: 0f 31 rdtsc
>>>>> 898b: 90 nop
>>>>> 898c: 90 nop
>>>>> 898d: 90 nop
>>>>> ; return EAX_EDX_VAL(val, low, high);
>>>>> 898e: 48 c1 e2 20 shlq $0x20, %rdx
>>>>> 8992: 48 09 d0 orq %rdx, %rax
>>>>> 8995: 48 b9 ff ff ff ff ff ff ff 7f movabsq $0x7fffffffffffffff, %rcx # imm = 0x7FFFFFFFFFFFFFFF
>>>>> ; return (u64)rdtsc_ordered() & S64_MAX;
>>>>> 899f: 48 21 c8 andq %rcx, %rax
>>>>> ; return __arch_get_hw_counter(1, NULL);
>>>>> 89a2: 2e e9 00 00 00 00 jmp 0x89a8 <bpf_get_cpu_cycles+0x28>
>>>>>
>>>>> Is it patched when kernel is loaded to replace nops with lfence?
>>>>> By real-world difference I meant difference between default
>>>>> implementation and inlined assembly.
>>>>
>>>> Talked with Vadim off-list, he explained that 'rttsc nop nop nop' is
>>>> indeed patched at kernel load. Regarding S64_MAX patching we just hope
>>>> this should never be an issue for BPF use-case.
>>>> So, no more questions from my side.
>>>
>>> since s64 question came up twice it should be a comment.
>>
>> sure, will do it.
>>
>>>
>>> nop nop as well.
>>
>> do you mean why there are nop;nop instructions in the kernel's assembly?
>
> Explanation on why JITed matches __arch_get_hw_counter.
Got it, will do
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-09 0:41 [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc Vadim Fedorenko
` (4 preceding siblings ...)
2024-11-12 21:21 ` Eduard Zingerman
@ 2024-11-13 17:38 ` Yonghong Song
2024-11-13 17:52 ` Vadim Fedorenko
5 siblings, 1 reply; 23+ messages in thread
From: Yonghong Song @ 2024-11-13 17:38 UTC (permalink / raw)
To: Vadim Fedorenko, Alexei Starovoitov, Daniel Borkmann,
Andrii Nakryiko, Eduard Zingerman, Thomas Gleixner,
Vadim Fedorenko, Mykola Lysenko, Jakub Kicinski
Cc: x86, bpf, Martin KaFai Lau
On 11/8/24 4:41 PM, Vadim Fedorenko wrote:
> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
> it into rdtsc ordered call. Other architectures will get JIT
> implementation too if supported. The fallback is to
> __arch_get_hw_counter().
>
> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
> ---
> v4 -> v5:
> * use if instead of ifdef with IS_ENABLED
> v3 -> v4:
> * change name of the helper to bpf_get_cpu_cycles (Andrii)
> * Hide the helper behind CONFIG_GENERIC_GETTIMEOFDAY to avoid exposing
> it on architectures which do not have vDSO functions and data
> * reduce the scope of check of inlined functions in verifier to only 2,
> which are actually inlined.
> v2 -> v3:
> * change name of the helper to bpf_get_cpu_cycles_counter to explicitly
> mention what counter it provides (Andrii)
> * move kfunc definition to bpf.h to use it in JIT.
> * introduce another kfunc to convert cycles into nanoseconds as more
> meaningful time units for generic tracing use case (Andrii)
> v1 -> v2:
> * Fix incorrect function return value type to u64
> * Introduce bpf_jit_inlines_kfunc_call() and use it in
> mark_fastcall_pattern_for_call() to avoid clobbering in case of
> running programs with no JIT (Eduard)
> * Avoid rewriting instruction and check function pointer directly
> in JIT (Alexei)
> * Change includes to fix compile issues on non x86 architectures
> ---
> arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++++++++
> arch/x86/net/bpf_jit_comp32.c | 14 ++++++++++++++
> include/linux/bpf.h | 5 +++++
> include/linux/filter.h | 1 +
> kernel/bpf/core.c | 11 +++++++++++
> kernel/bpf/helpers.c | 13 +++++++++++++
> kernel/bpf/verifier.c | 30 +++++++++++++++++++++++++++++-
> 7 files changed, 101 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 06b080b61aa5..4f78ed93ee7f 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -2126,6 +2126,26 @@ st: if (is_imm8(insn->off))
> case BPF_JMP | BPF_CALL: {
> u8 *ip = image + addrs[i - 1];
>
> + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
> + imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
> + /* Save RDX because RDTSC will use EDX:EAX to return u64 */
> + emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
> + EMIT_LFENCE();
> + EMIT2(0x0F, 0x31);
> +
> + /* shl RDX, 32 */
> + maybe_emit_1mod(&prog, BPF_REG_3, true);
> + EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
> + /* or RAX, RDX */
> + maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
> + EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
> + /* restore RDX from R11 */
> + emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
> +
> + break;
> + }
> +
> func = (u8 *) __bpf_call_base + imm32;
> if (tail_call_reachable) {
> LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
> @@ -3652,3 +3672,11 @@ u64 bpf_arch_uaddress_limit(void)
> {
> return 0;
> }
> +
> +/* x86-64 JIT can inline kfunc */
> +bool bpf_jit_inlines_kfunc_call(s32 imm)
> +{
> + if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
> + return true;
> + return false;
> +}
> diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
> index de0f9e5f9f73..e6097a371b69 100644
> --- a/arch/x86/net/bpf_jit_comp32.c
> +++ b/arch/x86/net/bpf_jit_comp32.c
> @@ -2094,6 +2094,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
> if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
> int err;
>
> + if (imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
> + EMIT3(0x0F, 0xAE, 0xE8);
> + EMIT2(0x0F, 0x31);
> + break;
> + }
> +
> err = emit_kfunc_call(bpf_prog,
> image + addrs[i],
> insn, &prog);
> @@ -2621,3 +2628,10 @@ bool bpf_jit_supports_kfunc_call(void)
> {
> return true;
> }
> +
> +bool bpf_jit_inlines_kfunc_call(s32 imm)
> +{
> + if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
> + return true;
> + return false;
> +}
[...]
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 395221e53832..5c6c0383ebf4 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -23,6 +23,9 @@
> #include <linux/btf_ids.h>
> #include <linux/bpf_mem_alloc.h>
> #include <linux/kasan.h>
> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
> +#include <vdso/datapage.h>
> +#endif
>
> #include "../../lib/kstrtox.h"
>
> @@ -3023,6 +3026,13 @@ __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user
> return ret + 1;
> }
>
> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
> +__bpf_kfunc u64 bpf_get_cpu_cycles(void)
> +{
> + return __arch_get_hw_counter(1, NULL);
Some comment to explain what '1' mean in the above?
> +}
> +#endif
> +
> __bpf_kfunc_end_defs();
>
> BTF_KFUNCS_START(generic_btf_ids)
> @@ -3115,6 +3125,9 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache)
> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
> +BTF_ID_FLAGS(func, bpf_get_cpu_cycles, KF_FASTCALL)
> +#endif
> BTF_KFUNCS_END(common_btf_ids)
>
> static const struct btf_kfunc_id_set common_kfunc_set = {
[...]
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-13 17:38 ` Yonghong Song
@ 2024-11-13 17:52 ` Vadim Fedorenko
2024-11-13 18:42 ` Yonghong Song
0 siblings, 1 reply; 23+ messages in thread
From: Vadim Fedorenko @ 2024-11-13 17:52 UTC (permalink / raw)
To: Yonghong Song, Vadim Fedorenko, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman,
Thomas Gleixner, Mykola Lysenko, Jakub Kicinski
Cc: x86, bpf, Martin KaFai Lau
On 13/11/2024 17:38, Yonghong Song wrote:
>
>
>
> On 11/8/24 4:41 PM, Vadim Fedorenko wrote:
>> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
>> it into rdtsc ordered call. Other architectures will get JIT
>> implementation too if supported. The fallback is to
>> __arch_get_hw_counter().
>>
>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>> ---
>> v4 -> v5:
>> * use if instead of ifdef with IS_ENABLED
>> v3 -> v4:
>> * change name of the helper to bpf_get_cpu_cycles (Andrii)
>> * Hide the helper behind CONFIG_GENERIC_GETTIMEOFDAY to avoid exposing
>> it on architectures which do not have vDSO functions and data
>> * reduce the scope of check of inlined functions in verifier to only 2,
>> which are actually inlined.
>> v2 -> v3:
>> * change name of the helper to bpf_get_cpu_cycles_counter to explicitly
>> mention what counter it provides (Andrii)
>> * move kfunc definition to bpf.h to use it in JIT.
>> * introduce another kfunc to convert cycles into nanoseconds as more
>> meaningful time units for generic tracing use case (Andrii)
>> v1 -> v2:
>> * Fix incorrect function return value type to u64
>> * Introduce bpf_jit_inlines_kfunc_call() and use it in
>> mark_fastcall_pattern_for_call() to avoid clobbering in case of
>> running programs with no JIT (Eduard)
>> * Avoid rewriting instruction and check function pointer directly
>> in JIT (Alexei)
>> * Change includes to fix compile issues on non x86 architectures
>> ---
>> arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++++++++
>> arch/x86/net/bpf_jit_comp32.c | 14 ++++++++++++++
>> include/linux/bpf.h | 5 +++++
>> include/linux/filter.h | 1 +
>> kernel/bpf/core.c | 11 +++++++++++
>> kernel/bpf/helpers.c | 13 +++++++++++++
>> kernel/bpf/verifier.c | 30 +++++++++++++++++++++++++++++-
>> 7 files changed, 101 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>> index 06b080b61aa5..4f78ed93ee7f 100644
>> --- a/arch/x86/net/bpf_jit_comp.c
>> +++ b/arch/x86/net/bpf_jit_comp.c
>> @@ -2126,6 +2126,26 @@ st: if (is_imm8(insn->off))
>> case BPF_JMP | BPF_CALL: {
>> u8 *ip = image + addrs[i - 1];
>> + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
>> + imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>> + /* Save RDX because RDTSC will use EDX:EAX to return
>> u64 */
>> + emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
>> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>> + EMIT_LFENCE();
>> + EMIT2(0x0F, 0x31);
>> +
>> + /* shl RDX, 32 */
>> + maybe_emit_1mod(&prog, BPF_REG_3, true);
>> + EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
>> + /* or RAX, RDX */
>> + maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
>> + EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
>> + /* restore RDX from R11 */
>> + emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
>> +
>> + break;
>> + }
>> +
>> func = (u8 *) __bpf_call_base + imm32;
>> if (tail_call_reachable) {
>> LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
>> @@ -3652,3 +3672,11 @@ u64 bpf_arch_uaddress_limit(void)
>> {
>> return 0;
>> }
>> +
>> +/* x86-64 JIT can inline kfunc */
>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>> +{
>> + if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>> + return true;
>> + return false;
>> +}
>> diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/
>> bpf_jit_comp32.c
>> index de0f9e5f9f73..e6097a371b69 100644
>> --- a/arch/x86/net/bpf_jit_comp32.c
>> +++ b/arch/x86/net/bpf_jit_comp32.c
>> @@ -2094,6 +2094,13 @@ static int do_jit(struct bpf_prog *bpf_prog,
>> int *addrs, u8 *image,
>> if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
>> int err;
>> + if (imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>> + EMIT3(0x0F, 0xAE, 0xE8);
>> + EMIT2(0x0F, 0x31);
>> + break;
>> + }
>> +
>> err = emit_kfunc_call(bpf_prog,
>> image + addrs[i],
>> insn, &prog);
>> @@ -2621,3 +2628,10 @@ bool bpf_jit_supports_kfunc_call(void)
>> {
>> return true;
>> }
>> +
>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>> +{
>> + if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>> + return true;
>> + return false;
>> +}
>
> [...]
>
>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>> index 395221e53832..5c6c0383ebf4 100644
>> --- a/kernel/bpf/helpers.c
>> +++ b/kernel/bpf/helpers.c
>> @@ -23,6 +23,9 @@
>> #include <linux/btf_ids.h>
>> #include <linux/bpf_mem_alloc.h>
>> #include <linux/kasan.h>
>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>> +#include <vdso/datapage.h>
>> +#endif
>> #include "../../lib/kstrtox.h"
>> @@ -3023,6 +3026,13 @@ __bpf_kfunc int bpf_copy_from_user_str(void
>> *dst, u32 dst__sz, const void __user
>> return ret + 1;
>> }
>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>> +__bpf_kfunc u64 bpf_get_cpu_cycles(void)
>> +{
>> + return __arch_get_hw_counter(1, NULL);
>
> Some comment to explain what '1' mean in the above?
That's arch-specific value which translates to HW implemented counter on
all architectures which have vDSO gettimeofday() implementation.
For x86 it translates to VDSO_CLOCKMODE_TSC, while for aarch64/RISC-V
it's VDSO_CLOCKMODE_ARCHTIMER. Actually, for RISC-V the value of the
first parameter doesn't matter at all, for aarch64 it should be 0.
The only arch which is more strict about this parameter is x86, but it
has it's own special name...
>
>> +}
>> +#endif
>> +
>> __bpf_kfunc_end_defs();
>> BTF_KFUNCS_START(generic_btf_ids)
>> @@ -3115,6 +3125,9 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache)
>> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
>> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT |
>> KF_RET_NULL | KF_SLEEPABLE)
>> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY |
>> KF_SLEEPABLE)
>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>> +BTF_ID_FLAGS(func, bpf_get_cpu_cycles, KF_FASTCALL)
>> +#endif
>> BTF_KFUNCS_END(common_btf_ids)
>> static const struct btf_kfunc_id_set common_kfunc_set = {
> [...]
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-13 17:52 ` Vadim Fedorenko
@ 2024-11-13 18:42 ` Yonghong Song
2024-11-13 22:28 ` Vadim Fedorenko
0 siblings, 1 reply; 23+ messages in thread
From: Yonghong Song @ 2024-11-13 18:42 UTC (permalink / raw)
To: Vadim Fedorenko, Vadim Fedorenko, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman,
Thomas Gleixner, Mykola Lysenko, Jakub Kicinski
Cc: x86, bpf, Martin KaFai Lau
On 11/13/24 9:52 AM, Vadim Fedorenko wrote:
> On 13/11/2024 17:38, Yonghong Song wrote:
>>
>>
>>
>> On 11/8/24 4:41 PM, Vadim Fedorenko wrote:
>>> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
>>> it into rdtsc ordered call. Other architectures will get JIT
>>> implementation too if supported. The fallback is to
>>> __arch_get_hw_counter().
>>>
>>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>>> ---
>>> v4 -> v5:
>>> * use if instead of ifdef with IS_ENABLED
>>> v3 -> v4:
>>> * change name of the helper to bpf_get_cpu_cycles (Andrii)
>>> * Hide the helper behind CONFIG_GENERIC_GETTIMEOFDAY to avoid exposing
>>> it on architectures which do not have vDSO functions and data
>>> * reduce the scope of check of inlined functions in verifier to only 2,
>>> which are actually inlined.
>>> v2 -> v3:
>>> * change name of the helper to bpf_get_cpu_cycles_counter to explicitly
>>> mention what counter it provides (Andrii)
>>> * move kfunc definition to bpf.h to use it in JIT.
>>> * introduce another kfunc to convert cycles into nanoseconds as more
>>> meaningful time units for generic tracing use case (Andrii)
>>> v1 -> v2:
>>> * Fix incorrect function return value type to u64
>>> * Introduce bpf_jit_inlines_kfunc_call() and use it in
>>> mark_fastcall_pattern_for_call() to avoid clobbering in case of
>>> running programs with no JIT (Eduard)
>>> * Avoid rewriting instruction and check function pointer directly
>>> in JIT (Alexei)
>>> * Change includes to fix compile issues on non x86 architectures
>>> ---
>>> arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++++++++
>>> arch/x86/net/bpf_jit_comp32.c | 14 ++++++++++++++
>>> include/linux/bpf.h | 5 +++++
>>> include/linux/filter.h | 1 +
>>> kernel/bpf/core.c | 11 +++++++++++
>>> kernel/bpf/helpers.c | 13 +++++++++++++
>>> kernel/bpf/verifier.c | 30 +++++++++++++++++++++++++++++-
>>> 7 files changed, 101 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>>> index 06b080b61aa5..4f78ed93ee7f 100644
>>> --- a/arch/x86/net/bpf_jit_comp.c
>>> +++ b/arch/x86/net/bpf_jit_comp.c
>>> @@ -2126,6 +2126,26 @@ st: if (is_imm8(insn->off))
>>> case BPF_JMP | BPF_CALL: {
>>> u8 *ip = image + addrs[i - 1];
>>> + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
>>> + imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>> + /* Save RDX because RDTSC will use EDX:EAX to
>>> return u64 */
>>> + emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
>>> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>> + EMIT_LFENCE();
>>> + EMIT2(0x0F, 0x31);
>>> +
>>> + /* shl RDX, 32 */
>>> + maybe_emit_1mod(&prog, BPF_REG_3, true);
>>> + EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
>>> + /* or RAX, RDX */
>>> + maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
>>> + EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
>>> + /* restore RDX from R11 */
>>> + emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
>>> +
>>> + break;
>>> + }
>>> +
>>> func = (u8 *) __bpf_call_base + imm32;
>>> if (tail_call_reachable) {
>>> LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
>>> @@ -3652,3 +3672,11 @@ u64 bpf_arch_uaddress_limit(void)
>>> {
>>> return 0;
>>> }
>>> +
>>> +/* x86-64 JIT can inline kfunc */
>>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>>> +{
>>> + if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>>> + return true;
>>> + return false;
>>> +}
>>> diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/
>>> bpf_jit_comp32.c
>>> index de0f9e5f9f73..e6097a371b69 100644
>>> --- a/arch/x86/net/bpf_jit_comp32.c
>>> +++ b/arch/x86/net/bpf_jit_comp32.c
>>> @@ -2094,6 +2094,13 @@ static int do_jit(struct bpf_prog *bpf_prog,
>>> int *addrs, u8 *image,
>>> if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
>>> int err;
>>> + if (imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>> + EMIT3(0x0F, 0xAE, 0xE8);
>>> + EMIT2(0x0F, 0x31);
>>> + break;
>>> + }
>>> +
>>> err = emit_kfunc_call(bpf_prog,
>>> image + addrs[i],
>>> insn, &prog);
>>> @@ -2621,3 +2628,10 @@ bool bpf_jit_supports_kfunc_call(void)
>>> {
>>> return true;
>>> }
>>> +
>>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>>> +{
>>> + if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>>> + return true;
>>> + return false;
>>> +}
>>
>> [...]
>>
>>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>>> index 395221e53832..5c6c0383ebf4 100644
>>> --- a/kernel/bpf/helpers.c
>>> +++ b/kernel/bpf/helpers.c
>>> @@ -23,6 +23,9 @@
>>> #include <linux/btf_ids.h>
>>> #include <linux/bpf_mem_alloc.h>
>>> #include <linux/kasan.h>
>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>> +#include <vdso/datapage.h>
>>> +#endif
>>> #include "../../lib/kstrtox.h"
>>> @@ -3023,6 +3026,13 @@ __bpf_kfunc int bpf_copy_from_user_str(void
>>> *dst, u32 dst__sz, const void __user
>>> return ret + 1;
>>> }
>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>> +__bpf_kfunc u64 bpf_get_cpu_cycles(void)
>>> +{
>>> + return __arch_get_hw_counter(1, NULL);
>>
>> Some comment to explain what '1' mean in the above?
>
> That's arch-specific value which translates to HW implemented counter on
> all architectures which have vDSO gettimeofday() implementation.
>
> For x86 it translates to VDSO_CLOCKMODE_TSC, while for aarch64/RISC-V
> it's VDSO_CLOCKMODE_ARCHTIMER. Actually, for RISC-V the value of the
> first parameter doesn't matter at all, for aarch64 it should be 0.
> The only arch which is more strict about this parameter is x86, but it
> has it's own special name...
So in the future, if we want add aarch64 support or other architecture,
the argument could be different, right?
I think we should avoid to have arch specific control in helpers.c.
How about we define a __weak func like bpf_arch_get_hw_counter() so we
have
__bpf_kfunc u64 bpf_get_cpu_cycles(void)
{
return bpf_arch_get_hw_counter();
}
Each arch can implement their own bpf_arch_get_hw_counter().
Do you think this will make more sense? This should not impact jit inlining
of this kfunc.
>
>>
>>> +}
>>> +#endif
>>> +
>>> __bpf_kfunc_end_defs();
>>> BTF_KFUNCS_START(generic_btf_ids)
>>> @@ -3115,6 +3125,9 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache)
>>> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW |
>>> KF_SLEEPABLE)
>>> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT |
>>> KF_RET_NULL | KF_SLEEPABLE)
>>> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY |
>>> KF_SLEEPABLE)
>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>> +BTF_ID_FLAGS(func, bpf_get_cpu_cycles, KF_FASTCALL)
>>> +#endif
>>> BTF_KFUNCS_END(common_btf_ids)
>>> static const struct btf_kfunc_id_set common_kfunc_set = {
>> [...]
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-13 18:42 ` Yonghong Song
@ 2024-11-13 22:28 ` Vadim Fedorenko
2024-11-13 23:02 ` Yonghong Song
0 siblings, 1 reply; 23+ messages in thread
From: Vadim Fedorenko @ 2024-11-13 22:28 UTC (permalink / raw)
To: Yonghong Song, Vadim Fedorenko, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman,
Thomas Gleixner, Mykola Lysenko, Jakub Kicinski
Cc: x86, bpf, Martin KaFai Lau
On 13/11/2024 18:42, Yonghong Song wrote:
>
>
>
> On 11/13/24 9:52 AM, Vadim Fedorenko wrote:
>> On 13/11/2024 17:38, Yonghong Song wrote:
>>>
>>>
>>>
>>> On 11/8/24 4:41 PM, Vadim Fedorenko wrote:
>>>> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
>>>> it into rdtsc ordered call. Other architectures will get JIT
>>>> implementation too if supported. The fallback is to
>>>> __arch_get_hw_counter().
>>>>
>>>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>>>> ---
>>>> v4 -> v5:
>>>> * use if instead of ifdef with IS_ENABLED
>>>> v3 -> v4:
>>>> * change name of the helper to bpf_get_cpu_cycles (Andrii)
>>>> * Hide the helper behind CONFIG_GENERIC_GETTIMEOFDAY to avoid exposing
>>>> it on architectures which do not have vDSO functions and data
>>>> * reduce the scope of check of inlined functions in verifier to only 2,
>>>> which are actually inlined.
>>>> v2 -> v3:
>>>> * change name of the helper to bpf_get_cpu_cycles_counter to explicitly
>>>> mention what counter it provides (Andrii)
>>>> * move kfunc definition to bpf.h to use it in JIT.
>>>> * introduce another kfunc to convert cycles into nanoseconds as more
>>>> meaningful time units for generic tracing use case (Andrii)
>>>> v1 -> v2:
>>>> * Fix incorrect function return value type to u64
>>>> * Introduce bpf_jit_inlines_kfunc_call() and use it in
>>>> mark_fastcall_pattern_for_call() to avoid clobbering in case of
>>>> running programs with no JIT (Eduard)
>>>> * Avoid rewriting instruction and check function pointer directly
>>>> in JIT (Alexei)
>>>> * Change includes to fix compile issues on non x86 architectures
>>>> ---
>>>> arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++++++++
>>>> arch/x86/net/bpf_jit_comp32.c | 14 ++++++++++++++
>>>> include/linux/bpf.h | 5 +++++
>>>> include/linux/filter.h | 1 +
>>>> kernel/bpf/core.c | 11 +++++++++++
>>>> kernel/bpf/helpers.c | 13 +++++++++++++
>>>> kernel/bpf/verifier.c | 30 +++++++++++++++++++++++++++++-
>>>> 7 files changed, 101 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>>>> index 06b080b61aa5..4f78ed93ee7f 100644
>>>> --- a/arch/x86/net/bpf_jit_comp.c
>>>> +++ b/arch/x86/net/bpf_jit_comp.c
>>>> @@ -2126,6 +2126,26 @@ st: if (is_imm8(insn->off))
>>>> case BPF_JMP | BPF_CALL: {
>>>> u8 *ip = image + addrs[i - 1];
>>>> + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
>>>> + imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>>> + /* Save RDX because RDTSC will use EDX:EAX to
>>>> return u64 */
>>>> + emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
>>>> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>>> + EMIT_LFENCE();
>>>> + EMIT2(0x0F, 0x31);
>>>> +
>>>> + /* shl RDX, 32 */
>>>> + maybe_emit_1mod(&prog, BPF_REG_3, true);
>>>> + EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
>>>> + /* or RAX, RDX */
>>>> + maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
>>>> + EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
>>>> + /* restore RDX from R11 */
>>>> + emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
>>>> +
>>>> + break;
>>>> + }
>>>> +
>>>> func = (u8 *) __bpf_call_base + imm32;
>>>> if (tail_call_reachable) {
>>>> LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
>>>> @@ -3652,3 +3672,11 @@ u64 bpf_arch_uaddress_limit(void)
>>>> {
>>>> return 0;
>>>> }
>>>> +
>>>> +/* x86-64 JIT can inline kfunc */
>>>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>>>> +{
>>>> + if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>>>> + return true;
>>>> + return false;
>>>> +}
>>>> diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/
>>>> bpf_jit_comp32.c
>>>> index de0f9e5f9f73..e6097a371b69 100644
>>>> --- a/arch/x86/net/bpf_jit_comp32.c
>>>> +++ b/arch/x86/net/bpf_jit_comp32.c
>>>> @@ -2094,6 +2094,13 @@ static int do_jit(struct bpf_prog *bpf_prog,
>>>> int *addrs, u8 *image,
>>>> if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
>>>> int err;
>>>> + if (imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>>> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>>> + EMIT3(0x0F, 0xAE, 0xE8);
>>>> + EMIT2(0x0F, 0x31);
>>>> + break;
>>>> + }
>>>> +
>>>> err = emit_kfunc_call(bpf_prog,
>>>> image + addrs[i],
>>>> insn, &prog);
>>>> @@ -2621,3 +2628,10 @@ bool bpf_jit_supports_kfunc_call(void)
>>>> {
>>>> return true;
>>>> }
>>>> +
>>>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>>>> +{
>>>> + if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>>>> + return true;
>>>> + return false;
>>>> +}
>>>
>>> [...]
>>>
>>>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>>>> index 395221e53832..5c6c0383ebf4 100644
>>>> --- a/kernel/bpf/helpers.c
>>>> +++ b/kernel/bpf/helpers.c
>>>> @@ -23,6 +23,9 @@
>>>> #include <linux/btf_ids.h>
>>>> #include <linux/bpf_mem_alloc.h>
>>>> #include <linux/kasan.h>
>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>> +#include <vdso/datapage.h>
>>>> +#endif
>>>> #include "../../lib/kstrtox.h"
>>>> @@ -3023,6 +3026,13 @@ __bpf_kfunc int bpf_copy_from_user_str(void
>>>> *dst, u32 dst__sz, const void __user
>>>> return ret + 1;
>>>> }
>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>> +__bpf_kfunc u64 bpf_get_cpu_cycles(void)
>>>> +{
>>>> + return __arch_get_hw_counter(1, NULL);
>>>
>>> Some comment to explain what '1' mean in the above?
>>
>> That's arch-specific value which translates to HW implemented counter on
>> all architectures which have vDSO gettimeofday() implementation.
>>
>> For x86 it translates to VDSO_CLOCKMODE_TSC, while for aarch64/RISC-V
>> it's VDSO_CLOCKMODE_ARCHTIMER. Actually, for RISC-V the value of the
>> first parameter doesn't matter at all, for aarch64 it should be 0.
>> The only arch which is more strict about this parameter is x86, but it
>> has it's own special name...
>
> So in the future, if we want add aarch64 support or other architecture,
> the argument could be different, right?
No, that's the point. This value will be the same for all architectures.
I'll do the implementation for aarch64 once this series is in.
>
> I think we should avoid to have arch specific control in helpers.c.
> How about we define a __weak func like bpf_arch_get_hw_counter() so we
> have
>
> __bpf_kfunc u64 bpf_get_cpu_cycles(void)
> {
> return bpf_arch_get_hw_counter();
> }
>
> Each arch can implement their own bpf_arch_get_hw_counter().
> Do you think this will make more sense? This should not impact jit inlining
> of this kfunc.
>
>>
>>>
>>>> +}
>>>> +#endif
>>>> +
>>>> __bpf_kfunc_end_defs();
>>>> BTF_KFUNCS_START(generic_btf_ids)
>>>> @@ -3115,6 +3125,9 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache)
>>>> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW |
>>>> KF_SLEEPABLE)
>>>> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT |
>>>> KF_RET_NULL | KF_SLEEPABLE)
>>>> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY |
>>>> KF_SLEEPABLE)
>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>> +BTF_ID_FLAGS(func, bpf_get_cpu_cycles, KF_FASTCALL)
>>>> +#endif
>>>> BTF_KFUNCS_END(common_btf_ids)
>>>> static const struct btf_kfunc_id_set common_kfunc_set = {
>>> [...]
>>
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-13 22:28 ` Vadim Fedorenko
@ 2024-11-13 23:02 ` Yonghong Song
2024-11-14 1:05 ` Vadim Fedorenko
0 siblings, 1 reply; 23+ messages in thread
From: Yonghong Song @ 2024-11-13 23:02 UTC (permalink / raw)
To: Vadim Fedorenko, Vadim Fedorenko, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman,
Thomas Gleixner, Mykola Lysenko, Jakub Kicinski
Cc: x86, bpf, Martin KaFai Lau
On 11/13/24 2:28 PM, Vadim Fedorenko wrote:
> On 13/11/2024 18:42, Yonghong Song wrote:
>>
>>
>>
>> On 11/13/24 9:52 AM, Vadim Fedorenko wrote:
>>> On 13/11/2024 17:38, Yonghong Song wrote:
>>>>
>>>>
>>>>
>>>> On 11/8/24 4:41 PM, Vadim Fedorenko wrote:
>>>>> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT
>>>>> converts
>>>>> it into rdtsc ordered call. Other architectures will get JIT
>>>>> implementation too if supported. The fallback is to
>>>>> __arch_get_hw_counter().
>>>>>
>>>>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>>>>> ---
>>>>> v4 -> v5:
>>>>> * use if instead of ifdef with IS_ENABLED
>>>>> v3 -> v4:
>>>>> * change name of the helper to bpf_get_cpu_cycles (Andrii)
>>>>> * Hide the helper behind CONFIG_GENERIC_GETTIMEOFDAY to avoid
>>>>> exposing
>>>>> it on architectures which do not have vDSO functions and data
>>>>> * reduce the scope of check of inlined functions in verifier to
>>>>> only 2,
>>>>> which are actually inlined.
>>>>> v2 -> v3:
>>>>> * change name of the helper to bpf_get_cpu_cycles_counter to
>>>>> explicitly
>>>>> mention what counter it provides (Andrii)
>>>>> * move kfunc definition to bpf.h to use it in JIT.
>>>>> * introduce another kfunc to convert cycles into nanoseconds as more
>>>>> meaningful time units for generic tracing use case (Andrii)
>>>>> v1 -> v2:
>>>>> * Fix incorrect function return value type to u64
>>>>> * Introduce bpf_jit_inlines_kfunc_call() and use it in
>>>>> mark_fastcall_pattern_for_call() to avoid clobbering in case of
>>>>> running programs with no JIT (Eduard)
>>>>> * Avoid rewriting instruction and check function pointer directly
>>>>> in JIT (Alexei)
>>>>> * Change includes to fix compile issues on non x86 architectures
>>>>> ---
>>>>> arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++++++++
>>>>> arch/x86/net/bpf_jit_comp32.c | 14 ++++++++++++++
>>>>> include/linux/bpf.h | 5 +++++
>>>>> include/linux/filter.h | 1 +
>>>>> kernel/bpf/core.c | 11 +++++++++++
>>>>> kernel/bpf/helpers.c | 13 +++++++++++++
>>>>> kernel/bpf/verifier.c | 30 +++++++++++++++++++++++++++++-
>>>>> 7 files changed, 101 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/arch/x86/net/bpf_jit_comp.c
>>>>> b/arch/x86/net/bpf_jit_comp.c
>>>>> index 06b080b61aa5..4f78ed93ee7f 100644
>>>>> --- a/arch/x86/net/bpf_jit_comp.c
>>>>> +++ b/arch/x86/net/bpf_jit_comp.c
>>>>> @@ -2126,6 +2126,26 @@ st: if (is_imm8(insn->off))
>>>>> case BPF_JMP | BPF_CALL: {
>>>>> u8 *ip = image + addrs[i - 1];
>>>>> + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
>>>>> + imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>>>> + /* Save RDX because RDTSC will use EDX:EAX to
>>>>> return u64 */
>>>>> + emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
>>>>> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>>>> + EMIT_LFENCE();
>>>>> + EMIT2(0x0F, 0x31);
>>>>> +
>>>>> + /* shl RDX, 32 */
>>>>> + maybe_emit_1mod(&prog, BPF_REG_3, true);
>>>>> + EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
>>>>> + /* or RAX, RDX */
>>>>> + maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
>>>>> + EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
>>>>> + /* restore RDX from R11 */
>>>>> + emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
>>>>> +
>>>>> + break;
>>>>> + }
>>>>> +
>>>>> func = (u8 *) __bpf_call_base + imm32;
>>>>> if (tail_call_reachable) {
>>>>> LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
>>>>> @@ -3652,3 +3672,11 @@ u64 bpf_arch_uaddress_limit(void)
>>>>> {
>>>>> return 0;
>>>>> }
>>>>> +
>>>>> +/* x86-64 JIT can inline kfunc */
>>>>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>>>>> +{
>>>>> + if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>>>>> + return true;
>>>>> + return false;
>>>>> +}
>>>>> diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/
>>>>> bpf_jit_comp32.c
>>>>> index de0f9e5f9f73..e6097a371b69 100644
>>>>> --- a/arch/x86/net/bpf_jit_comp32.c
>>>>> +++ b/arch/x86/net/bpf_jit_comp32.c
>>>>> @@ -2094,6 +2094,13 @@ static int do_jit(struct bpf_prog
>>>>> *bpf_prog, int *addrs, u8 *image,
>>>>> if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
>>>>> int err;
>>>>> + if (imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>>>> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>>>> + EMIT3(0x0F, 0xAE, 0xE8);
>>>>> + EMIT2(0x0F, 0x31);
>>>>> + break;
>>>>> + }
>>>>> +
>>>>> err = emit_kfunc_call(bpf_prog,
>>>>> image + addrs[i],
>>>>> insn, &prog);
>>>>> @@ -2621,3 +2628,10 @@ bool bpf_jit_supports_kfunc_call(void)
>>>>> {
>>>>> return true;
>>>>> }
>>>>> +
>>>>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>>>>> +{
>>>>> + if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>>>>> + return true;
>>>>> + return false;
>>>>> +}
>>>>
>>>> [...]
>>>>
>>>>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>>>>> index 395221e53832..5c6c0383ebf4 100644
>>>>> --- a/kernel/bpf/helpers.c
>>>>> +++ b/kernel/bpf/helpers.c
>>>>> @@ -23,6 +23,9 @@
>>>>> #include <linux/btf_ids.h>
>>>>> #include <linux/bpf_mem_alloc.h>
>>>>> #include <linux/kasan.h>
>>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>>> +#include <vdso/datapage.h>
>>>>> +#endif
>>>>> #include "../../lib/kstrtox.h"
>>>>> @@ -3023,6 +3026,13 @@ __bpf_kfunc int bpf_copy_from_user_str(void
>>>>> *dst, u32 dst__sz, const void __user
>>>>> return ret + 1;
>>>>> }
>>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>>> +__bpf_kfunc u64 bpf_get_cpu_cycles(void)
>>>>> +{
>>>>> + return __arch_get_hw_counter(1, NULL);
>>>>
>>>> Some comment to explain what '1' mean in the above?
>>>
>>> That's arch-specific value which translates to HW implemented
>>> counter on
>>> all architectures which have vDSO gettimeofday() implementation.
>>>
>>> For x86 it translates to VDSO_CLOCKMODE_TSC, while for aarch64/RISC-V
>>> it's VDSO_CLOCKMODE_ARCHTIMER. Actually, for RISC-V the value of the
>>> first parameter doesn't matter at all, for aarch64 it should be 0.
>>> The only arch which is more strict about this parameter is x86, but it
>>> has it's own special name...
>>
>> So in the future, if we want add aarch64 support or other architecture,
>> the argument could be different, right?
>
> No, that's the point. This value will be the same for all architectures.
> I'll do the implementation for aarch64 once this series is in.
I did a little bit research and the following are two callsites for
__arch_get_hw_counter:
0 lib/vdso/gettimeofday.c do_hres_timens 96 cycles = __arch_get_hw_counter(vd->clock_mode, vd);
1 lib/vdso/gettimeofday.c do_hres 164 cycles = __arch_get_hw_counter(vd->clock_mode, vd);
Let us pick func do_hres_timens():
vd = vdns - (clk == CLOCK_MONOTONIC_RAW ? CS_RAW : CS_HRES_COARSE);
vd = __arch_get_timens_vdso_data(vd);
if (clk != CLOCK_MONOTONIC_RAW)
vd = &vd[CS_HRES_COARSE];
else
vd = &vd[CS_RAW];
..
cycles = __arch_get_hw_counter(vd->clock_mode, vd);
So 'vd' is supplied by arch specific func (__arch_get_timens_vdso_data
()), so theoretically vd->clock_mode in __arch_get_hw_counter(vd->clock_mode, vd)
could be different for different archs. The other case in do_hres() is similar.
But if you are sure that the first argument is the same for all architectures, please
add a comment right above __arch_get_hw_counter() to say
All architectures have the same parameters as below
>
>>
>> I think we should avoid to have arch specific control in helpers.c.
>> How about we define a __weak func like bpf_arch_get_hw_counter() so we
>> have
>>
>> __bpf_kfunc u64 bpf_get_cpu_cycles(void)
>> {
>> return bpf_arch_get_hw_counter();
>> }
>>
>> Each arch can implement their own bpf_arch_get_hw_counter().
>> Do you think this will make more sense? This should not impact jit
>> inlining
>> of this kfunc.
>>
>>>
>>>>
>>>>> +}
>>>>> +#endif
>>>>> +
>>>>> __bpf_kfunc_end_defs();
>>>>> BTF_KFUNCS_START(generic_btf_ids)
>>>>> @@ -3115,6 +3125,9 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache)
>>>>> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW |
>>>>> KF_SLEEPABLE)
>>>>> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT |
>>>>> KF_RET_NULL | KF_SLEEPABLE)
>>>>> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY
>>>>> | KF_SLEEPABLE)
>>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>>> +BTF_ID_FLAGS(func, bpf_get_cpu_cycles, KF_FASTCALL)
>>>>> +#endif
>>>>> BTF_KFUNCS_END(common_btf_ids)
>>>>> static const struct btf_kfunc_id_set common_kfunc_set = {
>>>> [...]
>>>
>>
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc
2024-11-13 23:02 ` Yonghong Song
@ 2024-11-14 1:05 ` Vadim Fedorenko
0 siblings, 0 replies; 23+ messages in thread
From: Vadim Fedorenko @ 2024-11-14 1:05 UTC (permalink / raw)
To: Yonghong Song, Vadim Fedorenko, Alexei Starovoitov,
Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman,
Thomas Gleixner, Mykola Lysenko, Jakub Kicinski
Cc: x86, bpf, Martin KaFai Lau
On 13/11/2024 23:02, Yonghong Song wrote:
>
>
>
> On 11/13/24 2:28 PM, Vadim Fedorenko wrote:
>> On 13/11/2024 18:42, Yonghong Song wrote:
>>>
>>>
>>>
>>> On 11/13/24 9:52 AM, Vadim Fedorenko wrote:
>>>> On 13/11/2024 17:38, Yonghong Song wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 11/8/24 4:41 PM, Vadim Fedorenko wrote:
>>>>>> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT
>>>>>> converts
>>>>>> it into rdtsc ordered call. Other architectures will get JIT
>>>>>> implementation too if supported. The fallback is to
>>>>>> __arch_get_hw_counter().
>>>>>>
>>>>>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>>>>>> ---
>>>>>> v4 -> v5:
>>>>>> * use if instead of ifdef with IS_ENABLED
>>>>>> v3 -> v4:
>>>>>> * change name of the helper to bpf_get_cpu_cycles (Andrii)
>>>>>> * Hide the helper behind CONFIG_GENERIC_GETTIMEOFDAY to avoid
>>>>>> exposing
>>>>>> it on architectures which do not have vDSO functions and data
>>>>>> * reduce the scope of check of inlined functions in verifier to
>>>>>> only 2,
>>>>>> which are actually inlined.
>>>>>> v2 -> v3:
>>>>>> * change name of the helper to bpf_get_cpu_cycles_counter to
>>>>>> explicitly
>>>>>> mention what counter it provides (Andrii)
>>>>>> * move kfunc definition to bpf.h to use it in JIT.
>>>>>> * introduce another kfunc to convert cycles into nanoseconds as more
>>>>>> meaningful time units for generic tracing use case (Andrii)
>>>>>> v1 -> v2:
>>>>>> * Fix incorrect function return value type to u64
>>>>>> * Introduce bpf_jit_inlines_kfunc_call() and use it in
>>>>>> mark_fastcall_pattern_for_call() to avoid clobbering in case of
>>>>>> running programs with no JIT (Eduard)
>>>>>> * Avoid rewriting instruction and check function pointer directly
>>>>>> in JIT (Alexei)
>>>>>> * Change includes to fix compile issues on non x86 architectures
>>>>>> ---
>>>>>> arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++++++++
>>>>>> arch/x86/net/bpf_jit_comp32.c | 14 ++++++++++++++
>>>>>> include/linux/bpf.h | 5 +++++
>>>>>> include/linux/filter.h | 1 +
>>>>>> kernel/bpf/core.c | 11 +++++++++++
>>>>>> kernel/bpf/helpers.c | 13 +++++++++++++
>>>>>> kernel/bpf/verifier.c | 30 +++++++++++++++++++++++++++++-
>>>>>> 7 files changed, 101 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/
>>>>>> bpf_jit_comp.c
>>>>>> index 06b080b61aa5..4f78ed93ee7f 100644
>>>>>> --- a/arch/x86/net/bpf_jit_comp.c
>>>>>> +++ b/arch/x86/net/bpf_jit_comp.c
>>>>>> @@ -2126,6 +2126,26 @@ st: if (is_imm8(insn->off))
>>>>>> case BPF_JMP | BPF_CALL: {
>>>>>> u8 *ip = image + addrs[i - 1];
>>>>>> + if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
>>>>>> + imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>>>>> + /* Save RDX because RDTSC will use EDX:EAX to
>>>>>> return u64 */
>>>>>> + emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
>>>>>> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>>>>> + EMIT_LFENCE();
>>>>>> + EMIT2(0x0F, 0x31);
>>>>>> +
>>>>>> + /* shl RDX, 32 */
>>>>>> + maybe_emit_1mod(&prog, BPF_REG_3, true);
>>>>>> + EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
>>>>>> + /* or RAX, RDX */
>>>>>> + maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
>>>>>> + EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
>>>>>> + /* restore RDX from R11 */
>>>>>> + emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
>>>>>> +
>>>>>> + break;
>>>>>> + }
>>>>>> +
>>>>>> func = (u8 *) __bpf_call_base + imm32;
>>>>>> if (tail_call_reachable) {
>>>>>> LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
>>>>>> @@ -3652,3 +3672,11 @@ u64 bpf_arch_uaddress_limit(void)
>>>>>> {
>>>>>> return 0;
>>>>>> }
>>>>>> +
>>>>>> +/* x86-64 JIT can inline kfunc */
>>>>>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>>>>>> +{
>>>>>> + if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>>>>>> + return true;
>>>>>> + return false;
>>>>>> +}
>>>>>> diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/
>>>>>> bpf_jit_comp32.c
>>>>>> index de0f9e5f9f73..e6097a371b69 100644
>>>>>> --- a/arch/x86/net/bpf_jit_comp32.c
>>>>>> +++ b/arch/x86/net/bpf_jit_comp32.c
>>>>>> @@ -2094,6 +2094,13 @@ static int do_jit(struct bpf_prog
>>>>>> *bpf_prog, int *addrs, u8 *image,
>>>>>> if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
>>>>>> int err;
>>>>>> + if (imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>>>>> + if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>>>>> + EMIT3(0x0F, 0xAE, 0xE8);
>>>>>> + EMIT2(0x0F, 0x31);
>>>>>> + break;
>>>>>> + }
>>>>>> +
>>>>>> err = emit_kfunc_call(bpf_prog,
>>>>>> image + addrs[i],
>>>>>> insn, &prog);
>>>>>> @@ -2621,3 +2628,10 @@ bool bpf_jit_supports_kfunc_call(void)
>>>>>> {
>>>>>> return true;
>>>>>> }
>>>>>> +
>>>>>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>>>>>> +{
>>>>>> + if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>>>>>> + return true;
>>>>>> + return false;
>>>>>> +}
>>>>>
>>>>> [...]
>>>>>
>>>>>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>>>>>> index 395221e53832..5c6c0383ebf4 100644
>>>>>> --- a/kernel/bpf/helpers.c
>>>>>> +++ b/kernel/bpf/helpers.c
>>>>>> @@ -23,6 +23,9 @@
>>>>>> #include <linux/btf_ids.h>
>>>>>> #include <linux/bpf_mem_alloc.h>
>>>>>> #include <linux/kasan.h>
>>>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>>>> +#include <vdso/datapage.h>
>>>>>> +#endif
>>>>>> #include "../../lib/kstrtox.h"
>>>>>> @@ -3023,6 +3026,13 @@ __bpf_kfunc int bpf_copy_from_user_str(void
>>>>>> *dst, u32 dst__sz, const void __user
>>>>>> return ret + 1;
>>>>>> }
>>>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>>>> +__bpf_kfunc u64 bpf_get_cpu_cycles(void)
>>>>>> +{
>>>>>> + return __arch_get_hw_counter(1, NULL);
>>>>>
>>>>> Some comment to explain what '1' mean in the above?
>>>>
>>>> That's arch-specific value which translates to HW implemented
>>>> counter on
>>>> all architectures which have vDSO gettimeofday() implementation.
>>>>
>>>> For x86 it translates to VDSO_CLOCKMODE_TSC, while for aarch64/RISC-V
>>>> it's VDSO_CLOCKMODE_ARCHTIMER. Actually, for RISC-V the value of the
>>>> first parameter doesn't matter at all, for aarch64 it should be 0.
>>>> The only arch which is more strict about this parameter is x86, but it
>>>> has it's own special name...
>>>
>>> So in the future, if we want add aarch64 support or other architecture,
>>> the argument could be different, right?
>>
>> No, that's the point. This value will be the same for all architectures.
>> I'll do the implementation for aarch64 once this series is in.
>
> I did a little bit research and the following are two callsites for
> __arch_get_hw_counter:
> 0 lib/vdso/gettimeofday.c do_hres_timens 96 cycles =
> __arch_get_hw_counter(vd->clock_mode, vd);
> 1 lib/vdso/gettimeofday.c do_hres 164 cycles =
> __arch_get_hw_counter(vd->clock_mode, vd);
>
> Let us pick func do_hres_timens():
>
> vd = vdns - (clk == CLOCK_MONOTONIC_RAW ? CS_RAW : CS_HRES_COARSE);
> vd = __arch_get_timens_vdso_data(vd);
> if (clk != CLOCK_MONOTONIC_RAW)
> vd = &vd[CS_HRES_COARSE];
> else
> vd = &vd[CS_RAW];
> ..
> cycles = __arch_get_hw_counter(vd->clock_mode, vd);
>
> So 'vd' is supplied by arch specific func (__arch_get_timens_vdso_data
> ()), so theoretically vd->clock_mode in __arch_get_hw_counter(vd-
> >clock_mode, vd)
> could be different for different archs. The other case in do_hres() is
> similar.
Well, ok, even though I'm pretty sure all the implememtations will end
up with 1 as a value, let's be on the safe side and implement it the
same way with reading vdso data structure. Now with cycles2ns kfunc we
include all the needed pieces, should be no blockers on this
implementations. Thanks for bringing it up.
>
> But if you are sure that the first argument is the same for all
> architectures, please
> add a comment right above __arch_get_hw_counter() to say
> All architectures have the same parameters as below
>
>>
>>>
>>> I think we should avoid to have arch specific control in helpers.c.
>>> How about we define a __weak func like bpf_arch_get_hw_counter() so we
>>> have
>>>
>>> __bpf_kfunc u64 bpf_get_cpu_cycles(void)
>>> {
>>> return bpf_arch_get_hw_counter();
>>> }
>>>
>>> Each arch can implement their own bpf_arch_get_hw_counter().
>>> Do you think this will make more sense? This should not impact jit
>>> inlining
>>> of this kfunc.
>>>
>>>>
>>>>>
>>>>>> +}
>>>>>> +#endif
>>>>>> +
>>>>>> __bpf_kfunc_end_defs();
>>>>>> BTF_KFUNCS_START(generic_btf_ids)
>>>>>> @@ -3115,6 +3125,9 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache)
>>>>>> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW |
>>>>>> KF_SLEEPABLE)
>>>>>> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT |
>>>>>> KF_RET_NULL | KF_SLEEPABLE)
>>>>>> BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY
>>>>>> | KF_SLEEPABLE)
>>>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>>>> +BTF_ID_FLAGS(func, bpf_get_cpu_cycles, KF_FASTCALL)
>>>>>> +#endif
>>>>>> BTF_KFUNCS_END(common_btf_ids)
>>>>>> static const struct btf_kfunc_id_set common_kfunc_set = {
>>>>> [...]
>>>>
>>>
>>
>
^ permalink raw reply [flat|nested] 23+ messages in thread
end of thread, other threads:[~2024-11-14 1:05 UTC | newest]
Thread overview: 23+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-11-09 0:41 [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc Vadim Fedorenko
2024-11-09 0:41 ` [PATCH bpf-next v5 2/4] bpf: add bpf_cpu_cycles_to_ns helper Vadim Fedorenko
2024-11-12 23:03 ` Eduard Zingerman
2024-11-09 0:41 ` [PATCH bpf-next v5 3/4] selftests/bpf: add selftest to check rdtsc jit Vadim Fedorenko
2024-11-12 23:17 ` Eduard Zingerman
2024-11-09 0:41 ` [PATCH bpf-next v5 4/4] selftests/bpf: add usage example for cpu cycles kfuncs Vadim Fedorenko
2024-11-12 5:50 ` [PATCH bpf-next v5 1/4] bpf: add bpf_get_cpu_cycles kfunc Andrii Nakryiko
2024-11-12 21:43 ` Vadim Fedorenko
2024-11-12 23:59 ` Andrii Nakryiko
2024-11-12 21:21 ` Eduard Zingerman
2024-11-12 21:39 ` Vadim Fedorenko
2024-11-12 21:53 ` Eduard Zingerman
2024-11-12 22:19 ` Eduard Zingerman
2024-11-12 22:27 ` Alexei Starovoitov
2024-11-12 23:08 ` Vadim Fedorenko
2024-11-13 0:09 ` Alexei Starovoitov
2024-11-13 0:20 ` Vadim Fedorenko
2024-11-13 17:38 ` Yonghong Song
2024-11-13 17:52 ` Vadim Fedorenko
2024-11-13 18:42 ` Yonghong Song
2024-11-13 22:28 ` Vadim Fedorenko
2024-11-13 23:02 ` Yonghong Song
2024-11-14 1:05 ` Vadim Fedorenko
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox