[RFC bpf-next v2 0/4] bpf: Fast-Path approach for BPF program

bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC bpf-next v2 0/4] bpf: Fast-Path approach for BPF program
@ 2025-06-14  6:40 Siddharth Chintamaneni
  2025-06-14  6:40 ` [RFC bpf-next v2 1/4] bpf: Introduce new structs and struct fields Siddharth Chintamaneni
                   ` (4 more replies)
  0 siblings, 5 replies; 19+ messages in thread
From: Siddharth Chintamaneni @ 2025-06-14  6:40 UTC (permalink / raw)
  To: bpf
  Cc: ast, daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
	john.fastabend, kpsingh, sdf, haoluo, jolsa, djwillia, miloc,
	ericts, rahult, doniaghazy, quanzhif, jinghao7, sidchintamaneni,
	memxor, egor, sairoop10

This is RFC v2 of 
	https://lore.kernel.org/bpf/20250420105524.2115690-1-rjsu26@gmail.com/

Change since v1:
- Patch generation has been moved after verification and before JIT.
	- Now patch generation handles both helpers and kfuncs.
	- Sanity check on original prog and patch prog after JIT.
- Runtime termination handler is now global termination mechanism using
  text_poke.
- Termination is triggered by watchdog timer.

TODO:
- Termination support for tailcall programs.
- Fix issue caused by warning in runtime termination handler due to 
  https://elixir.bootlin.com/linux/v6.15.1/source/kernel/smp.c#L815
- Free memory for patch progs related fields.
- Include selftests covering more cases such as BPF program nesting.

 include/linux/bpf.h                           |  19 +-
 include/linux/bpf_verifier.h                  |   4 +
 include/linux/filter.h                        |  43 ++-
 kernel/bpf/core.c                             |  64 +++++
 kernel/bpf/syscall.c                          | 225 ++++++++++++++++
 kernel/bpf/trampoline.c                       |   5 +
 kernel/bpf/verifier.c                         | 245 +++++++++++++++++-
 .../bpf/prog_tests/bpf_termination.c          |  39 +++
 .../selftests/bpf/progs/bpf_termination.c     |  38 +++
 9 files changed, 674 insertions(+), 8 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_termination.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_termination.c

-- 
2.43.0


^ permalink raw reply	[flat|nested] 19+ messages in thread

* [RFC bpf-next v2 1/4] bpf: Introduce new structs and struct fields
  2025-06-14  6:40 [RFC bpf-next v2 0/4] bpf: Fast-Path approach for BPF program Siddharth Chintamaneni
@ 2025-06-14  6:40 ` Siddharth Chintamaneni
  2025-06-14  6:40 ` [RFC bpf-next v2 2/4] bpf: Generating a stubbed version of BPF program for termination Siddharth Chintamaneni
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 19+ messages in thread
From: Siddharth Chintamaneni @ 2025-06-14  6:40 UTC (permalink / raw)
  To: bpf
  Cc: ast, daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
	john.fastabend, kpsingh, sdf, haoluo, jolsa, djwillia, miloc,
	ericts, rahult, doniaghazy, quanzhif, jinghao7, sidchintamaneni,
	memxor, egor, sairoop10, Raj Sahu

Introduced the definition of struct bpf_term_aux_states
required to support fast-path termination of BPF programs.
Added the memory allocation and free logic for newly added
term_states feild in struct bpf_prog.

Signed-off-by: Raj Sahu <rjsu26@gmail.com>
Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
---
 include/linux/bpf.h | 18 +++++++++++++++++-
 kernel/bpf/core.c   | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5dd556e89cce..1c534b3e10d8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -31,6 +31,7 @@
 #include <linux/memcontrol.h>
 #include <linux/cfi.h>
 #include <asm/rqspinlock.h>
+#include <linux/hrtimer.h>
 
 struct bpf_verifier_env;
 struct bpf_verifier_log;
@@ -39,6 +40,7 @@ struct bpf_prog;
 struct bpf_prog_aux;
 struct bpf_map;
 struct bpf_arena;
+struct bpf_term_aux_states;
 struct sock;
 struct seq_file;
 struct btf;
@@ -1538,6 +1540,17 @@ struct btf_mod_pair {
 
 struct bpf_kfunc_desc_tab;
 
+struct cpu_aux {
+	u8 cpu_flag;
+	spinlock_t lock;
+};
+
+struct bpf_term_aux_states {
+	struct bpf_prog *patch_prog;
+	struct cpu_aux *per_cpu_state;
+	struct hrtimer hrtimer;
+};
+
 struct bpf_prog_aux {
 	atomic64_t refcnt;
 	u32 used_map_cnt;
@@ -1664,7 +1677,8 @@ struct bpf_prog {
 				call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
 				call_get_func_ip:1, /* Do we call get_func_ip() */
 				tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
-				sleepable:1;	/* BPF program is sleepable */
+				sleepable:1,	/* BPF program is sleepable */
+				is_termination_prog:1; /* Is patch prog? */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
@@ -1676,6 +1690,7 @@ struct bpf_prog {
 					    const struct bpf_insn *insn);
 	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
+	struct bpf_term_aux_states *term_states; /* Program termination aux fields */
 	/* Instructions for interpreter */
 	union {
 		DECLARE_FLEX_ARRAY(struct sock_filter, insns);
@@ -2385,6 +2400,7 @@ static inline struct btf *__btf_get_by_fd(struct fd f)
 	return fd_file(f)->private_data;
 }
 
+enum hrtimer_restart bpf_termination_wd_callback(struct hrtimer *hr);
 void bpf_map_inc(struct bpf_map *map);
 void bpf_map_inc_with_uref(struct bpf_map *map);
 struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index e536a34a32c8..756ed575741e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -99,6 +99,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
 	struct bpf_prog_aux *aux;
 	struct bpf_prog *fp;
+	struct bpf_term_aux_states *term_states = NULL;
 
 	size = round_up(size, __PAGE_SIZE);
 	fp = __vmalloc(size, gfp_flags);
@@ -117,11 +118,28 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 		return NULL;
 	}
 
+	term_states = kzalloc(sizeof(*term_states), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
+	if (!term_states)
+		goto free_alloc_percpu;
+
+	term_states->per_cpu_state = kzalloc(sizeof(struct cpu_aux) * NR_CPUS, 
+						bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
+	if (!term_states->per_cpu_state)
+		goto free_bpf_term_states;
+	
+	for (int i = 0; i < NR_CPUS; i++) {
+		term_states->per_cpu_state[i].cpu_flag = 0;
+		spin_lock_init(&term_states->per_cpu_state[i].lock);
+	}
+
+
 	fp->pages = size / PAGE_SIZE;
 	fp->aux = aux;
 	fp->aux->prog = fp;
 	fp->jit_requested = ebpf_jit_enabled();
 	fp->blinding_requested = bpf_jit_blinding_enabled(fp);
+	fp->term_states = term_states;
+	fp->term_states->patch_prog = NULL;
 #ifdef CONFIG_CGROUP_BPF
 	aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
 #endif
@@ -135,6 +153,15 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 	mutex_init(&fp->aux->dst_mutex);
 
 	return fp;
+
+free_bpf_term_states:
+	kfree(term_states);
+free_alloc_percpu:
+	free_percpu(fp->active);
+	kfree(aux);
+	vfree(fp);
+
+	return NULL;
 }
 
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
@@ -268,6 +295,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 		fp_old->aux = NULL;
 		fp_old->stats = NULL;
 		fp_old->active = NULL;
+		fp_old->term_states = NULL;
 		__bpf_prog_free(fp_old);
 	}
 
@@ -282,6 +310,15 @@ void __bpf_prog_free(struct bpf_prog *fp)
 		kfree(fp->aux->poke_tab);
 		kfree(fp->aux);
 	}
+	if (fp->term_states) {
+		if (fp->term_states->patch_prog) {
+			kfree(fp->term_states->patch_prog->aux->poke_tab);
+			kfree(fp->term_states->patch_prog->aux);
+			vfree(fp->term_states->patch_prog);
+		}
+		kfree(fp->term_states->per_cpu_state);
+		kfree(fp->term_states);
+	}
 	free_percpu(fp->stats);
 	free_percpu(fp->active);
 	vfree(fp);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [RFC bpf-next v2 2/4] bpf: Generating a stubbed version of BPF program for termination
  2025-06-14  6:40 [RFC bpf-next v2 0/4] bpf: Fast-Path approach for BPF program Siddharth Chintamaneni
  2025-06-14  6:40 ` [RFC bpf-next v2 1/4] bpf: Introduce new structs and struct fields Siddharth Chintamaneni
@ 2025-06-14  6:40 ` Siddharth Chintamaneni
  2025-06-30 12:47   ` Kumar Kartikeya Dwivedi
  2025-06-14  6:40 ` [RFC bpf-next v2 3/4] bpf: Runtime part of fast-path termination approach Siddharth Chintamaneni
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 19+ messages in thread
From: Siddharth Chintamaneni @ 2025-06-14  6:40 UTC (permalink / raw)
  To: bpf
  Cc: ast, daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
	john.fastabend, kpsingh, sdf, haoluo, jolsa, djwillia, miloc,
	ericts, rahult, doniaghazy, quanzhif, jinghao7, sidchintamaneni,
	memxor, egor, sairoop10, Raj Sahu

Introduces patch generation which generates patch for each BPF
program, iterate through all helper calls, kfuncs and if the return
type may return NULL then stub them with a dummy function.

Additional to these helpers/kfuncs the current implementation also
supports termination of bpf_loop (both inline and non-inline case)

Signed-off-by: Raj Sahu <rjsu26@gmail.com>
Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
---
 include/linux/bpf_verifier.h |   4 +
 include/linux/filter.h       |   2 +
 kernel/bpf/core.c            |  12 ++
 kernel/bpf/syscall.c         |  19 +++
 kernel/bpf/verifier.c        | 245 ++++++++++++++++++++++++++++++++++-
 5 files changed, 276 insertions(+), 6 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7e459e839f8b..3b4f371684a9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -844,6 +844,10 @@ struct bpf_verifier_env {
 	/* array of pointers to bpf_scc_info indexed by SCC id */
 	struct bpf_scc_info **scc_info;
 	u32 scc_cnt;
+	struct {
+		u32 call_sites_cnt;
+		int *call_idx;
+	} bpf_term_patch_call_sites;
 };
 
 static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index eca229752cbe..cd9f1c2727ec 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1119,6 +1119,8 @@ int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);
 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 
+int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx);
+void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 #define __bpf_call_base_args \
 	((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 756ed575741e..2a02e9cafd5a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1571,6 +1571,18 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
 }
 #endif /* CONFIG_BPF_JIT */
 
+noinline void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(bpf_termination_null_func);
+
+noinline int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx)
+{
+	return 1;
+}
+EXPORT_SYMBOL_GPL(bpf_loop_term_callback);
+
 /* Base function for offset calculation. Needs to go into .text section,
  * therefore keeping it non-static as well; will also be used by JITs
  * anyway later on, so do not let the compiler omit it. This also needs
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 56500381c28a..cd8e7c47e3fe 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2757,6 +2757,16 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
 /* last field in 'union bpf_attr' used by this command */
 #define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt
 
+static int sanity_check_jit_len(struct bpf_prog *prog)
+{
+	struct bpf_prog *patch_prog = prog->term_states->patch_prog;
+
+	if (prog->jited_len != patch_prog->jited_len)
+		return -EFAULT;
+
+	return 0;
+}
+
 static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
 	enum bpf_prog_type type = attr->prog_type;
@@ -2921,6 +2931,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 
 	prog->orig_prog = NULL;
 	prog->jited = 0;
+	prog->is_termination_prog = 0;
 
 	atomic64_set(&prog->aux->refcnt, 1);
 
@@ -2977,6 +2988,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	if (err < 0)
 		goto free_used_maps;
 
+	prog->term_states->patch_prog = bpf_prog_select_runtime(prog->term_states->patch_prog, &err);
+	if (err < 0)
+		goto free_used_maps;
+
+	err = sanity_check_jit_len(prog);
+	if (err < 0)
+		goto free_used_maps;
+
 	err = bpf_prog_alloc_id(prog);
 	if (err)
 		goto free_used_maps;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c378074516cf..6920a623dde4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21397,6 +21397,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			goto out_free;
 		func[i]->is_func = 1;
 		func[i]->sleepable = prog->sleepable;
+		if (prog->is_termination_prog)
+			func[i]->is_termination_prog = 1;
 		func[i]->aux->func_idx = i;
 		/* Below members will be freed only at prog->aux */
 		func[i]->aux->btf = prog->aux->btf;
@@ -21514,8 +21516,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			goto out_free;
 	}
 
-	for (i = 1; i < env->subprog_cnt; i++)
-		bpf_prog_kallsyms_add(func[i]);
+	if (!prog->is_termination_prog) {
+		for (i = 1; i < env->subprog_cnt; i++)
+			bpf_prog_kallsyms_add(func[i]);
+	}
 
 	/* Last step: make now unused interpreter insns from main
 	 * prog consistent for later dump requests, so they can
@@ -21693,15 +21697,21 @@ static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
 }
 
 static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
-			    struct bpf_insn *insn_buf, int insn_idx, int *cnt)
+			    struct bpf_insn *insn_buf, int insn_idx, int *cnt, int *kfunc_btf_id)
 {
 	const struct bpf_kfunc_desc *desc;
+	struct bpf_kfunc_call_arg_meta meta;
+	int err;
 
 	if (!insn->imm) {
 		verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
 		return -EINVAL;
 	}
 
+	err = fetch_kfunc_meta(env, insn, &meta, NULL);
+	if (err)
+		return err;
+
 	*cnt = 0;
 
 	/* insn->imm has the btf func_id. Replace it with an offset relative to
@@ -21715,8 +21725,11 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		return -EFAULT;
 	}
 
-	if (!bpf_jit_supports_far_kfunc_call())
+	if (!bpf_jit_supports_far_kfunc_call()) {
+		if (meta.kfunc_flags & KF_RET_NULL)
+			*kfunc_btf_id = insn->imm;
 		insn->imm = BPF_CALL_IMM(desc->addr);
+	}
 	if (insn->off)
 		return 0;
 	if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
@@ -21846,7 +21859,13 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 	struct bpf_subprog_info *subprogs = env->subprog_info;
 	u16 stack_depth = subprogs[cur_subprog].stack_depth;
 	u16 stack_depth_extra = 0;
+	int call_sites_cnt = 0;
+	int *call_idx;
+	env->bpf_term_patch_call_sites.call_idx = NULL;
 
+	call_idx = vmalloc(sizeof(*call_idx) * prog->len);
+	if (!call_idx)
+		return -ENOMEM;
 	if (env->seen_exception && !env->exception_callback_subprog) {
 		struct bpf_insn patch[] = {
 			env->prog->insnsi[insn_cnt - 1],
@@ -22182,11 +22201,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 		if (insn->src_reg == BPF_PSEUDO_CALL)
 			goto next_insn;
 		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
-			ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
+			int kfunc_btf_id = 0;
+			ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt, &kfunc_btf_id);
 			if (ret)
 				return ret;
 			if (cnt == 0)
-				goto next_insn;
+				goto store_call_indices;
 
 			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
 			if (!new_prog)
@@ -22195,6 +22215,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			delta	 += cnt - 1;
 			env->prog = prog = new_prog;
 			insn	  = new_prog->insnsi + i + delta;
+
+store_call_indices:
+			if (kfunc_btf_id != 0) {
+				call_idx[call_sites_cnt] = i + delta;
+				call_sites_cnt++;
+			}
 			goto next_insn;
 		}
 
@@ -22673,6 +22699,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 				func_id_name(insn->imm), insn->imm);
 			return -EFAULT;
 		}
+		if (fn->ret_type & PTR_MAYBE_NULL) {
+			call_idx[call_sites_cnt] = i + delta;
+			call_sites_cnt++;
+		}
 		insn->imm = fn->func - __bpf_call_base;
 next_insn:
 		if (subprogs[cur_subprog + 1].start == i + delta + 1) {
@@ -22693,6 +22723,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 		insn++;
 	}
 
+	env->bpf_term_patch_call_sites.call_sites_cnt = call_sites_cnt;
+	env->bpf_term_patch_call_sites.call_idx = call_idx;
 	env->prog->aux->stack_depth = subprogs[0].stack_depth;
 	for (i = 0; i < env->subprog_cnt; i++) {
 		int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
@@ -22828,6 +22860,8 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
 	call_insn_offset = position + 12;
 	callback_offset = callback_start - call_insn_offset - 1;
 	new_prog->insnsi[call_insn_offset].imm = callback_offset;
+	/* marking offset field to identify and patch the patch_prog*/
+	new_prog->insnsi[call_insn_offset].off = 0x1;
 
 	return new_prog;
 }
@@ -24394,6 +24428,194 @@ static int compute_scc(struct bpf_verifier_env *env)
 	return err;
 }
 
+static int clone_patch_prog(struct bpf_verifier_env *env)
+{
+	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | GFP_USER);
+	unsigned int size, prog_name_len;
+	struct bpf_prog *patch_prog, *prog = env->prog;
+	struct bpf_prog_aux *aux;
+
+	size = prog->pages * PAGE_SIZE;
+	patch_prog = __vmalloc(size, gfp_flags);
+	if (!patch_prog)
+		return -ENOMEM;
+
+	aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | GFP_USER));
+	if (!aux) {
+		vfree(patch_prog);
+		return -ENOMEM;
+	}
+
+	/*
+	 * Copying prog fields
+	 */
+	patch_prog->pages = prog->pages;
+	patch_prog->jited = 0;
+	patch_prog->jit_requested = prog->jit_requested;
+	patch_prog->gpl_compatible = prog->gpl_compatible;
+	patch_prog->blinding_requested = prog->blinding_requested;
+	patch_prog->is_termination_prog = 1;
+	patch_prog->len = prog->len;
+	patch_prog->type = prog->type;
+	patch_prog->aux = aux;
+
+	/*
+	 * Copying prog aux fields
+	 */
+	patch_prog->aux->used_map_cnt = prog->aux->used_map_cnt;
+	patch_prog->aux->used_btf_cnt = prog->aux->used_btf_cnt;
+	patch_prog->aux->max_ctx_offset = prog->aux->max_ctx_offset;
+	patch_prog->aux->stack_depth = prog->aux->stack_depth;
+	patch_prog->aux->func_cnt = prog->aux->func_cnt; /* will be populated by jit_subprogs */
+	patch_prog->aux->real_func_cnt = prog->aux->real_func_cnt; /* will be populated by jit_subprogs */
+	patch_prog->aux->func_idx = prog->aux->func_idx; /* will be populated by jit_subprogs */
+	patch_prog->aux->attach_btf_id = prog->aux->attach_btf_id;
+	patch_prog->aux->attach_st_ops_member_off = prog->aux->attach_st_ops_member_off;
+	patch_prog->aux->ctx_arg_info_size = prog->aux->ctx_arg_info_size;
+	patch_prog->aux->max_rdonly_access = prog->aux->max_rdonly_access;
+	patch_prog->aux->max_rdwr_access = prog->aux->max_rdwr_access;
+	patch_prog->aux->verifier_zext = prog->aux->verifier_zext;
+	patch_prog->aux->dev_bound = prog->aux->dev_bound;
+	patch_prog->aux->offload_requested = prog->aux->offload_requested;
+	patch_prog->aux->attach_btf_trace = prog->aux->attach_btf_trace;
+	patch_prog->aux->attach_tracing_prog = prog->aux->attach_tracing_prog;
+	patch_prog->aux->func_proto_unreliable = prog->aux->func_proto_unreliable;
+	patch_prog->aux->tail_call_reachable = prog->aux->tail_call_reachable;
+	patch_prog->aux->xdp_has_frags = prog->aux->xdp_has_frags;
+	patch_prog->aux->exception_cb = prog->aux->exception_cb;
+	patch_prog->aux->exception_boundary = prog->aux->exception_boundary;
+	patch_prog->aux->is_extended = prog->aux->is_extended;
+	patch_prog->aux->jits_use_priv_stack = prog->aux->jits_use_priv_stack;
+	patch_prog->aux->priv_stack_requested = prog->aux->priv_stack_requested;
+	patch_prog->aux->changes_pkt_data = prog->aux->changes_pkt_data;
+	patch_prog->aux->might_sleep = prog->aux->might_sleep;
+	patch_prog->aux->prog_array_member_cnt = prog->aux->prog_array_member_cnt;
+	patch_prog->aux->size_poke_tab = prog->aux->size_poke_tab;
+	patch_prog->aux->cgroup_atype = prog->aux->cgroup_atype;
+	patch_prog->aux->linfo = prog->aux->linfo; 
+	patch_prog->aux->func_info_cnt = prog->aux->func_info_cnt;
+	patch_prog->aux->nr_linfo = prog->aux->nr_linfo;
+	patch_prog->aux->linfo_idx = prog->aux->linfo_idx;
+	patch_prog->aux->num_exentries = prog->aux->num_exentries;
+
+	patch_prog->aux->poke_tab = kmalloc_array(patch_prog->aux->size_poke_tab, 
+					sizeof(struct bpf_jit_poke_descriptor), GFP_KERNEL);
+	if (!patch_prog->aux->poke_tab) {
+		kfree(patch_prog->aux);
+		vfree(patch_prog);
+		return -ENOMEM;
+	}
+	
+	for (int i = 0; i < patch_prog->aux->size_poke_tab; i++) {
+		memcpy(&patch_prog->aux->poke_tab[i], &prog->aux->poke_tab[i], 
+				sizeof(struct bpf_jit_poke_descriptor));
+	}
+
+	memcpy(patch_prog->insnsi, prog->insnsi, bpf_prog_insn_size(prog));
+
+	char *patch_prefix = "patch_";
+	prog_name_len = strlen(prog->aux->name);
+	strncpy(patch_prog->aux->name, patch_prefix, strlen(patch_prefix));
+
+	if (prog_name_len + strlen(patch_prefix) + 1 > BPF_OBJ_NAME_LEN) {
+		prog_name_len = BPF_OBJ_NAME_LEN - strlen(patch_prefix) - 1;
+	}
+	strncat(patch_prog->aux->name, prog->aux->name, prog_name_len);
+	
+	prog->term_states->patch_prog = patch_prog;
+
+	return 0;
+}
+
+static int patch_call_sites(struct bpf_verifier_env *env)
+{
+	int i, subprog;
+	struct bpf_insn *insn;
+	struct bpf_prog *prog = env->prog;
+	struct bpf_prog *patch_prog = prog->term_states->patch_prog;
+	int call_sites_cnt = env->bpf_term_patch_call_sites.call_sites_cnt;
+	int *call_idx = env->bpf_term_patch_call_sites.call_idx;
+	
+	for (int i = 0; i < call_sites_cnt; i++) {
+		patch_prog->insnsi[call_idx[i]].imm = 
+			BPF_CALL_IMM(bpf_termination_null_func);
+	}
+
+	if (!env->subprog_cnt)
+		return 0;
+
+	for (i = 0, insn = patch_prog->insnsi; i < patch_prog->len; i++, insn++) {
+		if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn))
+			continue;
+
+		subprog = find_subprog(env, i + insn->imm + 1);
+		if (subprog < 0)
+			return -EFAULT;
+
+		if (insn->off == 0x1) {
+			patch_prog->insnsi[i].imm = BPF_CALL_IMM(bpf_loop_term_callback);
+			prog->insnsi[i].off = 0x0; /* Removing the marker */
+			/*
+			 * Modify callback call -> function call
+			 */
+			patch_prog->insnsi[i].off = 0x0;
+			patch_prog->insnsi[i].src_reg = 0x0;
+		}
+		
+	}
+
+	return 0;
+}
+
+
+
+static int prepare_patch_prog(struct bpf_verifier_env *env)
+{
+	int err = 0;
+
+	err = clone_patch_prog(env);
+	if (err)
+		return err;
+
+	err = patch_call_sites(env);
+	if (err)
+		return err;
+
+	return err;
+}
+
+
+static int fixup_patch_prog(struct bpf_verifier_env *env, union bpf_attr *attr)
+{
+	
+	struct bpf_verifier_env *patch_env;
+	int err = 0;
+
+	patch_env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
+	if (!patch_env) 
+		return -ENOMEM;
+
+	memcpy(patch_env, env, sizeof(*env));
+	patch_env->prog = env->prog->term_states->patch_prog;
+
+	/* do 32-bit optimization after insn patching has done so those patched
+	 * insns could be handled correctly.
+	 */
+	if (!bpf_prog_is_offloaded(patch_env->prog->aux)) {
+		err = opt_subreg_zext_lo32_rnd_hi32(patch_env, attr);
+		patch_env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !err
+								     : false;
+	}
+
+	if (err == 0)
+		err = fixup_call_args(patch_env);
+
+	kfree(patch_env);
+
+	return err;
+
+}
+
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
 {
 	u64 start_time = ktime_get_ns();
@@ -24568,6 +24790,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	if (ret == 0)
 		ret = do_misc_fixups(env);
 
+	/*
+	 * Preparing patch_prog for termination
+	 * - Cloning and patching call sites.
+	 */
+	if (ret == 0)
+		ret = prepare_patch_prog(env);
+
 	/* do 32-bit optimization after insn patching has done so those patched
 	 * insns could be handled correctly.
 	 */
@@ -24580,6 +24809,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	if (ret == 0)
 		ret = fixup_call_args(env);
 
+	if (ret == 0)
+		ret = fixup_patch_prog(env, attr);
+
 	env->verification_time = ktime_get_ns() - start_time;
 	print_verification_stats(env);
 	env->prog->aux->verified_insns = env->insn_processed;
@@ -24660,6 +24892,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 		mutex_unlock(&bpf_verifier_lock);
 	vfree(env->insn_aux_data);
 err_free_env:
+	vfree(env->bpf_term_patch_call_sites.call_idx);
 	kvfree(env->cfg.insn_postorder);
 	kvfree(env->scc_info);
 	kvfree(env);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [RFC bpf-next v2 3/4] bpf: Runtime part of fast-path termination approach
  2025-06-14  6:40 [RFC bpf-next v2 0/4] bpf: Fast-Path approach for BPF program Siddharth Chintamaneni
  2025-06-14  6:40 ` [RFC bpf-next v2 1/4] bpf: Introduce new structs and struct fields Siddharth Chintamaneni
  2025-06-14  6:40 ` [RFC bpf-next v2 2/4] bpf: Generating a stubbed version of BPF program for termination Siddharth Chintamaneni
@ 2025-06-14  6:40 ` Siddharth Chintamaneni
  2025-06-30 12:15   ` Kumar Kartikeya Dwivedi
  2025-06-14  6:40 ` [RFC bpf-next v2 4/4] selftests/bpf: Adds selftests to check termination of long running nested bpf loops Siddharth Chintamaneni
  2025-06-30 13:03 ` [RFC bpf-next v2 0/4] bpf: Fast-Path approach for BPF program Kumar Kartikeya Dwivedi
  4 siblings, 1 reply; 19+ messages in thread
From: Siddharth Chintamaneni @ 2025-06-14  6:40 UTC (permalink / raw)
  To: bpf
  Cc: ast, daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
	john.fastabend, kpsingh, sdf, haoluo, jolsa, djwillia, miloc,
	ericts, rahult, doniaghazy, quanzhif, jinghao7, sidchintamaneni,
	memxor, egor, sairoop10, Raj Sahu

Introduces watchdog based runtime mechanism to terminate
a BPF program. When a BPF program is interrupted by
an watchdog, its registers are are passed onto the bpf_die.

Inside bpf_die we perform the text_poke and stack walk
to stub helpers/kfunc replace bpf_loop helper if called
inside bpf program.

Current implementation doesn't handle the termination of
tailcall programs.

There is a known issue by calling text_poke inside interrupt
context - https://elixir.bootlin.com/linux/v6.15.1/source/kernel/smp.c#L815.

Please let us know if you have any suggestions around this?

Signed-off-by: Raj Sahu <rjsu26@gmail.com>
Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
---
 include/linux/bpf.h     |   1 +
 include/linux/filter.h  |  41 +++++++-
 kernel/bpf/core.c       |  15 +++
 kernel/bpf/syscall.c    | 206 ++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/trampoline.c |   5 +
 5 files changed, 267 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1c534b3e10d8..5dd0f06bbf02 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1547,6 +1547,7 @@ struct cpu_aux {
 
 struct bpf_term_aux_states {
 	struct bpf_prog *patch_prog;
+	struct bpf_prog *prog;
 	struct cpu_aux *per_cpu_state;
 	struct hrtimer hrtimer;
 };
diff --git a/include/linux/filter.h b/include/linux/filter.h
index cd9f1c2727ec..921d2318bcf7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -689,11 +689,40 @@ extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
 				     const struct bpf_reg_state *reg,
 				     int off, int size);
 
+void bpf_die(void *data);
+
 typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
 					  const struct bpf_insn *insnsi,
 					  unsigned int (*bpf_func)(const void *,
 								   const struct bpf_insn *));
 
+static void update_term_per_cpu_flag(const struct bpf_prog *prog, u8 cpu_flag)
+{
+	unsigned long flags;
+	u32 cpu_id = raw_smp_processor_id();
+	spin_lock_irqsave(&prog->term_states->per_cpu_state[cpu_id].lock, 
+				flags);
+	prog->term_states->per_cpu_state[cpu_id].cpu_flag = cpu_flag;
+	spin_unlock_irqrestore(&prog->term_states->per_cpu_state[cpu_id].lock,
+				flags);
+}
+
+static void bpf_terminate_timer_init(const struct bpf_prog *prog)
+{
+	ktime_t timeout = ktime_set(1, 0); // 1s, 0ns
+
+	/* Initialize timer on Monotonic clock, relative mode */
+	hrtimer_setup(&prog->term_states->hrtimer, bpf_termination_wd_callback, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+
+	/* Start watchdog */
+	hrtimer_start(&prog->term_states->hrtimer, timeout, HRTIMER_MODE_REL);
+}
+
+static void bpf_terminate_timer_cancel(const struct bpf_prog *prog)
+{
+	hrtimer_cancel(&prog->term_states->hrtimer);  
+}
+
 static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
 					  const void *ctx,
 					  bpf_dispatcher_fn dfunc)
@@ -706,7 +735,11 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
 		u64 duration, start = sched_clock();
 		unsigned long flags;
 
+		update_term_per_cpu_flag(prog, 1);
+		bpf_terminate_timer_init(prog);
 		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
+		bpf_terminate_timer_cancel(prog);
+		update_term_per_cpu_flag(prog, 0);
 
 		duration = sched_clock() - start;
 		stats = this_cpu_ptr(prog->stats);
@@ -715,8 +748,11 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
 		u64_stats_add(&stats->nsecs, duration);
 		u64_stats_update_end_irqrestore(&stats->syncp, flags);
 	} else {
+		update_term_per_cpu_flag(prog, 1);
+		bpf_terminate_timer_init(prog);
 		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
-	}
+		bpf_terminate_timer_cancel(prog);
+		update_term_per_cpu_flag(prog, 0);}
 	return ret;
 }
 
@@ -1119,6 +1155,9 @@ int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);
 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 
+#ifdef CONFIG_X86_64
+int bpf_loop_termination(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags);
+#endif
 int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx);
 void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2a02e9cafd5a..735518735779 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1583,6 +1583,21 @@ noinline int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx)
 }
 EXPORT_SYMBOL_GPL(bpf_loop_term_callback);
 
+#ifdef CONFIG_X86_64
+noinline int bpf_loop_termination(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags)
+{
+	asm volatile(
+		"pop %rbx\n\t"
+		"pop %rbp\n\t"
+		"pop %r12\n\t"
+		"pop %r13\n\t"
+	);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bpf_loop_termination);
+STACK_FRAME_NON_STANDARD(bpf_loop_termination);
+#endif
+
 /* Base function for offset calculation. Needs to go into .text section,
  * therefore keeping it non-static as well; will also be used by JITs
  * anyway later on, so do not let the compiler omit it. This also needs
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cd8e7c47e3fe..065767ae1bd1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -37,6 +37,10 @@
 #include <linux/trace_events.h>
 #include <linux/tracepoint.h>
 #include <linux/overflow.h>
+#include <asm/unwind.h>
+#include <asm/insn.h>
+#include <asm/text-patching.h>
+#include <asm/irq_regs.h>
 
 #include <net/netfilter/nf_bpf_link.h>
 #include <net/netkit.h>
@@ -2767,6 +2771,207 @@ static int sanity_check_jit_len(struct bpf_prog *prog)
 	return 0;
 }
 
+static bool per_cpu_flag_is_true(struct bpf_term_aux_states *term_states, int cpu_id)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&term_states->per_cpu_state[cpu_id].lock, 
+				flags);
+	if (term_states->per_cpu_state[cpu_id].cpu_flag == 1) {
+		spin_unlock_irqrestore(&term_states->per_cpu_state[cpu_id].lock,
+					flags);
+		return true;
+	}
+	spin_unlock_irqrestore(&term_states->per_cpu_state[cpu_id].lock,
+				flags);
+	return false;
+}
+
+static int is_bpf_address(struct bpf_prog *prog, unsigned long addr) 
+{
+
+        unsigned long bpf_func_addr = (unsigned long)prog->bpf_func;
+        if ((addr > bpf_func_addr) &&
+                        (addr < bpf_func_addr + prog->jited_len)){
+                return 1;
+        }
+
+        for (int subprog = 1; subprog < prog->aux->func_cnt; subprog++) {
+                struct bpf_prog *bpf_subprog = prog->aux->func[subprog];
+                unsigned long bpf_subprog_func_addr =
+                                        (unsigned long)bpf_subprog->bpf_func;
+                if ((addr > bpf_subprog_func_addr) && (addr < bpf_subprog_func_addr +
+                                                        bpf_subprog->jited_len)) {
+                        return 1;
+                }
+        }
+
+        return 0;
+}
+
+/* 
+ * For a call instruction in a BPF program, return the stubbed insn buff.
+ * Returns new instruction buff if stubbing required,
+ *	   NULL if no change needed.
+ */
+__always_inline char* find_termination_realloc(struct insn orig_insn, unsigned char *orig_addr, 
+					       struct insn patch_insn, unsigned char *patch_addr) {
+
+	unsigned long new_target;
+	unsigned long original_call_target = (unsigned long)orig_addr + 5 + orig_insn.immediate.value;
+
+	unsigned long patch_call_target = (unsigned long)patch_addr + 5 + patch_insn.immediate.value;
+
+	/* As per patch prog, no stubbing needed. */
+	if (patch_call_target == original_call_target)
+		return NULL;
+
+	/* bpf_termination_null_func is the generic stub function unless its either of
+	* the bpf_loop helper or the associated callback
+	*/
+	new_target = (unsigned long)bpf_termination_null_func;
+	if (patch_call_target == (unsigned long)bpf_loop_term_callback)
+		new_target = (unsigned long)bpf_loop_term_callback;
+	
+
+	unsigned long new_rel = (unsigned long)(new_target - (unsigned long)(orig_addr + 5));
+
+	char *new_insn = kmalloc(5, GFP_KERNEL);
+	new_insn[0] = 0xE8;
+	new_insn[1] = (new_rel >> 0) & 0xff;
+	new_insn[2] = (new_rel >> 8) & 0xff;
+	new_insn[3] = (new_rel >> 16) & 0xff;
+	new_insn[4] = (new_rel >> 24) & 0xff;
+
+ 	return new_insn;
+}
+
+/* 
+ * Given a bpf program and a corresponding termination patch prog
+ * (generated during verification), this program will patch all
+ * call instructions in prog and decide whether to stub them
+ * based on whether the termination_prog has stubbed or not.
+ */
+static void __maybe_unused in_place_patch_bpf_prog(struct bpf_prog *prog, struct bpf_prog *patch_prog){
+
+       uint32_t size = 0;
+  
+       while (size < prog->jited_len) {
+	       unsigned char *addr = (unsigned char*)prog->bpf_func;
+	       unsigned char *addr_patch = (unsigned char*)patch_prog->bpf_func;
+
+	       struct insn insn;
+	       struct insn insn_patch;
+
+	       addr += size;
+	       /* Decode original instruction */
+               if (WARN_ON_ONCE(insn_decode_kernel(&insn, addr))) {
+	       		return;
+               }
+
+               /* Check for call instruction */
+               if (insn.opcode.bytes[0] != CALL_INSN_OPCODE) {
+                       goto next_insn;  
+               }
+  
+	       addr_patch += size;
+	       /* Decode patch_prog instruction */
+               if (WARN_ON_ONCE(insn_decode_kernel(&insn_patch, addr_patch))) {
+	       		return ;
+               }
+
+	       // Stub the call instruction if needed
+	       char *buf;
+	       if ((buf = find_termination_realloc(insn, addr, insn_patch, addr_patch)) != NULL) {
+		       smp_text_poke_batch_add(addr, buf, insn.length, NULL);
+		       kfree(buf);
+	       }
+               
+       next_insn:
+               size += insn.length;
+       }       
+}
+
+
+void bpf_die(void *data)
+{
+	struct bpf_prog *prog, *patch_prog;
+	int cpu_id = raw_smp_processor_id();
+
+	prog = (struct bpf_prog *)data;
+	patch_prog = prog->term_states->patch_prog;
+
+	if (!per_cpu_flag_is_true(prog->term_states, cpu_id))
+		return;
+
+	unsigned long jmp_offset = prog->jited_len - (4 /*First endbr is 4 bytes*/
+						+ 5 /*5 bytes of noop*/ 
+						+ 5 /*5 bytes of jmp return_thunk*/);
+	char new_insn[5];
+	new_insn[0] = 0xE9;
+	new_insn[1] = (jmp_offset >> 0) & 0xff;
+	new_insn[2] = (jmp_offset >> 8) & 0xff;
+	new_insn[3] = (jmp_offset >> 16) & 0xff;
+	new_insn[4] = (jmp_offset >> 24) & 0xff;
+	smp_text_poke_batch_add(prog->bpf_func + 4, new_insn, 5, NULL);
+
+	/* poke all progs and subprogs */
+	if (prog->aux->func_cnt) {
+		for(int i=0; i<prog->aux->func_cnt; i++){
+			in_place_patch_bpf_prog(prog->aux->func[i], patch_prog->aux->func[i]);
+		}
+	} else {
+		in_place_patch_bpf_prog(prog, patch_prog);
+	}
+	/* flush all text poke calls */
+	smp_text_poke_batch_finish();
+
+
+ #ifdef CONFIG_X86_64
+	struct unwind_state state;
+	unsigned long addr, bpf_loop_addr, bpf_loop_term_addr;
+	struct pt_regs *regs = get_irq_regs();
+	char str[KSYM_SYMBOL_LEN];
+	bpf_loop_addr = (unsigned long)bpf_loop_proto.func;
+	bpf_loop_term_addr = (unsigned long)bpf_loop_termination;
+	unwind_start(&state, current, regs, NULL);
+
+	addr = unwind_get_return_address(&state);
+
+	unsigned long stack_addr = regs->sp;
+	while (addr) {
+		if (is_bpf_address(prog, addr)) {
+			break;
+		} else {
+			const char *name = kallsyms_lookup(addr, NULL, NULL, NULL, str);
+			if (name) {
+				unsigned long lookup_addr = kallsyms_lookup_name(name);
+				if (lookup_addr && lookup_addr == bpf_loop_addr) {
+					while (*(unsigned long *)stack_addr != addr) {
+						stack_addr += 1;
+					}
+					*(unsigned long *)stack_addr = bpf_loop_term_addr;
+				}
+			}
+		}
+		unwind_next_frame(&state);
+		addr = unwind_get_return_address(&state);
+	}
+#endif
+
+	return;
+}
+
+enum hrtimer_restart bpf_termination_wd_callback(struct hrtimer *hr)
+{
+
+	struct bpf_term_aux_states *term_states = container_of(hr, struct bpf_term_aux_states, hrtimer);
+	struct bpf_prog *prog = term_states->prog;
+	bpf_die(prog);
+	return HRTIMER_NORESTART;
+
+}
+EXPORT_SYMBOL_GPL(bpf_termination_wd_callback);
+
 static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 {
 	enum bpf_prog_type type = attr->prog_type;
@@ -2995,6 +3200,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	err = sanity_check_jit_len(prog);
 	if (err < 0)
 		goto free_used_maps;
+	prog->term_states->prog = prog;
 
 	err = bpf_prog_alloc_id(prog);
 	if (err)
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index c4b1a98ff726..16f685c861a3 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -908,6 +908,9 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
 			prog->aux->recursion_detected(prog);
 		return 0;
 	}
+
+	update_term_per_cpu_flag(prog, 1);
+	bpf_terminate_timer_init(prog);
 	return bpf_prog_start_time();
 }
 
@@ -941,6 +944,8 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
 	bpf_reset_run_ctx(run_ctx->saved_run_ctx);
 
 	update_prog_stats(prog, start);
+	bpf_terminate_timer_cancel(prog);
+	update_term_per_cpu_flag(prog, 0);
 	this_cpu_dec(*(prog->active));
 	migrate_enable();
 	rcu_read_unlock();
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [RFC bpf-next v2 4/4] selftests/bpf: Adds selftests to check termination of long running nested bpf loops
  2025-06-14  6:40 [RFC bpf-next v2 0/4] bpf: Fast-Path approach for BPF program Siddharth Chintamaneni
                   ` (2 preceding siblings ...)
  2025-06-14  6:40 ` [RFC bpf-next v2 3/4] bpf: Runtime part of fast-path termination approach Siddharth Chintamaneni
@ 2025-06-14  6:40 ` Siddharth Chintamaneni
  2025-06-30 13:03 ` [RFC bpf-next v2 0/4] bpf: Fast-Path approach for BPF program Kumar Kartikeya Dwivedi
  4 siblings, 0 replies; 19+ messages in thread
From: Siddharth Chintamaneni @ 2025-06-14  6:40 UTC (permalink / raw)
  To: bpf
  Cc: ast, daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
	john.fastabend, kpsingh, sdf, haoluo, jolsa, djwillia, miloc,
	ericts, rahult, doniaghazy, quanzhif, jinghao7, sidchintamaneni,
	memxor, egor, sairoop10, Raj Sahu

Adds tests checks for loops termination which are nested.
Additionally it ensure the termination of loops in both inlining
and non-inlining case.

Signed-off-by: Raj Sahu <rjsu26@gmail.com>
Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
---
 tools/testing/selftests/bpf/prog_tests/bpf_termination.c | 39 +++++++++++++++++++
 tools/testing/selftests/bpf/progs/bpf_termination.c	  | 38 ++++++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_termination.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_termination.c

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_termination.c b/tools/testing/selftests/bpf/prog_tests/bpf_termination.c
new file mode 100644
index 000000000000..d060073db8f9
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_termination.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include <sys/socket.h>
+
+#include "bpf_termination.skel.h"
+
+void test_loop_termination(void)
+{
+	struct bpf_termination *skel;
+	int err;
+	
+	skel = bpf_termination__open();
+	if (!ASSERT_OK_PTR(skel, "bpf_termination__open"))
+	        return;
+	
+	err = bpf_termination__load(skel);
+	if (!ASSERT_OK(err, "bpf_termination__load"))
+	        goto out;
+	
+	skel->bss->pid = getpid();
+	err = bpf_termination__attach(skel);
+	if (!ASSERT_OK(err, "bpf_termination__attach"))
+	        goto out;
+	
+	/* Triggers long running BPF program */
+	socket(AF_UNSPEC, SOCK_DGRAM, 0);
+
+	/* If the program is not terminated, it doesn't reach this point */
+	ASSERT_TRUE(true, "Program is terminated");
+out:
+       bpf_termination__destroy(skel);
+}
+
+void test_bpf_termination(void)
+{
+	if (test__start_subtest("bpf_termination"))
+		test_loop_termination();
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_termination.c b/tools/testing/selftests/bpf/progs/bpf_termination.c
new file mode 100644
index 000000000000..bd8f499f2507
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_termination.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+int pid;
+
+#define LOOPS_CNT 1 << 10
+
+static int callback_fn4(void *ctx) {
+	return 0;
+}
+
+static int callback_fn3(void *ctx) {
+	bpf_loop(LOOPS_CNT, callback_fn4, NULL, 0);
+	return 0;
+}
+
+
+static int callback_fn2(void *ctx) {
+	bpf_loop(LOOPS_CNT, callback_fn3, NULL, 0);
+	return 0;
+}
+
+static int callback_fn(void *ctx) {
+	bpf_loop(LOOPS_CNT, callback_fn2, NULL, 0);
+	return 0;
+}
+
+SEC("tp/syscalls/sys_enter_socket")
+int bpf_loop_lr(void *ctx) {
+	if ((bpf_get_current_pid_tgid() >> 32) != pid)
+		return 0;
+	bpf_loop(LOOPS_CNT, callback_fn, NULL, 0);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [RFC bpf-next v2 3/4] bpf: Runtime part of fast-path termination approach
  2025-06-14  6:40 ` [RFC bpf-next v2 3/4] bpf: Runtime part of fast-path termination approach Siddharth Chintamaneni
@ 2025-06-30 12:15   ` Kumar Kartikeya Dwivedi
  2025-07-04 17:29     ` Raj Sahu
  0 siblings, 1 reply; 19+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2025-06-30 12:15 UTC (permalink / raw)
  To: Siddharth Chintamaneni
  Cc: bpf, ast, daniel, andrii, martin.lau, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
	djwillia, miloc, ericts, rahult, doniaghazy, quanzhif, jinghao7,
	egor, sairoop10, Raj Sahu

On Sat, 14 Jun 2025 at 08:41, Siddharth Chintamaneni
<sidchintamaneni@gmail.com> wrote:
>
> Introduces watchdog based runtime mechanism to terminate
> a BPF program. When a BPF program is interrupted by
> an watchdog, its registers are are passed onto the bpf_die.
>
> Inside bpf_die we perform the text_poke and stack walk
> to stub helpers/kfunc replace bpf_loop helper if called
> inside bpf program.
>
> Current implementation doesn't handle the termination of
> tailcall programs.
>
> There is a known issue by calling text_poke inside interrupt
> context - https://elixir.bootlin.com/linux/v6.15.1/source/kernel/smp.c#L815.

I don't have a good idea so far, maybe by deferring work to wq context?
Each CPU would need its own context and schedule work there.
The problem is that it may not be invoked immediately.

Regardless, I think there's more things to fix before we get here. See below.

>
> Please let us know if you have any suggestions around this?
>
> Signed-off-by: Raj Sahu <rjsu26@gmail.com>
> Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
> ---
>  include/linux/bpf.h     |   1 +
>  include/linux/filter.h  |  41 +++++++-
>  kernel/bpf/core.c       |  15 +++
>  kernel/bpf/syscall.c    | 206 ++++++++++++++++++++++++++++++++++++++++
>  kernel/bpf/trampoline.c |   5 +
>  5 files changed, 267 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 1c534b3e10d8..5dd0f06bbf02 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1547,6 +1547,7 @@ struct cpu_aux {
>
>  struct bpf_term_aux_states {
>         struct bpf_prog *patch_prog;
> +       struct bpf_prog *prog;
>         struct cpu_aux *per_cpu_state;
>         struct hrtimer hrtimer;
>  };
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index cd9f1c2727ec..921d2318bcf7 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -689,11 +689,40 @@ extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
>                                      const struct bpf_reg_state *reg,
>                                      int off, int size);
>
> +void bpf_die(void *data);
> +
>  typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
>                                           const struct bpf_insn *insnsi,
>                                           unsigned int (*bpf_func)(const void *,
>                                                                    const struct bpf_insn *));
>
> +static void update_term_per_cpu_flag(const struct bpf_prog *prog, u8 cpu_flag)
> +{
> +       unsigned long flags;
> +       u32 cpu_id = raw_smp_processor_id();
> +       spin_lock_irqsave(&prog->term_states->per_cpu_state[cpu_id].lock,
> +                               flags);
> +       prog->term_states->per_cpu_state[cpu_id].cpu_flag = cpu_flag;
> +       spin_unlock_irqrestore(&prog->term_states->per_cpu_state[cpu_id].lock,
> +                               flags);
> +}
> +
> +static void bpf_terminate_timer_init(const struct bpf_prog *prog)
> +{
> +       ktime_t timeout = ktime_set(1, 0); // 1s, 0ns
> +
> +       /* Initialize timer on Monotonic clock, relative mode */
> +       hrtimer_setup(&prog->term_states->hrtimer, bpf_termination_wd_callback, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

Hmm, doesn't this need to be a per-CPU hrtimer? Otherwise all
concurrent invocations will race to set up and start it?
Doesn't even look thread safe, unless I'm missing something.

> +
> +       /* Start watchdog */
> +       hrtimer_start(&prog->term_states->hrtimer, timeout, HRTIMER_MODE_REL);
> +}
> +
> +static void bpf_terminate_timer_cancel(const struct bpf_prog *prog)
> +{
> +       hrtimer_cancel(&prog->term_states->hrtimer);
> +}
> +
>  static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
>                                           const void *ctx,
>                                           bpf_dispatcher_fn dfunc)
> @@ -706,7 +735,11 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
>                 u64 duration, start = sched_clock();
>                 unsigned long flags;
>
> +               update_term_per_cpu_flag(prog, 1);
> +               bpf_terminate_timer_init(prog);
>                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> +               bpf_terminate_timer_cancel(prog);
> +               update_term_per_cpu_flag(prog, 0);
>
>                 duration = sched_clock() - start;
>                 stats = this_cpu_ptr(prog->stats);
> @@ -715,8 +748,11 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
>                 u64_stats_add(&stats->nsecs, duration);
>                 u64_stats_update_end_irqrestore(&stats->syncp, flags);
>         } else {
> +               update_term_per_cpu_flag(prog, 1);
> +               bpf_terminate_timer_init(prog);
>                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> -       }
> +               bpf_terminate_timer_cancel(prog);
> +               update_term_per_cpu_flag(prog, 0);}
>         return ret;
>  }

Hmm, did you profile how much overhead this adds? It's not completely
free, right?
I guess the per_cpu flag's lock is uncontended, so there wouldn't be
too much overhead there (though it's still an extra atomic op on the
fast path).
hrtimer_setup() won't be that expensive either, but I think
hrtimer_start() can be.
Also, what about programs invoked from BPF trampoline? We would need
such "watchdog" protection for potentially every program, right?

I'm more concerned about the implications of using an hrtimer around
every program invocation though.
Imagine that the program gets invoked in task context, the same
program then runs in interrupt context (let's say it's a tracing
program).
Even the simple hrtimer_cancel() when returning from interrupt context
can potentially deadlock the kernel if the task context program hit
its limit and was inside the timer callback.
Let alone the fact that we can have recursion on the same CPU as above
or by repeatedly invoking the same program, which reprograms the timer
again.

I think we should piggy back on softlockup / hardlockup checks (that's
what I did long ago), but for simplicity I would just drop these time
based enforcement checks from the set for now.
They're incomplete, and potentially buggy. Instead you can invoke
bpf_die() when a program hits the loop's max count limit or something
similar, in order to test this.
We also need to account for sleepable programs, so a 1 second
hardcoded limit is probably not appropriate.
Enforcement is orthogonal to how a program is cleaned up, though as
important, but it can be revisited once we sort out the first part.

>
> @@ -1119,6 +1155,9 @@ int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);
>  bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
>  void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
>
> +#ifdef CONFIG_X86_64
> +int bpf_loop_termination(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags);
> +#endif
>  int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx);
>  void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
>  u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 2a02e9cafd5a..735518735779 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -1583,6 +1583,21 @@ noinline int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx)
>  }
>  EXPORT_SYMBOL_GPL(bpf_loop_term_callback);
>
> +#ifdef CONFIG_X86_64
> +noinline int bpf_loop_termination(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags)
> +{
> +       asm volatile(
> +               "pop %rbx\n\t"
> +               "pop %rbp\n\t"
> +               "pop %r12\n\t"
> +               "pop %r13\n\t"
> +       );
> +       return 0;
> +}
> +EXPORT_SYMBOL_GPL(bpf_loop_termination);
> +STACK_FRAME_NON_STANDARD(bpf_loop_termination);

You can move this into an arch-specific helper, see
bpf_timed_may_goto.S in arch/x86/net.
For non-x86, the whole logic should not kick in.

Also this function needs more comments. Why is restoring these 4
registers correct?
It is not clear from the code, please point out where they are being saved.

If this is about restoring callee saved registers, then it looks broken to me.
Only those scratched are saved by the JIT (see callee_regs_used in
bpf_jit_comp.c), so it would be plain wrong.
r12 is unused except for arenas. rbx, r13, r14, r15 are used, at max.

It would make more sense to move the logic into the JIT, as I suggested above.
Even then, you either need to spill all callee regs, or figure out a
way to conditionally restore.

> +#endif
> +
>  /* Base function for offset calculation. Needs to go into .text section,
>   * therefore keeping it non-static as well; will also be used by JITs
>   * anyway later on, so do not let the compiler omit it. This also needs
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index cd8e7c47e3fe..065767ae1bd1 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -37,6 +37,10 @@
>  #include <linux/trace_events.h>
>  #include <linux/tracepoint.h>
>  #include <linux/overflow.h>
> +#include <asm/unwind.h>
> +#include <asm/insn.h>
> +#include <asm/text-patching.h>
> +#include <asm/irq_regs.h>
>
>  #include <net/netfilter/nf_bpf_link.h>
>  #include <net/netkit.h>
> @@ -2767,6 +2771,207 @@ static int sanity_check_jit_len(struct bpf_prog *prog)
>         return 0;
>  }
>
> +static bool per_cpu_flag_is_true(struct bpf_term_aux_states *term_states, int cpu_id)
> +{
> +       unsigned long flags;
> +       spin_lock_irqsave(&term_states->per_cpu_state[cpu_id].lock,
> +                               flags);
> +       if (term_states->per_cpu_state[cpu_id].cpu_flag == 1) {
> +               spin_unlock_irqrestore(&term_states->per_cpu_state[cpu_id].lock,
> +                                       flags);
> +               return true;
> +       }
> +       spin_unlock_irqrestore(&term_states->per_cpu_state[cpu_id].lock,
> +                               flags);
> +       return false;
> +}
> +
> +static int is_bpf_address(struct bpf_prog *prog, unsigned long addr)

Use prog == bpf_prog_ksym_find(addr) here.
Also, rename this function to something like is_bpf_prog_text_address
(scoped version of is_bpf_text_address).

> +{
> +
> +        unsigned long bpf_func_addr = (unsigned long)prog->bpf_func;
> +        if ((addr > bpf_func_addr) &&
> +                        (addr < bpf_func_addr + prog->jited_len)){
> +                return 1;
> +        }
> +
> +        for (int subprog = 1; subprog < prog->aux->func_cnt; subprog++) {
> +                struct bpf_prog *bpf_subprog = prog->aux->func[subprog];
> +                unsigned long bpf_subprog_func_addr =
> +                                        (unsigned long)bpf_subprog->bpf_func;
> +                if ((addr > bpf_subprog_func_addr) && (addr < bpf_subprog_func_addr +
> +                                                        bpf_subprog->jited_len)) {
> +                        return 1;
> +                }
> +        }
> +
> +        return 0;
> +}
> +
> +/*
> + * For a call instruction in a BPF program, return the stubbed insn buff.
> + * Returns new instruction buff if stubbing required,
> + *        NULL if no change needed.
> + */
> +__always_inline char* find_termination_realloc(struct insn orig_insn, unsigned char *orig_addr,
> +                                              struct insn patch_insn, unsigned char *patch_addr) {
> +
> +       unsigned long new_target;
> +       unsigned long original_call_target = (unsigned long)orig_addr + 5 + orig_insn.immediate.value;
> +
> +       unsigned long patch_call_target = (unsigned long)patch_addr + 5 + patch_insn.immediate.value;
> +
> +       /* As per patch prog, no stubbing needed. */
> +       if (patch_call_target == original_call_target)
> +               return NULL;
> +
> +       /* bpf_termination_null_func is the generic stub function unless its either of
> +       * the bpf_loop helper or the associated callback
> +       */
> +       new_target = (unsigned long)bpf_termination_null_func;
> +       if (patch_call_target == (unsigned long)bpf_loop_term_callback)
> +               new_target = (unsigned long)bpf_loop_term_callback;
> +
> +
> +       unsigned long new_rel = (unsigned long)(new_target - (unsigned long)(orig_addr + 5));
> +
> +       char *new_insn = kmalloc(5, GFP_KERNEL);

This can fail, so you'd have to return NULL even in cases where you
need to patch the target...
I'd suggest modifying the contract of the function to not depend on
returning NULL, can be some out parameter.

> +       new_insn[0] = 0xE8;
> +       new_insn[1] = (new_rel >> 0) & 0xff;
> +       new_insn[2] = (new_rel >> 8) & 0xff;
> +       new_insn[3] = (new_rel >> 16) & 0xff;
> +       new_insn[4] = (new_rel >> 24) & 0xff;
> +
> +       return new_insn;
> +}
> +
> +/*
> + * Given a bpf program and a corresponding termination patch prog
> + * (generated during verification), this program will patch all
> + * call instructions in prog and decide whether to stub them
> + * based on whether the termination_prog has stubbed or not.
> + */
> +static void __maybe_unused in_place_patch_bpf_prog(struct bpf_prog *prog, struct bpf_prog *patch_prog){
> +
> +       uint32_t size = 0;
> +
> +       while (size < prog->jited_len) {
> +              unsigned char *addr = (unsigned char*)prog->bpf_func;
> +              unsigned char *addr_patch = (unsigned char*)patch_prog->bpf_func;
> +
> +              struct insn insn;
> +              struct insn insn_patch;
> +
> +              addr += size;
> +              /* Decode original instruction */
> +               if (WARN_ON_ONCE(insn_decode_kernel(&insn, addr))) {
> +                       return;
> +               }
> +
> +               /* Check for call instruction */
> +               if (insn.opcode.bytes[0] != CALL_INSN_OPCODE) {
> +                       goto next_insn;
> +               }
> +
> +              addr_patch += size;
> +              /* Decode patch_prog instruction */
> +               if (WARN_ON_ONCE(insn_decode_kernel(&insn_patch, addr_patch))) {
> +                       return ;
> +               }
> +
> +              // Stub the call instruction if needed
> +              char *buf;
> +              if ((buf = find_termination_realloc(insn, addr, insn_patch, addr_patch)) != NULL) {
> +                      smp_text_poke_batch_add(addr, buf, insn.length, NULL);
> +                      kfree(buf);

I think we should find a way to make this work without allocations.
What if it fails? Are we going to let the program keep executing
forever.
Doesn't seem like a good option.

> +              }
> +
> +       next_insn:
> +               size += insn.length;
> +       }
> +}
> +
> +
> +void bpf_die(void *data)
> +{
> +       struct bpf_prog *prog, *patch_prog;
> +       int cpu_id = raw_smp_processor_id();

Assuming you make the hrtimer per-CPU.
So the hrtimer is not pinned to the CPU (HRTIMER_MODE_PINNED), hence
it can be fired on any other CPU when the timer expires.
This means you lose the associativity between the CPU where the
program invocation did not complete in 1 second.
Instead I think it might be better to have it in the per-cpu state,
and stash the CPU number there and use container_of to obtain it.

> +
> +       prog = (struct bpf_prog *)data;
> +       patch_prog = prog->term_states->patch_prog;
> +
> +       if (!per_cpu_flag_is_true(prog->term_states, cpu_id))
> +               return;

Unless hrtimer_cancel() provides a write barrier, this can return
early if the write to 0 in the per-CPU flag gets reordered.
It would be better to be explicit there.

> +
> +       unsigned long jmp_offset = prog->jited_len - (4 /*First endbr is 4 bytes*/
> +                                               + 5 /*5 bytes of noop*/
> +                                               + 5 /*5 bytes of jmp return_thunk*/);

This is all x86 specific, so at the very least it should be guarded.
The proper way would be to add a weak stub in core.c and provide an
implementation in the arch-specific directory.

> +       char new_insn[5];
> +       new_insn[0] = 0xE9;
> +       new_insn[1] = (jmp_offset >> 0) & 0xff;
> +       new_insn[2] = (jmp_offset >> 8) & 0xff;
> +       new_insn[3] = (jmp_offset >> 16) & 0xff;
> +       new_insn[4] = (jmp_offset >> 24) & 0xff;
> +       smp_text_poke_batch_add(prog->bpf_func + 4, new_insn, 5, NULL);
> +
> +       /* poke all progs and subprogs */
> +       if (prog->aux->func_cnt) {
> +               for(int i=0; i<prog->aux->func_cnt; i++){
> +                       in_place_patch_bpf_prog(prog->aux->func[i], patch_prog->aux->func[i]);
> +               }
> +       } else {
> +               in_place_patch_bpf_prog(prog, patch_prog);
> +       }
> +       /* flush all text poke calls */
> +       smp_text_poke_batch_finish();
> +
> +
> + #ifdef CONFIG_X86_64
> +       struct unwind_state state;
> +       unsigned long addr, bpf_loop_addr, bpf_loop_term_addr;
> +       struct pt_regs *regs = get_irq_regs();
> +       char str[KSYM_SYMBOL_LEN];
> +       bpf_loop_addr = (unsigned long)bpf_loop_proto.func;
> +       bpf_loop_term_addr = (unsigned long)bpf_loop_termination;
> +       unwind_start(&state, current, regs, NULL);
> +
> +       addr = unwind_get_return_address(&state);
> +
> +       unsigned long stack_addr = regs->sp;
> +       while (addr) {
> +               if (is_bpf_address(prog, addr)) {
> +                       break;
> +               } else {
> +                       const char *name = kallsyms_lookup(addr, NULL, NULL, NULL, str);
> +                       if (name) {
> +                               unsigned long lookup_addr = kallsyms_lookup_name(name);
> +                               if (lookup_addr && lookup_addr == bpf_loop_addr) {
> +                                       while (*(unsigned long *)stack_addr != addr) {
> +                                               stack_addr += 1;
> +                                       }
> +                                       *(unsigned long *)stack_addr = bpf_loop_term_addr;
> +                               }
> +                       }
> +               }
> +               unwind_next_frame(&state);
> +               addr = unwind_get_return_address(&state);
> +       }

Instead of doing all this munging by hand, a better way is to figure
out the frame base pointer using arch_bpf_stack_walk, then figure out
the return address using that.

> +#endif
> +
> +       return;
> +}
> +
> +enum hrtimer_restart bpf_termination_wd_callback(struct hrtimer *hr)
> +{
> +
> +       struct bpf_term_aux_states *term_states = container_of(hr, struct bpf_term_aux_states, hrtimer);
> +       struct bpf_prog *prog = term_states->prog;
> +       bpf_die(prog);
> +       return HRTIMER_NORESTART;
> +
> +}
> +EXPORT_SYMBOL_GPL(bpf_termination_wd_callback);
> +
>  static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
>  {
>         enum bpf_prog_type type = attr->prog_type;
> @@ -2995,6 +3200,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
>         err = sanity_check_jit_len(prog);
>         if (err < 0)
>                 goto free_used_maps;
> +       prog->term_states->prog = prog;
>
>         err = bpf_prog_alloc_id(prog);
>         if (err)
> diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
> index c4b1a98ff726..16f685c861a3 100644
> --- a/kernel/bpf/trampoline.c
> +++ b/kernel/bpf/trampoline.c
> @@ -908,6 +908,9 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
>                         prog->aux->recursion_detected(prog);
>                 return 0;
>         }
> +
> +       update_term_per_cpu_flag(prog, 1);
> +       bpf_terminate_timer_init(prog);
>         return bpf_prog_start_time();
>  }
>
> @@ -941,6 +944,8 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
>         bpf_reset_run_ctx(run_ctx->saved_run_ctx);
>
>         update_prog_stats(prog, start);
> +       bpf_terminate_timer_cancel(prog);
> +       update_term_per_cpu_flag(prog, 0);
>         this_cpu_dec(*(prog->active));
>         migrate_enable();
>         rcu_read_unlock();
> --
> 2.43.0
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC bpf-next v2 2/4] bpf: Generating a stubbed version of BPF program for termination
  2025-06-14  6:40 ` [RFC bpf-next v2 2/4] bpf: Generating a stubbed version of BPF program for termination Siddharth Chintamaneni
@ 2025-06-30 12:47   ` Kumar Kartikeya Dwivedi
  2025-07-04 17:32     ` Raj Sahu
  0 siblings, 1 reply; 19+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2025-06-30 12:47 UTC (permalink / raw)
  To: Siddharth Chintamaneni
  Cc: bpf, ast, daniel, andrii, martin.lau, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
	djwillia, miloc, ericts, rahult, doniaghazy, quanzhif, jinghao7,
	egor, sairoop10, Raj Sahu

On Sat, 14 Jun 2025 at 08:41, Siddharth Chintamaneni
<sidchintamaneni@gmail.com> wrote:
>
> Introduces patch generation which generates patch for each BPF
> program, iterate through all helper calls, kfuncs and if the return
> type may return NULL then stub them with a dummy function.
>
> Additional to these helpers/kfuncs the current implementation also
> supports termination of bpf_loop (both inline and non-inline case)
>
> Signed-off-by: Raj Sahu <rjsu26@gmail.com>
> Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
> ---
>  include/linux/bpf_verifier.h |   4 +
>  include/linux/filter.h       |   2 +
>  kernel/bpf/core.c            |  12 ++
>  kernel/bpf/syscall.c         |  19 +++
>  kernel/bpf/verifier.c        | 245 ++++++++++++++++++++++++++++++++++-
>  5 files changed, 276 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index 7e459e839f8b..3b4f371684a9 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -844,6 +844,10 @@ struct bpf_verifier_env {
>         /* array of pointers to bpf_scc_info indexed by SCC id */
>         struct bpf_scc_info **scc_info;
>         u32 scc_cnt;
> +       struct {
> +               u32 call_sites_cnt;
> +               int *call_idx;
> +       } bpf_term_patch_call_sites;
>  };
>
>  static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog)
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index eca229752cbe..cd9f1c2727ec 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -1119,6 +1119,8 @@ int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);
>  bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
>  void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
>
> +int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx);
> +void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
>  u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
>  #define __bpf_call_base_args \
>         ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 756ed575741e..2a02e9cafd5a 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -1571,6 +1571,18 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
>  }
>  #endif /* CONFIG_BPF_JIT */
>
> +noinline void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
> +{
> +       return NULL;
> +}
> +EXPORT_SYMBOL_GPL(bpf_termination_null_func);
> +
> +noinline int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx)
> +{
> +       return 1;
> +}
> +EXPORT_SYMBOL_GPL(bpf_loop_term_callback);
> +
>  /* Base function for offset calculation. Needs to go into .text section,
>   * therefore keeping it non-static as well; will also be used by JITs
>   * anyway later on, so do not let the compiler omit it. This also needs
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 56500381c28a..cd8e7c47e3fe 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2757,6 +2757,16 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
>  /* last field in 'union bpf_attr' used by this command */
>  #define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt
>
> +static int sanity_check_jit_len(struct bpf_prog *prog)
> +{
> +       struct bpf_prog *patch_prog = prog->term_states->patch_prog;
> +
> +       if (prog->jited_len != patch_prog->jited_len)
> +               return -EFAULT;
> +
> +       return 0;
> +}
> +
>  static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
>  {
>         enum bpf_prog_type type = attr->prog_type;
> @@ -2921,6 +2931,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
>
>         prog->orig_prog = NULL;
>         prog->jited = 0;
> +       prog->is_termination_prog = 0;
>
>         atomic64_set(&prog->aux->refcnt, 1);
>
> @@ -2977,6 +2988,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
>         if (err < 0)
>                 goto free_used_maps;
>
> +       prog->term_states->patch_prog = bpf_prog_select_runtime(prog->term_states->patch_prog, &err);
> +       if (err < 0)
> +               goto free_used_maps;
> +
> +       err = sanity_check_jit_len(prog);
> +       if (err < 0)
> +               goto free_used_maps;
> +
>         err = bpf_prog_alloc_id(prog);
>         if (err)
>                 goto free_used_maps;
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index c378074516cf..6920a623dde4 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -21397,6 +21397,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
>                         goto out_free;
>                 func[i]->is_func = 1;
>                 func[i]->sleepable = prog->sleepable;
> +               if (prog->is_termination_prog)
> +                       func[i]->is_termination_prog = 1;
>                 func[i]->aux->func_idx = i;
>                 /* Below members will be freed only at prog->aux */
>                 func[i]->aux->btf = prog->aux->btf;
> @@ -21514,8 +21516,10 @@ static int jit_subprogs(struct bpf_verifier_env *env)
>                         goto out_free;
>         }
>
> -       for (i = 1; i < env->subprog_cnt; i++)
> -               bpf_prog_kallsyms_add(func[i]);
> +       if (!prog->is_termination_prog) {
> +               for (i = 1; i < env->subprog_cnt; i++)
> +                       bpf_prog_kallsyms_add(func[i]);
> +       }
>
>         /* Last step: make now unused interpreter insns from main
>          * prog consistent for later dump requests, so they can
> @@ -21693,15 +21697,21 @@ static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
>  }
>
>  static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
> -                           struct bpf_insn *insn_buf, int insn_idx, int *cnt)
> +                           struct bpf_insn *insn_buf, int insn_idx, int *cnt, int *kfunc_btf_id)
>  {
>         const struct bpf_kfunc_desc *desc;
> +       struct bpf_kfunc_call_arg_meta meta;
> +       int err;
>
>         if (!insn->imm) {
>                 verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
>                 return -EINVAL;
>         }
>
> +       err = fetch_kfunc_meta(env, insn, &meta, NULL);
> +       if (err)
> +               return err;
> +
>         *cnt = 0;
>
>         /* insn->imm has the btf func_id. Replace it with an offset relative to
> @@ -21715,8 +21725,11 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>                 return -EFAULT;
>         }
>
> -       if (!bpf_jit_supports_far_kfunc_call())
> +       if (!bpf_jit_supports_far_kfunc_call()) {
> +               if (meta.kfunc_flags & KF_RET_NULL)
> +                       *kfunc_btf_id = insn->imm;
>                 insn->imm = BPF_CALL_IMM(desc->addr);
> +       }
>         if (insn->off)
>                 return 0;
>         if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
> @@ -21846,7 +21859,13 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>         struct bpf_subprog_info *subprogs = env->subprog_info;
>         u16 stack_depth = subprogs[cur_subprog].stack_depth;
>         u16 stack_depth_extra = 0;
> +       int call_sites_cnt = 0;
> +       int *call_idx;
> +       env->bpf_term_patch_call_sites.call_idx = NULL;
>
> +       call_idx = vmalloc(sizeof(*call_idx) * prog->len);
> +       if (!call_idx)
> +               return -ENOMEM;
>         if (env->seen_exception && !env->exception_callback_subprog) {
>                 struct bpf_insn patch[] = {
>                         env->prog->insnsi[insn_cnt - 1],
> @@ -22182,11 +22201,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>                 if (insn->src_reg == BPF_PSEUDO_CALL)
>                         goto next_insn;
>                 if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
> -                       ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
> +                       int kfunc_btf_id = 0;
> +                       ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt, &kfunc_btf_id);
>                         if (ret)
>                                 return ret;
>                         if (cnt == 0)
> -                               goto next_insn;
> +                               goto store_call_indices;
>
>                         new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
>                         if (!new_prog)
> @@ -22195,6 +22215,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>                         delta    += cnt - 1;
>                         env->prog = prog = new_prog;
>                         insn      = new_prog->insnsi + i + delta;
> +
> +store_call_indices:
> +                       if (kfunc_btf_id != 0) {
> +                               call_idx[call_sites_cnt] = i + delta;
> +                               call_sites_cnt++;
> +                       }
>                         goto next_insn;
>                 }
>
> @@ -22673,6 +22699,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>                                 func_id_name(insn->imm), insn->imm);
>                         return -EFAULT;
>                 }
> +               if (fn->ret_type & PTR_MAYBE_NULL) {
> +                       call_idx[call_sites_cnt] = i + delta;
> +                       call_sites_cnt++;
> +               }
>                 insn->imm = fn->func - __bpf_call_base;
>  next_insn:
>                 if (subprogs[cur_subprog + 1].start == i + delta + 1) {
> @@ -22693,6 +22723,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>                 insn++;
>         }
>
> +       env->bpf_term_patch_call_sites.call_sites_cnt = call_sites_cnt;
> +       env->bpf_term_patch_call_sites.call_idx = call_idx;
>         env->prog->aux->stack_depth = subprogs[0].stack_depth;
>         for (i = 0; i < env->subprog_cnt; i++) {
>                 int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
> @@ -22828,6 +22860,8 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
>         call_insn_offset = position + 12;
>         callback_offset = callback_start - call_insn_offset - 1;
>         new_prog->insnsi[call_insn_offset].imm = callback_offset;
> +       /* marking offset field to identify and patch the patch_prog*/
> +       new_prog->insnsi[call_insn_offset].off = 0x1;
>
>         return new_prog;
>  }
> @@ -24394,6 +24428,194 @@ static int compute_scc(struct bpf_verifier_env *env)
>         return err;
>  }
>
> +static int clone_patch_prog(struct bpf_verifier_env *env)
> +{
> +       gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | GFP_USER);
> +       unsigned int size, prog_name_len;
> +       struct bpf_prog *patch_prog, *prog = env->prog;
> +       struct bpf_prog_aux *aux;
> +
> +       size = prog->pages * PAGE_SIZE;
> +       patch_prog = __vmalloc(size, gfp_flags);
> +       if (!patch_prog)
> +               return -ENOMEM;
> +
> +       aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | GFP_USER));
> +       if (!aux) {
> +               vfree(patch_prog);
> +               return -ENOMEM;
> +       }
> +
> +       /*
> +        * Copying prog fields
> +        */
> +       patch_prog->pages = prog->pages;
> +       patch_prog->jited = 0;
> +       patch_prog->jit_requested = prog->jit_requested;
> +       patch_prog->gpl_compatible = prog->gpl_compatible;
> +       patch_prog->blinding_requested = prog->blinding_requested;
> +       patch_prog->is_termination_prog = 1;
> +       patch_prog->len = prog->len;
> +       patch_prog->type = prog->type;
> +       patch_prog->aux = aux;
> +
> +       /*
> +        * Copying prog aux fields
> +        */
> +       patch_prog->aux->used_map_cnt = prog->aux->used_map_cnt;
> +       patch_prog->aux->used_btf_cnt = prog->aux->used_btf_cnt;
> +       patch_prog->aux->max_ctx_offset = prog->aux->max_ctx_offset;
> +       patch_prog->aux->stack_depth = prog->aux->stack_depth;
> +       patch_prog->aux->func_cnt = prog->aux->func_cnt; /* will be populated by jit_subprogs */
> +       patch_prog->aux->real_func_cnt = prog->aux->real_func_cnt; /* will be populated by jit_subprogs */
> +       patch_prog->aux->func_idx = prog->aux->func_idx; /* will be populated by jit_subprogs */
> +       patch_prog->aux->attach_btf_id = prog->aux->attach_btf_id;
> +       patch_prog->aux->attach_st_ops_member_off = prog->aux->attach_st_ops_member_off;
> +       patch_prog->aux->ctx_arg_info_size = prog->aux->ctx_arg_info_size;
> +       patch_prog->aux->max_rdonly_access = prog->aux->max_rdonly_access;
> +       patch_prog->aux->max_rdwr_access = prog->aux->max_rdwr_access;
> +       patch_prog->aux->verifier_zext = prog->aux->verifier_zext;
> +       patch_prog->aux->dev_bound = prog->aux->dev_bound;
> +       patch_prog->aux->offload_requested = prog->aux->offload_requested;
> +       patch_prog->aux->attach_btf_trace = prog->aux->attach_btf_trace;
> +       patch_prog->aux->attach_tracing_prog = prog->aux->attach_tracing_prog;
> +       patch_prog->aux->func_proto_unreliable = prog->aux->func_proto_unreliable;
> +       patch_prog->aux->tail_call_reachable = prog->aux->tail_call_reachable;
> +       patch_prog->aux->xdp_has_frags = prog->aux->xdp_has_frags;
> +       patch_prog->aux->exception_cb = prog->aux->exception_cb;
> +       patch_prog->aux->exception_boundary = prog->aux->exception_boundary;
> +       patch_prog->aux->is_extended = prog->aux->is_extended;
> +       patch_prog->aux->jits_use_priv_stack = prog->aux->jits_use_priv_stack;
> +       patch_prog->aux->priv_stack_requested = prog->aux->priv_stack_requested;
> +       patch_prog->aux->changes_pkt_data = prog->aux->changes_pkt_data;
> +       patch_prog->aux->might_sleep = prog->aux->might_sleep;
> +       patch_prog->aux->prog_array_member_cnt = prog->aux->prog_array_member_cnt;
> +       patch_prog->aux->size_poke_tab = prog->aux->size_poke_tab;
> +       patch_prog->aux->cgroup_atype = prog->aux->cgroup_atype;
> +       patch_prog->aux->linfo = prog->aux->linfo;
> +       patch_prog->aux->func_info_cnt = prog->aux->func_info_cnt;
> +       patch_prog->aux->nr_linfo = prog->aux->nr_linfo;
> +       patch_prog->aux->linfo_idx = prog->aux->linfo_idx;
> +       patch_prog->aux->num_exentries = prog->aux->num_exentries;
> +
> +       patch_prog->aux->poke_tab = kmalloc_array(patch_prog->aux->size_poke_tab,
> +                                       sizeof(struct bpf_jit_poke_descriptor), GFP_KERNEL);
> +       if (!patch_prog->aux->poke_tab) {
> +               kfree(patch_prog->aux);
> +               vfree(patch_prog);
> +               return -ENOMEM;
> +       }
> +
> +       for (int i = 0; i < patch_prog->aux->size_poke_tab; i++) {
> +               memcpy(&patch_prog->aux->poke_tab[i], &prog->aux->poke_tab[i],
> +                               sizeof(struct bpf_jit_poke_descriptor));
> +       }
> +
> +       memcpy(patch_prog->insnsi, prog->insnsi, bpf_prog_insn_size(prog));
> +
> +       char *patch_prefix = "patch_";
> +       prog_name_len = strlen(prog->aux->name);
> +       strncpy(patch_prog->aux->name, patch_prefix, strlen(patch_prefix));
> +
> +       if (prog_name_len + strlen(patch_prefix) + 1 > BPF_OBJ_NAME_LEN) {
> +               prog_name_len = BPF_OBJ_NAME_LEN - strlen(patch_prefix) - 1;
> +       }
> +       strncat(patch_prog->aux->name, prog->aux->name, prog_name_len);
> +
> +       prog->term_states->patch_prog = patch_prog;
> +
> +       return 0;
> +}

The parts above this function makes sense, but I don't understand why
we need programs to be cloned separately anymore.
Instead, can't we simply have a table of patch targets for the original program?
They'd need adjustment after jit to reflect the exact correct offset,
but that shouldn't be difficult to compute.
IIUC you're comparing instructions in the patched program and original
program to find out if you need to patch out the original one.
That seems like a very expensive way of tracking which call targets
need to be modified.

> +
> +[...]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC bpf-next v2 0/4] bpf: Fast-Path approach for BPF program
  2025-06-14  6:40 [RFC bpf-next v2 0/4] bpf: Fast-Path approach for BPF program Siddharth Chintamaneni
                   ` (3 preceding siblings ...)
  2025-06-14  6:40 ` [RFC bpf-next v2 4/4] selftests/bpf: Adds selftests to check termination of long running nested bpf loops Siddharth Chintamaneni
@ 2025-06-30 13:03 ` Kumar Kartikeya Dwivedi
  4 siblings, 0 replies; 19+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2025-06-30 13:03 UTC (permalink / raw)
  To: Siddharth Chintamaneni
  Cc: bpf, ast, daniel, andrii, martin.lau, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
	djwillia, miloc, ericts, rahult, doniaghazy, quanzhif, jinghao7,
	egor, sairoop10

On Sat, 14 Jun 2025 at 08:41, Siddharth Chintamaneni
<sidchintamaneni@gmail.com> wrote:
>
> This is RFC v2 of
>         https://lore.kernel.org/bpf/20250420105524.2115690-1-rjsu26@gmail.com/
>
> Change since v1:
> - Patch generation has been moved after verification and before JIT.
>         - Now patch generation handles both helpers and kfuncs.
>         - Sanity check on original prog and patch prog after JIT.
> - Runtime termination handler is now global termination mechanism using
>   text_poke.
> - Termination is triggered by watchdog timer.
>
> TODO:
> - Termination support for tailcall programs.
> - Fix issue caused by warning in runtime termination handler due to
>   https://elixir.bootlin.com/linux/v6.15.1/source/kernel/smp.c#L815
> - Free memory for patch progs related fields.
> - Include selftests covering more cases such as BPF program nesting.

Thanks for sharing the new version. I think there's a few
implementation issues that need to be sorted out, on a high-level.
- Move arch-specific bits into arch/x86 and ensure all this is not
invoked / disabled on other architectures.
- Drop the hrtimer based termination enforcement.
- Drop extra prog cloning, since I don't see the need anymore.
- Add an early-short circuit by padding the entry with nop5 and then
changing it to return early.
- Other comments left inline.

Code patching when not in_task() looks problematic, I will think more
but I guess deferring work to some wq context is probably the best
option.
We can potentially be in an NMI context as well when we stall, so
either it needs to be made safe such that we can invoke it from any
context, or we defer work.
Right now, the invocation happens from a timer callback, but we may
synchronously invoke fast-execute termination request for cond_break
timeouts or rqspinlock deadlock.

Otherwise it might be simpler to just swap the return address with a
copy of a patched/cloned program on the local CPU, and replace
prog_func globally. But we discarded that approach.
Doesn't feel right to go back there just because we can't modify text
from any context.

In terms of design we still need to figure out how to handle tail
calls and extension programs.
Tail calls are relatively easier, since they can potentially fail.
Extension programs with the current approach become tricky.
If you had cloned the program during verification and replaced the
original with it at runtime, then any extensions attached to the
program do not get invoked.
Now, since we patch the original program directly, it's possible that
we still enter an extension program and end up stalling there again.
I guess we will end up patching the extension program next, but it
would be nice to verify that this works as expected.
Other options are removing the extension program by patching it out
and replacing with original global function target which would have
been patched.

>
>  include/linux/bpf.h                           |  19 +-
>  include/linux/bpf_verifier.h                  |   4 +
>  include/linux/filter.h                        |  43 ++-
>  kernel/bpf/core.c                             |  64 +++++
>  kernel/bpf/syscall.c                          | 225 ++++++++++++++++
>  kernel/bpf/trampoline.c                       |   5 +
>  kernel/bpf/verifier.c                         | 245 +++++++++++++++++-
>  .../bpf/prog_tests/bpf_termination.c          |  39 +++
>  .../selftests/bpf/progs/bpf_termination.c     |  38 +++
>  9 files changed, 674 insertions(+), 8 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_termination.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_termination.c
>
> --
> 2.43.0
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC bpf-next v2 3/4] bpf: Runtime part of fast-path termination approach
  2025-06-30 12:15   ` Kumar Kartikeya Dwivedi
@ 2025-07-04 17:29     ` Raj Sahu
  2025-07-04 19:10       ` Kumar Kartikeya Dwivedi
  0 siblings, 1 reply; 19+ messages in thread
From: Raj Sahu @ 2025-07-04 17:29 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi
  Cc: Siddharth Chintamaneni, bpf, ast, daniel, andrii, martin.lau,
	eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
	haoluo, jolsa, djwillia, miloc, ericts, rahult, doniaghazy,
	quanzhif, jinghao7, egor, sairoop10

> > Introduces watchdog based runtime mechanism to terminate
> > a BPF program. When a BPF program is interrupted by
> > an watchdog, its registers are are passed onto the bpf_die.
> >
> > Inside bpf_die we perform the text_poke and stack walk
> > to stub helpers/kfunc replace bpf_loop helper if called
> > inside bpf program.
> >
> > Current implementation doesn't handle the termination of
> > tailcall programs.
> >
> > There is a known issue by calling text_poke inside interrupt
> > context - https://elixir.bootlin.com/linux/v6.15.1/source/kernel/smp.c#L815.
>
> I don't have a good idea so far, maybe by deferring work to wq context?
> Each CPU would need its own context and schedule work there.
> The problem is that it may not be invoked immediately.
We will give it a try using wq. We were a bit hesitant in pursuing wq
earlier because to modify the return address on the stack we would
want to interrupt the running BPF program and access its stack since
that's a key part of the design.

Will need some suggestions here on how to achieve that.

> > +static void bpf_terminate_timer_init(const struct bpf_prog *prog)
> > +{
> > +       ktime_t timeout = ktime_set(1, 0); // 1s, 0ns
> > +
> > +       /* Initialize timer on Monotonic clock, relative mode */
> > +       hrtimer_setup(&prog->term_states->hrtimer, bpf_termination_wd_callback, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
>
> Hmm, doesn't this need to be a per-CPU hrtimer? Otherwise all
> concurrent invocations will race to set up and start it?
> Doesn't even look thread safe, unless I'm missing something.
Yes, this was an oversight. Thanks for pointing it out.
> > +       /* Start watchdog */
> > +       hrtimer_start(&prog->term_states->hrtimer, timeout, HRTIMER_MODE_REL);
> > +}
> > +
> > +static void bpf_terminate_timer_cancel(const struct bpf_prog *prog)
> > +{
> > +       hrtimer_cancel(&prog->term_states->hrtimer);
> > +}
> > +
> >  static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
> >                                           const void *ctx,
> >                                           bpf_dispatcher_fn dfunc)
> > @@ -706,7 +735,11 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
> >                 u64 duration, start = sched_clock();
> >                 unsigned long flags;
> >
> > +               update_term_per_cpu_flag(prog, 1);
> > +               bpf_terminate_timer_init(prog);
> >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > +               bpf_terminate_timer_cancel(prog);
> > +               update_term_per_cpu_flag(prog, 0);
> >
> >                 duration = sched_clock() - start;
> >                 stats = this_cpu_ptr(prog->stats);
> > @@ -715,8 +748,11 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
> >                 u64_stats_add(&stats->nsecs, duration);
> >                 u64_stats_update_end_irqrestore(&stats->syncp, flags);
> >         } else {
> > +               update_term_per_cpu_flag(prog, 1);
> > +               bpf_terminate_timer_init(prog);
> >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > -       }
> > +               bpf_terminate_timer_cancel(prog);
> > +               update_term_per_cpu_flag(prog, 0);}
> >         return ret;
> >  }
>
> Hmm, did you profile how much overhead this adds? It's not completely
> free, right?
> I guess the per_cpu flag's lock is uncontended, so there wouldn't be
> too much overhead there (though it's still an extra atomic op on the
> fast path).
> hrtimer_setup() won't be that expensive either, but I think
> hrtimer_start() can be.
> Also, what about programs invoked from BPF trampoline? We would need
> such "watchdog" protection for potentially every program, right?
>
> I'm more concerned about the implications of using an hrtimer around
> every program invocation though.
> Imagine that the program gets invoked in task context, the same
> program then runs in interrupt context (let's say it's a tracing
> program).
> Even the simple hrtimer_cancel() when returning from interrupt context
> can potentially deadlock the kernel if the task context program hit
> its limit and was inside the timer callback.
> Let alone the fact that we can have recursion on the same CPU as above
> or by repeatedly invoking the same program, which reprograms the timer
> again.
>
> I think we should piggy back on softlockup / hardlockup checks (that's
> what I did long ago), but for simplicity I would just drop these time
> based enforcement checks from the set for now.
> They're incomplete, and potentially buggy. Instead you can invoke
> bpf_die() when a program hits the loop's max count limit or something
> similar, in order to test this.
> We also need to account for sleepable programs, so a 1 second
> hardcoded limit is probably not appropriate.
> Enforcement is orthogonal to how a program is cleaned up, though as
> important, but it can be revisited once we sort out the first part.
ACK
We can do some profiling eventually then if we decide to bring it back.
The deadlock case is a good case to consider, however a program's
recursion is not possible on a given CPU right?

Earlier we were thinking of enforcing performance based runtime
policies for BPF programs. Looks like it is getting hard to implement
it. So I think we will go ahead and rely on the kernel/ bpf mechanism
to detect bad BPF programs (stalls, pf, etc).

Adding an iteration based termination for bpf_loop won't be enough
because an expensive callback won't need too many
iterations,comparatively, to exceed runtime expectations.


> > @@ -1119,6 +1155,9 @@ int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);
> >  bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
> >  void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
> >
> > +#ifdef CONFIG_X86_64
> > +int bpf_loop_termination(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags);
> > +#endif
> >  int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx);
> >  void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
> >  u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
> > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> > index 2a02e9cafd5a..735518735779 100644
> > --- a/kernel/bpf/core.c
> > +++ b/kernel/bpf/core.c
> > @@ -1583,6 +1583,21 @@ noinline int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx)
> >  }
> >  EXPORT_SYMBOL_GPL(bpf_loop_term_callback);
> >
> > +#ifdef CONFIG_X86_64
> > +noinline int bpf_loop_termination(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags)
> > +{
> > +       asm volatile(
> > +               "pop %rbx\n\t"
> > +               "pop %rbp\n\t"
> > +               "pop %r12\n\t"
> > +               "pop %r13\n\t"
> > +       );
> > +       return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(bpf_loop_termination);
> > +STACK_FRAME_NON_STANDARD(bpf_loop_termination);
>
> You can move this into an arch-specific helper, see
> bpf_timed_may_goto.S in arch/x86/net.
> For non-x86, the whole logic should not kick in.
>
> Also this function needs more comments. Why is restoring these 4
> registers correct?
> It is not clear from the code, please point out where they are being saved.
>
> If this is about restoring callee saved registers, then it looks broken to me.
> Only those scratched are saved by the JIT (see callee_regs_used in
> bpf_jit_comp.c), so it would be plain wrong.
> r12 is unused except for arenas. rbx, r13, r14, r15 are used, at max.
>
> It would make more sense to move the logic into the JIT, as I suggested above.
> Even then, you either need to spill all callee regs, or figure out a
> way to conditionally restore.
Should have added more comments : )
bpf_loop_termination is supposed to be the replacement of bpf_loop
helper call for the cases when a BPF program didn't inline them.
The original bpf_loop helper has 4 arguments which we observed were
getting saved onto the stack.
The 4 pop instructions are to remove the 4 saved registers from the stack.
We did have our concerns about the compiler not saving all 4 of them or maybe
saving them in a different order, but didn't observe that in the testing.
But I agree it's a bit hacky although we were not sure how else to
figure this out.

If we end up pursuing the wq approach, this hack won't be applicable anyways
(handler will be running on a different stack). We'll have to figure
out another way then.

> > +#endif
> > +
> >  /* Base function for offset calculation. Needs to go into .text section,
> >   * therefore keeping it non-static as well; will also be used by JITs
> >   * anyway later on, so do not let the compiler omit it. This also needs
> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > index cd8e7c47e3fe..065767ae1bd1 100644
> > --- a/kernel/bpf/syscall.c
> > +++ b/kernel/bpf/syscall.c
> > @@ -37,6 +37,10 @@
> >  #include <linux/trace_events.h>
> >  #include <linux/tracepoint.h>
> >  #include <linux/overflow.h>
> > +#include <asm/unwind.h>
> > +#include <asm/insn.h>
> > +#include <asm/text-patching.h>
> > +#include <asm/irq_regs.h>
> >
> >  #include <net/netfilter/nf_bpf_link.h>
> >  #include <net/netkit.h>
> > @@ -2767,6 +2771,207 @@ static int sanity_check_jit_len(struct bpf_prog *prog)
> >         return 0;
> >  }
> >
> > +static bool per_cpu_flag_is_true(struct bpf_term_aux_states *term_states, int cpu_id)
> > +{
> > +       unsigned long flags;
> > +       spin_lock_irqsave(&term_states->per_cpu_state[cpu_id].lock,
> > +                               flags);
> > +       if (term_states->per_cpu_state[cpu_id].cpu_flag == 1) {
> > +               spin_unlock_irqrestore(&term_states->per_cpu_state[cpu_id].lock,
> > +                                       flags);
> > +               return true;
> > +       }
> > +       spin_unlock_irqrestore(&term_states->per_cpu_state[cpu_id].lock,
> > +                               flags);
> > +       return false;
> > +}
> > +
> > +static int is_bpf_address(struct bpf_prog *prog, unsigned long addr)
>
> Use prog == bpf_prog_ksym_find(addr) here.
> Also, rename this function to something like is_bpf_prog_text_address
> (scoped version of is_bpf_text_address).
>
> > +{
> > +
> > +        unsigned long bpf_func_addr = (unsigned long)prog->bpf_func;
> > +        if ((addr > bpf_func_addr) &&
> > +                        (addr < bpf_func_addr + prog->jited_len)){
> > +                return 1;
> > +        }
> > +
> > +        for (int subprog = 1; subprog < prog->aux->func_cnt; subprog++) {
> > +                struct bpf_prog *bpf_subprog = prog->aux->func[subprog];
> > +                unsigned long bpf_subprog_func_addr =
> > +                                        (unsigned long)bpf_subprog->bpf_func;
> > +                if ((addr > bpf_subprog_func_addr) && (addr < bpf_subprog_func_addr +
> > +                                                        bpf_subprog->jited_len)) {
> > +                        return 1;
> > +                }
> > +        }
> > +
> > +        return 0;
> > +}
> > +
> > +/*
> > + * For a call instruction in a BPF program, return the stubbed insn buff.
> > + * Returns new instruction buff if stubbing required,
> > + *        NULL if no change needed.
> > + */
> > +__always_inline char* find_termination_realloc(struct insn orig_insn, unsigned char *orig_addr,
> > +                                              struct insn patch_insn, unsigned char *patch_addr) {
> > +
> > +       unsigned long new_target;
> > +       unsigned long original_call_target = (unsigned long)orig_addr + 5 + orig_insn.immediate.value;
> > +
> > +       unsigned long patch_call_target = (unsigned long)patch_addr + 5 + patch_insn.immediate.value;
> > +
> > +       /* As per patch prog, no stubbing needed. */
> > +       if (patch_call_target == original_call_target)
> > +               return NULL;
> > +
> > +       /* bpf_termination_null_func is the generic stub function unless its either of
> > +       * the bpf_loop helper or the associated callback
> > +       */
> > +       new_target = (unsigned long)bpf_termination_null_func;
> > +       if (patch_call_target == (unsigned long)bpf_loop_term_callback)
> > +               new_target = (unsigned long)bpf_loop_term_callback;
> > +
> > +
> > +       unsigned long new_rel = (unsigned long)(new_target - (unsigned long)(orig_addr + 5));
> > +
> > +       char *new_insn = kmalloc(5, GFP_KERNEL);
>
> This can fail, so you'd have to return NULL even in cases where you
> need to patch the target...
> I'd suggest modifying the contract of the function to not depend on
> returning NULL, can be some out parameter.
>
> > +       new_insn[0] = 0xE8;
> > +       new_insn[1] = (new_rel >> 0) & 0xff;
> > +       new_insn[2] = (new_rel >> 8) & 0xff;
> > +       new_insn[3] = (new_rel >> 16) & 0xff;
> > +       new_insn[4] = (new_rel >> 24) & 0xff;
> > +
> > +       return new_insn;
> > +}
> > +
> > +/*
> > + * Given a bpf program and a corresponding termination patch prog
> > + * (generated during verification), this program will patch all
> > + * call instructions in prog and decide whether to stub them
> > + * based on whether the termination_prog has stubbed or not.
> > + */
> > +static void __maybe_unused in_place_patch_bpf_prog(struct bpf_prog *prog, struct bpf_prog *patch_prog){
> > +
> > +       uint32_t size = 0;
> > +
> > +       while (size < prog->jited_len) {
> > +              unsigned char *addr = (unsigned char*)prog->bpf_func;
> > +              unsigned char *addr_patch = (unsigned char*)patch_prog->bpf_func;
> > +
> > +              struct insn insn;
> > +              struct insn insn_patch;
> > +
> > +              addr += size;
> > +              /* Decode original instruction */
> > +               if (WARN_ON_ONCE(insn_decode_kernel(&insn, addr))) {
> > +                       return;
> > +               }
> > +
> > +               /* Check for call instruction */
> > +               if (insn.opcode.bytes[0] != CALL_INSN_OPCODE) {
> > +                       goto next_insn;
> > +               }
> > +
> > +              addr_patch += size;
> > +              /* Decode patch_prog instruction */
> > +               if (WARN_ON_ONCE(insn_decode_kernel(&insn_patch, addr_patch))) {
> > +                       return ;
> > +               }
> > +
> > +              // Stub the call instruction if needed
> > +              char *buf;
> > +              if ((buf = find_termination_realloc(insn, addr, insn_patch, addr_patch)) != NULL) {
> > +                      smp_text_poke_batch_add(addr, buf, insn.length, NULL);
> > +                      kfree(buf);
>
> I think we should find a way to make this work without allocations.
> What if it fails? Are we going to let the program keep executing
> forever.
> Doesn't seem like a good option.
>
> > +              }
> > +
> > +       next_insn:
> > +               size += insn.length;
> > +       }
> > +}
> > +
> > +
> > +void bpf_die(void *data)
> > +{
> > +       struct bpf_prog *prog, *patch_prog;
> > +       int cpu_id = raw_smp_processor_id();
>
> Assuming you make the hrtimer per-CPU.
> So the hrtimer is not pinned to the CPU (HRTIMER_MODE_PINNED), hence
> it can be fired on any other CPU when the timer expires.
> This means you lose the associativity between the CPU where the
> program invocation did not complete in 1 second.
> Instead I think it might be better to have it in the per-cpu state,
> and stash the CPU number there and use container_of to obtain it.
>
> > +
> > +       prog = (struct bpf_prog *)data;
> > +       patch_prog = prog->term_states->patch_prog;
> > +
> > +       if (!per_cpu_flag_is_true(prog->term_states, cpu_id))
> > +               return;
>
> Unless hrtimer_cancel() provides a write barrier, this can return
> early if the write to 0 in the per-CPU flag gets reordered.
> It would be better to be explicit there.
>
> > +
> > +       unsigned long jmp_offset = prog->jited_len - (4 /*First endbr is 4 bytes*/
> > +                                               + 5 /*5 bytes of noop*/
> > +                                               + 5 /*5 bytes of jmp return_thunk*/);
>
> This is all x86 specific, so at the very least it should be guarded.
> The proper way would be to add a weak stub in core.c and provide an
> implementation in the arch-specific directory.
>
> > +       char new_insn[5];
> > +       new_insn[0] = 0xE9;
> > +       new_insn[1] = (jmp_offset >> 0) & 0xff;
> > +       new_insn[2] = (jmp_offset >> 8) & 0xff;
> > +       new_insn[3] = (jmp_offset >> 16) & 0xff;
> > +       new_insn[4] = (jmp_offset >> 24) & 0xff;
> > +       smp_text_poke_batch_add(prog->bpf_func + 4, new_insn, 5, NULL);
> > +
> > +       /* poke all progs and subprogs */
> > +       if (prog->aux->func_cnt) {
> > +               for(int i=0; i<prog->aux->func_cnt; i++){
> > +                       in_place_patch_bpf_prog(prog->aux->func[i], patch_prog->aux->func[i]);
> > +               }
> > +       } else {
> > +               in_place_patch_bpf_prog(prog, patch_prog);
> > +       }
> > +       /* flush all text poke calls */
> > +       smp_text_poke_batch_finish();
> > +
> > +
> > + #ifdef CONFIG_X86_64
> > +       struct unwind_state state;
> > +       unsigned long addr, bpf_loop_addr, bpf_loop_term_addr;
> > +       struct pt_regs *regs = get_irq_regs();
> > +       char str[KSYM_SYMBOL_LEN];
> > +       bpf_loop_addr = (unsigned long)bpf_loop_proto.func;
> > +       bpf_loop_term_addr = (unsigned long)bpf_loop_termination;
> > +       unwind_start(&state, current, regs, NULL);
> > +
> > +       addr = unwind_get_return_address(&state);
> > +
> > +       unsigned long stack_addr = regs->sp;
> > +       while (addr) {
> > +               if (is_bpf_address(prog, addr)) {
> > +                       break;
> > +               } else {
> > +                       const char *name = kallsyms_lookup(addr, NULL, NULL, NULL, str);
> > +                       if (name) {
> > +                               unsigned long lookup_addr = kallsyms_lookup_name(name);
> > +                               if (lookup_addr && lookup_addr == bpf_loop_addr) {
> > +                                       while (*(unsigned long *)stack_addr != addr) {
> > +                                               stack_addr += 1;
> > +                                       }
> > +                                       *(unsigned long *)stack_addr = bpf_loop_term_addr;
> > +                               }
> > +                       }
> > +               }
> > +               unwind_next_frame(&state);
> > +               addr = unwind_get_return_address(&state);
> > +       }
>
> Instead of doing all this munging by hand, a better way is to figure
> out the frame base pointer using arch_bpf_stack_walk, then figure out
> the return address using that.
>
> > +#endif
> > +
> > +       return;
> > +}
> > +
> > +enum hrtimer_restart bpf_termination_wd_callback(struct hrtimer *hr)
> > +{
> > +
> > +       struct bpf_term_aux_states *term_states = container_of(hr, struct bpf_term_aux_states, hrtimer);
> > +       struct bpf_prog *prog = term_states->prog;
> > +       bpf_die(prog);
> > +       return HRTIMER_NORESTART;
> > +
> > +}
> > +EXPORT_SYMBOL_GPL(bpf_termination_wd_callback);
> > +
> >  static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
> >  {
> >         enum bpf_prog_type type = attr->prog_type;
> > @@ -2995,6 +3200,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
> >         err = sanity_check_jit_len(prog);
> >         if (err < 0)
> >                 goto free_used_maps;
> > +       prog->term_states->prog = prog;
> >
> >         err = bpf_prog_alloc_id(prog);
> >         if (err)
> > diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
> > index c4b1a98ff726..16f685c861a3 100644
> > --- a/kernel/bpf/trampoline.c
> > +++ b/kernel/bpf/trampoline.c
> > @@ -908,6 +908,9 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
> >                         prog->aux->recursion_detected(prog);
> >                 return 0;
> >         }
> > +
> > +       update_term_per_cpu_flag(prog, 1);
> > +       bpf_terminate_timer_init(prog);
> >         return bpf_prog_start_time();
> >  }
> >
> > @@ -941,6 +944,8 @@ static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
> >         bpf_reset_run_ctx(run_ctx->saved_run_ctx);
> >
> >         update_prog_stats(prog, start);
> > +       bpf_terminate_timer_cancel(prog);
> > +       update_term_per_cpu_flag(prog, 0);
> >         this_cpu_dec(*(prog->active));
> >         migrate_enable();
> >         rcu_read_unlock();
> > --
> > 2.43.0
> >
ACK

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC bpf-next v2 2/4] bpf: Generating a stubbed version of BPF program for termination
  2025-06-30 12:47   ` Kumar Kartikeya Dwivedi
@ 2025-07-04 17:32     ` Raj Sahu
  2025-07-04 18:37       ` Kumar Kartikeya Dwivedi
  0 siblings, 1 reply; 19+ messages in thread
From: Raj Sahu @ 2025-07-04 17:32 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi
  Cc: Siddharth Chintamaneni, bpf, ast, daniel, andrii, martin.lau,
	eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
	haoluo, jolsa, djwillia, miloc, ericts, rahult, doniaghazy,
	quanzhif, jinghao7, egor, sairoop10

> The parts above this function makes sense, but I don't understand why
> we need programs to be cloned separately anymore.
> Instead, can't we simply have a table of patch targets for the original program?
> They'd need adjustment after jit to reflect the exact correct offset,
> but that shouldn't be difficult to compute.
If we understand correctly you advocate for storing the state of call
instruction offset of helpers/ kfuncs/ callback functions (incase of
bpf_loop inlining) and adjust the offset during JIT?

While we did think about following this approach, we are worried of
accidentally introducing bugs while implementing offset handling
either now or in future when some new JIT optimization is being added
by someone else.

We also thought of decoding JIT instructions (right after JIT) similar
to the runtime handler but there is an additional burden of figuring
out the helper/ kfunc's from the call instruction.
Currently, the cloned program is simplifying the whole task of going
through the weeds of JIT.

> IIUC you're comparing instructions in the patched program and original
> program to find out if you need to patch out the original one.
> That seems like a very expensive way of tracking which call targets
> need to be modified.
We can avoid this overhead still (by creating an offset table right
after JIT) so that the termination handler becomes faster. However,
since termination was itself a rare-case end-of-the-world situation,
we didn't consider having great performance as one of the requirements
for the handler.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC bpf-next v2 2/4] bpf: Generating a stubbed version of BPF program for termination
  2025-07-04 17:32     ` Raj Sahu
@ 2025-07-04 18:37       ` Kumar Kartikeya Dwivedi
  0 siblings, 0 replies; 19+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2025-07-04 18:37 UTC (permalink / raw)
  To: Raj Sahu
  Cc: Siddharth Chintamaneni, bpf, ast, daniel, andrii, martin.lau,
	eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
	haoluo, jolsa, djwillia, miloc, ericts, rahult, doniaghazy,
	quanzhif, jinghao7, egor, sairoop10

On Fri, 4 Jul 2025 at 19:32, Raj Sahu <rjsu26@gmail.com> wrote:
>
> > The parts above this function makes sense, but I don't understand why
> > we need programs to be cloned separately anymore.
> > Instead, can't we simply have a table of patch targets for the original program?
> > They'd need adjustment after jit to reflect the exact correct offset,
> > but that shouldn't be difficult to compute.
> If we understand correctly you advocate for storing the state of call
> instruction offset of helpers/ kfuncs/ callback functions (incase of
> bpf_loop inlining) and adjust the offset during JIT?
>
> While we did think about following this approach, we are worried of
> accidentally introducing bugs while implementing offset handling
> either now or in future when some new JIT optimization is being added
> by someone else.

That won't be as big a problem, the JIT keeps a mapping. As an
example, see how I did it here:
https://lore.kernel.org/bpf/20240201042109.1150490-10-memxor@gmail.com

You can keep the table of instruction indices until JIT, then do a
pass to convert.

>
> We also thought of decoding JIT instructions (right after JIT) similar
> to the runtime handler but there is an additional burden of figuring
> out the helper/ kfunc's from the call instruction.
> Currently, the cloned program is simplifying the whole task of going
> through the weeds of JIT.

I think the above should work, so all this won't be necessary.

>
> > IIUC you're comparing instructions in the patched program and original
> > program to find out if you need to patch out the original one.
> > That seems like a very expensive way of tracking which call targets
> > need to be modified.
> We can avoid this overhead still (by creating an offset table right
> after JIT) so that the termination handler becomes faster. However,
> since termination was itself a rare-case end-of-the-world situation,
> we didn't consider having great performance as one of the requirements
> for the handler.

It's not really about performance, but rather about extra memory
overhead per program.
These days we have 200 or so programs at any given time on production machines.
Can be more or less for other users, and the footprint is only growing
with every year.
They are loaded on every server in the fleet since bpf is "always on".
That's millions of machines.
This would mean we double the current memory footprint of program
.text (not just bpf_prog struct) if we kept the current approach, just
to match patch targets.
I think it's not a good tradeoff.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC bpf-next v2 3/4] bpf: Runtime part of fast-path termination approach
  2025-07-04 17:29     ` Raj Sahu
@ 2025-07-04 19:10       ` Kumar Kartikeya Dwivedi
  2025-07-07 17:40         ` Alexei Starovoitov
  0 siblings, 1 reply; 19+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2025-07-04 19:10 UTC (permalink / raw)
  To: Raj Sahu
  Cc: Siddharth Chintamaneni, bpf, ast, daniel, andrii, martin.lau,
	eddyz87, song, yonghong.song, john.fastabend, kpsingh, sdf,
	haoluo, jolsa, djwillia, miloc, ericts, rahult, doniaghazy,
	quanzhif, jinghao7, egor, sairoop10

On Fri, 4 Jul 2025 at 19:29, Raj Sahu <rjsu26@gmail.com> wrote:
>
> > > Introduces watchdog based runtime mechanism to terminate
> > > a BPF program. When a BPF program is interrupted by
> > > an watchdog, its registers are are passed onto the bpf_die.
> > >
> > > Inside bpf_die we perform the text_poke and stack walk
> > > to stub helpers/kfunc replace bpf_loop helper if called
> > > inside bpf program.
> > >
> > > Current implementation doesn't handle the termination of
> > > tailcall programs.
> > >
> > > There is a known issue by calling text_poke inside interrupt
> > > context - https://elixir.bootlin.com/linux/v6.15.1/source/kernel/smp.c#L815.
> >
> > I don't have a good idea so far, maybe by deferring work to wq context?
> > Each CPU would need its own context and schedule work there.
> > The problem is that it may not be invoked immediately.
> We will give it a try using wq. We were a bit hesitant in pursuing wq
> earlier because to modify the return address on the stack we would
> want to interrupt the running BPF program and access its stack since
> that's a key part of the design.
>
> Will need some suggestions here on how to achieve that.

Yeah, this is not trivial, now that I think more about it.
So keep the stack state untouched so you could synchronize with the
callback (spin until it signals us that it's done touching the stack).
I guess we can do it from another CPU, not too bad.

There's another problem though, wq execution not happening instantly
in time is not a big deal, but it getting interrupted by yet another
program that stalls can set up a cascading chain that leads to lock up
of the machine.
So let's say we have a program that stalls in NMI/IRQ. It might happen
that all CPUs that can service the wq enter this stall. The kthread is
ready to run the wq callback (or in the middle of it) but it may be
indefinitely interrupted.
It seems like this is a more fundamental problem with the non-cloning
approach. We can prevent program execution on the CPU where the wq
callback will be run, but we can also have a case where all CPUs lock
up simultaneously.

So the alternative is to do it locally, such that the program itself
enters the repair path and terminates.
Switching the return address to a patched clone seems much simpler in
comparison, even though it's added memory overhead.

>
> > > +static void bpf_terminate_timer_init(const struct bpf_prog *prog)
> > > +{
> > > +       ktime_t timeout = ktime_set(1, 0); // 1s, 0ns
> > > +
> > > +       /* Initialize timer on Monotonic clock, relative mode */
> > > +       hrtimer_setup(&prog->term_states->hrtimer, bpf_termination_wd_callback, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> >
> > Hmm, doesn't this need to be a per-CPU hrtimer? Otherwise all
> > concurrent invocations will race to set up and start it?
> > Doesn't even look thread safe, unless I'm missing something.
> Yes, this was an oversight. Thanks for pointing it out.
> > > +       /* Start watchdog */
> > > +       hrtimer_start(&prog->term_states->hrtimer, timeout, HRTIMER_MODE_REL);
> > > +}
> > > +
> > > +static void bpf_terminate_timer_cancel(const struct bpf_prog *prog)
> > > +{
> > > +       hrtimer_cancel(&prog->term_states->hrtimer);
> > > +}
> > > +
> > >  static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
> > >                                           const void *ctx,
> > >                                           bpf_dispatcher_fn dfunc)
> > > @@ -706,7 +735,11 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
> > >                 u64 duration, start = sched_clock();
> > >                 unsigned long flags;
> > >
> > > +               update_term_per_cpu_flag(prog, 1);
> > > +               bpf_terminate_timer_init(prog);
> > >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > > +               bpf_terminate_timer_cancel(prog);
> > > +               update_term_per_cpu_flag(prog, 0);
> > >
> > >                 duration = sched_clock() - start;
> > >                 stats = this_cpu_ptr(prog->stats);
> > > @@ -715,8 +748,11 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
> > >                 u64_stats_add(&stats->nsecs, duration);
> > >                 u64_stats_update_end_irqrestore(&stats->syncp, flags);
> > >         } else {
> > > +               update_term_per_cpu_flag(prog, 1);
> > > +               bpf_terminate_timer_init(prog);
> > >                 ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
> > > -       }
> > > +               bpf_terminate_timer_cancel(prog);
> > > +               update_term_per_cpu_flag(prog, 0);}
> > >         return ret;
> > >  }
> >
> > Hmm, did you profile how much overhead this adds? It's not completely
> > free, right?
> > I guess the per_cpu flag's lock is uncontended, so there wouldn't be
> > too much overhead there (though it's still an extra atomic op on the
> > fast path).
> > hrtimer_setup() won't be that expensive either, but I think
> > hrtimer_start() can be.
> > Also, what about programs invoked from BPF trampoline? We would need
> > such "watchdog" protection for potentially every program, right?
> >
> > I'm more concerned about the implications of using an hrtimer around
> > every program invocation though.
> > Imagine that the program gets invoked in task context, the same
> > program then runs in interrupt context (let's say it's a tracing
> > program).
> > Even the simple hrtimer_cancel() when returning from interrupt context
> > can potentially deadlock the kernel if the task context program hit
> > its limit and was inside the timer callback.
> > Let alone the fact that we can have recursion on the same CPU as above
> > or by repeatedly invoking the same program, which reprograms the timer
> > again.
> >
> > I think we should piggy back on softlockup / hardlockup checks (that's
> > what I did long ago), but for simplicity I would just drop these time
> > based enforcement checks from the set for now.
> > They're incomplete, and potentially buggy. Instead you can invoke
> > bpf_die() when a program hits the loop's max count limit or something
> > similar, in order to test this.
> > We also need to account for sleepable programs, so a 1 second
> > hardcoded limit is probably not appropriate.
> > Enforcement is orthogonal to how a program is cleaned up, though as
> > important, but it can be revisited once we sort out the first part.
> ACK
> We can do some profiling eventually then if we decide to bring it back.
> The deadlock case is a good case to consider, however a program's
> recursion is not possible on a given CPU right?

It is possible, we have certain programs which need to recurse to
function correctly.

>
> Earlier we were thinking of enforcing performance based runtime
> policies for BPF programs. Looks like it is getting hard to implement
> it. So I think we will go ahead and rely on the kernel/ bpf mechanism
> to detect bad BPF programs (stalls, pf, etc).
>
> Adding an iteration based termination for bpf_loop won't be enough
> because an expensive callback won't need too many
> iterations,comparatively, to exceed runtime expectations.
>

We can do this as a follow up. As you see from the comments above,
it's already too complicated to think about and review :).

>
> [...]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC bpf-next v2 3/4] bpf: Runtime part of fast-path termination approach
  2025-07-04 19:10       ` Kumar Kartikeya Dwivedi
@ 2025-07-07 17:40         ` Alexei Starovoitov
  2025-07-07 19:16           ` Kumar Kartikeya Dwivedi
  2025-07-08  7:07           ` Raj Sahu
  0 siblings, 2 replies; 19+ messages in thread
From: Alexei Starovoitov @ 2025-07-07 17:40 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi
  Cc: Raj Sahu, Siddharth Chintamaneni, bpf, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Eduard,
	Song Liu, Yonghong Song, John Fastabend, KP Singh,
	Stanislav Fomichev, Hao Luo, Jiri Olsa, Dan Williams, miloc,
	ericts, rahult, doniaghazy, quanzhif, Jinghao Jia, egor,
	Sai Roop Somaraju

On Fri, Jul 4, 2025 at 12:11 PM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> On Fri, 4 Jul 2025 at 19:29, Raj Sahu <rjsu26@gmail.com> wrote:
> >
> > > > Introduces watchdog based runtime mechanism to terminate
> > > > a BPF program. When a BPF program is interrupted by
> > > > an watchdog, its registers are are passed onto the bpf_die.
> > > >
> > > > Inside bpf_die we perform the text_poke and stack walk
> > > > to stub helpers/kfunc replace bpf_loop helper if called
> > > > inside bpf program.
> > > >
> > > > Current implementation doesn't handle the termination of
> > > > tailcall programs.
> > > >
> > > > There is a known issue by calling text_poke inside interrupt
> > > > context - https://elixir.bootlin.com/linux/v6.15.1/source/kernel/smp.c#L815.
> > >
> > > I don't have a good idea so far, maybe by deferring work to wq context?
> > > Each CPU would need its own context and schedule work there.
> > > The problem is that it may not be invoked immediately.
> > We will give it a try using wq. We were a bit hesitant in pursuing wq
> > earlier because to modify the return address on the stack we would
> > want to interrupt the running BPF program and access its stack since
> > that's a key part of the design.
> >
> > Will need some suggestions here on how to achieve that.
>
> Yeah, this is not trivial, now that I think more about it.
> So keep the stack state untouched so you could synchronize with the
> callback (spin until it signals us that it's done touching the stack).
> I guess we can do it from another CPU, not too bad.
>
> There's another problem though, wq execution not happening instantly
> in time is not a big deal, but it getting interrupted by yet another
> program that stalls can set up a cascading chain that leads to lock up
> of the machine.
> So let's say we have a program that stalls in NMI/IRQ. It might happen
> that all CPUs that can service the wq enter this stall. The kthread is
> ready to run the wq callback (or in the middle of it) but it may be
> indefinitely interrupted.
> It seems like this is a more fundamental problem with the non-cloning
> approach. We can prevent program execution on the CPU where the wq
> callback will be run, but we can also have a case where all CPUs lock
> up simultaneously.

If we have such bugs that prog in NMI can stall CPU indefinitely
they need to be fixed independently of fast-execute.
timed may_goto, tailcalls or whatever may need to have different
limits when it detects that the prog is running in NMI or with hard irqs
disabled. Fast-execute doesn't have to be a universal kill-bpf-prog
mechanism that can work in any context. I think fast-execute
is for progs that deadlocked in res_spin_lock, faulted arena,
or were slow for wrong reasons, but not fatal for the kernel reasons.
imo we can rely on schedule_work() and bpf_arch_text_poke() from there.
The alternative of clone of all progs and memory waste for a rare case
is not appealing. Unless we can detect "dangerous" progs and
clone with fast execute only for them, so that the majority of bpf progs
stay as single copy.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC bpf-next v2 3/4] bpf: Runtime part of fast-path termination approach
  2025-07-07 17:40         ` Alexei Starovoitov
@ 2025-07-07 19:16           ` Kumar Kartikeya Dwivedi
  2025-07-07 20:09             ` Alexei Starovoitov
  2025-07-07 22:08             ` Zvi Effron
  2025-07-08  7:07           ` Raj Sahu
  1 sibling, 2 replies; 19+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2025-07-07 19:16 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Raj Sahu, Siddharth Chintamaneni, bpf, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Eduard,
	Song Liu, Yonghong Song, John Fastabend, KP Singh,
	Stanislav Fomichev, Hao Luo, Jiri Olsa, Dan Williams, miloc,
	ericts, rahult, doniaghazy, quanzhif, Jinghao Jia, egor,
	Sai Roop Somaraju

On Mon, 7 Jul 2025 at 19:41, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Fri, Jul 4, 2025 at 12:11 PM Kumar Kartikeya Dwivedi
> <memxor@gmail.com> wrote:
> >
> > On Fri, 4 Jul 2025 at 19:29, Raj Sahu <rjsu26@gmail.com> wrote:
> > >
> > > > > Introduces watchdog based runtime mechanism to terminate
> > > > > a BPF program. When a BPF program is interrupted by
> > > > > an watchdog, its registers are are passed onto the bpf_die.
> > > > >
> > > > > Inside bpf_die we perform the text_poke and stack walk
> > > > > to stub helpers/kfunc replace bpf_loop helper if called
> > > > > inside bpf program.
> > > > >
> > > > > Current implementation doesn't handle the termination of
> > > > > tailcall programs.
> > > > >
> > > > > There is a known issue by calling text_poke inside interrupt
> > > > > context - https://elixir.bootlin.com/linux/v6.15.1/source/kernel/smp.c#L815.
> > > >
> > > > I don't have a good idea so far, maybe by deferring work to wq context?
> > > > Each CPU would need its own context and schedule work there.
> > > > The problem is that it may not be invoked immediately.
> > > We will give it a try using wq. We were a bit hesitant in pursuing wq
> > > earlier because to modify the return address on the stack we would
> > > want to interrupt the running BPF program and access its stack since
> > > that's a key part of the design.
> > >
> > > Will need some suggestions here on how to achieve that.
> >
> > Yeah, this is not trivial, now that I think more about it.
> > So keep the stack state untouched so you could synchronize with the
> > callback (spin until it signals us that it's done touching the stack).
> > I guess we can do it from another CPU, not too bad.
> >
> > There's another problem though, wq execution not happening instantly
> > in time is not a big deal, but it getting interrupted by yet another
> > program that stalls can set up a cascading chain that leads to lock up
> > of the machine.
> > So let's say we have a program that stalls in NMI/IRQ. It might happen
> > that all CPUs that can service the wq enter this stall. The kthread is
> > ready to run the wq callback (or in the middle of it) but it may be
> > indefinitely interrupted.
> > It seems like this is a more fundamental problem with the non-cloning
> > approach. We can prevent program execution on the CPU where the wq
> > callback will be run, but we can also have a case where all CPUs lock
> > up simultaneously.
>
> If we have such bugs that prog in NMI can stall CPU indefinitely
> they need to be fixed independently of fast-execute.
> timed may_goto, tailcalls or whatever may need to have different
> limits when it detects that the prog is running in NMI or with hard irqs
> disabled.

I think a lot of programs end up running with hard IRQs disabled. Most
scheduler programs (which would use all these runtime checked
facilities) can fall into this category.
I don't think we can come up with appropriate limits without proper
WCET analysis.

> Fast-execute doesn't have to be a universal kill-bpf-prog
> mechanism that can work in any context. I think fast-execute
> is for progs that deadlocked in res_spin_lock, faulted arena,
> or were slow for wrong reasons, but not fatal for the kernel reasons.
> imo we can rely on schedule_work() and bpf_arch_text_poke() from there.
> The alternative of clone of all progs and memory waste for a rare case
> is not appealing. Unless we can detect "dangerous" progs and
> clone with fast execute only for them, so that the majority of bpf progs
> stay as single copy.

Right, I sympathize with the memory overhead argument. But I think
just ignoring NMI programs as broken is not sufficient.
You can have some tracing program that gets stuck while tracing parts
of the kernel such that it prevents wq from making progress in
patching it out.
I think we will discover more edge cases. All this effort to then have
something working incompletely is sort of a bummer.

Most of the syzbot reports triggering deadlocks were also not "real"
use cases, but we still decided to close the gap with rqspinlock.
When leaving the door open, it's hard to anticipate how the failure
mode in case fast-execute is not triggered will be.
I think let's try to see how bad cloning can be, if done
conditionally, before giving up completely on it.

I think most programs won't use these facilities that end up extending
the total runtime beyond acceptable bounds.
Most legacy programs are not using cond_break, rqspinlock, or arenas.
Most network function programs and tracing programs probably don't use
these facilities as well.
This can certainly change in the future, but I think unless pressing
use cases come up people would stick to how things are.
Schedulers are a different category and they will definitely make use
of all this.
So only triggering cloning when these are used sounds better to me,
even if not ideal.

We can have some simple WCET heuristic that assigns fixed weights to
certain instructions/helpers, and compute an approximate cost which
when breached triggers cloning of the prog.
We can obviously disable cloning when we see things like bpf_for(i, 0, 1000).
We can restrict it to specific subprogs where we notice specific
patterns requiring it, so we may be able to avoid the entire prog.
But for simplicity we can also just trigger cloning when one of
rqspinlock/cond_break/arenas/iterators/bpf_loop are used.

The other option of course is to do run time checks of
prog->aux->terminate bit and start failing things that way.
Most of the time, the branch will be untaken. It can be set to 1 when
a prog detects a fatal condition or from a watchdog (later).
cond_break, rqspinlock, iterators, bpf_loop can all be adjusted to check it.

When I was playing with all this, this is basically what I did
(amortized or not) and I think the overhead was negligible/in range of
noise.
If a prog is hot, the line with terminate bit is almost always in
cache, and it's not a load dependency for other accesses.
If it's not hot, the cost of every other state access dominates anyway.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC bpf-next v2 3/4] bpf: Runtime part of fast-path termination approach
  2025-07-07 19:16           ` Kumar Kartikeya Dwivedi
@ 2025-07-07 20:09             ` Alexei Starovoitov
  2025-07-07 22:08             ` Zvi Effron
  1 sibling, 0 replies; 19+ messages in thread
From: Alexei Starovoitov @ 2025-07-07 20:09 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi
  Cc: Raj Sahu, Siddharth Chintamaneni, bpf, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Martin KaFai Lau, Eduard,
	Song Liu, Yonghong Song, John Fastabend, KP Singh,
	Stanislav Fomichev, Hao Luo, Jiri Olsa, Dan Williams, miloc,
	ericts, rahult, doniaghazy, quanzhif, Jinghao Jia, egor,
	Sai Roop Somaraju

On Mon, Jul 7, 2025 at 12:16 PM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> The other option of course is to do run time checks of
> prog->aux->terminate bit and start failing things that way.
> Most of the time, the branch will be untaken. It can be set to 1 when
> a prog detects a fatal condition or from a watchdog (later).
> cond_break, rqspinlock, iterators, bpf_loop can all be adjusted to check it.
>
> When I was playing with all this, this is basically what I did
> (amortized or not) and I think the overhead was negligible/in range of
> noise.
> If a prog is hot, the line with terminate bit is almost always in
> cache, and it's not a load dependency for other accesses.
> If it's not hot, the cost of every other state access dominates anyway.

bpf_check_timed_may_goto() can certainly check prog->aux->must_terminate.
bpf_loop() can gain aux__prog and check the flag as well.
if we add it in more places then we won't need fast-execute.
Everything that can take a long time will be required to check that flag.
I was hoping fast-execute can avoid this run-time penalty,
but if it comes with extra clone (even if optional for some progs)
then the amortized flag check looks much simpler.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC bpf-next v2 3/4] bpf: Runtime part of fast-path termination approach
  2025-07-07 19:16           ` Kumar Kartikeya Dwivedi
  2025-07-07 20:09             ` Alexei Starovoitov
@ 2025-07-07 22:08             ` Zvi Effron
  2025-07-07 22:39               ` Kumar Kartikeya Dwivedi
  1 sibling, 1 reply; 19+ messages in thread
From: Zvi Effron @ 2025-07-07 22:08 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi
  Cc: Alexei Starovoitov, Raj Sahu, Siddharth Chintamaneni, bpf,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Eduard, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa, Dan Williams,
	miloc, ericts, rahult, doniaghazy, quanzhif, Jinghao Jia, egor,
	Sai Roop Somaraju

On Mon, Jul 7, 2025 at 12:17 PM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> On Mon, 7 Jul 2025 at 19:41, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Fri, Jul 4, 2025 at 12:11 PM Kumar Kartikeya Dwivedi
> > <memxor@gmail.com> wrote:
> > >
> > > On Fri, 4 Jul 2025 at 19:29, Raj Sahu <rjsu26@gmail.com> wrote:
> > > >
> > > > > > Introduces watchdog based runtime mechanism to terminate
> > > > > > a BPF program. When a BPF program is interrupted by
> > > > > > an watchdog, its registers are are passed onto the bpf_die.
> > > > > >
> > > > > > Inside bpf_die we perform the text_poke and stack walk
> > > > > > to stub helpers/kfunc replace bpf_loop helper if called
> > > > > > inside bpf program.
> > > > > >
> > > > > > Current implementation doesn't handle the termination of
> > > > > > tailcall programs.
> > > > > >
> > > > > > There is a known issue by calling text_poke inside interrupt
> > > > > > context - https://elixir.bootlin.com/linux/v6.15.1/source/kernel/smp.c#L815.
> > > > >
> > > > > I don't have a good idea so far, maybe by deferring work to wq context?
> > > > > Each CPU would need its own context and schedule work there.
> > > > > The problem is that it may not be invoked immediately.
> > > > We will give it a try using wq. We were a bit hesitant in pursuing wq
> > > > earlier because to modify the return address on the stack we would
> > > > want to interrupt the running BPF program and access its stack since
> > > > that's a key part of the design.
> > > >
> > > > Will need some suggestions here on how to achieve that.
> > >
> > > Yeah, this is not trivial, now that I think more about it.
> > > So keep the stack state untouched so you could synchronize with the
> > > callback (spin until it signals us that it's done touching the stack).
> > > I guess we can do it from another CPU, not too bad.
> > >
> > > There's another problem though, wq execution not happening instantly
> > > in time is not a big deal, but it getting interrupted by yet another
> > > program that stalls can set up a cascading chain that leads to lock up
> > > of the machine.
> > > So let's say we have a program that stalls in NMI/IRQ. It might happen
> > > that all CPUs that can service the wq enter this stall. The kthread is
> > > ready to run the wq callback (or in the middle of it) but it may be
> > > indefinitely interrupted.
> > > It seems like this is a more fundamental problem with the non-cloning
> > > approach. We can prevent program execution on the CPU where the wq
> > > callback will be run, but we can also have a case where all CPUs lock
> > > up simultaneously.
> >
> > If we have such bugs that prog in NMI can stall CPU indefinitely
> > they need to be fixed independently of fast-execute.
> > timed may_goto, tailcalls or whatever may need to have different
> > limits when it detects that the prog is running in NMI or with hard irqs
> > disabled.
>
> I think a lot of programs end up running with hard IRQs disabled. Most
> scheduler programs (which would use all these runtime checked
> facilities) can fall into this category.
> I don't think we can come up with appropriate limits without proper
> WCET analysis.
>
> > Fast-execute doesn't have to be a universal kill-bpf-prog
> > mechanism that can work in any context. I think fast-execute
> > is for progs that deadlocked in res_spin_lock, faulted arena,
> > or were slow for wrong reasons, but not fatal for the kernel reasons.
> > imo we can rely on schedule_work() and bpf_arch_text_poke() from there.
> > The alternative of clone of all progs and memory waste for a rare case
> > is not appealing. Unless we can detect "dangerous" progs and
> > clone with fast execute only for them, so that the majority of bpf progs
> > stay as single copy.
>
> Right, I sympathize with the memory overhead argument. But I think
> just ignoring NMI programs as broken is not sufficient.
> You can have some tracing program that gets stuck while tracing parts
> of the kernel such that it prevents wq from making progress in
> patching it out.
> I think we will discover more edge cases. All this effort to then have
> something working incompletely is sort of a bummer.
>
> Most of the syzbot reports triggering deadlocks were also not "real"
> use cases, but we still decided to close the gap with rqspinlock.
> When leaving the door open, it's hard to anticipate how the failure
> mode in case fast-execute is not triggered will be.
> I think let's try to see how bad cloning can be, if done
> conditionally, before giving up completely on it.
>
> I think most programs won't use these facilities that end up extending
> the total runtime beyond acceptable bounds.
> Most legacy programs are not using cond_break, rqspinlock, or arenas.
> Most network function programs and tracing programs probably don't use
> these facilities as well.

I can't speak to _most_, but doesn't BPF_MAP_TYPE_LPM_TRIE use rqspinlock? And
isn't that map type frequently used by network programs (especially XDP
filters) to do IP prefix lookups?

So it wouldn't be uncommon for network programs (including "legacy" ones) to
use rqspinlock because they use a BPF map type that uses rqspinlock?

> This can certainly change in the future, but I think unless pressing
> use cases come up people would stick to how things are.
> Schedulers are a different category and they will definitely make use
> of all this.
> So only triggering cloning when these are used sounds better to me,
> even if not ideal.
>
> We can have some simple WCET heuristic that assigns fixed weights to
> certain instructions/helpers, and compute an approximate cost which
> when breached triggers cloning of the prog.
> We can obviously disable cloning when we see things like bpf_for(i, 0, 1000).
> We can restrict it to specific subprogs where we notice specific
> patterns requiring it, so we may be able to avoid the entire prog.
> But for simplicity we can also just trigger cloning when one of
> rqspinlock/cond_break/arenas/iterators/bpf_loop are used.
>
> The other option of course is to do run time checks of
> prog->aux->terminate bit and start failing things that way.
> Most of the time, the branch will be untaken. It can be set to 1 when
> a prog detects a fatal condition or from a watchdog (later).
> cond_break, rqspinlock, iterators, bpf_loop can all be adjusted to check it.
>
> When I was playing with all this, this is basically what I did
> (amortized or not) and I think the overhead was negligible/in range of
> noise.
> If a prog is hot, the line with terminate bit is almost always in
> cache, and it's not a load dependency for other accesses.
> If it's not hot, the cost of every other state access dominates anyway.
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC bpf-next v2 3/4] bpf: Runtime part of fast-path termination approach
  2025-07-07 22:08             ` Zvi Effron
@ 2025-07-07 22:39               ` Kumar Kartikeya Dwivedi
  0 siblings, 0 replies; 19+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2025-07-07 22:39 UTC (permalink / raw)
  To: Zvi Effron
  Cc: Alexei Starovoitov, Raj Sahu, Siddharth Chintamaneni, bpf,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Eduard, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa, Dan Williams,
	miloc, ericts, rahult, doniaghazy, quanzhif, Jinghao Jia, egor,
	Sai Roop Somaraju

On Tue, 8 Jul 2025 at 00:09, Zvi Effron <zeffron@riotgames.com> wrote:
>
> On Mon, Jul 7, 2025 at 12:17 PM Kumar Kartikeya Dwivedi
> <memxor@gmail.com> wrote:
> >
> > On Mon, 7 Jul 2025 at 19:41, Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Fri, Jul 4, 2025 at 12:11 PM Kumar Kartikeya Dwivedi
> > > <memxor@gmail.com> wrote:
> > > >
> > > > On Fri, 4 Jul 2025 at 19:29, Raj Sahu <rjsu26@gmail.com> wrote:
> > > > >
> > > > > > > Introduces watchdog based runtime mechanism to terminate
> > > > > > > a BPF program. When a BPF program is interrupted by
> > > > > > > an watchdog, its registers are are passed onto the bpf_die.
> > > > > > >
> > > > > > > Inside bpf_die we perform the text_poke and stack walk
> > > > > > > to stub helpers/kfunc replace bpf_loop helper if called
> > > > > > > inside bpf program.
> > > > > > >
> > > > > > > Current implementation doesn't handle the termination of
> > > > > > > tailcall programs.
> > > > > > >
> > > > > > > There is a known issue by calling text_poke inside interrupt
> > > > > > > context - https://elixir.bootlin.com/linux/v6.15.1/source/kernel/smp.c#L815.
> > > > > >
> > > > > > I don't have a good idea so far, maybe by deferring work to wq context?
> > > > > > Each CPU would need its own context and schedule work there.
> > > > > > The problem is that it may not be invoked immediately.
> > > > > We will give it a try using wq. We were a bit hesitant in pursuing wq
> > > > > earlier because to modify the return address on the stack we would
> > > > > want to interrupt the running BPF program and access its stack since
> > > > > that's a key part of the design.
> > > > >
> > > > > Will need some suggestions here on how to achieve that.
> > > >
> > > > Yeah, this is not trivial, now that I think more about it.
> > > > So keep the stack state untouched so you could synchronize with the
> > > > callback (spin until it signals us that it's done touching the stack).
> > > > I guess we can do it from another CPU, not too bad.
> > > >
> > > > There's another problem though, wq execution not happening instantly
> > > > in time is not a big deal, but it getting interrupted by yet another
> > > > program that stalls can set up a cascading chain that leads to lock up
> > > > of the machine.
> > > > So let's say we have a program that stalls in NMI/IRQ. It might happen
> > > > that all CPUs that can service the wq enter this stall. The kthread is
> > > > ready to run the wq callback (or in the middle of it) but it may be
> > > > indefinitely interrupted.
> > > > It seems like this is a more fundamental problem with the non-cloning
> > > > approach. We can prevent program execution on the CPU where the wq
> > > > callback will be run, but we can also have a case where all CPUs lock
> > > > up simultaneously.
> > >
> > > If we have such bugs that prog in NMI can stall CPU indefinitely
> > > they need to be fixed independently of fast-execute.
> > > timed may_goto, tailcalls or whatever may need to have different
> > > limits when it detects that the prog is running in NMI or with hard irqs
> > > disabled.
> >
> > I think a lot of programs end up running with hard IRQs disabled. Most
> > scheduler programs (which would use all these runtime checked
> > facilities) can fall into this category.
> > I don't think we can come up with appropriate limits without proper
> > WCET analysis.
> >
> > > Fast-execute doesn't have to be a universal kill-bpf-prog
> > > mechanism that can work in any context. I think fast-execute
> > > is for progs that deadlocked in res_spin_lock, faulted arena,
> > > or were slow for wrong reasons, but not fatal for the kernel reasons.
> > > imo we can rely on schedule_work() and bpf_arch_text_poke() from there.
> > > The alternative of clone of all progs and memory waste for a rare case
> > > is not appealing. Unless we can detect "dangerous" progs and
> > > clone with fast execute only for them, so that the majority of bpf progs
> > > stay as single copy.
> >
> > Right, I sympathize with the memory overhead argument. But I think
> > just ignoring NMI programs as broken is not sufficient.
> > You can have some tracing program that gets stuck while tracing parts
> > of the kernel such that it prevents wq from making progress in
> > patching it out.
> > I think we will discover more edge cases. All this effort to then have
> > something working incompletely is sort of a bummer.
> >
> > Most of the syzbot reports triggering deadlocks were also not "real"
> > use cases, but we still decided to close the gap with rqspinlock.
> > When leaving the door open, it's hard to anticipate how the failure
> > mode in case fast-execute is not triggered will be.
> > I think let's try to see how bad cloning can be, if done
> > conditionally, before giving up completely on it.
> >
> > I think most programs won't use these facilities that end up extending
> > the total runtime beyond acceptable bounds.
> > Most legacy programs are not using cond_break, rqspinlock, or arenas.
> > Most network function programs and tracing programs probably don't use
> > these facilities as well.
>
> I can't speak to _most_, but doesn't BPF_MAP_TYPE_LPM_TRIE use rqspinlock? And
> isn't that map type frequently used by network programs (especially XDP
> filters) to do IP prefix lookups?
>
> So it wouldn't be uncommon for network programs (including "legacy" ones) to
> use rqspinlock because they use a BPF map type that uses rqspinlock?

Right, good point.
We would probably need to penalize such a program when it triggers a deadlock.
That would require cloning if we don't go the text_poke() way.
So it may indeed end up covering most programs.
hashtab.c uses it too, which probably the majority of programs use.
Oh well, I guess it would be most of them then.
So it's hard to eliminate the memory overhead in cloning.

>
> > This can certainly change in the future, but I think unless pressing
> > use cases come up people would stick to how things are.
> > Schedulers are a different category and they will definitely make use
> > of all this.
> > So only triggering cloning when these are used sounds better to me,
> > even if not ideal.
> >
> > We can have some simple WCET heuristic that assigns fixed weights to
> > certain instructions/helpers, and compute an approximate cost which
> > when breached triggers cloning of the prog.
> > We can obviously disable cloning when we see things like bpf_for(i, 0, 1000).
> > We can restrict it to specific subprogs where we notice specific
> > patterns requiring it, so we may be able to avoid the entire prog.
> > But for simplicity we can also just trigger cloning when one of
> > rqspinlock/cond_break/arenas/iterators/bpf_loop are used.
> >
> > The other option of course is to do run time checks of
> > prog->aux->terminate bit and start failing things that way.
> > Most of the time, the branch will be untaken. It can be set to 1 when
> > a prog detects a fatal condition or from a watchdog (later).
> > cond_break, rqspinlock, iterators, bpf_loop can all be adjusted to check it.
> >
> > When I was playing with all this, this is basically what I did
> > (amortized or not) and I think the overhead was negligible/in range of
> > noise.
> > If a prog is hot, the line with terminate bit is almost always in
> > cache, and it's not a load dependency for other accesses.
> > If it's not hot, the cost of every other state access dominates anyway.
> >

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC bpf-next v2 3/4] bpf: Runtime part of fast-path termination approach
  2025-07-07 17:40         ` Alexei Starovoitov
  2025-07-07 19:16           ` Kumar Kartikeya Dwivedi
@ 2025-07-08  7:07           ` Raj Sahu
  2025-07-10  0:54             ` Kumar Kartikeya Dwivedi
  1 sibling, 1 reply; 19+ messages in thread
From: Raj Sahu @ 2025-07-08  7:07 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Kumar Kartikeya Dwivedi, Siddharth Chintamaneni, bpf,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Eduard, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa, Dan Williams,
	miloc, ericts, rahult, doniaghazy, quanzhif, Jinghao Jia, egor,
	Sai Roop Somaraju

> If we have such bugs that prog in NMI can stall CPU indefinitely
> they need to be fixed independently of fast-execute.
> timed may_goto, tailcalls or whatever may need to have different
> limits when it detects that the prog is running in NMI or with hard irqs
> disabled. Fast-execute doesn't have to be a universal kill-bpf-prog
> mechanism that can work in any context. I think fast-execute
> is for progs that deadlocked in res_spin_lock, faulted arena,
> or were slow for wrong reasons, but not fatal for the kernel reasons.
> imo we can rely on schedule_work() and bpf_arch_text_poke() from there.
> The alternative of clone of all progs and memory waste for a rare case
> is not appealing. Unless we can detect "dangerous" progs and
> clone with fast execute only for them, so that the majority of bpf progs
> stay as single copy.

I just want to confirm that we are on the same page here:
While the RFC we sent was using prog cloning, Kumar's earlier
suggestion of implementing offset tables can avoid the complete
cloning process and the associated memory footprint. Is there
something else which is concerning here in terms of memory overhead?

Regarding the NMI issue, the fast-execute design was meant to take
care of stalling in tracing and other task-context based programs
running slow for some reason. While I do agree with your point that
deadlocks in NMIs should be solved independently, kumar's point of
having several BPF programs needing termination, running in hardIRQ,
puts us in a fix. What should be the way forward here?

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [RFC bpf-next v2 3/4] bpf: Runtime part of fast-path termination approach
  2025-07-08  7:07           ` Raj Sahu
@ 2025-07-10  0:54             ` Kumar Kartikeya Dwivedi
  0 siblings, 0 replies; 19+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2025-07-10  0:54 UTC (permalink / raw)
  To: Raj Sahu
  Cc: Alexei Starovoitov, Siddharth Chintamaneni, bpf,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Eduard, Song Liu, Yonghong Song, John Fastabend,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa, Dan Williams,
	miloc, ericts, rahult, doniaghazy, quanzhif, Jinghao Jia, egor,
	Sai Roop Somaraju

On Tue, 8 Jul 2025 at 09:07, Raj Sahu <rjsu26@gmail.com> wrote:
>
> > If we have such bugs that prog in NMI can stall CPU indefinitely
> > they need to be fixed independently of fast-execute.
> > timed may_goto, tailcalls or whatever may need to have different
> > limits when it detects that the prog is running in NMI or with hard irqs
> > disabled. Fast-execute doesn't have to be a universal kill-bpf-prog
> > mechanism that can work in any context. I think fast-execute
> > is for progs that deadlocked in res_spin_lock, faulted arena,
> > or were slow for wrong reasons, but not fatal for the kernel reasons.
> > imo we can rely on schedule_work() and bpf_arch_text_poke() from there.
> > The alternative of clone of all progs and memory waste for a rare case
> > is not appealing. Unless we can detect "dangerous" progs and
> > clone with fast execute only for them, so that the majority of bpf progs
> > stay as single copy.
>
> I just want to confirm that we are on the same page here:
> While the RFC we sent was using prog cloning, Kumar's earlier
> suggestion of implementing offset tables can avoid the complete
> cloning process and the associated memory footprint. Is there
> something else which is concerning here in terms of memory overhead?
>
> Regarding the NMI issue, the fast-execute design was meant to take
> care of stalling in tracing and other task-context based programs
> running slow for some reason. While I do agree with your point that
> deadlocks in NMIs should be solved independently, kumar's point of
> having several BPF programs needing termination, running in hardIRQ,
> puts us in a fix. What should be the way forward here?

I would give the prog->aux->terminate bit idea we discussed in the
other thread a try.
If we can can know that it has acceptable overhead (see example
microbenchmarking I did here:
https://lore.kernel.org/bpf/20250304003239.2390751-1-memxor@gmail.com/)
then I think it seems the best option to go with.
You can also try loops with costs for the body, since it's more
appropriate as the % of cost of the loop body.
We can sample this bit and later on hook up enforcement to set it when
it detects a timeout, but let's keep both separate for the next
iteration.

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2025-07-10  0:54 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-14  6:40 [RFC bpf-next v2 0/4] bpf: Fast-Path approach for BPF program Siddharth Chintamaneni
2025-06-14  6:40 ` [RFC bpf-next v2 1/4] bpf: Introduce new structs and struct fields Siddharth Chintamaneni
2025-06-14  6:40 ` [RFC bpf-next v2 2/4] bpf: Generating a stubbed version of BPF program for termination Siddharth Chintamaneni
2025-06-30 12:47   ` Kumar Kartikeya Dwivedi
2025-07-04 17:32     ` Raj Sahu
2025-07-04 18:37       ` Kumar Kartikeya Dwivedi
2025-06-14  6:40 ` [RFC bpf-next v2 3/4] bpf: Runtime part of fast-path termination approach Siddharth Chintamaneni
2025-06-30 12:15   ` Kumar Kartikeya Dwivedi
2025-07-04 17:29     ` Raj Sahu
2025-07-04 19:10       ` Kumar Kartikeya Dwivedi
2025-07-07 17:40         ` Alexei Starovoitov
2025-07-07 19:16           ` Kumar Kartikeya Dwivedi
2025-07-07 20:09             ` Alexei Starovoitov
2025-07-07 22:08             ` Zvi Effron
2025-07-07 22:39               ` Kumar Kartikeya Dwivedi
2025-07-08  7:07           ` Raj Sahu
2025-07-10  0:54             ` Kumar Kartikeya Dwivedi
2025-06-14  6:40 ` [RFC bpf-next v2 4/4] selftests/bpf: Adds selftests to check termination of long running nested bpf loops Siddharth Chintamaneni
2025-06-30 13:03 ` [RFC bpf-next v2 0/4] bpf: Fast-Path approach for BPF program Kumar Kartikeya Dwivedi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).