[PATCH 0/4] bpf: Fast-Path approach for BPF program termination

BPF List
 help / color / mirror / Atom feed

* [PATCH 0/4] bpf: Fast-Path approach for BPF program termination
@ 2025-09-07 23:04 Siddharth Chintamaneni
  2025-09-07 23:04 ` [PATCH 1/4] bpf: Introduce new structs and struct fields for fast path termination Siddharth Chintamaneni
                   ` (4 more replies)
  0 siblings, 5 replies; 12+ messages in thread
From: Siddharth Chintamaneni @ 2025-09-07 23:04 UTC (permalink / raw)
  To: bpf
  Cc: ast, daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
	john.fastabend, kpsingh, sdf, haoluo, jolsa, djwillia, miloc,
	ericts, rahult, doniaghazy, quanzhif, jinghao7, sidchintamaneni,
	memxor, egor, sairoop10, rjsu26

This is RFC v3 of
	https://lore.kernel.org/all/20250614064056.237005-1-sidchintamaneni@gmail.com/

The termination handler call from the softlockup detector is mainly for
the demonstration of the entire workflow and also serves as a potential
use case for discussion. However, the runtime mechanism is modular
enough to be ported to different scenarios such as deadlocks, page
faults, userspace BPF management tools, and stack overflows.

The main changes that we bring in this version are: We have avoided the
memory overhead caused by program cloning in previous versions. During
normal program execution, none of the termination logic causes any
additional overhead.

Change log:
v2 -> v3:
- Cloning of BPF programs has been removed.
- Created call sites table to maintain helper/ kfunc call instruction
  offsets.
- Termination is triggered inside the softlockup detector not affecting
  any fast path operations.

v1 -> v2:
- Patch generation has been moved after verification and before JIT.
	- Now patch generation handles both helpers and kfuncs.
	- Sanity check on original prog and patch prog after JIT.
- Runtime termination handler is now global termination mechanism using
  text_poke.
- Termination is triggered by watchdog timer.

 arch/x86/net/bpf_jit_comp.c                   | 141 ++++++++++++++++++
 include/linux/bpf.h                           |  77 ++++++----
 include/linux/bpf_verifier.h                  |   1 +
 include/linux/filter.h                        |   6 +
 kernel/bpf/core.c                             |  67 +++++++++
 kernel/bpf/verifier.c                         | 135 +++++++++++++++--
 kernel/watchdog.c                             |   8 +
 .../bpf/prog_tests/bpf_termination.c          |  39 +++++
 .../selftests/bpf/progs/bpf_termination.c     |  47 ++++++
 9 files changed, 482 insertions(+), 39 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_termination.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_termination.c

-- 
2.43.0


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/4] bpf: Introduce new structs and struct fields for fast path termination
  2025-09-07 23:04 [PATCH 0/4] bpf: Fast-Path approach for BPF program termination Siddharth Chintamaneni
@ 2025-09-07 23:04 ` Siddharth Chintamaneni
  2025-09-17  2:11   ` Kumar Kartikeya Dwivedi
  2025-09-07 23:04 ` [PATCH 2/4] bpf: Creating call sites table to stub instructions during runtime Siddharth Chintamaneni
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 12+ messages in thread
From: Siddharth Chintamaneni @ 2025-09-07 23:04 UTC (permalink / raw)
  To: bpf
  Cc: ast, daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
	john.fastabend, kpsingh, sdf, haoluo, jolsa, djwillia, miloc,
	ericts, rahult, doniaghazy, quanzhif, jinghao7, sidchintamaneni,
	memxor, egor, sairoop10, rjsu26

Introduced the definition of struct bpf_term_aux_states
required to support fast-path termination of BPF programs.
Added the memory allocation and free logic for newly added
term_states feild in struct bpf_prog.

Signed-off-by: Raj Sahu <rjsu26@gmail.com>
Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
---
 include/linux/bpf.h | 75 +++++++++++++++++++++++++++++----------------
 kernel/bpf/core.c   | 31 +++++++++++++++++++
 2 files changed, 79 insertions(+), 27 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8f6e87f0f3a8..caaee33744fc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1584,6 +1584,25 @@ struct bpf_stream_stage {
 	int len;
 };
 
+struct call_aux_states {
+	int call_bpf_insn_idx;
+	int jit_call_idx;
+	u8 is_helper_kfunc;
+	u8 is_bpf_loop;
+	u8 is_bpf_loop_cb_inline;
+};
+
+struct bpf_term_patch_call_sites {
+	u32 call_sites_cnt;
+	struct call_aux_states *call_states;
+};
+
+struct bpf_term_aux_states {
+	struct bpf_prog *prog;
+	struct work_struct work;
+	struct bpf_term_patch_call_sites *patch_call_sites;
+};
+
 struct bpf_prog_aux {
 	atomic64_t refcnt;
 	u32 used_map_cnt;
@@ -1618,6 +1637,7 @@ struct bpf_prog_aux {
 	bool tail_call_reachable;
 	bool xdp_has_frags;
 	bool exception_cb;
+	bool is_bpf_loop_cb_non_inline;
 	bool exception_boundary;
 	bool is_extended; /* true if extended by freplace program */
 	bool jits_use_priv_stack;
@@ -1696,33 +1716,34 @@ struct bpf_prog_aux {
 };
 
 struct bpf_prog {
-	u16			pages;		/* Number of allocated pages */
-	u16			jited:1,	/* Is our filter JIT'ed? */
-				jit_requested:1,/* archs need to JIT the prog */
-				gpl_compatible:1, /* Is filter GPL compatible? */
-				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1,	/* Do we need dst entry? */
-				blinding_requested:1, /* needs constant blinding */
-				blinded:1,	/* Was blinded */
-				is_func:1,	/* program is a bpf function */
-				kprobe_override:1, /* Do we override a kprobe? */
-				has_callchain_buf:1, /* callchain buffer allocated? */
-				enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
-				call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
-				call_get_func_ip:1, /* Do we call get_func_ip() */
-				tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
-				sleepable:1;	/* BPF program is sleepable */
-	enum bpf_prog_type	type;		/* Type of BPF program */
-	enum bpf_attach_type	expected_attach_type; /* For some prog types */
-	u32			len;		/* Number of filter blocks */
-	u32			jited_len;	/* Size of jited insns in bytes */
-	u8			tag[BPF_TAG_SIZE];
-	struct bpf_prog_stats __percpu *stats;
-	int __percpu		*active;
-	unsigned int		(*bpf_func)(const void *ctx,
-					    const struct bpf_insn *insn);
-	struct bpf_prog_aux	*aux;		/* Auxiliary fields */
-	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
+	u16				pages;		/* Number of allocated pages */
+	u16				jited:1,	/* Is our filter JIT'ed? */
+					jit_requested:1,/* archs need to JIT the prog */
+					gpl_compatible:1, /* Is filter GPL compatible? */
+					cb_access:1,	/* Is control block accessed? */
+					dst_needed:1,	/* Do we need dst entry? */
+					blinding_requested:1, /* needs constant blinding */
+					blinded:1,	/* Was blinded */
+					is_func:1,	/* program is a bpf function */
+					kprobe_override:1, /* Do we override a kprobe? */
+					has_callchain_buf:1, /* callchain buffer allocated? */
+					enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
+					call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
+					call_get_func_ip:1, /* Do we call get_func_ip() */
+					tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
+					sleepable:1;	/* BPF program is sleepable */
+	enum bpf_prog_type		type;		/* Type of BPF program */
+	enum bpf_attach_type		expected_attach_type; /* For some prog types */
+	u32				len;		/* Number of filter blocks */
+	u32				jited_len;	/* Size of jited insns in bytes */
+	u8				tag[BPF_TAG_SIZE];
+	struct bpf_prog_stats __percpu	*stats;
+	int __percpu			*active;
+	unsigned int			(*bpf_func)(const void *ctx,
+						    const struct bpf_insn *insn);
+	struct bpf_prog_aux		*aux;		/* Auxiliary fields */
+	struct sock_fprog_kern		*orig_prog;	/* Original BPF program */
+	struct bpf_term_aux_states	*term_states;
 	/* Instructions for interpreter */
 	union {
 		DECLARE_FLEX_ARRAY(struct sock_filter, insns);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ef01cc644a96..740b5a3a6b55 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -100,6 +100,8 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
 	struct bpf_prog_aux *aux;
 	struct bpf_prog *fp;
+	struct bpf_term_aux_states *term_states = NULL;
+	struct bpf_term_patch_call_sites *patch_call_sites = NULL;
 
 	size = round_up(size, __PAGE_SIZE);
 	fp = __vmalloc(size, gfp_flags);
@@ -118,11 +120,24 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 		return NULL;
 	}
 
+	term_states = kzalloc(sizeof(*term_states), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
+	if (!term_states)
+		goto free_alloc_percpu;
+
+	patch_call_sites = kzalloc(sizeof(*patch_call_sites), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
+	if (!patch_call_sites)
+		goto free_bpf_term_states;
+
 	fp->pages = size / PAGE_SIZE;
 	fp->aux = aux;
 	fp->aux->prog = fp;
 	fp->jit_requested = ebpf_jit_enabled();
 	fp->blinding_requested = bpf_jit_blinding_enabled(fp);
+	fp->term_states = term_states;
+	fp->term_states->patch_call_sites = patch_call_sites;
+	fp->term_states->patch_call_sites->call_sites_cnt = 0;
+	fp->term_states->prog = fp;
+
 #ifdef CONFIG_CGROUP_BPF
 	aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
 #endif
@@ -140,6 +155,15 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 #endif
 
 	return fp;
+
+free_bpf_term_states:
+	kfree(term_states);
+free_alloc_percpu:
+	free_percpu(fp->active);
+	kfree(aux);
+	vfree(fp);
+
+	return NULL;
 }
 
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
@@ -266,6 +290,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 		memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
 		fp->pages = pages;
 		fp->aux->prog = fp;
+		fp->term_states->prog = fp;
 
 		/* We keep fp->aux from fp_old around in the new
 		 * reallocated structure.
@@ -273,6 +298,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 		fp_old->aux = NULL;
 		fp_old->stats = NULL;
 		fp_old->active = NULL;
+		fp_old->term_states = NULL;
 		__bpf_prog_free(fp_old);
 	}
 
@@ -287,6 +313,11 @@ void __bpf_prog_free(struct bpf_prog *fp)
 		kfree(fp->aux->poke_tab);
 		kfree(fp->aux);
 	}
+	if (fp->term_states) {
+		if (fp->term_states->patch_call_sites)
+			kfree(fp->term_states->patch_call_sites);
+		kfree(fp->term_states);
+	}
 	free_percpu(fp->stats);
 	free_percpu(fp->active);
 	vfree(fp);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/4] bpf: Creating call sites table to stub instructions during runtime
  2025-09-07 23:04 [PATCH 0/4] bpf: Fast-Path approach for BPF program termination Siddharth Chintamaneni
  2025-09-07 23:04 ` [PATCH 1/4] bpf: Introduce new structs and struct fields for fast path termination Siddharth Chintamaneni
@ 2025-09-07 23:04 ` Siddharth Chintamaneni
  2025-09-07 23:04 ` [PATCH 3/4] bpf: runtime part of fast-path termination approach Siddharth Chintamaneni
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 12+ messages in thread
From: Siddharth Chintamaneni @ 2025-09-07 23:04 UTC (permalink / raw)
  To: bpf
  Cc: ast, daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
	john.fastabend, kpsingh, sdf, haoluo, jolsa, djwillia, miloc,
	ericts, rahult, doniaghazy, quanzhif, jinghao7, sidchintamaneni,
	memxor, egor, sairoop10, rjsu26

Create callsites tables and store jit indexes of RET_NULL calls
to poke them later with dummy functions. Additional to jit indexes,
meta data about helpers/kfuncs/loops is stored. Later this could
be extended to remaining potential long running iterator helpers/kfuncs.

Signed-off-by: Raj Sahu <rjsu26@gmail.com>
Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
---
 arch/x86/net/bpf_jit_comp.c  |   9 +++
 include/linux/bpf_verifier.h |   1 +
 kernel/bpf/core.c            |   5 +-
 kernel/bpf/verifier.c        | 135 +++++++++++++++++++++++++++++++----
 4 files changed, 137 insertions(+), 13 deletions(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 7e3fca164620..107a44729675 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3733,6 +3733,15 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	}
 
 	if (!image || !prog->is_func || extra_pass) {
+		
+		if (addrs) {
+			struct bpf_term_patch_call_sites *patch_call_sites = prog->term_states->patch_call_sites;
+			for (int i = 0; i < patch_call_sites->call_sites_cnt; i++) {
+				struct call_aux_states *call_states = patch_call_sites->call_states + i;
+				call_states->jit_call_idx = addrs[call_states->call_bpf_insn_idx];
+			}
+		}
+		
 		if (image)
 			bpf_prog_fill_jited_linfo(prog, addrs + 1);
 out_addrs:
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 020de62bd09c..2c8bfde8191a 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -677,6 +677,7 @@ struct bpf_subprog_info {
 	bool is_cb: 1;
 	bool is_async_cb: 1;
 	bool is_exception_cb: 1;
+	bool is_bpf_loop_cb_non_inline: 1;
 	bool args_cached: 1;
 	/* true if bpf_fastcall stack region is used by functions that can't be inlined */
 	bool keep_fastcall_stack: 1;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 740b5a3a6b55..93442ab2acde 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -136,6 +136,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 	fp->term_states = term_states;
 	fp->term_states->patch_call_sites = patch_call_sites;
 	fp->term_states->patch_call_sites->call_sites_cnt = 0;
+	fp->term_states->patch_call_sites->call_states = NULL;
 	fp->term_states->prog = fp;
 
 #ifdef CONFIG_CGROUP_BPF
@@ -314,8 +315,10 @@ void __bpf_prog_free(struct bpf_prog *fp)
 		kfree(fp->aux);
 	}
 	if (fp->term_states) {
-		if (fp->term_states->patch_call_sites)
+		if (fp->term_states->patch_call_sites) {
+			vfree(fp->term_states->patch_call_sites->call_states);
 			kfree(fp->term_states->patch_call_sites);
+		}
 		kfree(fp->term_states);
 	}
 	free_percpu(fp->stats);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b9394f8fac0e..1d27208e1078 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3491,6 +3491,7 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
 	 * logic. 'subprog_cnt' should not be increased.
 	 */
 	subprog[env->subprog_cnt].start = insn_cnt;
+	subprog[env->subprog_cnt].is_bpf_loop_cb_non_inline = false;
 
 	if (env->log.level & BPF_LOG_LEVEL2)
 		for (i = 0; i < env->subprog_cnt; i++)
@@ -11319,19 +11320,30 @@ static bool loop_flag_is_zero(struct bpf_verifier_env *env)
 static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
 {
 	struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
+	struct bpf_subprog_info *prev_info, *info = subprog_info(env, subprogno);
 
 	if (!state->initialized) {
 		state->initialized = 1;
 		state->fit_for_inline = loop_flag_is_zero(env);
 		state->callback_subprogno = subprogno;
+		if (!state->fit_for_inline)
+			info->is_bpf_loop_cb_non_inline = 1;
 		return;
 	}
 
-	if (!state->fit_for_inline)
+	if (!state->fit_for_inline) {
+		info->is_bpf_loop_cb_non_inline = 1;
 		return;
+	}
 
 	state->fit_for_inline = (loop_flag_is_zero(env) &&
 				 state->callback_subprogno == subprogno);
+
+	if (state->callback_subprogno != subprogno) {
+		info->is_bpf_loop_cb_non_inline = 1;
+		prev_info = subprog_info(env, state->callback_subprogno);
+		prev_info->is_bpf_loop_cb_non_inline = 1;
+	}
 }
 
 /* Returns whether or not the given map type can potentially elide
@@ -21120,6 +21132,9 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
 	int i, patch_len, delta = 0, len = env->prog->len;
 	struct bpf_insn *insns = env->prog->insnsi;
 	struct bpf_prog *new_prog;
+	struct bpf_term_aux_states *term_states = env->prog->term_states;
+	u32 call_sites_cnt = term_states->patch_call_sites->call_sites_cnt;
+	struct call_aux_states *call_states = term_states->patch_call_sites->call_states;
 	bool rnd_hi32;
 
 	rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
@@ -21205,6 +21220,15 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
 		insns = new_prog->insnsi;
 		aux = env->insn_aux_data;
 		delta += patch_len - 1;
+
+		/* Adust call instruction offsets
+		 * w.r.t adj_idx
+		 */
+		for (int iter = 0; iter < call_sites_cnt; iter++) {
+			if (call_states[iter].call_bpf_insn_idx < adj_idx)
+				continue;
+			call_states[iter].call_bpf_insn_idx += patch_len - 1;
+		}
 	}
 
 	return 0;
@@ -21597,6 +21621,26 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 		func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
 		func[i]->aux->poke_tab = prog->aux->poke_tab;
 		func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
+		func[i]->aux->is_bpf_loop_cb_non_inline = env->subprog_info[i].is_bpf_loop_cb_non_inline;
+
+		if (prog->term_states->patch_call_sites->call_sites_cnt != 0) {
+			int call_sites_cnt = 0;
+			struct call_aux_states *func_call_states;
+			func_call_states = vzalloc(sizeof(*func_call_states) * len);
+			if (!func_call_states)
+				goto out_free;
+			for (int iter = 0; iter < prog->term_states->patch_call_sites->call_sites_cnt; iter++) {
+				struct call_aux_states call_states = prog->term_states->patch_call_sites->call_states[iter];
+				if (call_states.call_bpf_insn_idx >= subprog_start
+						&& call_states.call_bpf_insn_idx < subprog_end) {
+					func_call_states[call_sites_cnt] = call_states;
+					func_call_states[call_sites_cnt].call_bpf_insn_idx -= subprog_start;
+					call_sites_cnt++;
+				}
+			}
+			func[i]->term_states->patch_call_sites->call_sites_cnt = call_sites_cnt;
+			func[i]->term_states->patch_call_sites->call_states = func_call_states;
+		}
 
 		for (j = 0; j < prog->aux->size_poke_tab; j++) {
 			struct bpf_jit_poke_descriptor *poke;
@@ -21886,15 +21930,21 @@ static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
 }
 
 static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
-			    struct bpf_insn *insn_buf, int insn_idx, int *cnt)
+			    struct bpf_insn *insn_buf, int insn_idx, int *cnt, int *kfunc_btf_id)
 {
 	const struct bpf_kfunc_desc *desc;
+	struct bpf_kfunc_call_arg_meta meta;
+	int err;
 
 	if (!insn->imm) {
 		verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
 		return -EINVAL;
 	}
 
+	err = fetch_kfunc_meta(env, insn, &meta, NULL);
+	if (err)
+		return err;
+
 	*cnt = 0;
 
 	/* insn->imm has the btf func_id. Replace it with an offset relative to
@@ -21908,8 +21958,11 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		return -EFAULT;
 	}
 
-	if (!bpf_jit_supports_far_kfunc_call())
+	if (!bpf_jit_supports_far_kfunc_call()) {
+		if (meta.kfunc_flags & KF_RET_NULL)
+			*kfunc_btf_id = insn->imm;
 		insn->imm = BPF_CALL_IMM(desc->addr);
+	}
 	if (insn->off)
 		return 0;
 	if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl] ||
@@ -22019,6 +22072,13 @@ static int add_hidden_subprog(struct bpf_verifier_env *env, struct bpf_insn *pat
 	return 0;
 }
 
+static bool is_bpf_loop_call(struct bpf_insn *insn)
+{
+	return insn->code == (BPF_JMP | BPF_CALL) &&
+		insn->src_reg == 0 &&
+		insn->imm == BPF_FUNC_loop;
+}
+
 /* Do various post-verification rewrites in a single program pass.
  * These rewrites simplify JIT and interpreter implementations.
  */
@@ -22039,6 +22099,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 	struct bpf_subprog_info *subprogs = env->subprog_info;
 	u16 stack_depth = subprogs[cur_subprog].stack_depth;
 	u16 stack_depth_extra = 0;
+	u32 call_sites_cnt = 0;
+	struct call_aux_states *call_states;
+
+	call_states = vzalloc(sizeof(*call_states) * prog->len);
+	if (!call_states)
+		return -ENOMEM;
 
 	if (env->seen_exception && !env->exception_callback_subprog) {
 		struct bpf_insn *patch = insn_buf;
@@ -22368,11 +22434,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 		if (insn->src_reg == BPF_PSEUDO_CALL)
 			goto next_insn;
 		if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
-			ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
+			int kfunc_btf_id = 0;
+			ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt, &kfunc_btf_id);
 			if (ret)
 				return ret;
 			if (cnt == 0)
-				goto next_insn;
+				goto store_call_indices;
 
 			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
 			if (!new_prog)
@@ -22381,6 +22448,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			delta	 += cnt - 1;
 			env->prog = prog = new_prog;
 			insn	  = new_prog->insnsi + i + delta;
+store_call_indices:
+			if (kfunc_btf_id != 0) {
+				call_states[call_sites_cnt].call_bpf_insn_idx = i + delta;
+				call_states[call_sites_cnt].is_helper_kfunc = 1;
+				call_sites_cnt++;
+			}
 			goto next_insn;
 		}
 
@@ -22859,6 +22932,15 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 				     func_id_name(insn->imm), insn->imm);
 			return -EFAULT;
 		}
+
+		if ((fn->ret_type & PTR_MAYBE_NULL) || is_bpf_loop_call(insn)) {
+			call_states[call_sites_cnt].call_bpf_insn_idx = i + delta;	
+			if (is_bpf_loop_call(insn))
+				call_states[call_sites_cnt].is_bpf_loop = 1;
+			else
+				call_states[call_sites_cnt].is_helper_kfunc = 1;
+			call_sites_cnt++;
+		}
 		insn->imm = fn->func - __bpf_call_base;
 next_insn:
 		if (subprogs[cur_subprog + 1].start == i + delta + 1) {
@@ -22879,6 +22961,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 		insn++;
 	}
 
+	env->prog->term_states->patch_call_sites->call_sites_cnt = call_sites_cnt;
+	env->prog->term_states->patch_call_sites->call_states = call_states;
 	env->prog->aux->stack_depth = subprogs[0].stack_depth;
 	for (i = 0; i < env->subprog_cnt; i++) {
 		int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1;
@@ -23014,17 +23098,12 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
 	call_insn_offset = position + 12;
 	callback_offset = callback_start - call_insn_offset - 1;
 	new_prog->insnsi[call_insn_offset].imm = callback_offset;
+	/* Marking offset field to identify loop cb */
+	new_prog->insnsi[call_insn_offset].off = 0x1;
 
 	return new_prog;
 }
 
-static bool is_bpf_loop_call(struct bpf_insn *insn)
-{
-	return insn->code == (BPF_JMP | BPF_CALL) &&
-		insn->src_reg == 0 &&
-		insn->imm == BPF_FUNC_loop;
-}
-
 /* For all sub-programs in the program (including main) check
  * insn_aux_data to see if there are bpf_loop calls that require
  * inlining. If such calls are found the calls are replaced with a
@@ -24584,6 +24663,35 @@ static int compute_scc(struct bpf_verifier_env *env)
 	return err;
 }
 
+static int fix_call_sites(struct bpf_verifier_env *env)
+{
+	int err = 0, i, subprog;
+	struct bpf_insn *insn;
+	struct bpf_prog *prog = env->prog;
+	struct bpf_term_aux_states *term_states = env->prog->term_states;
+	u32 *call_sites_cnt = &term_states->patch_call_sites->call_sites_cnt;
+	struct call_aux_states *call_states = term_states->patch_call_sites->call_states;
+
+	if (!env->subprog_cnt)
+		return 0;
+	for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+		if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn))
+			continue;
+
+		subprog = find_subprog(env, i + insn->imm + 1);
+		if (subprog < 0)
+			return -EFAULT;
+
+		if (insn->off == 0x1) {
+			call_states[*call_sites_cnt].call_bpf_insn_idx = i;
+			call_states[*call_sites_cnt].is_bpf_loop_cb_inline = 1;
+			*call_sites_cnt = *call_sites_cnt + 1;
+			prog->insnsi[i].off = 0x0; /* Removing the marker */
+		}
+	}
+	return err;
+}
+
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
 {
 	u64 start_time = ktime_get_ns();
@@ -24769,6 +24877,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 								     : false;
 	}
 
+	if (ret == 0)
+		ret = fix_call_sites(env);
+
 	if (ret == 0)
 		ret = fixup_call_args(env);
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/4] bpf: runtime part of fast-path termination approach
  2025-09-07 23:04 [PATCH 0/4] bpf: Fast-Path approach for BPF program termination Siddharth Chintamaneni
  2025-09-07 23:04 ` [PATCH 1/4] bpf: Introduce new structs and struct fields for fast path termination Siddharth Chintamaneni
  2025-09-07 23:04 ` [PATCH 2/4] bpf: Creating call sites table to stub instructions during runtime Siddharth Chintamaneni
@ 2025-09-07 23:04 ` Siddharth Chintamaneni
  2025-09-08  6:01   ` kernel test robot
                     ` (2 more replies)
  2025-09-07 23:04 ` [PATCH 4/4] selftests/bpf: Adds selftests to check termination of long running nested bpf loops Siddharth Chintamaneni
  2025-09-17  2:13 ` [PATCH 0/4] bpf: Fast-Path approach for BPF program termination Kumar Kartikeya Dwivedi
  4 siblings, 3 replies; 12+ messages in thread
From: Siddharth Chintamaneni @ 2025-09-07 23:04 UTC (permalink / raw)
  To: bpf
  Cc: ast, daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
	john.fastabend, kpsingh, sdf, haoluo, jolsa, djwillia, miloc,
	ericts, rahult, doniaghazy, quanzhif, jinghao7, sidchintamaneni,
	memxor, egor, sairoop10, rjsu26

Update softlock detection logic to detect any stalls due to
BPF programs. When softlockup is detected, bpf_die will be
added to a workqueue on a CPU. With this implementation termination
handler will only get triggered when CONFIG_SOFTLOCKUP_DETECTOR is
enabled.

Inside bpf_die, we perform the text_poke to stub helpers/kfuncs.
The current implementation handles termination of long running
bpf_loop iterators both inlining and non-inlining case.

The limitation of this implementation is that the termination handler
atleast need a single CPU to run.

Signed-off-by: Raj Sahu <rjsu26@gmail.com>
Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
---
 arch/x86/net/bpf_jit_comp.c | 132 ++++++++++++++++++++++++++++++++++++
 include/linux/bpf.h         |   2 +
 include/linux/filter.h      |   6 ++
 kernel/bpf/core.c           |  35 +++++++++-
 kernel/watchdog.c           |   8 +++
 5 files changed, 182 insertions(+), 1 deletion(-)

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 107a44729675..4de9a8cdc465 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2606,6 +2606,10 @@ st:			if (is_imm8(insn->off))
 				if (arena_vm_start)
 					pop_r12(&prog);
 			}
+			/* emiting 5 byte nop for non-inline bpf_loop callback */
+			if (bpf_is_subprog(bpf_prog) && bpf_prog->aux->is_bpf_loop_cb_non_inline) {
+				emit_nops(&prog, X86_PATCH_SIZE);
+			}
 			EMIT1(0xC9);         /* leave */
 			emit_return(&prog, image + addrs[i - 1] + (prog - temp));
 			break;
@@ -3833,6 +3837,8 @@ bool bpf_jit_supports_private_stack(void)
 	return true;
 }
 
+
+
 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
 {
 #if defined(CONFIG_UNWINDER_ORC)
@@ -3849,6 +3855,132 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp
 #endif
 }
 
+void in_place_patch_bpf_prog(struct bpf_prog *prog)
+{
+	struct call_aux_states *call_states;
+	unsigned long new_target;
+	unsigned char *addr;
+	u8 ret_jmp_size = 1;
+	if (cpu_wants_rethunk()) {
+		ret_jmp_size = 5;
+	}
+	call_states = prog->term_states->patch_call_sites->call_states;
+	for (int i = 0; i < prog->term_states->patch_call_sites->call_sites_cnt; i++) {
+		
+		new_target = (unsigned long) bpf_termination_null_func;
+		if (call_states[i].is_bpf_loop_cb_inline) {
+			new_target = (unsigned long) bpf_loop_term_callback;	
+		}
+		char new_insn[5];
+
+		addr = (unsigned char *)prog->bpf_func + call_states->jit_call_idx;
+
+		unsigned long new_rel = (unsigned long)(new_target - (unsigned long)(addr + 5));
+		new_insn[0] = 0xE8;
+		new_insn[1] = (new_rel >> 0) & 0xFF;
+		new_insn[2] = (new_rel >> 8) & 0xFF;
+		new_insn[3] = (new_rel >> 16) & 0xFF;
+		new_insn[4] = (new_rel >> 24) & 0xFF;
+
+		smp_text_poke_batch_add(addr, new_insn, 5 /* call instruction len */, NULL);
+	}
+
+	if (prog->aux->is_bpf_loop_cb_non_inline) {
+		
+		char new_insn[5] = { 0xB8, 0x01, 0x00, 0x00, 0x00 };
+		char old_insn[5] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 };
+		smp_text_poke_batch_add(prog->bpf_func + prog->jited_len - 
+				(1 + ret_jmp_size) /* leave, jmp/ ret */ - 5 /* nop size */, new_insn, 5 /* mov eax, 1 */, old_insn);
+	}
+
+
+	/* flush all text poke calls */
+	smp_text_poke_batch_finish();
+}
+
+void bpf_die(struct bpf_prog *prog)
+{
+	u8 ret_jmp_size = 1;
+	if (cpu_wants_rethunk()) {
+		ret_jmp_size = 5;
+	}
+
+	/*
+	 * Replacing 5 byte nop in prologue with jmp instruction to ret
+	 */
+	unsigned long jmp_offset = prog->jited_len - (4 /* First endbr is 4 bytes */ 
+					+ 5 /* noop is 5 bytes */ 
+					+ ret_jmp_size /* 5 bytes of jmp return_thunk or 1 byte ret*/);
+
+	char new_insn[5];
+	new_insn[0] = 0xE9;
+	new_insn[1] = (jmp_offset >> 0) & 0xFF;
+	new_insn[2] = (jmp_offset >> 8) & 0xFF;
+	new_insn[3] = (jmp_offset >> 16) & 0xFF;
+	new_insn[4] = (jmp_offset >> 24) & 0xFF;
+
+	smp_text_poke_batch_add(prog->bpf_func + 4, new_insn, 5, NULL);
+
+	if (prog->aux->func_cnt) {
+		for (int i = 0; i < prog->aux->func_cnt; i++) {
+			in_place_patch_bpf_prog(prog->aux->func[i]);
+		}
+	} else {
+		in_place_patch_bpf_prog(prog);
+	}
+
+}
+
+void bpf_prog_termination_deferred(struct work_struct *work)
+{
+	struct bpf_term_aux_states *term_states = container_of(work, struct bpf_term_aux_states,
+						 work);
+	struct bpf_prog *prog = term_states->prog;
+
+	bpf_die(prog);
+}
+
+static struct workqueue_struct *bpf_termination_wq;
+
+void bpf_softlockup(u32 dur_s)
+{
+	unsigned long addr;
+	struct unwind_state state;
+	struct bpf_prog *prog;
+
+	for (unwind_start(&state, current, NULL, NULL); !unwind_done(&state);
+	     unwind_next_frame(&state)) {
+		addr = unwind_get_return_address(&state);
+		if (!addr)
+			break;
+
+		if (!is_bpf_text_address(addr))
+			continue;
+
+		rcu_read_lock();
+		prog = bpf_prog_ksym_find(addr);
+		rcu_read_unlock();
+		if (bpf_is_subprog(prog))
+			continue;
+
+		if (atomic_cmpxchg(&prog->term_states->bpf_die_in_progress, 0, 1))
+			break;
+	
+		bpf_termination_wq = alloc_workqueue("bpf_termination_wq", WQ_UNBOUND, 1);
+		if (!bpf_termination_wq)
+			pr_err("Failed to alloc workqueue for bpf termination.\n");
+
+		queue_work(bpf_termination_wq, &prog->term_states->work);
+
+		/* Currently nested programs are not terminated together.
+		 * Removing this break will result in BPF trampolines being
+		 * identified as is_bpf_text_address resulting in NULL ptr
+		 * deref in next step.
+		 */
+		break;
+	}
+}
+
 void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
 			       struct bpf_prog *new, struct bpf_prog *old)
 {
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index caaee33744fc..03fce8f2c466 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -71,6 +71,7 @@ typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
 typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
 typedef unsigned int (*bpf_func_t)(const void *,
 				   const struct bpf_insn *);
+
 struct bpf_iter_seq_info {
 	const struct seq_operations *seq_ops;
 	bpf_iter_init_seq_priv_t init_seq_private;
@@ -1600,6 +1601,7 @@ struct bpf_term_patch_call_sites {
 struct bpf_term_aux_states {
 	struct bpf_prog *prog;
 	struct work_struct work;
+	atomic_t bpf_die_in_progress;
 	struct bpf_term_patch_call_sites *patch_call_sites;
 };
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9092d8ea95c8..4f0f8fe478bf 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1123,6 +1123,8 @@ int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);
 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 
+void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx);
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 #define __bpf_call_base_args \
 	((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \
@@ -1257,6 +1259,10 @@ bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);
 
 void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns);
 void bpf_prog_pack_free(void *ptr, u32 size);
+void bpf_softlockup(u32 dur_s);
+void bpf_prog_termination_deferred(struct work_struct *work);
+void bpf_die(struct bpf_prog *prog);
+void in_place_patch_bpf_prog(struct bpf_prog *prog);
 
 static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
 {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 93442ab2acde..7b0552d15be3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -41,6 +41,7 @@
 #include <linux/execmem.h>
 
 #include <asm/barrier.h>
+#include <asm/unwind.h>
 #include <linux/unaligned.h>
 
 /* Registers */
@@ -95,6 +96,37 @@ enum page_size_enum {
 	__PAGE_SIZE = PAGE_SIZE
 };
 
+void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	return NULL;
+}
+
+int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx)
+{
+	return 1;
+}
+
+
+void __weak in_place_patch_bpf_prog(struct bpf_prog *prog)
+{
+	return;
+}
+
+void __weak bpf_die(struct bpf_prog *prog)
+{
+	return;
+}
+
+void __weak bpf_prog_termination_deferred(struct work_struct *work)
+{
+	return;
+}
+
+void __weak bpf_softlockup(u32 dur_s)
+{
+	return;
+}
+
 struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
 {
 	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
@@ -134,11 +166,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
 	fp->jit_requested = ebpf_jit_enabled();
 	fp->blinding_requested = bpf_jit_blinding_enabled(fp);
 	fp->term_states = term_states;
+	atomic_set(&fp->term_states->bpf_die_in_progress, 0);
 	fp->term_states->patch_call_sites = patch_call_sites;
 	fp->term_states->patch_call_sites->call_sites_cnt = 0;
 	fp->term_states->patch_call_sites->call_states = NULL;
 	fp->term_states->prog = fp;
-
+	INIT_WORK(&fp->term_states->work, bpf_prog_termination_deferred);
 #ifdef CONFIG_CGROUP_BPF
 	aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
 #endif
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 80b56c002c7f..59c91c18ca0e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -25,6 +25,7 @@
 #include <linux/stop_machine.h>
 #include <linux/sysctl.h>
 #include <linux/tick.h>
+#include <linux/filter.h>
 
 #include <linux/sched/clock.h>
 #include <linux/sched/debug.h>
@@ -700,6 +701,13 @@ static int is_softlockup(unsigned long touch_ts,
 		if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
 			scx_softlockup(now - touch_ts);
 
+		/*
+		 * Long running BPF programs can cause CPU's to stall.
+		 * So trigger fast path termination to terminate such BPF programs.
+		 */
+		if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
+			bpf_softlockup(now - touch_ts);
+
 		/* Warn about unreasonable delays. */
 		if (time_after(now, period_ts + get_softlockup_thresh()))
 			return now - touch_ts;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 4/4] selftests/bpf: Adds selftests to check termination of long running nested bpf loops
  2025-09-07 23:04 [PATCH 0/4] bpf: Fast-Path approach for BPF program termination Siddharth Chintamaneni
                   ` (2 preceding siblings ...)
  2025-09-07 23:04 ` [PATCH 3/4] bpf: runtime part of fast-path termination approach Siddharth Chintamaneni
@ 2025-09-07 23:04 ` Siddharth Chintamaneni
  2025-09-17  2:13 ` [PATCH 0/4] bpf: Fast-Path approach for BPF program termination Kumar Kartikeya Dwivedi
  4 siblings, 0 replies; 12+ messages in thread
From: Siddharth Chintamaneni @ 2025-09-07 23:04 UTC (permalink / raw)
  To: bpf
  Cc: ast, daniel, andrii, martin.lau, eddyz87, song, yonghong.song,
	john.fastabend, kpsingh, sdf, haoluo, jolsa, djwillia, miloc,
	ericts, rahult, doniaghazy, quanzhif, jinghao7, sidchintamaneni,
	memxor, egor, sairoop10, rjsu26

Adds tests checks for loops termination which are nested.

32/1    bpf_termination/bpf_termination:OK
32      bpf_termination:OK
Summary: 1/1 PASSED, 0 SKIPPED, 0 FAILED

Signed-off-by: Raj Sahu <rjsu26@gmail.com>
Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
---
 .../bpf/prog_tests/bpf_termination.c          | 39 +++++++++++++++
 .../selftests/bpf/progs/bpf_termination.c     | 47 +++++++++++++++++++
 2 files changed, 86 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_termination.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_termination.c

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_termination.c b/tools/testing/selftests/bpf/prog_tests/bpf_termination.c
new file mode 100644
index 000000000000..d060073db8f9
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_termination.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include <sys/socket.h>
+
+#include "bpf_termination.skel.h"
+
+void test_loop_termination(void)
+{
+	struct bpf_termination *skel;
+	int err;
+	
+	skel = bpf_termination__open();
+	if (!ASSERT_OK_PTR(skel, "bpf_termination__open"))
+	        return;
+	
+	err = bpf_termination__load(skel);
+	if (!ASSERT_OK(err, "bpf_termination__load"))
+	        goto out;
+	
+	skel->bss->pid = getpid();
+	err = bpf_termination__attach(skel);
+	if (!ASSERT_OK(err, "bpf_termination__attach"))
+	        goto out;
+	
+	/* Triggers long running BPF program */
+	socket(AF_UNSPEC, SOCK_DGRAM, 0);
+
+	/* If the program is not terminated, it doesn't reach this point */
+	ASSERT_TRUE(true, "Program is terminated");
+out:
+       bpf_termination__destroy(skel);
+}
+
+void test_bpf_termination(void)
+{
+	if (test__start_subtest("bpf_termination"))
+		test_loop_termination();
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_termination.c b/tools/testing/selftests/bpf/progs/bpf_termination.c
new file mode 100644
index 000000000000..36e97d84750b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_termination.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+int pid;
+
+#define LOOPS_CNT 1 << 10
+
+static int callback_fn4(void *ctx) {
+	return 0;
+}
+
+static int callback_fn3(void *ctx) {
+
+	bpf_loop(LOOPS_CNT, callback_fn4, NULL, 0);
+	return 0;
+
+}
+
+
+static int callback_fn2(void *ctx) {
+
+	bpf_loop(LOOPS_CNT, callback_fn3, NULL, 0);
+	return 0;
+
+}
+
+static int callback_fn(void *ctx) {
+
+	bpf_loop(LOOPS_CNT, callback_fn2, NULL, 0);
+	return 0;
+
+}
+
+SEC("tp/syscalls/sys_enter_socket")
+int bpf_loop_lr(void *ctx) {
+
+	if ((bpf_get_current_pid_tgid() >> 32) != pid)
+		return 0;
+
+	bpf_loop(LOOPS_CNT, callback_fn, NULL, 0);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/4] bpf: runtime part of fast-path termination approach
  2025-09-07 23:04 ` [PATCH 3/4] bpf: runtime part of fast-path termination approach Siddharth Chintamaneni
@ 2025-09-08  6:01   ` kernel test robot
  2025-09-08  7:14   ` kernel test robot
  2025-09-17  2:11   ` Kumar Kartikeya Dwivedi
  2 siblings, 0 replies; 12+ messages in thread
From: kernel test robot @ 2025-09-08  6:01 UTC (permalink / raw)
  To: Siddharth Chintamaneni, bpf
  Cc: llvm, oe-kbuild-all, ast, daniel, andrii, martin.lau, eddyz87,
	song, yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
	djwillia, miloc, ericts, rahult, doniaghazy, quanzhif, jinghao7,
	sidchintamaneni, memxor, egor, sairoop10, rjsu26

Hi Siddharth,

kernel test robot noticed the following build warnings:

[auto build test WARNING on bpf-next/net]
[also build test WARNING on bpf-next/master bpf/master linus/master v6.17-rc5 next-20250905]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Siddharth-Chintamaneni/bpf-Introduce-new-structs-and-struct-fields-for-fast-path-termination/20250908-070655
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git net
patch link:    https://lore.kernel.org/r/20250907230415.289327-4-sidchintamaneni%40gmail.com
patch subject: [PATCH 3/4] bpf: runtime part of fast-path termination approach
config: s390-randconfig-001-20250908 (https://download.01.org/0day-ci/archive/20250908/202509081350.yqptixjh-lkp@intel.com/config)
compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 7fb1dc08d2f025aad5777bb779dfac1197e9ef87)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250908/202509081350.yqptixjh-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202509081350.yqptixjh-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> kernel/bpf/core.c:109:13: warning: no previous prototype for function 'in_place_patch_bpf_prog' [-Wmissing-prototypes]
     109 | void __weak in_place_patch_bpf_prog(struct bpf_prog *prog)
         |             ^
   kernel/bpf/core.c:109:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
     109 | void __weak in_place_patch_bpf_prog(struct bpf_prog *prog)
         | ^
         | static 
>> kernel/bpf/core.c:114:13: warning: no previous prototype for function 'bpf_die' [-Wmissing-prototypes]
     114 | void __weak bpf_die(struct bpf_prog *prog)
         |             ^
   kernel/bpf/core.c:114:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
     114 | void __weak bpf_die(struct bpf_prog *prog)
         | ^
         | static 
>> kernel/bpf/core.c:119:13: warning: no previous prototype for function 'bpf_prog_termination_deferred' [-Wmissing-prototypes]
     119 | void __weak bpf_prog_termination_deferred(struct work_struct *work)
         |             ^
   kernel/bpf/core.c:119:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
     119 | void __weak bpf_prog_termination_deferred(struct work_struct *work)
         | ^
         | static 
>> kernel/bpf/core.c:124:13: warning: no previous prototype for function 'bpf_softlockup' [-Wmissing-prototypes]
     124 | void __weak bpf_softlockup(u32 dur_s)
         |             ^
   kernel/bpf/core.c:124:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
     124 | void __weak bpf_softlockup(u32 dur_s)
         | ^
         | static 
   4 warnings generated.


vim +/in_place_patch_bpf_prog +109 kernel/bpf/core.c

   107	
   108	
 > 109	void __weak in_place_patch_bpf_prog(struct bpf_prog *prog)
   110	{
   111		return;
   112	}
   113	
 > 114	void __weak bpf_die(struct bpf_prog *prog)
   115	{
   116		return;
   117	}
   118	
 > 119	void __weak bpf_prog_termination_deferred(struct work_struct *work)
   120	{
   121		return;
   122	}
   123	
 > 124	void __weak bpf_softlockup(u32 dur_s)
   125	{
   126		return;
   127	}
   128	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/4] bpf: runtime part of fast-path termination approach
  2025-09-07 23:04 ` [PATCH 3/4] bpf: runtime part of fast-path termination approach Siddharth Chintamaneni
  2025-09-08  6:01   ` kernel test robot
@ 2025-09-08  7:14   ` kernel test robot
  2025-09-17  2:11   ` Kumar Kartikeya Dwivedi
  2 siblings, 0 replies; 12+ messages in thread
From: kernel test robot @ 2025-09-08  7:14 UTC (permalink / raw)
  To: Siddharth Chintamaneni, bpf
  Cc: oe-kbuild-all, ast, daniel, andrii, martin.lau, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
	djwillia, miloc, ericts, rahult, doniaghazy, quanzhif, jinghao7,
	sidchintamaneni, memxor, egor, sairoop10, rjsu26

Hi Siddharth,

kernel test robot noticed the following build errors:

[auto build test ERROR on bpf-next/net]
[also build test ERROR on bpf-next/master bpf/master linus/master v6.17-rc5 next-20250905]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Siddharth-Chintamaneni/bpf-Introduce-new-structs-and-struct-fields-for-fast-path-termination/20250908-070655
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git net
patch link:    https://lore.kernel.org/r/20250907230415.289327-4-sidchintamaneni%40gmail.com
patch subject: [PATCH 3/4] bpf: runtime part of fast-path termination approach
config: sh-randconfig-001-20250908 (https://download.01.org/0day-ci/archive/20250908/202509081431.PcY1azAC-lkp@intel.com/config)
compiler: sh4-linux-gcc (GCC) 14.3.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250908/202509081431.PcY1azAC-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202509081431.PcY1azAC-lkp@intel.com/

All errors (new ones prefixed by >>):

   kernel/watchdog.c: In function 'is_softlockup':
>> kernel/watchdog.c:709:25: error: implicit declaration of function 'bpf_softlockup'; did you mean 'is_softlockup'? [-Wimplicit-function-declaration]
     709 |                         bpf_softlockup(now - touch_ts);
         |                         ^~~~~~~~~~~~~~
         |                         is_softlockup


vim +709 kernel/watchdog.c

   678	
   679	static int is_softlockup(unsigned long touch_ts,
   680				 unsigned long period_ts,
   681				 unsigned long now)
   682	{
   683		if ((watchdog_enabled & WATCHDOG_SOFTOCKUP_ENABLED) && watchdog_thresh) {
   684			/*
   685			 * If period_ts has not been updated during a sample_period, then
   686			 * in the subsequent few sample_periods, period_ts might also not
   687			 * be updated, which could indicate a potential softlockup. In
   688			 * this case, if we suspect the cause of the potential softlockup
   689			 * might be interrupt storm, then we need to count the interrupts
   690			 * to find which interrupt is storming.
   691			 */
   692			if (time_after_eq(now, period_ts + get_softlockup_thresh() / NUM_SAMPLE_PERIODS) &&
   693			    need_counting_irqs())
   694				start_counting_irqs();
   695	
   696			/*
   697			 * A poorly behaving BPF scheduler can live-lock the system into
   698			 * soft lockups. Tell sched_ext to try ejecting the BPF
   699			 * scheduler when close to a soft lockup.
   700			 */
   701			if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
   702				scx_softlockup(now - touch_ts);
   703	
   704			/*
   705			 * Long running BPF programs can cause CPU's to stall.
   706			 * So trigger fast path termination to terminate such BPF programs.
   707			 */
   708			if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
 > 709				bpf_softlockup(now - touch_ts);
   710	
   711			/* Warn about unreasonable delays. */
   712			if (time_after(now, period_ts + get_softlockup_thresh()))
   713				return now - touch_ts;
   714		}
   715		return 0;
   716	}
   717	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/4] bpf: runtime part of fast-path termination approach
  2025-09-07 23:04 ` [PATCH 3/4] bpf: runtime part of fast-path termination approach Siddharth Chintamaneni
  2025-09-08  6:01   ` kernel test robot
  2025-09-08  7:14   ` kernel test robot
@ 2025-09-17  2:11   ` Kumar Kartikeya Dwivedi
  2025-09-17  4:01     ` Siddharth Chintamaneni
  2 siblings, 1 reply; 12+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2025-09-17  2:11 UTC (permalink / raw)
  To: Siddharth Chintamaneni
  Cc: bpf, ast, daniel, andrii, martin.lau, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
	djwillia, miloc, ericts, rahult, doniaghazy, quanzhif, jinghao7,
	egor, sairoop10, rjsu26

On Mon, 8 Sept 2025 at 01:04, Siddharth Chintamaneni
<sidchintamaneni@gmail.com> wrote:
>
> Update softlock detection logic to detect any stalls due to
> BPF programs. When softlockup is detected, bpf_die will be
> added to a workqueue on a CPU. With this implementation termination
> handler will only get triggered when CONFIG_SOFTLOCKUP_DETECTOR is
> enabled.
>

This is probably good for demonstration, but I'm not sure
piggy-backing off of optional softlockup detection is a good idea.
In any case, let's focus on the mechanism for now.

> Inside bpf_die, we perform the text_poke to stub helpers/kfuncs.
> The current implementation handles termination of long running
> bpf_loop iterators both inlining and non-inlining case.
>
> The limitation of this implementation is that the termination handler
> atleast need a single CPU to run.

Yeah, as discussed in v2, there were other options without this
limitation, i.e. take some overhead and check a terminate bit that
doesn't rely on punting work off to a wq to ensure prog is killed.

>
> Signed-off-by: Raj Sahu <rjsu26@gmail.com>
> Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
> ---
>  arch/x86/net/bpf_jit_comp.c | 132 ++++++++++++++++++++++++++++++++++++
>  include/linux/bpf.h         |   2 +
>  include/linux/filter.h      |   6 ++
>  kernel/bpf/core.c           |  35 +++++++++-
>  kernel/watchdog.c           |   8 +++
>  5 files changed, 182 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 107a44729675..4de9a8cdc465 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -2606,6 +2606,10 @@ st:                      if (is_imm8(insn->off))
>                                 if (arena_vm_start)
>                                         pop_r12(&prog);
>                         }
> +                       /* emiting 5 byte nop for non-inline bpf_loop callback */

typo: emitting

> +                       if (bpf_is_subprog(bpf_prog) && bpf_prog->aux->is_bpf_loop_cb_non_inline) {
> +                               emit_nops(&prog, X86_PATCH_SIZE);
> +                       }

But this is not the only source of potential stalls, right? E.g. bpf
iterators can stall (if nested), cond_break, etc.

>                         EMIT1(0xC9);         /* leave */
>                         emit_return(&prog, image + addrs[i - 1] + (prog - temp));
>                         break;
> @@ -3833,6 +3837,8 @@ bool bpf_jit_supports_private_stack(void)
>         return true;
>  }
>
> +
> +
>  void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
>  {
>  #if defined(CONFIG_UNWINDER_ORC)
> @@ -3849,6 +3855,132 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp
>  #endif
>  }
>
> +void in_place_patch_bpf_prog(struct bpf_prog *prog)
> +{
> +       struct call_aux_states *call_states;
> +       unsigned long new_target;
> +       unsigned char *addr;
> +       u8 ret_jmp_size = 1;
> +       if (cpu_wants_rethunk()) {
> +               ret_jmp_size = 5;
> +       }
> +       call_states = prog->term_states->patch_call_sites->call_states;
> +       for (int i = 0; i < prog->term_states->patch_call_sites->call_sites_cnt; i++) {
> +
> +               new_target = (unsigned long) bpf_termination_null_func;
> +               if (call_states[i].is_bpf_loop_cb_inline) {
> +                       new_target = (unsigned long) bpf_loop_term_callback;
> +               }
> +               char new_insn[5];
> +
> +               addr = (unsigned char *)prog->bpf_func + call_states->jit_call_idx;
> +
> +               unsigned long new_rel = (unsigned long)(new_target - (unsigned long)(addr + 5));
> +               new_insn[0] = 0xE8;
> +               new_insn[1] = (new_rel >> 0) & 0xFF;
> +               new_insn[2] = (new_rel >> 8) & 0xFF;
> +               new_insn[3] = (new_rel >> 16) & 0xFF;
> +               new_insn[4] = (new_rel >> 24) & 0xFF;
> +
> +               smp_text_poke_batch_add(addr, new_insn, 5 /* call instruction len */, NULL);
> +       }
> +
> +       if (prog->aux->is_bpf_loop_cb_non_inline) {
> +
> +               char new_insn[5] = { 0xB8, 0x01, 0x00, 0x00, 0x00 };
> +               char old_insn[5] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 };
> +               smp_text_poke_batch_add(prog->bpf_func + prog->jited_len -
> +                               (1 + ret_jmp_size) /* leave, jmp/ ret */ - 5 /* nop size */, new_insn, 5 /* mov eax, 1 */, old_insn);
> +       }
> +
> +
> +       /* flush all text poke calls */
> +       smp_text_poke_batch_finish();
> +}
> +
> +void bpf_die(struct bpf_prog *prog)
> +{
> +       u8 ret_jmp_size = 1;
> +       if (cpu_wants_rethunk()) {
> +               ret_jmp_size = 5;
> +       }
> +
> +       /*
> +        * Replacing 5 byte nop in prologue with jmp instruction to ret
> +        */
> +       unsigned long jmp_offset = prog->jited_len - (4 /* First endbr is 4 bytes */
> +                                       + 5 /* noop is 5 bytes */
> +                                       + ret_jmp_size /* 5 bytes of jmp return_thunk or 1 byte ret*/);
> +
> +       char new_insn[5];
> +       new_insn[0] = 0xE9;
> +       new_insn[1] = (jmp_offset >> 0) & 0xFF;
> +       new_insn[2] = (jmp_offset >> 8) & 0xFF;
> +       new_insn[3] = (jmp_offset >> 16) & 0xFF;
> +       new_insn[4] = (jmp_offset >> 24) & 0xFF;
> +
> +       smp_text_poke_batch_add(prog->bpf_func + 4, new_insn, 5, NULL);
> +
> +       if (prog->aux->func_cnt) {
> +               for (int i = 0; i < prog->aux->func_cnt; i++) {
> +                       in_place_patch_bpf_prog(prog->aux->func[i]);
> +               }
> +       } else {
> +               in_place_patch_bpf_prog(prog);
> +       }
> +

Are you relying on batch finish() inside in_place_patch_bpf_prog()?

> +}
> +
> +void bpf_prog_termination_deferred(struct work_struct *work)
> +{
> +       struct bpf_term_aux_states *term_states = container_of(work, struct bpf_term_aux_states,
> +                                                work);
> +       struct bpf_prog *prog = term_states->prog;
> +
> +       bpf_die(prog);
> +}
> +
> +static struct workqueue_struct *bpf_termination_wq;
> +
> +void bpf_softlockup(u32 dur_s)
> +{
> +       unsigned long addr;
> +       struct unwind_state state;
> +       struct bpf_prog *prog;
> +
> +       for (unwind_start(&state, current, NULL, NULL); !unwind_done(&state);
> +            unwind_next_frame(&state)) {

Why not use arch_bpf_stack_walk?

> +               addr = unwind_get_return_address(&state);
> +               if (!addr)
> +                       break;
> +
> +               if (!is_bpf_text_address(addr))
> +                       continue;
> +
> +               rcu_read_lock();
> +               prog = bpf_prog_ksym_find(addr);
> +               rcu_read_unlock();
> +               if (bpf_is_subprog(prog))
> +                       continue;
> +
> +               if (atomic_cmpxchg(&prog->term_states->bpf_die_in_progress, 0, 1))
> +                       break;
> +
> +               bpf_termination_wq = alloc_workqueue("bpf_termination_wq", WQ_UNBOUND, 1);

Err, even if you have to, I'd rather go with system_unbound_wq for
now. We have bpf_wq, and should probably have a dedicated bpf wq for
all wq executions coming from the BPF subsystem, but that's a
discussion for later.


> +               if (!bpf_termination_wq)
> +                       pr_err("Failed to alloc workqueue for bpf termination.\n");
> +
> +               queue_work(bpf_termination_wq, &prog->term_states->work);
> +
> +               /* Currently nested programs are not terminated together.
> +                * Removing this break will result in BPF trampolines being
> +                * identified as is_bpf_text_address resulting in NULL ptr
> +                * deref in next step.
> +                */
> +               break;
> +       }
> +}
> +
>  void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
>                                struct bpf_prog *new, struct bpf_prog *old)
>  {
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index caaee33744fc..03fce8f2c466 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -71,6 +71,7 @@ typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
>  typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
>  typedef unsigned int (*bpf_func_t)(const void *,
>                                    const struct bpf_insn *);
> +
>  struct bpf_iter_seq_info {
>         const struct seq_operations *seq_ops;
>         bpf_iter_init_seq_priv_t init_seq_private;
> @@ -1600,6 +1601,7 @@ struct bpf_term_patch_call_sites {
>  struct bpf_term_aux_states {
>         struct bpf_prog *prog;
>         struct work_struct work;
> +       atomic_t bpf_die_in_progress;
>         struct bpf_term_patch_call_sites *patch_call_sites;
>  };
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 9092d8ea95c8..4f0f8fe478bf 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -1123,6 +1123,8 @@ int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);
>  bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
>  void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
>
> +void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
> +int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx);
>  u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
>  #define __bpf_call_base_args \
>         ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \
> @@ -1257,6 +1259,10 @@ bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);
>
>  void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns);
>  void bpf_prog_pack_free(void *ptr, u32 size);
> +void bpf_softlockup(u32 dur_s);
> +void bpf_prog_termination_deferred(struct work_struct *work);
> +void bpf_die(struct bpf_prog *prog);
> +void in_place_patch_bpf_prog(struct bpf_prog *prog);
>
>  static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
>  {
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 93442ab2acde..7b0552d15be3 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -41,6 +41,7 @@
>  #include <linux/execmem.h>
>
>  #include <asm/barrier.h>
> +#include <asm/unwind.h>
>  #include <linux/unaligned.h>
>
>  /* Registers */
> @@ -95,6 +96,37 @@ enum page_size_enum {
>         __PAGE_SIZE = PAGE_SIZE
>  };
>
> +void *bpf_termination_null_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
> +{
> +       return NULL;
> +}
> +
> +int bpf_loop_term_callback(u64 reg_loop_cnt, u64 *reg_loop_ctx)
> +{
> +       return 1;
> +}
> +
> +
> +void __weak in_place_patch_bpf_prog(struct bpf_prog *prog)
> +{
> +       return;
> +}
> +
> +void __weak bpf_die(struct bpf_prog *prog)
> +{
> +       return;
> +}
> +
> +void __weak bpf_prog_termination_deferred(struct work_struct *work)
> +{
> +       return;
> +}
> +
> +void __weak bpf_softlockup(u32 dur_s)
> +{
> +       return;
> +}
> +
>  struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
>  {
>         gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
> @@ -134,11 +166,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
>         fp->jit_requested = ebpf_jit_enabled();
>         fp->blinding_requested = bpf_jit_blinding_enabled(fp);
>         fp->term_states = term_states;
> +       atomic_set(&fp->term_states->bpf_die_in_progress, 0);
>         fp->term_states->patch_call_sites = patch_call_sites;
>         fp->term_states->patch_call_sites->call_sites_cnt = 0;
>         fp->term_states->patch_call_sites->call_states = NULL;
>         fp->term_states->prog = fp;
> -
> +       INIT_WORK(&fp->term_states->work, bpf_prog_termination_deferred);
>  #ifdef CONFIG_CGROUP_BPF
>         aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
>  #endif
> diff --git a/kernel/watchdog.c b/kernel/watchdog.c
> index 80b56c002c7f..59c91c18ca0e 100644
> --- a/kernel/watchdog.c
> +++ b/kernel/watchdog.c
> @@ -25,6 +25,7 @@
>  #include <linux/stop_machine.h>
>  #include <linux/sysctl.h>
>  #include <linux/tick.h>
> +#include <linux/filter.h>
>
>  #include <linux/sched/clock.h>
>  #include <linux/sched/debug.h>
> @@ -700,6 +701,13 @@ static int is_softlockup(unsigned long touch_ts,
>                 if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
>                         scx_softlockup(now - touch_ts);
>
> +               /*
> +                * Long running BPF programs can cause CPU's to stall.
> +                * So trigger fast path termination to terminate such BPF programs.
> +                */
> +               if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
> +                       bpf_softlockup(now - touch_ts);
> +
>                 /* Warn about unreasonable delays. */
>                 if (time_after(now, period_ts + get_softlockup_thresh()))
>                         return now - touch_ts;
> --
> 2.43.0
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/4] bpf: Introduce new structs and struct fields for fast path termination
  2025-09-07 23:04 ` [PATCH 1/4] bpf: Introduce new structs and struct fields for fast path termination Siddharth Chintamaneni
@ 2025-09-17  2:11   ` Kumar Kartikeya Dwivedi
  2025-09-17  3:38     ` Siddharth Chintamaneni
  0 siblings, 1 reply; 12+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2025-09-17  2:11 UTC (permalink / raw)
  To: Siddharth Chintamaneni
  Cc: bpf, ast, daniel, andrii, martin.lau, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
	djwillia, miloc, ericts, rahult, doniaghazy, quanzhif, jinghao7,
	egor, sairoop10, rjsu26

On Mon, 8 Sept 2025 at 01:04, Siddharth Chintamaneni
<sidchintamaneni@gmail.com> wrote:
>
> Introduced the definition of struct bpf_term_aux_states
> required to support fast-path termination of BPF programs.
> Added the memory allocation and free logic for newly added
> term_states feild in struct bpf_prog.

typo: field

>
> Signed-off-by: Raj Sahu <rjsu26@gmail.com>
> Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
> ---
>  include/linux/bpf.h | 75 +++++++++++++++++++++++++++++----------------
>  kernel/bpf/core.c   | 31 +++++++++++++++++++
>  2 files changed, 79 insertions(+), 27 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 8f6e87f0f3a8..caaee33744fc 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1584,6 +1584,25 @@ struct bpf_stream_stage {
>         int len;
>  };
>
> +struct call_aux_states {
> +       int call_bpf_insn_idx;
> +       int jit_call_idx;
> +       u8 is_helper_kfunc;
> +       u8 is_bpf_loop;
> +       u8 is_bpf_loop_cb_inline;
> +};
> +
> +struct bpf_term_patch_call_sites {
> +       u32 call_sites_cnt;
> +       struct call_aux_states *call_states;
> +};
> +
> +struct bpf_term_aux_states {
> +       struct bpf_prog *prog;
> +       struct work_struct work;
> +       struct bpf_term_patch_call_sites *patch_call_sites;
> +};
> +
>  struct bpf_prog_aux {
>         atomic64_t refcnt;
>         u32 used_map_cnt;
> @@ -1618,6 +1637,7 @@ struct bpf_prog_aux {
>         bool tail_call_reachable;
>         bool xdp_has_frags;
>         bool exception_cb;
> +       bool is_bpf_loop_cb_non_inline;
>         bool exception_boundary;
>         bool is_extended; /* true if extended by freplace program */
>         bool jits_use_priv_stack;
> @@ -1696,33 +1716,34 @@ struct bpf_prog_aux {
>  };
>
>  struct bpf_prog {
> -       u16                     pages;          /* Number of allocated pages */
> -       u16                     jited:1,        /* Is our filter JIT'ed? */
> -                               jit_requested:1,/* archs need to JIT the prog */
> -                               gpl_compatible:1, /* Is filter GPL compatible? */
> -                               cb_access:1,    /* Is control block accessed? */
> -                               dst_needed:1,   /* Do we need dst entry? */
> -                               blinding_requested:1, /* needs constant blinding */
> -                               blinded:1,      /* Was blinded */
> -                               is_func:1,      /* program is a bpf function */
> -                               kprobe_override:1, /* Do we override a kprobe? */
> -                               has_callchain_buf:1, /* callchain buffer allocated? */
> -                               enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
> -                               call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
> -                               call_get_func_ip:1, /* Do we call get_func_ip() */
> -                               tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
> -                               sleepable:1;    /* BPF program is sleepable */
> -       enum bpf_prog_type      type;           /* Type of BPF program */
> -       enum bpf_attach_type    expected_attach_type; /* For some prog types */
> -       u32                     len;            /* Number of filter blocks */
> -       u32                     jited_len;      /* Size of jited insns in bytes */
> -       u8                      tag[BPF_TAG_SIZE];
> -       struct bpf_prog_stats __percpu *stats;
> -       int __percpu            *active;
> -       unsigned int            (*bpf_func)(const void *ctx,
> -                                           const struct bpf_insn *insn);
> -       struct bpf_prog_aux     *aux;           /* Auxiliary fields */
> -       struct sock_fprog_kern  *orig_prog;     /* Original BPF program */
> +       u16                             pages;          /* Number of allocated pages */
> +       u16                             jited:1,        /* Is our filter JIT'ed? */
> +                                       jit_requested:1,/* archs need to JIT the prog */
> +                                       gpl_compatible:1, /* Is filter GPL compatible? */
> +                                       cb_access:1,    /* Is control block accessed? */
> +                                       dst_needed:1,   /* Do we need dst entry? */
> +                                       blinding_requested:1, /* needs constant blinding */
> +                                       blinded:1,      /* Was blinded */
> +                                       is_func:1,      /* program is a bpf function */
> +                                       kprobe_override:1, /* Do we override a kprobe? */
> +                                       has_callchain_buf:1, /* callchain buffer allocated? */
> +                                       enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
> +                                       call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
> +                                       call_get_func_ip:1, /* Do we call get_func_ip() */
> +                                       tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
> +                                       sleepable:1;    /* BPF program is sleepable */
> +       enum bpf_prog_type              type;           /* Type of BPF program */
> +       enum bpf_attach_type            expected_attach_type; /* For some prog types */
> +       u32                             len;            /* Number of filter blocks */
> +       u32                             jited_len;      /* Size of jited insns in bytes */
> +       u8                              tag[BPF_TAG_SIZE];
> +       struct bpf_prog_stats __percpu  *stats;
> +       int __percpu                    *active;
> +       unsigned int                    (*bpf_func)(const void *ctx,
> +                                                   const struct bpf_insn *insn);
> +       struct bpf_prog_aux             *aux;           /* Auxiliary fields */
> +       struct sock_fprog_kern          *orig_prog;     /* Original BPF program */
> +       struct bpf_term_aux_states      *term_states;
>         /* Instructions for interpreter */

This hunk looks bad, please only include what you touched, and don't
reformat the rest.


>         union {
>                 DECLARE_FLEX_ARRAY(struct sock_filter, insns);
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index ef01cc644a96..740b5a3a6b55 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -100,6 +100,8 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
>         gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
>         struct bpf_prog_aux *aux;
>         struct bpf_prog *fp;
> +       struct bpf_term_aux_states *term_states = NULL;
> +       struct bpf_term_patch_call_sites *patch_call_sites = NULL;
>
>         size = round_up(size, __PAGE_SIZE);
>         fp = __vmalloc(size, gfp_flags);
> @@ -118,11 +120,24 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
>                 return NULL;
>         }
>
> +       term_states = kzalloc(sizeof(*term_states), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
> +       if (!term_states)
> +               goto free_alloc_percpu;
> +
> +       patch_call_sites = kzalloc(sizeof(*patch_call_sites), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
> +       if (!patch_call_sites)
> +               goto free_bpf_term_states;
> +
>         fp->pages = size / PAGE_SIZE;
>         fp->aux = aux;
>         fp->aux->prog = fp;
>         fp->jit_requested = ebpf_jit_enabled();
>         fp->blinding_requested = bpf_jit_blinding_enabled(fp);
> +       fp->term_states = term_states;
> +       fp->term_states->patch_call_sites = patch_call_sites;
> +       fp->term_states->patch_call_sites->call_sites_cnt = 0;
> +       fp->term_states->prog = fp;
> +
>  #ifdef CONFIG_CGROUP_BPF
>         aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
>  #endif
> @@ -140,6 +155,15 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
>  #endif
>
>         return fp;
> +
> +free_bpf_term_states:
> +       kfree(term_states);
> +free_alloc_percpu:
> +       free_percpu(fp->active);
> +       kfree(aux);
> +       vfree(fp);
> +
> +       return NULL;
>  }
>
>  struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
> @@ -266,6 +290,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
>                 memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
>                 fp->pages = pages;
>                 fp->aux->prog = fp;
> +               fp->term_states->prog = fp;
>
>                 /* We keep fp->aux from fp_old around in the new
>                  * reallocated structure.
> @@ -273,6 +298,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
>                 fp_old->aux = NULL;
>                 fp_old->stats = NULL;
>                 fp_old->active = NULL;
> +               fp_old->term_states = NULL;
>                 __bpf_prog_free(fp_old);
>         }
>
> @@ -287,6 +313,11 @@ void __bpf_prog_free(struct bpf_prog *fp)
>                 kfree(fp->aux->poke_tab);
>                 kfree(fp->aux);
>         }
> +       if (fp->term_states) {
> +               if (fp->term_states->patch_call_sites)
> +                       kfree(fp->term_states->patch_call_sites);
> +               kfree(fp->term_states);
> +       }
>         free_percpu(fp->stats);
>         free_percpu(fp->active);
>         vfree(fp);
> --
> 2.43.0
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4] bpf: Fast-Path approach for BPF program termination
  2025-09-07 23:04 [PATCH 0/4] bpf: Fast-Path approach for BPF program termination Siddharth Chintamaneni
                   ` (3 preceding siblings ...)
  2025-09-07 23:04 ` [PATCH 4/4] selftests/bpf: Adds selftests to check termination of long running nested bpf loops Siddharth Chintamaneni
@ 2025-09-17  2:13 ` Kumar Kartikeya Dwivedi
  4 siblings, 0 replies; 12+ messages in thread
From: Kumar Kartikeya Dwivedi @ 2025-09-17  2:13 UTC (permalink / raw)
  To: Siddharth Chintamaneni
  Cc: bpf, ast, daniel, andrii, martin.lau, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
	djwillia, miloc, ericts, rahult, doniaghazy, quanzhif, jinghao7,
	egor, sairoop10, rjsu26

On Mon, 8 Sept 2025 at 01:04, Siddharth Chintamaneni
<sidchintamaneni@gmail.com> wrote:
>
> This is RFC v3 of
>         https://lore.kernel.org/all/20250614064056.237005-1-sidchintamaneni@gmail.com/
>
> The termination handler call from the softlockup detector is mainly for
> the demonstration of the entire workflow and also serves as a potential
> use case for discussion. However, the runtime mechanism is modular
> enough to be ported to different scenarios such as deadlocks, page
> faults, userspace BPF management tools, and stack overflows.
>
> The main changes that we bring in this version are: We have avoided the
> memory overhead caused by program cloning in previous versions. During
> normal program execution, none of the termination logic causes any
> additional overhead.

I think the bit missing from the changelog is why the sampling of a
terminate bit in various sources of stalls in a program was not
investigated.

https://lore.kernel.org/all/CAP01T760JcsZ0o5BfKZ7pi0viseocTQCUW6KjqbxzTW7TwXF9g@mail.gmail.com/

The last time we discussed v2 the conclusion was that it was possible
for all CPUs to simultaneously get stuck, such that the punting of
patching to wq or other non-prog context isn't useful in making any
progress.

It seems to me that the terminate bit sampling approach can apply more
broadly, to cond_break, iterators, bpf_loop(), etc.

Did you investigate that approach? Was there a reason to discard it
and continue with this in v3?
I am curious why you didn't look into it after the discussion in v2.



>
> Change log:
> v2 -> v3:
> - Cloning of BPF programs has been removed.
> - Created call sites table to maintain helper/ kfunc call instruction
>   offsets.
> - Termination is triggered inside the softlockup detector not affecting
>   any fast path operations.
>
> v1 -> v2:
> - Patch generation has been moved after verification and before JIT.
>         - Now patch generation handles both helpers and kfuncs.
>         - Sanity check on original prog and patch prog after JIT.
> - Runtime termination handler is now global termination mechanism using
>   text_poke.
> - Termination is triggered by watchdog timer.
>
>  arch/x86/net/bpf_jit_comp.c                   | 141 ++++++++++++++++++
>  include/linux/bpf.h                           |  77 ++++++----
>  include/linux/bpf_verifier.h                  |   1 +
>  include/linux/filter.h                        |   6 +
>  kernel/bpf/core.c                             |  67 +++++++++
>  kernel/bpf/verifier.c                         | 135 +++++++++++++++--
>  kernel/watchdog.c                             |   8 +
>  .../bpf/prog_tests/bpf_termination.c          |  39 +++++
>  .../selftests/bpf/progs/bpf_termination.c     |  47 ++++++
>  9 files changed, 482 insertions(+), 39 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_termination.c
>  create mode 100644 tools/testing/selftests/bpf/progs/bpf_termination.c
>
> --
> 2.43.0
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/4] bpf: Introduce new structs and struct fields for fast path termination
  2025-09-17  2:11   ` Kumar Kartikeya Dwivedi
@ 2025-09-17  3:38     ` Siddharth Chintamaneni
  0 siblings, 0 replies; 12+ messages in thread
From: Siddharth Chintamaneni @ 2025-09-17  3:38 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi
  Cc: bpf, ast, daniel, andrii, martin.lau, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
	djwillia, miloc, ericts, rahult, doniaghazy, quanzhif, jinghao7,
	egor, sairoop10, rjsu26

On Tue, 16 Sept 2025 at 19:11, Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote:
>
> On Mon, 8 Sept 2025 at 01:04, Siddharth Chintamaneni
> <sidchintamaneni@gmail.com> wrote:
> >
> > Introduced the definition of struct bpf_term_aux_states
> > required to support fast-path termination of BPF programs.
> > Added the memory allocation and free logic for newly added
> > term_states feild in struct bpf_prog.
>
> typo: field
>
> >
> > Signed-off-by: Raj Sahu <rjsu26@gmail.com>
> > Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
> > ---
> >  include/linux/bpf.h | 75 +++++++++++++++++++++++++++++----------------
> >  kernel/bpf/core.c   | 31 +++++++++++++++++++
> >  2 files changed, 79 insertions(+), 27 deletions(-)
> >
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 8f6e87f0f3a8..caaee33744fc 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -1584,6 +1584,25 @@ struct bpf_stream_stage {
> >         int len;
> >  };
> >
> > +struct call_aux_states {
> > +       int call_bpf_insn_idx;
> > +       int jit_call_idx;
> > +       u8 is_helper_kfunc;
> > +       u8 is_bpf_loop;
> > +       u8 is_bpf_loop_cb_inline;
> > +};
> > +
> > +struct bpf_term_patch_call_sites {
> > +       u32 call_sites_cnt;
> > +       struct call_aux_states *call_states;
> > +};
> > +
> > +struct bpf_term_aux_states {
> > +       struct bpf_prog *prog;
> > +       struct work_struct work;
> > +       struct bpf_term_patch_call_sites *patch_call_sites;
> > +};
> > +
> >  struct bpf_prog_aux {
> >         atomic64_t refcnt;
> >         u32 used_map_cnt;
> > @@ -1618,6 +1637,7 @@ struct bpf_prog_aux {
> >         bool tail_call_reachable;
> >         bool xdp_has_frags;
> >         bool exception_cb;
> > +       bool is_bpf_loop_cb_non_inline;
> >         bool exception_boundary;
> >         bool is_extended; /* true if extended by freplace program */
> >         bool jits_use_priv_stack;
> > @@ -1696,33 +1716,34 @@ struct bpf_prog_aux {
> >  };
> >
> >  struct bpf_prog {
> > -       u16                     pages;          /* Number of allocated pages */
> > -       u16                     jited:1,        /* Is our filter JIT'ed? */
> > -                               jit_requested:1,/* archs need to JIT the prog */
> > -                               gpl_compatible:1, /* Is filter GPL compatible? */
> > -                               cb_access:1,    /* Is control block accessed? */
> > -                               dst_needed:1,   /* Do we need dst entry? */
> > -                               blinding_requested:1, /* needs constant blinding */
> > -                               blinded:1,      /* Was blinded */
> > -                               is_func:1,      /* program is a bpf function */
> > -                               kprobe_override:1, /* Do we override a kprobe? */
> > -                               has_callchain_buf:1, /* callchain buffer allocated? */
> > -                               enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
> > -                               call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
> > -                               call_get_func_ip:1, /* Do we call get_func_ip() */
> > -                               tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
> > -                               sleepable:1;    /* BPF program is sleepable */
> > -       enum bpf_prog_type      type;           /* Type of BPF program */
> > -       enum bpf_attach_type    expected_attach_type; /* For some prog types */
> > -       u32                     len;            /* Number of filter blocks */
> > -       u32                     jited_len;      /* Size of jited insns in bytes */
> > -       u8                      tag[BPF_TAG_SIZE];
> > -       struct bpf_prog_stats __percpu *stats;
> > -       int __percpu            *active;
> > -       unsigned int            (*bpf_func)(const void *ctx,
> > -                                           const struct bpf_insn *insn);
> > -       struct bpf_prog_aux     *aux;           /* Auxiliary fields */
> > -       struct sock_fprog_kern  *orig_prog;     /* Original BPF program */
> > +       u16                             pages;          /* Number of allocated pages */
> > +       u16                             jited:1,        /* Is our filter JIT'ed? */
> > +                                       jit_requested:1,/* archs need to JIT the prog */
> > +                                       gpl_compatible:1, /* Is filter GPL compatible? */
> > +                                       cb_access:1,    /* Is control block accessed? */
> > +                                       dst_needed:1,   /* Do we need dst entry? */
> > +                                       blinding_requested:1, /* needs constant blinding */
> > +                                       blinded:1,      /* Was blinded */
> > +                                       is_func:1,      /* program is a bpf function */
> > +                                       kprobe_override:1, /* Do we override a kprobe? */
> > +                                       has_callchain_buf:1, /* callchain buffer allocated? */
> > +                                       enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
> > +                                       call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */
> > +                                       call_get_func_ip:1, /* Do we call get_func_ip() */
> > +                                       tstamp_type_access:1, /* Accessed __sk_buff->tstamp_type */
> > +                                       sleepable:1;    /* BPF program is sleepable */
> > +       enum bpf_prog_type              type;           /* Type of BPF program */
> > +       enum bpf_attach_type            expected_attach_type; /* For some prog types */
> > +       u32                             len;            /* Number of filter blocks */
> > +       u32                             jited_len;      /* Size of jited insns in bytes */
> > +       u8                              tag[BPF_TAG_SIZE];
> > +       struct bpf_prog_stats __percpu  *stats;
> > +       int __percpu                    *active;
> > +       unsigned int                    (*bpf_func)(const void *ctx,
> > +                                                   const struct bpf_insn *insn);
> > +       struct bpf_prog_aux             *aux;           /* Auxiliary fields */
> > +       struct sock_fprog_kern          *orig_prog;     /* Original BPF program */
> > +       struct bpf_term_aux_states      *term_states;
> >         /* Instructions for interpreter */
>
> This hunk looks bad, please only include what you touched, and don't
> reformat the rest.
>
>
I'll drop it

<SNIP>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/4] bpf: runtime part of fast-path termination approach
  2025-09-17  2:11   ` Kumar Kartikeya Dwivedi
@ 2025-09-17  4:01     ` Siddharth Chintamaneni
  0 siblings, 0 replies; 12+ messages in thread
From: Siddharth Chintamaneni @ 2025-09-17  4:01 UTC (permalink / raw)
  To: Kumar Kartikeya Dwivedi
  Cc: bpf, ast, daniel, andrii, martin.lau, eddyz87, song,
	yonghong.song, john.fastabend, kpsingh, sdf, haoluo, jolsa,
	djwillia, miloc, ericts, rahult, doniaghazy, quanzhif, jinghao7,
	egor, sairoop10, rjsu26

On Tue, 16 Sept 2025 at 19:11, Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote:
>
> On Mon, 8 Sept 2025 at 01:04, Siddharth Chintamaneni
> <sidchintamaneni@gmail.com> wrote:
> >
> > Update softlock detection logic to detect any stalls due to
> > BPF programs. When softlockup is detected, bpf_die will be
> > added to a workqueue on a CPU. With this implementation termination
> > handler will only get triggered when CONFIG_SOFTLOCKUP_DETECTOR is
> > enabled.
> >
>
> This is probably good for demonstration, but I'm not sure
> piggy-backing off of optional softlockup detection is a good idea.
> In any case, let's focus on the mechanism for now.
>
> > Inside bpf_die, we perform the text_poke to stub helpers/kfuncs.
> > The current implementation handles termination of long running
> > bpf_loop iterators both inlining and non-inlining case.
> >
> > The limitation of this implementation is that the termination handler
> > atleast need a single CPU to run.
>
> Yeah, as discussed in v2, there were other options without this
> limitation, i.e. take some overhead and check a terminate bit that
> doesn't rely on punting work off to a wq to ensure prog is killed.
>

The termination bit approach will work, if we were able to identify
predefined cancellation points in the helpers functions
implementation. The current termination handler implementation can be
easily ported there. We are also thinking of a termination case where
if BPF programs has to run under strict execution time constraints
which might not involve any long running iterators but still execution
of multiple helpers can accrue the cost. In those cases, the workqueue
approach could work. We will extend the current implementation to the
termination bit approach in the next iteration.

> >
> > Signed-off-by: Raj Sahu <rjsu26@gmail.com>
> > Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
> > ---
> >  arch/x86/net/bpf_jit_comp.c | 132 ++++++++++++++++++++++++++++++++++++
> >  include/linux/bpf.h         |   2 +
> >  include/linux/filter.h      |   6 ++
> >  kernel/bpf/core.c           |  35 +++++++++-
> >  kernel/watchdog.c           |   8 +++
> >  5 files changed, 182 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> > index 107a44729675..4de9a8cdc465 100644
> > --- a/arch/x86/net/bpf_jit_comp.c
> > +++ b/arch/x86/net/bpf_jit_comp.c
> > @@ -2606,6 +2606,10 @@ st:                      if (is_imm8(insn->off))
> >                                 if (arena_vm_start)
> >                                         pop_r12(&prog);
> >                         }
> > +                       /* emiting 5 byte nop for non-inline bpf_loop callback */
>
> typo: emitting
>
> > +                       if (bpf_is_subprog(bpf_prog) && bpf_prog->aux->is_bpf_loop_cb_non_inline) {
> > +                               emit_nops(&prog, X86_PATCH_SIZE);
> > +                       }
>
> But this is not the only source of potential stalls, right? E.g. bpf
> iterators can stall (if nested), cond_break, etc.
>

We've just implemented it to work for bpf_loop and honestly didn't
look into other iterators thinking that it should be straight forward.
In the next iteration, we will extend it to the other potential
stalls.

> >                         EMIT1(0xC9);         /* leave */
> >                         emit_return(&prog, image + addrs[i - 1] + (prog - temp));
> >                         break;
> > @@ -3833,6 +3837,8 @@ bool bpf_jit_supports_private_stack(void)
> >         return true;
> >  }
> >
> > +
> > +
> >  void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
> >  {
> >  #if defined(CONFIG_UNWINDER_ORC)
> > @@ -3849,6 +3855,132 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp
> >  #endif
> >  }
> >
> > +void in_place_patch_bpf_prog(struct bpf_prog *prog)
> > +{
> > +       struct call_aux_states *call_states;
> > +       unsigned long new_target;
> > +       unsigned char *addr;
> > +       u8 ret_jmp_size = 1;
> > +       if (cpu_wants_rethunk()) {
> > +               ret_jmp_size = 5;
> > +       }
> > +       call_states = prog->term_states->patch_call_sites->call_states;
> > +       for (int i = 0; i < prog->term_states->patch_call_sites->call_sites_cnt; i++) {
> > +
> > +               new_target = (unsigned long) bpf_termination_null_func;
> > +               if (call_states[i].is_bpf_loop_cb_inline) {
> > +                       new_target = (unsigned long) bpf_loop_term_callback;
> > +               }
> > +               char new_insn[5];
> > +
> > +               addr = (unsigned char *)prog->bpf_func + call_states->jit_call_idx;
> > +
> > +               unsigned long new_rel = (unsigned long)(new_target - (unsigned long)(addr + 5));
> > +               new_insn[0] = 0xE8;
> > +               new_insn[1] = (new_rel >> 0) & 0xFF;
> > +               new_insn[2] = (new_rel >> 8) & 0xFF;
> > +               new_insn[3] = (new_rel >> 16) & 0xFF;
> > +               new_insn[4] = (new_rel >> 24) & 0xFF;
> > +
> > +               smp_text_poke_batch_add(addr, new_insn, 5 /* call instruction len */, NULL);
> > +       }
> > +
> > +       if (prog->aux->is_bpf_loop_cb_non_inline) {
> > +
> > +               char new_insn[5] = { 0xB8, 0x01, 0x00, 0x00, 0x00 };
> > +               char old_insn[5] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 };
> > +               smp_text_poke_batch_add(prog->bpf_func + prog->jited_len -
> > +                               (1 + ret_jmp_size) /* leave, jmp/ ret */ - 5 /* nop size */, new_insn, 5 /* mov eax, 1 */, old_insn);
> > +       }
> > +
> > +
> > +       /* flush all text poke calls */
> > +       smp_text_poke_batch_finish();
> > +}
> > +
> > +void bpf_die(struct bpf_prog *prog)
> > +{
> > +       u8 ret_jmp_size = 1;
> > +       if (cpu_wants_rethunk()) {
> > +               ret_jmp_size = 5;
> > +       }
> > +
> > +       /*
> > +        * Replacing 5 byte nop in prologue with jmp instruction to ret
> > +        */
> > +       unsigned long jmp_offset = prog->jited_len - (4 /* First endbr is 4 bytes */
> > +                                       + 5 /* noop is 5 bytes */
> > +                                       + ret_jmp_size /* 5 bytes of jmp return_thunk or 1 byte ret*/);
> > +
> > +       char new_insn[5];
> > +       new_insn[0] = 0xE9;
> > +       new_insn[1] = (jmp_offset >> 0) & 0xFF;
> > +       new_insn[2] = (jmp_offset >> 8) & 0xFF;
> > +       new_insn[3] = (jmp_offset >> 16) & 0xFF;
> > +       new_insn[4] = (jmp_offset >> 24) & 0xFF;
> > +
> > +       smp_text_poke_batch_add(prog->bpf_func + 4, new_insn, 5, NULL);
> > +
> > +       if (prog->aux->func_cnt) {
> > +               for (int i = 0; i < prog->aux->func_cnt; i++) {
> > +                       in_place_patch_bpf_prog(prog->aux->func[i]);
> > +               }
> > +       } else {
> > +               in_place_patch_bpf_prog(prog);
> > +       }
> > +
>
> Are you relying on batch finish() inside in_place_patch_bpf_prog()?

Yes

> > +}
> > +
> > +void bpf_prog_termination_deferred(struct work_struct *work)
> > +{
> > +       struct bpf_term_aux_states *term_states = container_of(work, struct bpf_term_aux_states,
> > +                                                work);
> > +       struct bpf_prog *prog = term_states->prog;
> > +
> > +       bpf_die(prog);
> > +}
> > +
> > +static struct workqueue_struct *bpf_termination_wq;
> > +
> > +void bpf_softlockup(u32 dur_s)
> > +{
> > +       unsigned long addr;
> > +       struct unwind_state state;
> > +       struct bpf_prog *prog;
> > +
> > +       for (unwind_start(&state, current, NULL, NULL); !unwind_done(&state);
> > +            unwind_next_frame(&state)) {
>
> Why not use arch_bpf_stack_walk?

we will reuse it, kinda overlooked it. Also we have a question, is
trampoline text supposed to be identified as is_bpf_text_address?

>
> > +               addr = unwind_get_return_address(&state);
> > +               if (!addr)
> > +                       break;
> > +
> > +               if (!is_bpf_text_address(addr))
> > +                       continue;
> > +
> > +               rcu_read_lock();
> > +               prog = bpf_prog_ksym_find(addr);
> > +               rcu_read_unlock();
> > +               if (bpf_is_subprog(prog))
> > +                       continue;
> > +
> > +               if (atomic_cmpxchg(&prog->term_states->bpf_die_in_progress, 0, 1))
> > +                       break;
> > +
> > +               bpf_termination_wq = alloc_workqueue("bpf_termination_wq", WQ_UNBOUND, 1);
>
> Err, even if you have to, I'd rather go with system_unbound_wq for
> now. We have bpf_wq, and should probably have a dedicated bpf wq for
> all wq executions coming from the BPF subsystem, but that's a
> discussion for later.
>
>

Will look into this.

<SNIP>

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2025-09-17  4:01 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-07 23:04 [PATCH 0/4] bpf: Fast-Path approach for BPF program termination Siddharth Chintamaneni
2025-09-07 23:04 ` [PATCH 1/4] bpf: Introduce new structs and struct fields for fast path termination Siddharth Chintamaneni
2025-09-17  2:11   ` Kumar Kartikeya Dwivedi
2025-09-17  3:38     ` Siddharth Chintamaneni
2025-09-07 23:04 ` [PATCH 2/4] bpf: Creating call sites table to stub instructions during runtime Siddharth Chintamaneni
2025-09-07 23:04 ` [PATCH 3/4] bpf: runtime part of fast-path termination approach Siddharth Chintamaneni
2025-09-08  6:01   ` kernel test robot
2025-09-08  7:14   ` kernel test robot
2025-09-17  2:11   ` Kumar Kartikeya Dwivedi
2025-09-17  4:01     ` Siddharth Chintamaneni
2025-09-07 23:04 ` [PATCH 4/4] selftests/bpf: Adds selftests to check termination of long running nested bpf loops Siddharth Chintamaneni
2025-09-17  2:13 ` [PATCH 0/4] bpf: Fast-Path approach for BPF program termination Kumar Kartikeya Dwivedi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox