Re: [RFC bpf-next 3/4] bpf: Generating a stubbed version of BPF program for termination

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Eduard Zingerman <eddyz87@gmail.com>
To: Raj Sahu <rjsu26@gmail.com>
Cc: bpf@vger.kernel.org,  ast@kernel.org,  daniel@iogearbox.net,
	andrii@kernel.org,  martin.lau@linux.dev,  song@kernel.org,
	yonghong.song@linux.dev,  john.fastabend@gmail.com,
	 kpsingh@kernel.org, sdf@fomichev.me,  haoluo@google.com,
	 jolsa@kernel.org,  djwillia@vt.edu, miloc@vt.edu,
	 ericts@vt.edu,  rahult@vt.edu,  doniaghazy@vt.edu,
	quanzhif@vt.edu,  jinghao7@illinois.edu,
	 sidchintamaneni@gmail.com, memxor@gmail.com
Subject: Re: [RFC bpf-next 3/4] bpf: Generating a stubbed version of BPF program for termination
Date: Mon, 12 May 2025 17:07:02 -0700	[thread overview]
Message-ID: <m27c2l1ihl.fsf@gmail.com> (raw)
In-Reply-To: <20250420105524.2115690-4-rjsu26@gmail.com> (Raj Sahu's message of "Sun, 20 Apr 2025 06:55:21 -0400")

Raj Sahu <rjsu26@gmail.com> writes:

[...]

> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> index 380e9a7cac75..e5dceebfb302 100644
> --- a/kernel/bpf/bpf_iter.c
> +++ b/kernel/bpf/bpf_iter.c
> @@ -6,6 +6,7 @@
>  #include <linux/filter.h>
>  #include <linux/bpf.h>
>  #include <linux/rcupdate_trace.h>
> +#include <asm/unwind.h>
>  
>  struct bpf_iter_target_info {
>  	struct list_head list;
> @@ -775,6 +776,70 @@ const struct bpf_func_proto bpf_loop_proto = {
>  	.arg4_type	= ARG_ANYTHING,
>  };
>  
> +BPF_CALL_4(bpf_loop_termination, u32, nr_loops, void *, callback_fn,
> +		void *, callback_ctx, u64, flags)
> +{
> +	/* Since a patched BPF program for termination will want
> +	 * to finish as fast as possible,
> +	 * we simply don't run any loop in here.
> +	 */
> +
> +	/* Restoring the callee-saved registers and exit.
> +	 * Hacky/ err prone way of restoring the registers.
> +	 * We are figuring out a way to have arch independent
> +	 * way to do this.
> +	 */
> +
> +	asm volatile(
> +	"pop %rbx\n\t"
> +	"pop %rbp\n\t"
> +	"pop %r12\n\t"
> +	"pop %r13\n\t"
> +	);

Why do anything special here?
I'd expect a bpf_loop() version simplified to a single return to work.

> +
> +	return 0;
> +}

[...]

> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c

[...]

> +
> +static bool find_in_skiplist(int func_id) {

Nit: "skip list" is a name of an (unrelated) data structure,
     maybe disambiguate the name here?

> +	return is_verifier_inlined_function(func_id) ||
> +	       is_debug_function(func_id) ||
> +	       is_resource_release_function(func_id);
> +}
> +
> +static int get_replacement_helper(int func_id, enum bpf_return_type ret_type) {
> +
> +	switch (func_id) {
> +		case BPF_FUNC_loop:
> +			return BPF_FUNC_loop_termination;
> +		case BPF_FUNC_for_each_map_elem:
> +		case BPF_FUNC_user_ringbuf_drain:
> +			return -ENOTSUPP;
> +	}
> +
> +	switch (ret_type) {
> +		case RET_VOID:
> +			return BPF_FUNC_dummy_void;
> +		case RET_INTEGER:
> +			return BPF_FUNC_dummy_int;
> +		case RET_PTR_TO_MAP_VALUE_OR_NULL:
> +			return BPF_FUNC_dummy_ptr_to_map;
> +		case RET_PTR_TO_SOCKET_OR_NULL:
> +		case RET_PTR_TO_TCP_SOCK_OR_NULL:
> +		case RET_PTR_TO_SOCK_COMMON_OR_NULL:
> +		case RET_PTR_TO_RINGBUF_MEM_OR_NULL:
> +		case RET_PTR_TO_DYNPTR_MEM_OR_NULL:
> +		case RET_PTR_TO_BTF_ID_OR_NULL:
> +		case RET_PTR_TO_BTF_ID_TRUSTED:
> +		case RET_PTR_TO_MAP_VALUE:
> +		case RET_PTR_TO_SOCKET:
> +		case RET_PTR_TO_TCP_SOCK:
> +		case RET_PTR_TO_SOCK_COMMON:
> +		case RET_PTR_TO_MEM:
> +		case RET_PTR_TO_MEM_OR_BTF_ID:
> +		case RET_PTR_TO_BTF_ID:

I'm curious what's the plan for RET_PTR_TO_BTF_ID?
verifier.c:check_ptr_tp_btf_access() has the following comment:

  /* By default any pointer obtained from walking a trusted pointer is no
   * longer trusted, unless the field being accessed has explicitly been
   * marked as inheriting its parent's state of trust (either full or RCU).
   * For example:
   * 'cgroups' pointer is untrusted if task->cgroups dereference
   * happened in a sleepable program outside of bpf_rcu_read_lock()
   * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU).
   * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED.
   *
   * A regular RCU-protected pointer with __rcu tag can also be deemed
   * trusted if we are in an RCU CS. Such pointer can be NULL.
   */

> +		default:
> +			return -ENOTSUPP;
> +	}
> +}
> +
> +static void patch_generator(struct bpf_prog *prog)
> +{
> +	struct call_insn_aux{
> +		int insn_idx;
> +		int replacement_helper;
> +	};
> +
> +	struct call_insn_aux *call_indices;
> +	int num_calls=0;
> +	call_indices = vmalloc(sizeof(call_indices) * prog->len);

clang-tidy spotted a warning here, the expression should be `sizeof(*call_indices)`.

> +
> +	/* Find all call insns */
> +	for(int insn_idx =0 ;insn_idx < prog->len; insn_idx++)
> +	{
> +		struct bpf_insn *insn = &prog->insnsi[insn_idx] ;
> +		u8 class = BPF_CLASS(insn->code);
> +		if (class == BPF_JMP || class == BPF_JMP32) {
> +			if (BPF_OP(insn->code) == BPF_CALL){
> +				if (insn->src_reg == BPF_PSEUDO_CALL) {
> +					continue;
> +				}
> +				if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL){ /*kfunc */
> +					// TODO Need to use btf for getting proto
> +					// If release function --> skip
> +					// If acquire function --> find return type and add to list
> +				}
> +				else {
> +					int func_id = insn->imm;
> +					const struct bpf_func_proto *fn = NULL;
> +					int new_helper_id = -1;
> +
> +					if (find_in_skiplist(func_id)) {
> +						continue;
> +					}
> +
> +					fn = bpf_verifier_ops[prog->type]->get_func_proto(func_id, prog);

Nit: please reuse verifier.c:get_helper_proto() utility function here.

> +					if (!fn && !fn->func) {
> +						continue;
> +					}
> +
> +					new_helper_id = get_replacement_helper(func_id, fn->ret_type);
> +					if (new_helper_id < 0) {
> +						continue;

Nit: propagate error?

> +					}
> +
> +					call_indices[num_calls].insn_idx = insn_idx;
> +					call_indices[num_calls].replacement_helper= new_helper_id;
> +					num_calls++;
> +				}
> +			}
> +		}
> +	}
> +
> +	/* Patch all call insns */
> +	for(int k =0; k < num_calls; k++){
> +		prog->insnsi[call_indices[k].insn_idx].imm = call_indices[k].replacement_helper;

Nit: each instruction is visited only once, so it appears that patching
     can be done in-place w/o need for call_indices array.

> +	}
> +}
> +
> +static bool create_termination_prog(struct bpf_prog *prog,
> +					union bpf_attr *attr,
> +					bpfptr_t uattr,
> +					u32 uattr_size)
> +{
> +	if (prog->len < 10)
> +		return false;
> +
> +	int err;
> +	struct bpf_prog *patch_prog;
> +	patch_prog = bpf_prog_alloc_no_stats(bpf_prog_size(prog->len), 0);
> +	if (!patch_prog) {
> +		return false;
> +	}
> +
> +	patch_prog->termination_states->is_termination_prog = true;
> +
> +	err = clone_bpf_prog(patch_prog, prog);
> +	if (err)
> +			goto free_termination_prog;
> +
> +	patch_generator(patch_prog);
> +
> +	err = bpf_check(&patch_prog, attr, uattr, uattr_size);

There might be issues with program verification if bpf_check is called
for a patched program. For example, for a full implementation you'd need
to handle kfuncs, replacing loops like:

  while (bpf_iter_next()) {}

with loops like:

  while (dummy_return_null()) {}

won't work as verifier would assume that the loop is infinite.
In general, check_helper_call, check_kfunc_call and is_state_visited
have logic for specific helpers/kfuncs that modifies verifier state
basing not only on the helper return value type.

What do you plan to do with functions that unconditionally take
resources, e.g. bpf_spin_lock()?

- From verification point of view:
  this function is RET_VOID and is not in
  find_in_skiplist(), patch_generator() would replace its call with a
  dummy. However, a corresponding bpf_spin_unlock() would remain and thus
  bpf_check() will exit with error.
  So, you would need some special version of bpf_check, that collects
  all resources needed for program translation (e.g. maps), but does
  not perform semantic checks.
  Or patch_generator() has to be called for a program that is already
  verified.

- From termination point of view:
  this function cannot be replaced with a dummy w/o removing
  corresponding unlock. Do you plan to leave these as-is?

> +	if (err) {
> +		goto free_termination_prog;
> +	}
> +
> +	patch_prog = bpf_prog_select_runtime(patch_prog, &err);
> +	if (err) {
> +		goto free_termination_prog;
> +	}
> +
> +	prog->termination_states->patch_prog = patch_prog;
> +	return true;
> +
> +free_termination_prog:
> +	free_percpu(patch_prog->stats);
> +	free_percpu(patch_prog->active);
> +	kfree(patch_prog->aux);
> +	return false;


In case of a load failure of the main program bpf_prog_load_load() does
much more cleanup.

[...]

> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 54c6953a8b84..57b4fd1f6a72 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c

[...]

> @@ -22502,7 +22506,14 @@ static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
>  	 */
>  	insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt);
>  	insn_buf[cnt++] = BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx);
> -	insn_buf[cnt++] = BPF_CALL_REL(0);
> +
> +	if (termination_states && termination_states->is_termination_prog) {
> +		/* In a termination BPF prog, we want to exit - set R0 = 1 */
> +		insn_buf[cnt++] = BPF_MOV64_IMM(BPF_REG_0, 1);
> +	} else {
> +		insn_buf[cnt++] = BPF_CALL_REL(0);
> +	}
> +

Q: Do you need 1:1 instruction pointer correspondence between original
   and patched programs?
   Since you patch inlined bpf_loop instead of disallowing inlining in
   the patched program I assume you do. In this case, there is no
   guarantee that instructions above have the same size after jit.

>  	/* increment loop counter */
>  	insn_buf[cnt++] = BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1);
>  	/* jump to loop header if callback returned 0 */

[...]

next prev parent reply	other threads:[~2025-05-13  0:07 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-04-20 10:55 [RFC bpf-next 0/4] bpf: Fast-Path approach for BPF program Termination Raj Sahu
2025-04-20 10:55 ` [RFC bpf-next 1/4] bpf: Introduce new structs and struct fields Raj Sahu
2025-05-13  0:04   ` Eduard Zingerman
2025-04-20 10:55 ` [RFC bpf-next 2/4] bpftool/libbpf : Introduce bpf_prog_termination to trigger termination signal Raj Sahu
2025-04-20 10:55 ` [RFC bpf-next 3/4] bpf: Generating a stubbed version of BPF program for termination Raj Sahu
2025-04-20 13:01   ` kernel test robot
2025-04-20 13:01   ` kernel test robot
2025-04-20 13:01   ` kernel test robot
2025-05-13  0:07   ` Eduard Zingerman [this message]
2025-05-13  0:20     ` Alexei Starovoitov
2025-05-13  4:40       ` Eduard Zingerman
2025-05-13  5:16       ` Siddharth Chintamaneni
2025-05-15  1:59         ` Alexei Starovoitov
2025-05-15 17:43           ` Andrii Nakryiko
2025-05-17  5:01           ` Raj Sahu
2025-05-19  1:20             ` Alexei Starovoitov
2025-05-28  7:10               ` Raj Sahu
2025-06-04 23:02                 ` Alexei Starovoitov
2025-06-10 22:06                   ` Siddharth Chintamaneni
2025-06-11 15:59                     ` Alexei Starovoitov
2025-04-20 10:55 ` [RFC bpf-next 4/4] bpf: Runtime part of fast-path termination approach Raj Sahu
2025-04-20 12:50   ` kernel test robot
2025-04-20 13:12   ` kernel test robot
2025-04-20 14:15   ` kernel test robot
2025-05-05 20:13 ` [RFC bpf-next 0/4] bpf: Fast-Path approach for BPF program Termination Kumar Kartikeya Dwivedi
2025-05-06  5:55   ` Raj Sahu
2025-05-06 22:45     ` Alexei Starovoitov
2025-05-06 22:59       ` Kumar Kartikeya Dwivedi
2025-05-07  0:32         ` Alexei Starovoitov
2025-05-07  0:38           ` Kumar Kartikeya Dwivedi
2025-05-07  1:15             ` Alexei Starovoitov
2025-05-07  2:10               ` Kumar Kartikeya Dwivedi
2025-05-07  3:36                 ` Alexei Starovoitov
2025-05-07  5:04                   ` Kumar Kartikeya Dwivedi
2025-05-07 18:15 ` Zvi Effron
2025-05-07 20:01   ` Alexei Starovoitov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=m27c2l1ihl.fsf@gmail.com \
    --to=eddyz87@gmail.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=djwillia@vt.edu \
    --cc=doniaghazy@vt.edu \
    --cc=ericts@vt.edu \
    --cc=haoluo@google.com \
    --cc=jinghao7@illinois.edu \
    --cc=john.fastabend@gmail.com \
    --cc=jolsa@kernel.org \
    --cc=kpsingh@kernel.org \
    --cc=martin.lau@linux.dev \
    --cc=memxor@gmail.com \
    --cc=miloc@vt.edu \
    --cc=quanzhif@vt.edu \
    --cc=rahult@vt.edu \
    --cc=rjsu26@gmail.com \
    --cc=sdf@fomichev.me \
    --cc=sidchintamaneni@gmail.com \
    --cc=song@kernel.org \
    --cc=yonghong.song@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.