From 8eba407b1d5663bd767a4470bfb0e52ad1623475 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 2 Oct 2024 09:54:40 -0700 Subject: [PATCH] bpf: Implement private stack --- arch/x86/net/bpf_jit_comp.c | 29 +++++++++++++++++-- include/linux/bpf.h | 1 + kernel/bpf/core.c | 9 ++++++ kernel/bpf/verifier.c | 12 ++++++++ net/core/sysctl_net_core.c | 3 +- .../bpf/progs/task_storage_nodeadlock.c | 8 ++++- 6 files changed, 57 insertions(+), 5 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 06b080b61aa5..3f0cf410199a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -164,6 +164,7 @@ static int bpf_size_to_x86_bytes(int bpf_size) #define AUX_REG (MAX_BPF_JIT_REG + 1) #define X86_REG_R9 (MAX_BPF_JIT_REG + 2) #define X86_REG_R12 (MAX_BPF_JIT_REG + 3) +#define X86_REG_SP (MAX_BPF_JIT_REG + 4) /* * The following table maps BPF registers to x86-64 registers. @@ -191,6 +192,7 @@ static const int reg2hex[] = { [AUX_REG] = 3, /* R11 temp register */ [X86_REG_R9] = 1, /* R9 register, 6th function argument */ [X86_REG_R12] = 4, /* R12 callee saved */ + [X86_REG_SP] = 4, /* RSP */ }; static const int reg2pt_regs[] = { @@ -484,12 +486,15 @@ static void emit_prologue_tail_call(u8 **pprog, bool is_subprog) *pprog = prog; } +static void emit_mov_imm64(u8 **pprog, u32 dst_reg, const u32 imm32_hi, const u32 imm32_lo); + +extern int yhs; /* * Emit x86-64 prologue code for BPF program. * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes * while jumping to another program */ -static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, +static void emit_prologue(struct bpf_prog *bpf_prog, u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, bool tail_call_reachable, bool is_subprog, bool is_exception_cb) { @@ -526,13 +531,31 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ } + if (yhs && !is_subprog && !tail_call_reachable && bpf_prog->aux->priv_stack_ptr) { + /* mov rsp, pcpu_priv_stack_top */ + void __percpu *priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + 16 * 1024; + + /* movabs sp, priv_frame_ptr */ + emit_mov_imm64(&prog, AUX_REG, (long) priv_frame_ptr >> 32, + (u32) (long) priv_frame_ptr); + + /* add , gs:[] */ + EMIT2(0x65, 0x4c); + EMIT3(0x03, 0x1c, 0x25); + EMIT((u32)(unsigned long)&this_cpu_off, 4); + /* mov rsp, aux_reg */ + EMIT3(0x4c, 0x89, 0xdc); + } /* X86_TAIL_CALL_OFFSET is here */ EMIT_ENDBR(); /* sub rsp, rounded_stack_depth */ - if (stack_depth) + if (yhs && !is_subprog && !tail_call_reachable && bpf_prog->aux->priv_stack_ptr) { + EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8) + 16); + } else if (stack_depth) { EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8)); + } if (tail_call_reachable) emit_prologue_tail_call(&prog, is_subprog); *pprog = prog; @@ -1432,7 +1455,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image detect_reg_usage(insn, insn_cnt, callee_regs_used); - emit_prologue(&prog, bpf_prog->aux->stack_depth, + emit_prologue(bpf_prog, &prog, bpf_prog->aux->stack_depth, bpf_prog_was_classic(bpf_prog), tail_call_reachable, bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb); /* Exception callback will clobber callee regs for its own use, and diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 19d8ca8ac960..32f309e01d57 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1493,6 +1493,7 @@ struct bpf_prog_aux { struct bpf_jit_poke_descriptor *poke_tab; struct bpf_kfunc_desc_tab *kfunc_tab; struct bpf_kfunc_btf_tab *kfunc_btf_tab; + void __percpu *priv_stack_ptr; u32 size_poke_tab; #ifdef CONFIG_FINEIBT struct bpf_ksym ksym_prefix; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4e07cc057d6f..0c56124f66c2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2395,6 +2395,7 @@ static void bpf_prog_select_func(struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { + void __percpu *priv_stack_ptr; /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ @@ -2420,6 +2421,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) if (*err) return fp; + priv_stack_ptr = __alloc_percpu_gfp(1024 * 16, 8, GFP_KERNEL); + if (!priv_stack_ptr) { + *err = -ENOMEM; + return fp; + } + fp->aux->priv_stack_ptr = priv_stack_ptr; + fp = bpf_int_jit_compile(fp); bpf_prog_jit_attempt_done(fp); if (!fp->jited && jit_needed) { @@ -2800,6 +2808,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) #endif if (aux->dst_trampoline) bpf_trampoline_put(aux->dst_trampoline); + free_percpu(aux->priv_stack_ptr); for (i = 0; i < aux->real_func_cnt; i++) { /* We can just unlink the subprog poke descriptor table as * it was originally linked to the main program and is also diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7d9b38ffd220..81a750910489 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19940,6 +19940,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog, **func, *tmp; int i, j, subprog_start, subprog_end = 0, len, subprog; + void __percpu *priv_stack_ptr = NULL; struct bpf_map *map_ptr; struct bpf_insn *insn; void *old_bpf_func; @@ -20066,6 +20067,16 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; if (!i) func[i]->aux->exception_boundary = env->seen_exception; + + if (!i) { + priv_stack_ptr = __alloc_percpu_gfp(1024 * 16, 8, GFP_KERNEL); + if (!priv_stack_ptr) { + err = -ENOMEM; + goto out_free; + } + func[0]->aux->priv_stack_ptr = priv_stack_ptr; + } + func[i] = bpf_int_jit_compile(func[i]); if (!func[i]->jited) { err = -ENOTSUPP; @@ -20154,6 +20165,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->bpf_func = func[0]->bpf_func; prog->jited_len = func[0]->jited_len; prog->aux->extable = func[0]->aux->extable; + prog->aux->priv_stack_ptr = priv_stack_ptr; prog->aux->num_exentries = func[0]->aux->num_exentries; prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt - env->hidden_subprog_cnt; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 86a2476678c4..2903ae849a5e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -370,13 +370,14 @@ proc_dointvec_minmax_bpf_restricted(const struct ctl_table *table, int write, } # endif /* CONFIG_HAVE_EBPF_JIT */ +int yhs; static int proc_dolongvec_minmax_bpf_restricted(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; - + yhs = 1; return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif diff --git a/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c b/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c index ea2dbb80f7b3..a9f66a24f14a 100644 --- a/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c +++ b/tools/testing/selftests/bpf/progs/task_storage_nodeadlock.c @@ -33,8 +33,14 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, return 0; task = bpf_get_current_task_btf(); +if (0) { value = bpf_task_storage_get(&task_storage, task, &zero, BPF_LOCAL_STORAGE_GET_F_CREATE); +} else { + value = bpf_task_storage_get(&task_storage, task, NULL, + 0); +} +if (0) { if (!value) __sync_fetch_and_add(&nr_get_errs, 1); @@ -42,6 +48,6 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, bpf_get_current_task_btf()); if (ret == -EBUSY) __sync_fetch_and_add(&nr_del_errs, 1); - +} return 0; } -- 2.43.5