public inbox for bpf@vger.kernel.org
 help / color / mirror / Atom feed
From: Yonghong Song <yonghong.song@linux.dev>
To: Alexei Starovoitov <alexei.starovoitov@gmail.com>,
	Puranjay Mohan <puranjay@kernel.org>
Cc: bpf <bpf@vger.kernel.org>, Alexei Starovoitov <ast@kernel.org>,
	Andrii Nakryiko <andrii@kernel.org>,
	Daniel Borkmann <daniel@iogearbox.net>,
	"Jose E . Marchesi" <jose.marchesi@oracle.com>,
	Kernel Team <kernel-team@fb.com>,
	Martin KaFai Lau <martin.lau@kernel.org>
Subject: Re: [PATCH bpf-next v4 15/18] bpf,x86: Implement JIT support for stack arguments
Date: Tue, 14 Apr 2026 09:45:41 -0700	[thread overview]
Message-ID: <af8bb98f-5df7-4cba-9bd7-84eed5239db9@linux.dev> (raw)
In-Reply-To: <281485db-073e-45b6-8929-dad36fea5f87@linux.dev>



On 4/13/26 10:26 AM, Yonghong Song wrote:
>
>
> On 4/12/26 3:36 PM, Alexei Starovoitov wrote:
>> On Sat, Apr 11, 2026 at 10:00 PM Yonghong Song 
>> <yonghong.song@linux.dev> wrote:
>>> Add x86_64 JIT support for BPF functions and kfuncs with more than
>>> 5 arguments. The extra arguments are passed through a stack area
>>> addressed by register r12 (BPF_REG_STACK_ARG_BASE) in BPF bytecode,
>>> which the JIT translates to native code.
>>>
>>> The JIT follows the x86-64 calling convention for both BPF-to-BPF
>>> and kfunc calls:
>>>    - Arg 6 is passed in the R9 register
>>>    - Args 7+ are passed on the stack
>>>
>>> Incoming arg 6 (BPF r12+8) is translated to a MOV from R9 rather
>>> than a memory load. Incoming args 7+ (BPF r12+16, r12+24, ...) map
>>> directly to [rbp + 16], [rbp + 24], ..., matching the x86-64 stack
>>> layout after CALL + PUSH RBP, so no offset adjustment is needed.
>>>
>>> The verifier guarantees that neither tail_call_reachable nor
>>> priv_stack is set when outgoing stack args exist, so R9 is always
>>> available. When BPF bytecode writes to the arg-6 stack slot
>>> (the most negative outgoing offset), the JIT emits a MOV into R9
>>> instead of a memory store. Outgoing args 7+ are placed at [rsp]
>>> in a pre-allocated area below callee-saved registers, using:
>>>    native_off = outgoing_arg_base + bpf_off
>>>
>>> The native x86_64 stack layout:
>>>
>>>    high address
>>>    +-------------------------+
>>>    | incoming stack arg N    |  [rbp + 16 + (N-2)*8]  (from caller)
>>>    | ...                     |
>>>    | incoming stack arg 7    |  [rbp + 16]
>>>    +-------------------------+
>>>    | return address          |  [rbp + 8]
>>>    | saved rbp               |  [rbp]
>>>    +-------------------------+
>>>    | BPF program stack       |  (round_up(stack_depth, 8) bytes)
>>>    +-------------------------+
>>>    | callee-saved regs       |  (r12, rbx, r13, r14, r15 as needed)
>>>    +-------------------------+
>>>    | outgoing arg M          |  [rsp + (M-7)*8]
>>>    | ...                     |
>>>    | outgoing arg 7          |  [rsp]
>>>    +-------------------------+  rsp
>>>    low address
>>>
>>>    (Arg 6 is in R9, not on the stack)
>>>
>>>    [1] https://github.com/llvm/llvm-project/pull/189060
>>>
>>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>>> ---
>>>   arch/x86/net/bpf_jit_comp.c | 172 
>>> ++++++++++++++++++++++++++++++++++--
>>>   1 file changed, 164 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>>> index 32864dbc2c4e..ec57b9a6b417 100644
>>> --- a/arch/x86/net/bpf_jit_comp.c
>>> +++ b/arch/x86/net/bpf_jit_comp.c
>>> @@ -390,6 +390,34 @@ static void pop_callee_regs(u8 **pprog, bool 
>>> *callee_regs_used)
>>>          *pprog = prog;
>>>   }
>>>
>>> +/* add rsp, depth */
>>> +static void emit_add_rsp(u8 **pprog, u16 depth)
>>> +{
>>> +       u8 *prog = *pprog;
>>> +
>>> +       if (!depth)
>>> +               return;
>>> +       if (is_imm8(depth))
>>> +               EMIT4(0x48, 0x83, 0xC4, depth); /* add rsp, imm8 */
>>> +       else
>>> +               EMIT3_off32(0x48, 0x81, 0xC4, depth); /* add rsp, 
>>> imm32 */
>>> +       *pprog = prog;
>>> +}
>>> +
>>> +/* sub rsp, depth */
>>> +static void emit_sub_rsp(u8 **pprog, u16 depth)
>>> +{
>>> +       u8 *prog = *pprog;
>>> +
>>> +       if (!depth)
>>> +               return;
>>> +       if (is_imm8(depth))
>>> +               EMIT4(0x48, 0x83, 0xEC, depth); /* sub rsp, imm8 */
>>> +       else
>>> +               EMIT3_off32(0x48, 0x81, 0xEC, depth); /* sub rsp, 
>>> imm32 */
>>> +       *pprog = prog;
>>> +}
>>> +
>>>   static void emit_nops(u8 **pprog, int len)
>>>   {
>>>          u8 *prog = *pprog;
>>> @@ -725,8 +753,8 @@ static void emit_return(u8 **pprog, u8 *ip)
>>>    */
>>>   static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
>>>                                          u8 **pprog, bool 
>>> *callee_regs_used,
>>> -                                       u32 stack_depth, u8 *ip,
>>> -                                       struct jit_context *ctx)
>>> +                                       u32 stack_depth, u16 
>>> outgoing_depth,
>>> +                                       u8 *ip, struct jit_context 
>>> *ctx)
>>>   {
>>>          int tcc_ptr_off = 
>>> BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack_depth);
>>>          u8 *prog = *pprog, *start = *pprog;
>>> @@ -775,6 +803,9 @@ static void emit_bpf_tail_call_indirect(struct 
>>> bpf_prog *bpf_prog,
>>>          /* Inc tail_call_cnt if the slot is populated. */
>>>          EMIT4(0x48, 0x83, 0x00, 0x01);            /* add qword ptr 
>>> [rax], 1 */
>>>
>>> +       /* Deallocate outgoing stack arg area. */
>>> +       emit_add_rsp(&prog, outgoing_depth);
>> leftover?
>> tailcalls are 6+ args don't mix.
>
> Ack. This is due to my negligence.
>
>>
>>> +
>>>          if (bpf_prog->aux->exception_boundary) {
>>>                  pop_callee_regs(&prog, all_callee_regs_used);
>>>                  pop_r12(&prog);
>>> @@ -815,6 +846,7 @@ static void emit_bpf_tail_call_direct(struct 
>>> bpf_prog *bpf_prog,
>>>                                        struct 
>>> bpf_jit_poke_descriptor *poke,
>>>                                        u8 **pprog, u8 *ip,
>>>                                        bool *callee_regs_used, u32 
>>> stack_depth,
>>> +                                     u16 outgoing_depth,
>>>                                        struct jit_context *ctx)
>>>   {
>>>          int tcc_ptr_off = 
>>> BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack_depth);
>>> @@ -842,6 +874,9 @@ static void emit_bpf_tail_call_direct(struct 
>>> bpf_prog *bpf_prog,
>>>          /* Inc tail_call_cnt if the slot is populated. */
>>>          EMIT4(0x48, 0x83, 0x00, 0x01);                /* add qword 
>>> ptr [rax], 1 */
>>>
>>> +       /* Deallocate outgoing stack arg area. */
>>> +       emit_add_rsp(&prog, outgoing_depth);
>>> +
>> another leftover?
>
> Ya. Will remove in the next revision.
>
>>
>>
>>>          if (bpf_prog->aux->exception_boundary) {
>>>                  pop_callee_regs(&prog, all_callee_regs_used);
>>>                  pop_r12(&prog);
>>> @@ -1664,16 +1699,48 @@ static int do_jit(struct bpf_prog *bpf_prog, 
>>> int *addrs, u8 *image, u8 *rw_image
>>>          int i, excnt = 0;
>>>          int ilen, proglen = 0;
>>>          u8 *prog = temp;
>>> +       u16 stack_arg_depth, incoming_stack_arg_depth, 
>>> outgoing_stack_arg_depth;
>>> +       u16 outgoing_rsp;
>>>          u32 stack_depth;
>>> +       int callee_saved_size;
>>> +       s32 outgoing_arg_base;
>>> +       bool has_stack_args;
>>>          int err;
>>>
>>>          stack_depth = bpf_prog->aux->stack_depth;
>>> +       stack_arg_depth = bpf_prog->aux->stack_arg_depth;
>>> +       incoming_stack_arg_depth = 
>>> bpf_prog->aux->incoming_stack_arg_depth;
>>> +       outgoing_stack_arg_depth = stack_arg_depth - 
>>> incoming_stack_arg_depth;
>>>          priv_stack_ptr = bpf_prog->aux->priv_stack_ptr;
>>>          if (priv_stack_ptr) {
>>>                  priv_frame_ptr = priv_stack_ptr + 
>>> PRIV_STACK_GUARD_SZ + round_up(stack_depth, 8);
>>>                  stack_depth = 0;
>>>          }
>>>
>>> +       /*
>>> +        * Follow x86-64 calling convention for both BPF-to-BPF and
>>> +        * kfunc calls:
>>> +        *   - Arg 6 is passed in R9 register
>>> +        *   - Args 7+ are passed on the stack at [rsp]
>>> +        *
>>> +        * Incoming arg 6 is read from R9 (BPF r12+8 → MOV from R9).
>>> +        * Incoming args 7+ are read from [rbp + 16], [rbp + 24], ...
>>> +        * (BPF r12+16, r12+24, ... map directly with no offset 
>>> change).
>>> +        *
>>> +        * The verifier guarantees that neither tail_call_reachable nor
>>> +        * priv_stack is set when outgoing stack args exist, so R9 is
>>> +        * always available.
>>> +        *
>>> +        * Stack layout (high to low):
>>> +        *   [rbp + 16 + ...]    incoming stack args 7+ (from caller)
>>> +        *   [rbp + 8]           return address
>>> +        *   [rbp]               saved rbp
>>> +        *   [rbp - prog_stack]  program stack
>>> +        *   [below]             callee-saved regs
>>> +        *   [below]             outgoing args 7+ (= rsp)
>>> +        */
>>> +       has_stack_args = stack_arg_depth > 0;
>>> +
>>>          arena_vm_start = 
>>> bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
>>>          user_vm_start = 
>>> bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
>>>
>>> @@ -1700,6 +1767,41 @@ static int do_jit(struct bpf_prog *bpf_prog, 
>>> int *addrs, u8 *image, u8 *rw_image
>>>                          push_r12(&prog);
>>>                  push_callee_regs(&prog, callee_regs_used);
>>>          }
>>> +
>>> +       /* Compute callee-saved register area size. */
>>> +       callee_saved_size = 0;
>>> +       if (bpf_prog->aux->exception_boundary || arena_vm_start)
>>> +               callee_saved_size += 8; /* r12 */
>>> +       if (bpf_prog->aux->exception_boundary) {
>>> +               callee_saved_size += 4 * 8; /* rbx, r13, r14, r15 */
>>> +       } else {
>>> +               int j;
>>> +
>>> +               for (j = 0; j < 4; j++)
>>> +                       if (callee_regs_used[j])
>>> +                               callee_saved_size += 8;
>>> +       }
>>> +       /*
>>> +        * Base offset from rbp for translating BPF outgoing args 7+
>>> +        * to native offsets:
>>> +        *   native_off = outgoing_arg_base + bpf_off
>>> +        *
>>> +        * BPF outgoing offsets are negative (r12 - N*8 for arg6,
>>> +        * ..., r12 - 8 for last arg). Arg 6 goes to R9 directly,
>>> +        * so only args 7+ occupy the outgoing stack area.
>>> +        *
>>> +        * Note that tail_call_reachable is guaranteed to be false when
>>> +        * stack args exist, so tcc pushes need not be accounted for.
>>> +        */
>>> +       outgoing_arg_base = -(round_up(stack_depth, 8) + 
>>> callee_saved_size);
>>> +
>>> +       /*
>>> +        * Allocate outgoing stack arg area for args 7+ only.
>>> +        * Arg 6 goes into r9 register, not on stack.
>>> +        */
>>> +       outgoing_rsp = outgoing_stack_arg_depth > 8 ? 
>>> outgoing_stack_arg_depth - 8 : 0;
>>> +       emit_sub_rsp(&prog, outgoing_rsp);
>>> +
>>>          if (arena_vm_start)
>>>                  emit_mov_imm64(&prog, X86_REG_R12,
>>>                                 arena_vm_start >> 32, (u32) 
>>> arena_vm_start);
>>> @@ -1715,13 +1817,14 @@ static int do_jit(struct bpf_prog *bpf_prog, 
>>> int *addrs, u8 *image, u8 *rw_image
>>>          prog = temp;
>>>
>>>          for (i = 1; i <= insn_cnt; i++, insn++) {
>>> +               bool adjust_stack_arg_off = false;
>> This bool signal within a single insn is hard to read.
>
> This can be removed as we can directly compare src_reg/dst_reg to 
> BPF_REG_STACK_ARG_BASE
> in the below.
>
>>
>>>                  const s32 imm32 = insn->imm;
>>>                  u32 dst_reg = insn->dst_reg;
>>>                  u32 src_reg = insn->src_reg;
>>>                  u8 b2 = 0, b3 = 0;
>>>                  u8 *start_of_ldx;
>>>                  s64 jmp_offset;
>>> -               s16 insn_off;
>>> +               s32 insn_off;
>>>                  u8 jmp_cond;
>>>                  u8 *func;
>>>                  int nops;
>>> @@ -1734,6 +1837,21 @@ static int do_jit(struct bpf_prog *bpf_prog, 
>>> int *addrs, u8 *image, u8 *rw_image
>>>                                  dst_reg = X86_REG_R9;
>>>                  }
>>>
>>> +               if (has_stack_args) {
>>> +                       u8 class = BPF_CLASS(insn->code);
>>> +
>>> +                       if (class == BPF_LDX &&
>>> +                           src_reg == BPF_REG_STACK_ARG_BASE) {
>>> +                               src_reg = BPF_REG_FP;
>>> +                               adjust_stack_arg_off = true;
>>> +                       }
>>> +                       if ((class == BPF_STX || class == BPF_ST) &&
>>> +                           dst_reg == BPF_REG_STACK_ARG_BASE) {
>>> +                               dst_reg = BPF_REG_FP;
>>> +                               adjust_stack_arg_off = true;
>>> +                       }
>>> +               }
>> All that stuff looks unnecessary.
>
> Ack.
>
>>
>>> +
>>>                  switch (insn->code) {
>>>                          /* ALU */
>>>                  case BPF_ALU | BPF_ADD | BPF_X:
>>> @@ -2129,12 +2247,20 @@ static int do_jit(struct bpf_prog *bpf_prog, 
>>> int *addrs, u8 *image, u8 *rw_image
>>>                                  EMIT1(0xC7);
>>>                          goto st;
>>>                  case BPF_ST | BPF_MEM | BPF_DW:
>>> +                       if (adjust_stack_arg_off && insn->off == 
>>> -outgoing_stack_arg_depth) {
>>> +                               /* Arg 6: store immediate in r9 
>>> register */
>>> +                               emit_mov_imm64(&prog, X86_REG_R9, 
>>> imm32 >> 31, (u32)imm32);
>>> +                               break;
>>> +                       }
>>>                          EMIT2(add_1mod(0x48, dst_reg), 0xC7);
>>>
>>> -st:                    if (is_imm8(insn->off))
>>> -                               EMIT2(add_1reg(0x40, dst_reg), 
>>> insn->off);
>>> +st:                    insn_off = insn->off;
>>> +                       if (adjust_stack_arg_off)
>>> +                               insn_off = outgoing_arg_base + 
>>> insn_off;
>> Since this part needs to be done anyway, match dst_reg==r11 here
>> and do the right thing without bool adjust_stack_arg_off ?
>
> Yes, we can do something like below:
>
>                 case BPF_ST | BPF_MEM | BPF_DW:
> -                       if (adjust_stack_arg_off && insn->off == 
> -outgoing_stack_arg_depth) {
> +                       if (dst_reg == BPF_REG_STACK_ARG_BASE && 
> insn->off == -outgoing_stack_arg_depth) {
>                                 /* Arg 6: store immediate in r9 
> register */
>                                 emit_mov_imm64(&prog, X86_REG_R9, 
> imm32 >> 31, (u32)imm32);
>                                 break;
> @@ -2255,8 +2230,10 @@ static int do_jit(struct bpf_prog *bpf_prog, 
> int *addrs, u8 *image, u8 *rw_image
>                         EMIT2(add_1mod(0x48, dst_reg), 0xC7);
>
>  st:                    insn_off = insn->off;
> -                       if (adjust_stack_arg_off)
> +                       if (dst_reg == BPF_REG_STACK_ARG_BASE) {
>                                 insn_off = outgoing_arg_base + insn_off;
> +                               dst_reg = BPF_REG_FP;
> +                       }
>                         if (is_imm8(insn_off))
>                                 EMIT2(add_1reg(0x40, dst_reg), insn_off);
>                         else

Alexei,

I think we have an issue in the above w.r.t. outgoing_stack_arg_depth.
Puranjay discovered this issue.

In the above, we have

    if (dst_reg == BPF_REG_STACK_ARG_BASE && insn->off == -outgoing_stack_arg_depth) {
	/* Arg 6: store immediate in r9 register */
	...
    }

The outgoing_stack_arg_depth is the *max* depth among all callee's.
For example bar() calls foo1() and foo2().
    foo1(a1, a2, a3, a4, a5, a6, a7, a8); // int type
    foo2(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10); // int type

In this case, the outgoing_stack_arg_depth will be 40 bytes.

for foo1:
    a6: off -24
    a7: off -16
    a8: off -8
for foo2:
    a6:  off -40
    a7:  off -32
    a8:  off -24
    a9:  off -16
    a10: off -8

The current approach works for foo2(), but not foo1().

I think we should do

for foo1 (from llvm)
   a6: off -8
   a7: off -16
   a8: off -24
for foo2 (from llvm)
   a6:  off -8
   a7:  off -16
   a8:  off -24
   a9:  off -32
   a10: off -40

For x86,
   off -8 will do move for r9
and then we can do
foo1 (4 stack slots):
     <unknown>
     <unknown>
     off -24
     off -16
foo2 (4 stack slots):
     off -40
     off -32
     off -24
     off -16

Similarly for arm64,

The first 3 stack arguments (a6, a7, a8) corresponds to three registers.
(assuming w5 corresponds to arm64 #6 arguments)
    a6 (off -8) -> w5
    a7 (off -16) -> w6
    a8 (off -24) -> w7

The rest similar to the above foo1() and foo2() with bottom 'off -32' if exists.

Do you think this will work? If yes, llvm needs to update accordingly.


  parent reply	other threads:[~2026-04-14 16:46 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-12  4:58 [PATCH bpf-next v4 00/18] bpf: Support stack arguments for BPF functions and kfuncs Yonghong Song
2026-04-12  4:58 ` [PATCH bpf-next v4 01/18] bpf: Remove unused parameter from check_map_kptr_access() Yonghong Song
2026-04-12  4:58 ` [PATCH bpf-next v4 02/18] bpf: Change from "arg #%d" to "arg#%d" in verifier log Yonghong Song
2026-04-12  4:58 ` [PATCH bpf-next v4 03/18] bpf: Refactor to avoid redundant calculation of bpf_reg_state Yonghong Song
2026-04-12  5:31   ` bot+bpf-ci
2026-04-13 14:25     ` Yonghong Song
2026-04-12  4:58 ` [PATCH bpf-next v4 04/18] bpf: Refactor to handle memory and size together Yonghong Song
2026-04-12  5:31   ` bot+bpf-ci
2026-04-13 14:27     ` Yonghong Song
2026-04-12  4:58 ` [PATCH bpf-next v4 05/18] bpf: Change some regno type from u32 to int type Yonghong Song
2026-04-12  4:58 ` [PATCH bpf-next v4 06/18] bpf: Use argument index instead of register index in kfunc verifier logs Yonghong Song
2026-04-12  5:43   ` bot+bpf-ci
2026-04-13 14:37     ` Yonghong Song
2026-04-12 22:01   ` Alexei Starovoitov
2026-04-13 14:45     ` Yonghong Song
2026-04-15 23:23     ` Amery Hung
2026-04-16 14:39       ` Yonghong Song
2026-04-12  4:59 ` [PATCH bpf-next v4 07/18] bpf: Introduce bpf register BPF_REG_STACK_ARG_BASE Yonghong Song
2026-04-12  4:59 ` [PATCH bpf-next v4 08/18] bpf: Reuse MAX_BPF_FUNC_ARGS for maximum number of arguments Yonghong Song
2026-04-12  4:59 ` [PATCH bpf-next v4 09/18] bpf: Support stack arguments for bpf functions Yonghong Song
2026-04-12  5:43   ` bot+bpf-ci
2026-04-13 15:22     ` Yonghong Song
2026-04-12 22:23   ` Alexei Starovoitov
2026-04-13 16:33     ` Yonghong Song
2026-04-12  5:00 ` [PATCH bpf-next v4 10/18] bpf: Fix interaction between stack argument PTR_TO_STACK and dead slot poisoning Yonghong Song
2026-04-12  5:43   ` bot+bpf-ci
2026-04-13 16:36     ` Yonghong Song
2026-04-15 22:32   ` Amery Hung
2026-04-16 14:21     ` Yonghong Song
2026-04-12  5:00 ` [PATCH bpf-next v4 11/18] bpf: Reject stack arguments in non-JITed programs Yonghong Song
2026-04-12  5:00 ` [PATCH bpf-next v4 12/18] bpf: Reject stack arguments if tail call reachable Yonghong Song
2026-04-12  5:43   ` bot+bpf-ci
2026-04-13 16:37     ` Yonghong Song
2026-04-12  5:00 ` [PATCH bpf-next v4 13/18] bpf: Support stack arguments for kfunc calls Yonghong Song
2026-04-12  5:43   ` bot+bpf-ci
2026-04-13 16:43     ` Yonghong Song
2026-04-12  5:00 ` [PATCH bpf-next v4 14/18] bpf: Enable stack argument support for x86_64 Yonghong Song
2026-04-12  5:00 ` [PATCH bpf-next v4 15/18] bpf,x86: Implement JIT support for stack arguments Yonghong Song
2026-04-12  5:43   ` bot+bpf-ci
2026-04-13 16:49     ` Yonghong Song
2026-04-12 22:36   ` Alexei Starovoitov
2026-04-13 17:26     ` Yonghong Song
2026-04-13 19:59       ` Alexei Starovoitov
2026-04-13 20:32         ` Yonghong Song
2026-04-13 20:38           ` Alexei Starovoitov
2026-04-13 21:10             ` Yonghong Song
2026-04-14 16:45       ` Yonghong Song [this message]
2026-04-14 17:51         ` Alexei Starovoitov
2026-04-12  5:00 ` [PATCH bpf-next v4 16/18] selftests/bpf: Add tests for BPF function " Yonghong Song
2026-04-12  5:00 ` [PATCH bpf-next v4 17/18] selftests/bpf: Add negative test for greater-than-8-byte kfunc stack argument Yonghong Song
2026-04-12  5:00 ` [PATCH bpf-next v4 18/18] selftests/bpf: Add verifier tests for stack argument validation Yonghong Song

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=af8bb98f-5df7-4cba-9bd7-84eed5239db9@linux.dev \
    --to=yonghong.song@linux.dev \
    --cc=alexei.starovoitov@gmail.com \
    --cc=andrii@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=jose.marchesi@oracle.com \
    --cc=kernel-team@fb.com \
    --cc=martin.lau@kernel.org \
    --cc=puranjay@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox