All of lore.kernel.org
 help / color / mirror / Atom feed
From: Arnaldo Carvalho de Melo <acme@kernel.org>
To: Lorenz Bauer <lmb@cloudflare.com>
Cc: ast@kernel.org, daniel@iogearbox.net, jakub@cloudflare.com,
	bpf@vger.kernel.org, kernel-team@cloudflare.com
Subject: Re: [PATCH bpf] bpf: plug hole in struct bpf_sk_lookup_kern
Date: Thu, 10 Sep 2020 12:19:12 -0300	[thread overview]
Message-ID: <20200910151912.GF4018363@kernel.org> (raw)
In-Reply-To: <20200910110248.198326-1-lmb@cloudflare.com>

Em Thu, Sep 10, 2020 at 12:02:48PM +0100, Lorenz Bauer escreveu:
> As Alexei points out, struct bpf_sk_lookup_kern has two 4-byte holes.
> This leads to suboptimal instructions being generated (IPv4, x86):
> 
>     1372                    struct bpf_sk_lookup_kern ctx = {
>        0xffffffff81b87f30 <+624>:   xor    %eax,%eax
>        0xffffffff81b87f32 <+626>:   mov    $0x6,%ecx
>        0xffffffff81b87f37 <+631>:   lea    0x90(%rsp),%rdi
>        0xffffffff81b87f3f <+639>:   movl   $0x110002,0x88(%rsp)
>        0xffffffff81b87f4a <+650>:   rep stos %rax,%es:(%rdi)
>        0xffffffff81b87f4d <+653>:   mov    0x8(%rsp),%eax
>        0xffffffff81b87f51 <+657>:   mov    %r13d,0x90(%rsp)
>        0xffffffff81b87f59 <+665>:   incl   %gs:0x7e4970a0(%rip)
>        0xffffffff81b87f60 <+672>:   mov    %eax,0x8c(%rsp)
>        0xffffffff81b87f67 <+679>:   movzwl 0x10(%rsp),%eax
>        0xffffffff81b87f6c <+684>:   mov    %ax,0xa8(%rsp)
>        0xffffffff81b87f74 <+692>:   movzwl 0x38(%rsp),%eax
>        0xffffffff81b87f79 <+697>:   mov    %ax,0xaa(%rsp)
> 
> Fix this by moving around sport and dport. pahole confirms there
> are no more holes:
> 
>     struct bpf_sk_lookup_kern {
>         u16                        family;       /*     0     2 */
>         u16                        protocol;     /*     2     2 */
>         __be16                     sport;        /*     4     2 */
>         u16                        dport;        /*     6     2 */
>         struct {
>                 __be32             saddr;        /*     8     4 */
>                 __be32             daddr;        /*    12     4 */
>         } v4;                                    /*     8     8 */
>         struct {
>                 const struct in6_addr  * saddr;  /*    16     8 */
>                 const struct in6_addr  * daddr;  /*    24     8 */
>         } v6;                                    /*    16    16 */
>         struct sock *              selected_sk;  /*    32     8 */
>         bool                       no_reuseport; /*    40     1 */
> 
>         /* size: 48, cachelines: 1, members: 8 */
>         /* padding: 7 */
>         /* last cacheline: 48 bytes */
>     };

Cool, looking at:

[root@five ~]# pahole --sizes | grep ^bpf | sort -nr -k2 | head
bpf_ringbuf	12288	4
bpf_ctx_convert	6960	0
bpf_verifier_env	4816	3
bpf_func_state	1360	1
bpf_trace_sample_data	1152	0
bpf_verifier_log	1048	1
bpf_dispatcher	1048	2
bpf_struct_ops	976	0
bpf_seq_printf_buf	768	0
bpf_htab	640	1
[root@five ~]# pahole --sizes | grep ^bpf | sort -nr -k3 | head
bpf_ringbuf	12288	4
bpf_verifier_env	4816	3
bpf_verifier_state	120	2
bpf_trampoline	376	2
bpf_sk_lookup_kern	56	2
bpf_reg_state	120	2
bpf_func_proto	64	2
bpf_dispatcher	1048	2
bpf_verifier_log	1048	1
bpf_struct_ops_value	64	1
[root@five ~]#

second column is size, third is number of holes (before your patch).

bpf_ringbuf is interesting (this is all using /sys/kernel/btf/vmlinux):

[root@five ~]# pahole bpf_ringbuf
struct bpf_ringbuf {
	wait_queue_head_t          waitq;                /*     0    24 */
	struct irq_work            work;                 /*    24    24 */
	u64                        mask;                 /*    48     8 */
	struct page * *            pages;                /*    56     8 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	int                        nr_pages;             /*    64     4 */

	/* XXX 60 bytes hole, try to pack */

	/* --- cacheline 2 boundary (128 bytes) --- */
	spinlock_t                 spinlock;             /*   128     4 */

	/* XXX 3964 bytes hole, try to pack */

	/* --- cacheline 64 boundary (4096 bytes) --- */
	long unsigned int          consumer_pos;         /*  4096     8 */

	/* XXX 4088 bytes hole, try to pack */

	/* --- cacheline 128 boundary (8192 bytes) --- */
	long unsigned int          producer_pos;         /*  8192     8 */

	/* XXX 4088 bytes hole, try to pack */

	/* --- cacheline 192 boundary (12288 bytes) --- */
	char                       data[];               /* 12288     0 */

	/* size: 12288, cachelines: 192, members: 9 */
	/* sum members: 88, holes: 4, sum holes: 12200 */
};
[root@five ~]#

Which seems crazy, lemme see using DWARF:

[root@five ~]# pahole -F dwarf bpf_ringbuf
struct bpf_ringbuf {
	wait_queue_head_t          waitq;                /*     0    24 */
	struct irq_work            work;                 /*    24    24 */
	u64                        mask;                 /*    48     8 */
	struct page * *            pages;                /*    56     8 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	int                        nr_pages;             /*    64     4 */

	/* XXX 60 bytes hole, try to pack */

	/* --- cacheline 2 boundary (128 bytes) --- */
	spinlock_t                 spinlock __attribute__((__aligned__(64))); /*   128     4 */

	/* XXX 3964 bytes hole, try to pack */

	/* --- cacheline 64 boundary (4096 bytes) --- */
	long unsigned int          consumer_pos __attribute__((__aligned__(4096))); /*  4096     8 */

	/* XXX 4088 bytes hole, try to pack */

	/* --- cacheline 128 boundary (8192 bytes) --- */
	long unsigned int          producer_pos __attribute__((__aligned__(4096))); /*  8192     8 */

	/* XXX 4088 bytes hole, try to pack */

	/* --- cacheline 192 boundary (12288 bytes) --- */
	char                       data[] __attribute__((__aligned__(4096))); /* 12288     0 */

	/* size: 12288, cachelines: 192, members: 9 */
	/* sum members: 88, holes: 4, sum holes: 12200 */
	/* forced alignments: 4, forced holes: 4, sum forced holes: 12200 */
} __attribute__((__aligned__(4096)));
[root@five ~]#

Yeah, matches the header files:

struct bpf_ringbuf {
        wait_queue_head_t waitq;
        struct irq_work work;
        u64 mask;
        struct page **pages;
        int nr_pages;
        spinlock_t spinlock ____cacheline_aligned_in_smp;
        /* Consumer and producer counters are put into separate pages to allow
         * mapping consumer page as r/w, but restrict producer page to r/o.
         * This protects producer position from being modified by user-space
         * application and ruining in-kernel position tracking.
         */
        unsigned long consumer_pos __aligned(PAGE_SIZE);
        unsigned long producer_pos __aligned(PAGE_SIZE);
        char data[] __aligned(PAGE_SIZE);
};


But:

[root@five ~]# pahole bpf_verifier_env
struct bpf_verifier_env {
	u32                        insn_idx;             /*     0     4 */
	u32                        prev_insn_idx;        /*     4     4 */
	struct bpf_prog *          prog;                 /*     8     8 */
	const struct bpf_verifier_ops  * ops;            /*    16     8 */
	struct bpf_verifier_stack_elem * head;           /*    24     8 */
	int                        stack_size;           /*    32     4 */
	bool                       strict_alignment;     /*    36     1 */
	bool                       test_state_freq;      /*    37     1 */

	/* XXX 2 bytes hole, try to pack */

	struct bpf_verifier_state * cur_state;           /*    40     8 */
	struct bpf_verifier_state_list * * explored_states; /*    48     8 */
	struct bpf_verifier_state_list * free_list;      /*    56     8 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct bpf_map *           used_maps[64];        /*    64   512 */
	/* --- cacheline 9 boundary (576 bytes) --- */
	u32                        used_map_cnt;         /*   576     4 */
	u32                        id_gen;               /*   580     4 */
	bool                       allow_ptr_leaks;      /*   584     1 */
	bool                       allow_ptr_to_map_access; /*   585     1 */
	bool                       bpf_capable;          /*   586     1 */
	bool                       bypass_spec_v1;       /*   587     1 */
	bool                       bypass_spec_v4;       /*   588     1 */
	bool                       seen_direct_write;    /*   589     1 */

	/* XXX 2 bytes hole, try to pack */

	struct bpf_insn_aux_data * insn_aux_data;        /*   592     8 */
	const struct bpf_line_info  * prev_linfo;        /*   600     8 */
	struct bpf_verifier_log    log;                  /*   608  1048 */
	/* --- cacheline 25 boundary (1600 bytes) was 56 bytes ago --- */
	struct bpf_subprog_info    subprog_info[257];    /*  1656  3084 */

	/* XXX 4 bytes hole, try to pack */

	/* --- cacheline 74 boundary (4736 bytes) was 8 bytes ago --- */
	struct {
		int *              insn_state;           /*  4744     8 */
		int *              insn_stack;           /*  4752     8 */
		int                cur_stack;            /*  4760     4 */
	} cfg;                                           /*  4744    24 */

	/* XXX last struct has 4 bytes of padding */

	u32                        pass_cnt;             /*  4768     4 */
	u32                        subprog_cnt;          /*  4772     4 */
	u32                        prev_insn_processed;  /*  4776     4 */
	u32                        insn_processed;       /*  4780     4 */
	u32                        prev_jmps_processed;  /*  4784     4 */
	u32                        jmps_processed;       /*  4788     4 */
	u64                        verification_time;    /*  4792     8 */
	/* --- cacheline 75 boundary (4800 bytes) --- */
	u32                        max_states_per_insn;  /*  4800     4 */
	u32                        total_states;         /*  4804     4 */
	u32                        peak_states;          /*  4808     4 */
	u32                        longest_mark_read_walk; /*  4812     4 */

	/* size: 4816, cachelines: 76, members: 36 */
	/* sum members: 4808, holes: 3, sum holes: 8 */
	/* paddings: 1, sum paddings: 4 */
	/* last cacheline: 16 bytes */
};
[root@five ~]#

[root@five ~]# pahole --reorganize bpf_verifier_env | tail
	u32                        jmps_processed;       /*  4788     4 */
	u64                        verification_time;    /*  4792     8 */
	/* --- cacheline 75 boundary (4800 bytes) --- */
	u32                        max_states_per_insn;  /*  4800     4 */
	u32                        total_states;         /*  4804     4 */

	/* size: 4808, cachelines: 76, members: 36 */
	/* paddings: 1, sum paddings: 4 */
	/* last cacheline: 8 bytes */
};   /* saved 8 bytes! */
[root@five ~]#


And headers have no explicit alignment in the headers. Maybe doing the
reorg will not help.

But looking at disasm for rep stos + pahole... humm...

:-)

One last comment:

[root@five ~]# pahole bpf_trampoline
struct bpf_trampoline {
	struct hlist_node          hlist;                /*     0    16 */
	struct mutex               mutex;                /*    16    32 */
	refcount_t                 refcnt;               /*    48     4 */

	/* XXX 4 bytes hole, try to pack */

	u64                        key;                  /*    56     8 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct {
		struct btf_func_model model;             /*    64    14 */

		/* XXX 2 bytes hole, try to pack */

		void *             addr;                 /*    80     8 */
		bool               ftrace_managed;       /*    88     1 */
	} func;                                          /*    64    32 */

	/* XXX last struct has 7 bytes of padding */

	struct bpf_prog *          extension_prog;       /*    96     8 */
	struct hlist_head          progs_hlist[3];       /*   104    24 */
	/* --- cacheline 2 boundary (128 bytes) --- */
	int                        progs_cnt[3];         /*   128    12 */

	/* XXX 4 bytes hole, try to pack */

	void *                     image;                /*   144     8 */
	u64                        selector;             /*   152     8 */
	struct bpf_ksym            ksym;                 /*   160   216 */

	/* XXX last struct has 7 bytes of padding */

	/* size: 376, cachelines: 6, members: 11 */
	/* sum members: 368, holes: 2, sum holes: 8 */
	/* paddings: 2, sum paddings: 14 */
	/* last cacheline: 56 bytes */
};
[root@five ~]#

perhaps moving ftrace_managed to before addr in that anonymous struct on
the 'func' member may help?

- Arnaldo


  parent reply	other threads:[~2020-09-10 19:47 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-10 11:02 [PATCH bpf] bpf: plug hole in struct bpf_sk_lookup_kern Lorenz Bauer
2020-09-10 12:21 ` Jesper Dangaard Brouer
2020-09-10 15:19 ` Arnaldo Carvalho de Melo [this message]
2020-09-11  0:53 ` Alexei Starovoitov
2020-09-11  8:05   ` Lorenz Bauer

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200910151912.GF4018363@kernel.org \
    --to=acme@kernel.org \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=jakub@cloudflare.com \
    --cc=kernel-team@cloudflare.com \
    --cc=lmb@cloudflare.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.