From: Arnaldo Carvalho de Melo <acme@kernel.org>
To: Lorenz Bauer <lmb@cloudflare.com>
Cc: ast@kernel.org, daniel@iogearbox.net, jakub@cloudflare.com,
bpf@vger.kernel.org, kernel-team@cloudflare.com
Subject: Re: [PATCH bpf] bpf: plug hole in struct bpf_sk_lookup_kern
Date: Thu, 10 Sep 2020 12:19:12 -0300 [thread overview]
Message-ID: <20200910151912.GF4018363@kernel.org> (raw)
In-Reply-To: <20200910110248.198326-1-lmb@cloudflare.com>
Em Thu, Sep 10, 2020 at 12:02:48PM +0100, Lorenz Bauer escreveu:
> As Alexei points out, struct bpf_sk_lookup_kern has two 4-byte holes.
> This leads to suboptimal instructions being generated (IPv4, x86):
>
> 1372 struct bpf_sk_lookup_kern ctx = {
> 0xffffffff81b87f30 <+624>: xor %eax,%eax
> 0xffffffff81b87f32 <+626>: mov $0x6,%ecx
> 0xffffffff81b87f37 <+631>: lea 0x90(%rsp),%rdi
> 0xffffffff81b87f3f <+639>: movl $0x110002,0x88(%rsp)
> 0xffffffff81b87f4a <+650>: rep stos %rax,%es:(%rdi)
> 0xffffffff81b87f4d <+653>: mov 0x8(%rsp),%eax
> 0xffffffff81b87f51 <+657>: mov %r13d,0x90(%rsp)
> 0xffffffff81b87f59 <+665>: incl %gs:0x7e4970a0(%rip)
> 0xffffffff81b87f60 <+672>: mov %eax,0x8c(%rsp)
> 0xffffffff81b87f67 <+679>: movzwl 0x10(%rsp),%eax
> 0xffffffff81b87f6c <+684>: mov %ax,0xa8(%rsp)
> 0xffffffff81b87f74 <+692>: movzwl 0x38(%rsp),%eax
> 0xffffffff81b87f79 <+697>: mov %ax,0xaa(%rsp)
>
> Fix this by moving around sport and dport. pahole confirms there
> are no more holes:
>
> struct bpf_sk_lookup_kern {
> u16 family; /* 0 2 */
> u16 protocol; /* 2 2 */
> __be16 sport; /* 4 2 */
> u16 dport; /* 6 2 */
> struct {
> __be32 saddr; /* 8 4 */
> __be32 daddr; /* 12 4 */
> } v4; /* 8 8 */
> struct {
> const struct in6_addr * saddr; /* 16 8 */
> const struct in6_addr * daddr; /* 24 8 */
> } v6; /* 16 16 */
> struct sock * selected_sk; /* 32 8 */
> bool no_reuseport; /* 40 1 */
>
> /* size: 48, cachelines: 1, members: 8 */
> /* padding: 7 */
> /* last cacheline: 48 bytes */
> };
Cool, looking at:
[root@five ~]# pahole --sizes | grep ^bpf | sort -nr -k2 | head
bpf_ringbuf 12288 4
bpf_ctx_convert 6960 0
bpf_verifier_env 4816 3
bpf_func_state 1360 1
bpf_trace_sample_data 1152 0
bpf_verifier_log 1048 1
bpf_dispatcher 1048 2
bpf_struct_ops 976 0
bpf_seq_printf_buf 768 0
bpf_htab 640 1
[root@five ~]# pahole --sizes | grep ^bpf | sort -nr -k3 | head
bpf_ringbuf 12288 4
bpf_verifier_env 4816 3
bpf_verifier_state 120 2
bpf_trampoline 376 2
bpf_sk_lookup_kern 56 2
bpf_reg_state 120 2
bpf_func_proto 64 2
bpf_dispatcher 1048 2
bpf_verifier_log 1048 1
bpf_struct_ops_value 64 1
[root@five ~]#
second column is size, third is number of holes (before your patch).
bpf_ringbuf is interesting (this is all using /sys/kernel/btf/vmlinux):
[root@five ~]# pahole bpf_ringbuf
struct bpf_ringbuf {
wait_queue_head_t waitq; /* 0 24 */
struct irq_work work; /* 24 24 */
u64 mask; /* 48 8 */
struct page * * pages; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
int nr_pages; /* 64 4 */
/* XXX 60 bytes hole, try to pack */
/* --- cacheline 2 boundary (128 bytes) --- */
spinlock_t spinlock; /* 128 4 */
/* XXX 3964 bytes hole, try to pack */
/* --- cacheline 64 boundary (4096 bytes) --- */
long unsigned int consumer_pos; /* 4096 8 */
/* XXX 4088 bytes hole, try to pack */
/* --- cacheline 128 boundary (8192 bytes) --- */
long unsigned int producer_pos; /* 8192 8 */
/* XXX 4088 bytes hole, try to pack */
/* --- cacheline 192 boundary (12288 bytes) --- */
char data[]; /* 12288 0 */
/* size: 12288, cachelines: 192, members: 9 */
/* sum members: 88, holes: 4, sum holes: 12200 */
};
[root@five ~]#
Which seems crazy, lemme see using DWARF:
[root@five ~]# pahole -F dwarf bpf_ringbuf
struct bpf_ringbuf {
wait_queue_head_t waitq; /* 0 24 */
struct irq_work work; /* 24 24 */
u64 mask; /* 48 8 */
struct page * * pages; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
int nr_pages; /* 64 4 */
/* XXX 60 bytes hole, try to pack */
/* --- cacheline 2 boundary (128 bytes) --- */
spinlock_t spinlock __attribute__((__aligned__(64))); /* 128 4 */
/* XXX 3964 bytes hole, try to pack */
/* --- cacheline 64 boundary (4096 bytes) --- */
long unsigned int consumer_pos __attribute__((__aligned__(4096))); /* 4096 8 */
/* XXX 4088 bytes hole, try to pack */
/* --- cacheline 128 boundary (8192 bytes) --- */
long unsigned int producer_pos __attribute__((__aligned__(4096))); /* 8192 8 */
/* XXX 4088 bytes hole, try to pack */
/* --- cacheline 192 boundary (12288 bytes) --- */
char data[] __attribute__((__aligned__(4096))); /* 12288 0 */
/* size: 12288, cachelines: 192, members: 9 */
/* sum members: 88, holes: 4, sum holes: 12200 */
/* forced alignments: 4, forced holes: 4, sum forced holes: 12200 */
} __attribute__((__aligned__(4096)));
[root@five ~]#
Yeah, matches the header files:
struct bpf_ringbuf {
wait_queue_head_t waitq;
struct irq_work work;
u64 mask;
struct page **pages;
int nr_pages;
spinlock_t spinlock ____cacheline_aligned_in_smp;
/* Consumer and producer counters are put into separate pages to allow
* mapping consumer page as r/w, but restrict producer page to r/o.
* This protects producer position from being modified by user-space
* application and ruining in-kernel position tracking.
*/
unsigned long consumer_pos __aligned(PAGE_SIZE);
unsigned long producer_pos __aligned(PAGE_SIZE);
char data[] __aligned(PAGE_SIZE);
};
But:
[root@five ~]# pahole bpf_verifier_env
struct bpf_verifier_env {
u32 insn_idx; /* 0 4 */
u32 prev_insn_idx; /* 4 4 */
struct bpf_prog * prog; /* 8 8 */
const struct bpf_verifier_ops * ops; /* 16 8 */
struct bpf_verifier_stack_elem * head; /* 24 8 */
int stack_size; /* 32 4 */
bool strict_alignment; /* 36 1 */
bool test_state_freq; /* 37 1 */
/* XXX 2 bytes hole, try to pack */
struct bpf_verifier_state * cur_state; /* 40 8 */
struct bpf_verifier_state_list * * explored_states; /* 48 8 */
struct bpf_verifier_state_list * free_list; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct bpf_map * used_maps[64]; /* 64 512 */
/* --- cacheline 9 boundary (576 bytes) --- */
u32 used_map_cnt; /* 576 4 */
u32 id_gen; /* 580 4 */
bool allow_ptr_leaks; /* 584 1 */
bool allow_ptr_to_map_access; /* 585 1 */
bool bpf_capable; /* 586 1 */
bool bypass_spec_v1; /* 587 1 */
bool bypass_spec_v4; /* 588 1 */
bool seen_direct_write; /* 589 1 */
/* XXX 2 bytes hole, try to pack */
struct bpf_insn_aux_data * insn_aux_data; /* 592 8 */
const struct bpf_line_info * prev_linfo; /* 600 8 */
struct bpf_verifier_log log; /* 608 1048 */
/* --- cacheline 25 boundary (1600 bytes) was 56 bytes ago --- */
struct bpf_subprog_info subprog_info[257]; /* 1656 3084 */
/* XXX 4 bytes hole, try to pack */
/* --- cacheline 74 boundary (4736 bytes) was 8 bytes ago --- */
struct {
int * insn_state; /* 4744 8 */
int * insn_stack; /* 4752 8 */
int cur_stack; /* 4760 4 */
} cfg; /* 4744 24 */
/* XXX last struct has 4 bytes of padding */
u32 pass_cnt; /* 4768 4 */
u32 subprog_cnt; /* 4772 4 */
u32 prev_insn_processed; /* 4776 4 */
u32 insn_processed; /* 4780 4 */
u32 prev_jmps_processed; /* 4784 4 */
u32 jmps_processed; /* 4788 4 */
u64 verification_time; /* 4792 8 */
/* --- cacheline 75 boundary (4800 bytes) --- */
u32 max_states_per_insn; /* 4800 4 */
u32 total_states; /* 4804 4 */
u32 peak_states; /* 4808 4 */
u32 longest_mark_read_walk; /* 4812 4 */
/* size: 4816, cachelines: 76, members: 36 */
/* sum members: 4808, holes: 3, sum holes: 8 */
/* paddings: 1, sum paddings: 4 */
/* last cacheline: 16 bytes */
};
[root@five ~]#
[root@five ~]# pahole --reorganize bpf_verifier_env | tail
u32 jmps_processed; /* 4788 4 */
u64 verification_time; /* 4792 8 */
/* --- cacheline 75 boundary (4800 bytes) --- */
u32 max_states_per_insn; /* 4800 4 */
u32 total_states; /* 4804 4 */
/* size: 4808, cachelines: 76, members: 36 */
/* paddings: 1, sum paddings: 4 */
/* last cacheline: 8 bytes */
}; /* saved 8 bytes! */
[root@five ~]#
And headers have no explicit alignment in the headers. Maybe doing the
reorg will not help.
But looking at disasm for rep stos + pahole... humm...
:-)
One last comment:
[root@five ~]# pahole bpf_trampoline
struct bpf_trampoline {
struct hlist_node hlist; /* 0 16 */
struct mutex mutex; /* 16 32 */
refcount_t refcnt; /* 48 4 */
/* XXX 4 bytes hole, try to pack */
u64 key; /* 56 8 */
/* --- cacheline 1 boundary (64 bytes) --- */
struct {
struct btf_func_model model; /* 64 14 */
/* XXX 2 bytes hole, try to pack */
void * addr; /* 80 8 */
bool ftrace_managed; /* 88 1 */
} func; /* 64 32 */
/* XXX last struct has 7 bytes of padding */
struct bpf_prog * extension_prog; /* 96 8 */
struct hlist_head progs_hlist[3]; /* 104 24 */
/* --- cacheline 2 boundary (128 bytes) --- */
int progs_cnt[3]; /* 128 12 */
/* XXX 4 bytes hole, try to pack */
void * image; /* 144 8 */
u64 selector; /* 152 8 */
struct bpf_ksym ksym; /* 160 216 */
/* XXX last struct has 7 bytes of padding */
/* size: 376, cachelines: 6, members: 11 */
/* sum members: 368, holes: 2, sum holes: 8 */
/* paddings: 2, sum paddings: 14 */
/* last cacheline: 56 bytes */
};
[root@five ~]#
perhaps moving ftrace_managed to before addr in that anonymous struct on
the 'func' member may help?
- Arnaldo
next prev parent reply other threads:[~2020-09-10 19:47 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-09-10 11:02 [PATCH bpf] bpf: plug hole in struct bpf_sk_lookup_kern Lorenz Bauer
2020-09-10 12:21 ` Jesper Dangaard Brouer
2020-09-10 15:19 ` Arnaldo Carvalho de Melo [this message]
2020-09-11 0:53 ` Alexei Starovoitov
2020-09-11 8:05 ` Lorenz Bauer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200910151912.GF4018363@kernel.org \
--to=acme@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=jakub@cloudflare.com \
--cc=kernel-team@cloudflare.com \
--cc=lmb@cloudflare.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.