From: Jakub Sitnicki <jakub@cloudflare.com>
To: Lorenz Bauer <lmb@cloudflare.com>
Cc: ast@kernel.org, yhs@fb.com, daniel@iogearbox.net,
john.fastabend@gmail.com, bpf@vger.kernel.org,
kernel-team@cloudflare.com
Subject: Re: [PATCH bpf-next 1/3] net: Allow iterating sockmap and sockhash
Date: Mon, 31 Aug 2020 12:03:57 +0200 [thread overview]
Message-ID: <87eennrv1u.fsf@cloudflare.com> (raw)
In-Reply-To: <20200828094834.23290-2-lmb@cloudflare.com>
On Fri, Aug 28, 2020 at 11:48 AM CEST, Lorenz Bauer wrote:
> Add bpf_iter support for sockmap / sockhash, based on the bpf_sk_storage and
> hashtable implementation. sockmap and sockhash share the same iteration
> context: a pointer to an arbitrary key and a pointer to a socket. Both
> pointers may be NULL, and so BPF has to perform a NULL check before accessing
> them. Technically it's not possible for sockhash iteration to yield a NULL
> socket, but we ignore this to be able to use a single iteration point.
>
> Iteration will visit all keys that remain unmodified during the lifetime of
> the iterator. It may or may not visit newly added ones.
>
> Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> ---
> net/core/sock_map.c | 283 ++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 283 insertions(+)
>
> diff --git a/net/core/sock_map.c b/net/core/sock_map.c
> index d6c6e1e312fc..31c4332f06e4 100644
> --- a/net/core/sock_map.c
> +++ b/net/core/sock_map.c
> @@ -703,6 +703,116 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = {
> .arg4_type = ARG_ANYTHING,
> };
>
> +struct sock_map_seq_info {
> + struct bpf_map *map;
> + struct sock *sk;
> + u32 index;
> +};
> +
> +struct bpf_iter__sockmap {
> + __bpf_md_ptr(struct bpf_iter_meta *, meta);
> + __bpf_md_ptr(struct bpf_map *, map);
> + __bpf_md_ptr(void *, key);
> + __bpf_md_ptr(struct bpf_sock *, sk);
> +};
> +
> +DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta,
> + struct bpf_map *map, void *key,
> + struct sock *sk)
> +
> +static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info)
> +{
> + if (unlikely(info->index >= info->map->max_entries))
> + return NULL;
> +
> + info->sk = __sock_map_lookup_elem(info->map, info->index);
> + if (!info->sk || !sk_fullsock(info->sk))
As we've talked off-line, we don't expect neither timewait nor request
sockets in sockmap so sk_fullsock() check is likely not needed.
> + info->sk = NULL;
> +
> + /* continue iterating */
> + return info;
> +}
> +
> +static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> + struct sock_map_seq_info *info = seq->private;
> +
> + if (*pos == 0)
> + ++*pos;
> +
> + /* pairs with sock_map_seq_stop */
> + rcu_read_lock();
> + return sock_map_seq_lookup_elem(info);
> +}
> +
> +static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> + struct sock_map_seq_info *info = seq->private;
> +
> + ++*pos;
> + ++info->index;
> +
> + return sock_map_seq_lookup_elem(info);
> +}
> +
> +static int __sock_map_seq_show(struct seq_file *seq, void *v)
> +{
> + struct sock_map_seq_info *info = seq->private;
> + struct bpf_iter__sockmap ctx = {};
> + struct bpf_iter_meta meta;
> + struct bpf_prog *prog;
> +
> + meta.seq = seq;
> + prog = bpf_iter_get_info(&meta, !v);
> + if (!prog)
> + return 0;
> +
> + ctx.meta = &meta;
> + ctx.map = info->map;
> + if (v) {
> + ctx.key = &info->index;
> + ctx.sk = (struct bpf_sock *)info->sk;
> + }
> +
> + return bpf_iter_run_prog(prog, &ctx);
> +}
> +
> +static int sock_map_seq_show(struct seq_file *seq, void *v)
> +{
> + return __sock_map_seq_show(seq, v);
> +}
> +
> +static void sock_map_seq_stop(struct seq_file *seq, void *v)
> +{
> + if (!v)
> + (void)__sock_map_seq_show(seq, NULL);
> +
> + /* pairs with sock_map_seq_start */
> + rcu_read_unlock();
> +}
> +
> +static const struct seq_operations sock_map_seq_ops = {
> + .start = sock_map_seq_start,
> + .next = sock_map_seq_next,
> + .stop = sock_map_seq_stop,
> + .show = sock_map_seq_show,
> +};
> +
> +static int sock_map_init_seq_private(void *priv_data,
> + struct bpf_iter_aux_info *aux)
> +{
> + struct sock_map_seq_info *info = priv_data;
> +
> + info->map = aux->map;
> + return 0;
> +}
> +
> +static const struct bpf_iter_seq_info sock_map_iter_seq_info = {
> + .seq_ops = &sock_map_seq_ops,
> + .init_seq_private = sock_map_init_seq_private,
> + .seq_priv_size = sizeof(struct sock_map_seq_info),
> +};
> +
> static int sock_map_btf_id;
> const struct bpf_map_ops sock_map_ops = {
> .map_alloc = sock_map_alloc,
[...]
> @@ -1198,6 +1309,120 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
> .arg4_type = ARG_ANYTHING,
> };
>
> +struct sock_hash_seq_info {
> + struct bpf_map *map;
> + struct bpf_shtab *htab;
> + u32 bucket_id;
> +};
> +
> +static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info,
> + struct bpf_shtab_elem *prev_elem)
> +{
> + const struct bpf_shtab *htab = info->htab;
> + struct bpf_shtab_bucket *bucket;
> + struct bpf_shtab_elem *elem;
> + struct hlist_node *node;
> +
> + /* try to find next elem in the same bucket */
> + if (prev_elem) {
> + node = rcu_dereference_raw(hlist_next_rcu(&prev_elem->node));
I'm not convinced we need to go for the rcu_dereference_raw()
variant. Access happens inside read-side critical section, which we
entered with rcu_read_lock() in sock_hash_seq_start().
That's typical and rcu_dereference() seems appropriate. Basing this on
what I read in Documentation/RCU/rcu_dereference.rst.
> + elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
> + if (elem)
> + return elem;
> +
> + /* no more elements, continue in the next bucket */
> + info->bucket_id++;
> + }
> +
> + for (; info->bucket_id < htab->buckets_num; info->bucket_id++) {
> + bucket = &htab->buckets[info->bucket_id];
> + node = rcu_dereference_raw(hlist_first_rcu(&bucket->head));
> + elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
> + if (elem)
> + return elem;
> + }
> +
> + return NULL;
> +}
> +
> +static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> + struct sock_hash_seq_info *info = seq->private;
> +
> + if (*pos == 0)
> + ++*pos;
> +
> + /* pairs with sock_hash_seq_stop */
> + rcu_read_lock();
> + return sock_hash_seq_find_next(info, NULL);
> +}
> +
> +static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> + struct sock_hash_seq_info *info = seq->private;
> +
> + ++*pos;
> + return sock_hash_seq_find_next(info, v);
> +}
> +
> +static int __sock_hash_seq_show(struct seq_file *seq, struct bpf_shtab_elem *elem)
> +{
> + struct sock_hash_seq_info *info = seq->private;
> + struct bpf_iter__sockmap ctx = {};
> + struct bpf_iter_meta meta;
> + struct bpf_prog *prog;
> +
> + meta.seq = seq;
> + prog = bpf_iter_get_info(&meta, !elem);
> + if (!prog)
> + return 0;
> +
> + ctx.meta = &meta;
> + ctx.map = info->map;
> + if (elem) {
> + ctx.key = elem->key;
> + ctx.sk = (struct bpf_sock *)elem->sk;
> + }
> +
> + return bpf_iter_run_prog(prog, &ctx);
> +}
> +
> +static int sock_hash_seq_show(struct seq_file *seq, void *v)
> +{
> + return __sock_hash_seq_show(seq, v);
> +}
> +
> +static void sock_hash_seq_stop(struct seq_file *seq, void *v)
> +{
> + if (!v)
> + (void)__sock_hash_seq_show(seq, NULL);
> +
> + /* pairs with sock_hash_seq_start */
> + rcu_read_unlock();
> +}
> +
> +static const struct seq_operations sock_hash_seq_ops = {
> + .start = sock_hash_seq_start,
> + .next = sock_hash_seq_next,
> + .stop = sock_hash_seq_stop,
> + .show = sock_hash_seq_show,
> +};
> +
> +static int sock_hash_init_seq_private(void *priv_data,
> + struct bpf_iter_aux_info *aux)
> +{
> + struct sock_hash_seq_info *info = priv_data;
> +
> + info->map = aux->map;
> + return 0;
> +}
> +
> +static const struct bpf_iter_seq_info sock_hash_iter_seq_info = {
> + .seq_ops = &sock_hash_seq_ops,
> + .init_seq_private = sock_hash_init_seq_private,
> + .seq_priv_size = sizeof(struct sock_hash_seq_info),
> +};
> +
> static int sock_hash_map_btf_id;
> const struct bpf_map_ops sock_hash_ops = {
> .map_alloc = sock_hash_alloc,
[...]
next prev parent reply other threads:[~2020-08-31 10:04 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-08-28 9:48 [PATCH bpf-next 0/3] Sockmap iterator Lorenz Bauer
2020-08-28 9:48 ` [PATCH bpf-next 1/3] net: Allow iterating sockmap and sockhash Lorenz Bauer
2020-08-31 10:03 ` Jakub Sitnicki [this message]
2020-09-01 8:59 ` Lorenz Bauer
2020-08-28 9:48 ` [PATCH bpf-next 2/3] selftests: bpf: Add helper to compare socket cookies Lorenz Bauer
2020-08-28 10:50 ` Jakub Sitnicki
2020-08-28 15:48 ` Lorenz Bauer
2020-08-28 11:28 ` Jakub Sitnicki
2020-08-28 9:48 ` [PATCH bpf-next 3/3] selftests: bpf: Test copying a sockmap via bpf_iter Lorenz Bauer
2020-08-31 10:58 ` Jakub Sitnicki
2020-09-01 9:20 ` Lorenz Bauer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=87eennrv1u.fsf@cloudflare.com \
--to=jakub@cloudflare.com \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=john.fastabend@gmail.com \
--cc=kernel-team@cloudflare.com \
--cc=lmb@cloudflare.com \
--cc=yhs@fb.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox