Re: [PATCH bpf-next 1/3] net: Allow iterating sockmap and sockhash

BPF List
 help / color / mirror / Atom feed

From: Jakub Sitnicki <jakub@cloudflare.com>
To: Lorenz Bauer <lmb@cloudflare.com>
Cc: ast@kernel.org, yhs@fb.com, daniel@iogearbox.net,
	john.fastabend@gmail.com, bpf@vger.kernel.org,
	kernel-team@cloudflare.com
Subject: Re: [PATCH bpf-next 1/3] net: Allow iterating sockmap and sockhash
Date: Mon, 31 Aug 2020 12:03:57 +0200	[thread overview]
Message-ID: <87eennrv1u.fsf@cloudflare.com> (raw)
In-Reply-To: <20200828094834.23290-2-lmb@cloudflare.com>

On Fri, Aug 28, 2020 at 11:48 AM CEST, Lorenz Bauer wrote:
> Add bpf_iter support for sockmap / sockhash, based on the bpf_sk_storage and
> hashtable implementation. sockmap and sockhash share the same iteration
> context: a pointer to an arbitrary key and a pointer to a socket. Both
> pointers may be NULL, and so BPF has to perform a NULL check before accessing
> them. Technically it's not possible for sockhash iteration to yield a NULL
> socket, but we ignore this to be able to use a single iteration point.
>
> Iteration will visit all keys that remain unmodified during the lifetime of
> the iterator. It may or may not visit newly added ones.
>
> Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
> ---
>  net/core/sock_map.c | 283 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 283 insertions(+)
>
> diff --git a/net/core/sock_map.c b/net/core/sock_map.c
> index d6c6e1e312fc..31c4332f06e4 100644
> --- a/net/core/sock_map.c
> +++ b/net/core/sock_map.c
> @@ -703,6 +703,116 @@ const struct bpf_func_proto bpf_msg_redirect_map_proto = {
>  	.arg4_type      = ARG_ANYTHING,
>  };
>
> +struct sock_map_seq_info {
> +	struct bpf_map *map;
> +	struct sock *sk;
> +	u32 index;
> +};
> +
> +struct bpf_iter__sockmap {
> +	__bpf_md_ptr(struct bpf_iter_meta *, meta);
> +	__bpf_md_ptr(struct bpf_map *, map);
> +	__bpf_md_ptr(void *, key);
> +	__bpf_md_ptr(struct bpf_sock *, sk);
> +};
> +
> +DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta,
> +		     struct bpf_map *map, void *key,
> +		     struct sock *sk)
> +
> +static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info)
> +{
> +	if (unlikely(info->index >= info->map->max_entries))
> +		return NULL;
> +
> +	info->sk = __sock_map_lookup_elem(info->map, info->index);
> +	if (!info->sk || !sk_fullsock(info->sk))

As we've talked off-line, we don't expect neither timewait nor request
sockets in sockmap so sk_fullsock() check is likely not needed.

> +		info->sk = NULL;
> +
> +	/* continue iterating */
> +	return info;
> +}
> +
> +static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> +	struct sock_map_seq_info *info = seq->private;
> +
> +	if (*pos == 0)
> +		++*pos;
> +
> +	/* pairs with sock_map_seq_stop */
> +	rcu_read_lock();
> +	return sock_map_seq_lookup_elem(info);
> +}
> +
> +static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> +	struct sock_map_seq_info *info = seq->private;
> +
> +	++*pos;
> +	++info->index;
> +
> +	return sock_map_seq_lookup_elem(info);
> +}
> +
> +static int __sock_map_seq_show(struct seq_file *seq, void *v)
> +{
> +	struct sock_map_seq_info *info = seq->private;
> +	struct bpf_iter__sockmap ctx = {};
> +	struct bpf_iter_meta meta;
> +	struct bpf_prog *prog;
> +
> +	meta.seq = seq;
> +	prog = bpf_iter_get_info(&meta, !v);
> +	if (!prog)
> +		return 0;
> +
> +	ctx.meta = &meta;
> +	ctx.map = info->map;
> +	if (v) {
> +		ctx.key = &info->index;
> +		ctx.sk = (struct bpf_sock *)info->sk;
> +	}
> +
> +	return bpf_iter_run_prog(prog, &ctx);
> +}
> +
> +static int sock_map_seq_show(struct seq_file *seq, void *v)
> +{
> +	return __sock_map_seq_show(seq, v);
> +}
> +
> +static void sock_map_seq_stop(struct seq_file *seq, void *v)
> +{
> +	if (!v)
> +		(void)__sock_map_seq_show(seq, NULL);
> +
> +	/* pairs with sock_map_seq_start */
> +	rcu_read_unlock();
> +}
> +
> +static const struct seq_operations sock_map_seq_ops = {
> +	.start	= sock_map_seq_start,
> +	.next	= sock_map_seq_next,
> +	.stop	= sock_map_seq_stop,
> +	.show	= sock_map_seq_show,
> +};
> +
> +static int sock_map_init_seq_private(void *priv_data,
> +				     struct bpf_iter_aux_info *aux)
> +{
> +	struct sock_map_seq_info *info = priv_data;
> +
> +	info->map = aux->map;
> +	return 0;
> +}
> +
> +static const struct bpf_iter_seq_info sock_map_iter_seq_info = {
> +	.seq_ops		= &sock_map_seq_ops,
> +	.init_seq_private	= sock_map_init_seq_private,
> +	.seq_priv_size		= sizeof(struct sock_map_seq_info),
> +};
> +
>  static int sock_map_btf_id;
>  const struct bpf_map_ops sock_map_ops = {
>  	.map_alloc		= sock_map_alloc,

[...]

> @@ -1198,6 +1309,120 @@ const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
>  	.arg4_type      = ARG_ANYTHING,
>  };
>
> +struct sock_hash_seq_info {
> +	struct bpf_map *map;
> +	struct bpf_shtab *htab;
> +	u32 bucket_id;
> +};
> +
> +static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info,
> +				     struct bpf_shtab_elem *prev_elem)
> +{
> +	const struct bpf_shtab *htab = info->htab;
> +	struct bpf_shtab_bucket *bucket;
> +	struct bpf_shtab_elem *elem;
> +	struct hlist_node *node;
> +
> +	/* try to find next elem in the same bucket */
> +	if (prev_elem) {
> +		node = rcu_dereference_raw(hlist_next_rcu(&prev_elem->node));

I'm not convinced we need to go for the rcu_dereference_raw()
variant. Access happens inside read-side critical section, which we
entered with rcu_read_lock() in sock_hash_seq_start().

That's typical and rcu_dereference() seems appropriate. Basing this on
what I read in Documentation/RCU/rcu_dereference.rst.

> +		elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
> +		if (elem)
> +			return elem;
> +
> +		/* no more elements, continue in the next bucket */
> +		info->bucket_id++;
> +	}
> +
> +	for (; info->bucket_id < htab->buckets_num; info->bucket_id++) {
> +		bucket = &htab->buckets[info->bucket_id];
> +		node = rcu_dereference_raw(hlist_first_rcu(&bucket->head));
> +		elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
> +		if (elem)
> +			return elem;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos)
> +{
> +	struct sock_hash_seq_info *info = seq->private;
> +
> +	if (*pos == 0)
> +		++*pos;
> +
> +	/* pairs with sock_hash_seq_stop */
> +	rcu_read_lock();
> +	return sock_hash_seq_find_next(info, NULL);
> +}
> +
> +static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> +{
> +	struct sock_hash_seq_info *info = seq->private;
> +
> +	++*pos;
> +	return sock_hash_seq_find_next(info, v);
> +}
> +
> +static int __sock_hash_seq_show(struct seq_file *seq, struct bpf_shtab_elem *elem)
> +{
> +	struct sock_hash_seq_info *info = seq->private;
> +	struct bpf_iter__sockmap ctx = {};
> +	struct bpf_iter_meta meta;
> +	struct bpf_prog *prog;
> +
> +	meta.seq = seq;
> +	prog = bpf_iter_get_info(&meta, !elem);
> +	if (!prog)
> +		return 0;
> +
> +	ctx.meta = &meta;
> +	ctx.map = info->map;
> +	if (elem) {
> +		ctx.key = elem->key;
> +		ctx.sk = (struct bpf_sock *)elem->sk;
> +	}
> +
> +	return bpf_iter_run_prog(prog, &ctx);
> +}
> +
> +static int sock_hash_seq_show(struct seq_file *seq, void *v)
> +{
> +	return __sock_hash_seq_show(seq, v);
> +}
> +
> +static void sock_hash_seq_stop(struct seq_file *seq, void *v)
> +{
> +	if (!v)
> +		(void)__sock_hash_seq_show(seq, NULL);
> +
> +	/* pairs with sock_hash_seq_start */
> +	rcu_read_unlock();
> +}
> +
> +static const struct seq_operations sock_hash_seq_ops = {
> +	.start	= sock_hash_seq_start,
> +	.next	= sock_hash_seq_next,
> +	.stop	= sock_hash_seq_stop,
> +	.show	= sock_hash_seq_show,
> +};
> +
> +static int sock_hash_init_seq_private(void *priv_data,
> +				     struct bpf_iter_aux_info *aux)
> +{
> +	struct sock_hash_seq_info *info = priv_data;
> +
> +	info->map = aux->map;
> +	return 0;
> +}
> +
> +static const struct bpf_iter_seq_info sock_hash_iter_seq_info = {
> +	.seq_ops		= &sock_hash_seq_ops,
> +	.init_seq_private	= sock_hash_init_seq_private,
> +	.seq_priv_size		= sizeof(struct sock_hash_seq_info),
> +};
> +
>  static int sock_hash_map_btf_id;
>  const struct bpf_map_ops sock_hash_ops = {
>  	.map_alloc		= sock_hash_alloc,

[...]

next prev parent reply	other threads:[~2020-08-31 10:04 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-08-28  9:48 [PATCH bpf-next 0/3] Sockmap iterator Lorenz Bauer
2020-08-28  9:48 ` [PATCH bpf-next 1/3] net: Allow iterating sockmap and sockhash Lorenz Bauer
2020-08-31 10:03   ` Jakub Sitnicki [this message]
2020-09-01  8:59     ` Lorenz Bauer
2020-08-28  9:48 ` [PATCH bpf-next 2/3] selftests: bpf: Add helper to compare socket cookies Lorenz Bauer
2020-08-28 10:50   ` Jakub Sitnicki
2020-08-28 15:48     ` Lorenz Bauer
2020-08-28 11:28   ` Jakub Sitnicki
2020-08-28  9:48 ` [PATCH bpf-next 3/3] selftests: bpf: Test copying a sockmap via bpf_iter Lorenz Bauer
2020-08-31 10:58   ` Jakub Sitnicki
2020-09-01  9:20     ` Lorenz Bauer

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87eennrv1u.fsf@cloudflare.com \
    --to=jakub@cloudflare.com \
    --cc=ast@kernel.org \
    --cc=bpf@vger.kernel.org \
    --cc=daniel@iogearbox.net \
    --cc=john.fastabend@gmail.com \
    --cc=kernel-team@cloudflare.com \
    --cc=lmb@cloudflare.com \
    --cc=yhs@fb.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox