All of lore.kernel.org
 help / color / mirror / Atom feed
From: Stanislav Fomichev <sdf@google.com>
To: Cong Wang <xiyou.wangcong@gmail.com>
Cc: netdev@vger.kernel.org, bpf@vger.kernel.org,
	Cong Wang <cong.wang@bytedance.com>,
	John Fastabend <john.fastabend@gmail.com>,
	Jakub Sitnicki <jakub@cloudflare.com>
Subject: Re: [Patch net-next v2] sock_map: dump socket map id via diag
Date: Tue, 14 Mar 2023 10:58:35 -0700	[thread overview]
Message-ID: <ZBC1y4tuZgcQ1YZz@google.com> (raw)
In-Reply-To: <20230313041619.394914-1-xiyou.wangcong@gmail.com>

On 03/12, Cong Wang wrote:
> From: Cong Wang <cong.wang@bytedance.com>

> Currently there is no way to know which sockmap a socket has been added
> to from outside, especially for that a socket can be added to multiple
> sockmap's. We could dump this via socket diag, as shown below.

> Sample output:

>    # ./iproute2/misc/ss -tnaie --sockmap
>    ESTAB  0      344329     127.0.0.1:1234     127.0.0.1:40912 ino:21098  
> sk:5 cgroup:/user.slice/user-0.slice/session-c1.scope <-> sockmap: 1

>    # bpftool map
>    1: sockmap  flags 0x0
>    	key 4B  value 4B  max_entries 2  memlock 4096B
> 	pids echo-sockmap(549)
>    4: array  name pid_iter.rodata  flags 0x480
> 	key 4B  value 4B  max_entries 1  memlock 4096B
> 	btf_id 10  frozen
> 	pids bpftool(624)

> In the future, we could dump other sockmap related stats too, hence I
> make it a nested attribute.

> Cc: John Fastabend <john.fastabend@gmail.com>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
> Signed-off-by: Cong Wang <cong.wang@bytedance.com>
> ---
> v2: rename enum's with more generic names
>      sock_map_idiag_dump -> sock_map_diag_dump()
>      make sock_map_diag_dump() return number of maps

>   include/linux/bpf.h            |  1 +
>   include/uapi/linux/inet_diag.h |  1 +
>   include/uapi/linux/sock_diag.h |  8 ++++++
>   include/uapi/linux/unix_diag.h |  1 +
>   net/core/sock_map.c            | 51 ++++++++++++++++++++++++++++++++++
>   net/ipv4/inet_diag.c           |  5 ++++
>   net/unix/diag.c                |  6 ++++
>   7 files changed, 73 insertions(+)

> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 6792a7940e1e..4cc315ce26a9 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -2638,6 +2638,7 @@ int sock_map_bpf_prog_query(const union bpf_attr  
> *attr,
>   void sock_map_unhash(struct sock *sk);
>   void sock_map_destroy(struct sock *sk);
>   void sock_map_close(struct sock *sk, long timeout);
> +int sock_map_diag_dump(struct sock *sk, struct sk_buff *skb, int attr);
>   #else
>   static inline int bpf_dev_bound_kfunc_check(struct bpf_verifier_log *log,
>   					    struct bpf_prog_aux *prog_aux)
> diff --git a/include/uapi/linux/inet_diag.h  
> b/include/uapi/linux/inet_diag.h
> index 50655de04c9b..d1f1e4522633 100644
> --- a/include/uapi/linux/inet_diag.h
> +++ b/include/uapi/linux/inet_diag.h
> @@ -161,6 +161,7 @@ enum {
>   	INET_DIAG_SK_BPF_STORAGES,
>   	INET_DIAG_CGROUP_ID,
>   	INET_DIAG_SOCKOPT,
> +	INET_DIAG_BPF_MAP,
>   	__INET_DIAG_MAX,
>   };

> diff --git a/include/uapi/linux/sock_diag.h  
> b/include/uapi/linux/sock_diag.h
> index 5f74a5f6091d..7c961940b408 100644
> --- a/include/uapi/linux/sock_diag.h
> +++ b/include/uapi/linux/sock_diag.h
> @@ -62,4 +62,12 @@ enum {

>   #define SK_DIAG_BPF_STORAGE_MAX        (__SK_DIAG_BPF_STORAGE_MAX - 1)

> +enum {
> +	SK_DIAG_BPF_MAP_NONE,
> +	SK_DIAG_BPF_MAP_IDS,
> +	__SK_DIAG_BPF_MAP_MAX,
> +};
> +
> +#define SK_DIAG_BPF_MAP_MAX        (__SK_DIAG_BPF_MAP_MAX - 1)
> +
>   #endif /* _UAPI__SOCK_DIAG_H__ */
> diff --git a/include/uapi/linux/unix_diag.h  
> b/include/uapi/linux/unix_diag.h
> index a1988576fa8a..b95a2b33521d 100644
> --- a/include/uapi/linux/unix_diag.h
> +++ b/include/uapi/linux/unix_diag.h
> @@ -42,6 +42,7 @@ enum {
>   	UNIX_DIAG_MEMINFO,
>   	UNIX_DIAG_SHUTDOWN,
>   	UNIX_DIAG_UID,
> +	UNIX_DIAG_BPF_MAP,

>   	__UNIX_DIAG_MAX,
>   };
> diff --git a/net/core/sock_map.c b/net/core/sock_map.c
> index 9b854e236d23..8c4b3044e7a9 100644
> --- a/net/core/sock_map.c
> +++ b/net/core/sock_map.c
> @@ -1656,6 +1656,57 @@ void sock_map_close(struct sock *sk, long timeout)
>   }
>   EXPORT_SYMBOL_GPL(sock_map_close);

> +int sock_map_diag_dump(struct sock *sk, struct sk_buff *skb, int  
> attrtype)
> +{
> +	struct sk_psock_link *link;
> +	struct nlattr *nla, *attr;
> +	int nr_links = 0, ret = 0;
> +	struct sk_psock *psock;
> +	u32 *ids;
> +

[..]

> +	rcu_read_lock();

Looks like sk_psock_get already starts RCU read section and returns
a refcnt'ed psock. So maybe drop rcu_read_lock/rcu_read_unlock?

> +	psock = sk_psock_get(sk);
> +	if (unlikely(!psock)) {
> +		rcu_read_unlock();
> +		return 0;
> +	}
> +
> +	nla = nla_nest_start_noflag(skb, attrtype);
> +	if (!nla) {
> +		sk_psock_put(sk, psock);
> +		rcu_read_unlock();
> +		return -EMSGSIZE;
> +	}
> +	spin_lock_bh(&psock->link_lock);
> +	list_for_each_entry(link, &psock->link, list)
> +		nr_links++;
> +
> +	attr = nla_reserve(skb, SK_DIAG_BPF_MAP_IDS,
> +			   sizeof(link->map->id) * nr_links);
> +	if (!attr) {
> +		ret = -EMSGSIZE;
> +		goto unlock;
> +	}
> +
> +	ids = nla_data(attr);
> +	list_for_each_entry(link, &psock->link, list) {
> +		*ids = link->map->id;
> +		ids++;
> +	}
> +unlock:
> +	spin_unlock_bh(&psock->link_lock);
> +	sk_psock_put(sk, psock);
> +	rcu_read_unlock();
> +	if (ret) {
> +		nla_nest_cancel(skb, nla);
> +	} else {
> +		ret = nr_links;
> +		nla_nest_end(skb, nla);
> +	}
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(sock_map_diag_dump);
> +
>   static int sock_map_iter_attach_target(struct bpf_prog *prog,
>   				       union bpf_iter_link_info *linfo,
>   				       struct bpf_iter_aux_info *aux)
> diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
> index b812eb36f0e3..0949909d5b46 100644
> --- a/net/ipv4/inet_diag.c
> +++ b/net/ipv4/inet_diag.c
> @@ -197,6 +197,11 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct  
> sk_buff *skb,
>   		    &inet_sockopt))
>   		goto errout;

> +#ifdef CONFIG_BPF_SYSCALL
> +	if (sock_map_diag_dump(sk, skb, INET_DIAG_BPF_MAP) < 0)
> +		goto errout;
> +#endif
> +
>   	return 0;
>   errout:
>   	return 1;
> diff --git a/net/unix/diag.c b/net/unix/diag.c
> index 616b55c5b890..54aa8da2831e 100644
> --- a/net/unix/diag.c
> +++ b/net/unix/diag.c
> @@ -6,6 +6,7 @@
>   #include <linux/skbuff.h>
>   #include <linux/module.h>
>   #include <linux/uidgid.h>
> +#include <linux/bpf.h>
>   #include <net/netlink.h>
>   #include <net/af_unix.h>
>   #include <net/tcp_states.h>
> @@ -172,6 +173,11 @@ static int sk_diag_fill(struct sock *sk, struct  
> sk_buff *skb, struct unix_diag_r
>   	    sk_diag_dump_uid(sk, skb, user_ns))
>   		goto out_nlmsg_trim;

> +#ifdef CONFIG_BPF_SYSCALL
> +	if (sock_map_diag_dump(sk, skb, UNIX_DIAG_BPF_MAP) < 0)
> +		goto out_nlmsg_trim;
> +#endif
> +
>   	nlmsg_end(skb, nlh);
>   	return 0;

> --
> 2.34.1


  reply	other threads:[~2023-03-14 17:58 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-03-13  4:16 [Patch net-next v2] sock_map: dump socket map id via diag Cong Wang
2023-03-14 17:58 ` Stanislav Fomichev [this message]
2023-03-15  6:14 ` John Fastabend

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ZBC1y4tuZgcQ1YZz@google.com \
    --to=sdf@google.com \
    --cc=bpf@vger.kernel.org \
    --cc=cong.wang@bytedance.com \
    --cc=jakub@cloudflare.com \
    --cc=john.fastabend@gmail.com \
    --cc=netdev@vger.kernel.org \
    --cc=xiyou.wangcong@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.