netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next v2] tcp: Dump bound-only sockets in inet_diag.
@ 2023-11-23 23:11 Guillaume Nault
  2023-11-25  1:39 ` Kuniyuki Iwashima
  2023-11-28 10:14 ` Eric Dumazet
  0 siblings, 2 replies; 9+ messages in thread
From: Guillaume Nault @ 2023-11-23 23:11 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski, Paolo Abeni, Eric Dumazet
  Cc: netdev, David Ahern, Kuniyuki Iwashima, Michal Kubecek

Walk the hashinfo->bhash2 table so that inet_diag can dump TCP sockets
that are bound but haven't yet called connect() or listen().

This allows ss to dump bound-only TCP sockets, together with listening
sockets (as there's no specific state for bound-only sockets). This is
similar to the UDP behaviour for which bound-only sockets are already
dumped by ss -lu.

The code is inspired by the ->lhash2 loop. However there's no manual
test of the source port, since this kind of filtering is already
handled by inet_diag_bc_sk(). Also, a maximum of 16 sockets are dumped
at a time, to avoid running with bh disabled for too long.

No change is needed for ss. With an IPv4, an IPv6 and an IPv6-only
socket, bound respectively to 40000, 64000, 60000, the result is:

  $ ss -lt
  State  Recv-Q Send-Q Local Address:Port  Peer Address:PortProcess
  UNCONN 0      0            0.0.0.0:40000      0.0.0.0:*
  UNCONN 0      0               [::]:60000         [::]:*
  UNCONN 0      0                  *:64000            *:*

Signed-off-by: Guillaume Nault <gnault@redhat.com>
---

v2:
  * Use ->bhash2 instead of ->bhash (Kuniyuki Iwashima).
  * Process no more than 16 sockets at a time (Kuniyuki Iwashima).

 net/ipv4/inet_diag.c | 88 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 87 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7d0e7aaa71e0..d7fb6a625cb7 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -1077,10 +1077,96 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 		s_i = num = s_num = 0;
 	}
 
+/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets
+ * with bh disabled.
+ */
+#define SKARR_SZ 16
+
+	/* Dump bound-only sockets */
+	if (cb->args[0] == 1) {
+		if (!(idiag_states & TCPF_CLOSE))
+			goto skip_bind_ht;
+
+		for (i = s_i; i < hashinfo->bhash_size; i++) {
+			struct inet_bind_hashbucket *ibb;
+			struct inet_bind2_bucket *tb2;
+			struct sock *sk_arr[SKARR_SZ];
+			int num_arr[SKARR_SZ];
+			int idx, accum, res;
+
+resume_bind_walk:
+			num = 0;
+			accum = 0;
+			ibb = &hashinfo->bhash2[i];
+
+			spin_lock_bh(&ibb->lock);
+			inet_bind_bucket_for_each(tb2, &ibb->chain) {
+				if (!net_eq(ib2_net(tb2), net))
+					continue;
+
+				sk_for_each_bound_bhash2(sk, &tb2->owners) {
+					struct inet_sock *inet = inet_sk(sk);
+
+					if (num < s_num)
+						goto next_bind;
+
+					if (sk->sk_state != TCP_CLOSE ||
+					    !inet->inet_num)
+						goto next_bind;
+
+					if (r->sdiag_family != AF_UNSPEC &&
+					    r->sdiag_family != sk->sk_family)
+						goto next_bind;
+
+					if (!inet_diag_bc_sk(bc, sk))
+						goto next_bind;
+
+					if (!refcount_inc_not_zero(&sk->sk_refcnt))
+						goto next_bind;
+
+					num_arr[accum] = num;
+					sk_arr[accum] = sk;
+					if (++accum == SKARR_SZ)
+						goto pause_bind_walk;
+next_bind:
+					num++;
+				}
+			}
+pause_bind_walk:
+			spin_unlock_bh(&ibb->lock);
+
+			res = 0;
+			for (idx = 0; idx < accum; idx++) {
+				if (res >= 0) {
+					res = inet_sk_diag_fill(sk_arr[idx],
+								NULL, skb, cb,
+								r, NLM_F_MULTI,
+								net_admin);
+					if (res < 0)
+						num = num_arr[idx];
+				}
+				sock_gen_put(sk_arr[idx]);
+			}
+			if (res < 0)
+				goto done;
+
+			cond_resched();
+
+			if (accum == SKARR_SZ) {
+				s_num = num + 1;
+				goto resume_bind_walk;
+			}
+
+			s_num = 0;
+		}
+skip_bind_ht:
+		cb->args[0] = 2;
+		s_i = num = s_num = 0;
+	}
+
 	if (!(idiag_states & ~TCPF_LISTEN))
 		goto out;
 
-#define SKARR_SZ 16
 	for (i = s_i; i <= hashinfo->ehash_mask; i++) {
 		struct inet_ehash_bucket *head = &hashinfo->ehash[i];
 		spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next v2] tcp: Dump bound-only sockets in inet_diag.
  2023-11-23 23:11 [PATCH net-next v2] tcp: Dump bound-only sockets in inet_diag Guillaume Nault
@ 2023-11-25  1:39 ` Kuniyuki Iwashima
  2023-11-27 17:26   ` Guillaume Nault
  2023-11-28 10:14 ` Eric Dumazet
  1 sibling, 1 reply; 9+ messages in thread
From: Kuniyuki Iwashima @ 2023-11-25  1:39 UTC (permalink / raw)
  To: gnault; +Cc: davem, dsahern, edumazet, kuba, kuniyu, mkubecek, netdev, pabeni

From: Guillaume Nault <gnault@redhat.com>
Date: Fri, 24 Nov 2023 00:11:42 +0100
> Walk the hashinfo->bhash2 table so that inet_diag can dump TCP sockets
> that are bound but haven't yet called connect() or listen().
> 
> This allows ss to dump bound-only TCP sockets, together with listening
> sockets (as there's no specific state for bound-only sockets). This is
> similar to the UDP behaviour for which bound-only sockets are already
> dumped by ss -lu.
> 
> The code is inspired by the ->lhash2 loop. However there's no manual
> test of the source port, since this kind of filtering is already
> handled by inet_diag_bc_sk(). Also, a maximum of 16 sockets are dumped
> at a time, to avoid running with bh disabled for too long.
> 
> No change is needed for ss. With an IPv4, an IPv6 and an IPv6-only
> socket, bound respectively to 40000, 64000, 60000, the result is:
> 
>   $ ss -lt
>   State  Recv-Q Send-Q Local Address:Port  Peer Address:PortProcess
>   UNCONN 0      0            0.0.0.0:40000      0.0.0.0:*
>   UNCONN 0      0               [::]:60000         [::]:*
>   UNCONN 0      0                  *:64000            *:*
> 
> Signed-off-by: Guillaume Nault <gnault@redhat.com>
> ---
> 
> v2:
>   * Use ->bhash2 instead of ->bhash (Kuniyuki Iwashima).
>   * Process no more than 16 sockets at a time (Kuniyuki Iwashima).
> 
>  net/ipv4/inet_diag.c | 88 +++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 87 insertions(+), 1 deletion(-)
> 
> diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
> index 7d0e7aaa71e0..d7fb6a625cb7 100644
> --- a/net/ipv4/inet_diag.c
> +++ b/net/ipv4/inet_diag.c
> @@ -1077,10 +1077,96 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
>  		s_i = num = s_num = 0;
>  	}
>  
> +/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets
> + * with bh disabled.
> + */
> +#define SKARR_SZ 16
> +
> +	/* Dump bound-only sockets */
> +	if (cb->args[0] == 1) {
> +		if (!(idiag_states & TCPF_CLOSE))
> +			goto skip_bind_ht;
> +
> +		for (i = s_i; i < hashinfo->bhash_size; i++) {
> +			struct inet_bind_hashbucket *ibb;
> +			struct inet_bind2_bucket *tb2;
> +			struct sock *sk_arr[SKARR_SZ];
> +			int num_arr[SKARR_SZ];
> +			int idx, accum, res;
> +
> +resume_bind_walk:
> +			num = 0;
> +			accum = 0;
> +			ibb = &hashinfo->bhash2[i];
> +
> +			spin_lock_bh(&ibb->lock);
> +			inet_bind_bucket_for_each(tb2, &ibb->chain) {
> +				if (!net_eq(ib2_net(tb2), net))
> +					continue;
> +
> +				sk_for_each_bound_bhash2(sk, &tb2->owners) {
> +					struct inet_sock *inet = inet_sk(sk);
> +
> +					if (num < s_num)
> +						goto next_bind;
> +
> +					if (sk->sk_state != TCP_CLOSE ||
> +					    !inet->inet_num)
> +						goto next_bind;
> +
> +					if (r->sdiag_family != AF_UNSPEC &&
> +					    r->sdiag_family != sk->sk_family)
> +						goto next_bind;
> +
> +					if (!inet_diag_bc_sk(bc, sk))
> +						goto next_bind;
> +
> +					if (!refcount_inc_not_zero(&sk->sk_refcnt))
> +						goto next_bind;

I guess this is copied from the ehash code below, but could
refcount_inc_not_zero() fail for bhash2 under spin_lock_bh() ?


> +
> +					num_arr[accum] = num;
> +					sk_arr[accum] = sk;
> +					if (++accum == SKARR_SZ)
> +						goto pause_bind_walk;
> +next_bind:
> +					num++;
> +				}
> +			}
> +pause_bind_walk:
> +			spin_unlock_bh(&ibb->lock);
> +
> +			res = 0;
> +			for (idx = 0; idx < accum; idx++) {
> +				if (res >= 0) {
> +					res = inet_sk_diag_fill(sk_arr[idx],
> +								NULL, skb, cb,
> +								r, NLM_F_MULTI,
> +								net_admin);
> +					if (res < 0)
> +						num = num_arr[idx];
> +				}
> +				sock_gen_put(sk_arr[idx]);
> +			}
> +			if (res < 0)
> +				goto done;
> +
> +			cond_resched();
> +
> +			if (accum == SKARR_SZ) {
> +				s_num = num + 1;
> +				goto resume_bind_walk;
> +			}
> +
> +			s_num = 0;
> +		}
> +skip_bind_ht:
> +		cb->args[0] = 2;
> +		s_i = num = s_num = 0;
> +	}
> +
>  	if (!(idiag_states & ~TCPF_LISTEN))
>  		goto out;
>  
> -#define SKARR_SZ 16
>  	for (i = s_i; i <= hashinfo->ehash_mask; i++) {
>  		struct inet_ehash_bucket *head = &hashinfo->ehash[i];
>  		spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
> -- 
> 2.39.2

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next v2] tcp: Dump bound-only sockets in inet_diag.
  2023-11-25  1:39 ` Kuniyuki Iwashima
@ 2023-11-27 17:26   ` Guillaume Nault
  2023-11-27 17:56     ` Kuniyuki Iwashima
  0 siblings, 1 reply; 9+ messages in thread
From: Guillaume Nault @ 2023-11-27 17:26 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: davem, dsahern, edumazet, kuba, mkubecek, netdev, pabeni

On Fri, Nov 24, 2023 at 05:39:42PM -0800, Kuniyuki Iwashima wrote:
> > +			spin_lock_bh(&ibb->lock);
> > +			inet_bind_bucket_for_each(tb2, &ibb->chain) {
> > +				if (!net_eq(ib2_net(tb2), net))
> > +					continue;
> > +
> > +				sk_for_each_bound_bhash2(sk, &tb2->owners) {
> > +					struct inet_sock *inet = inet_sk(sk);
> > +
> > +					if (num < s_num)
> > +						goto next_bind;
> > +
> > +					if (sk->sk_state != TCP_CLOSE ||
> > +					    !inet->inet_num)
> > +						goto next_bind;
> > +
> > +					if (r->sdiag_family != AF_UNSPEC &&
> > +					    r->sdiag_family != sk->sk_family)
> > +						goto next_bind;
> > +
> > +					if (!inet_diag_bc_sk(bc, sk))
> > +						goto next_bind;
> > +
> > +					if (!refcount_inc_not_zero(&sk->sk_refcnt))
> > +						goto next_bind;
> 
> I guess this is copied from the ehash code below, but could
> refcount_inc_not_zero() fail for bhash2 under spin_lock_bh() ?

My understanding is that it can't fail, but I prefered to keep the test
to be on the safe side.

I can post a v3 using a plain sock_hold(), if you prefer.

> > +
> > +					num_arr[accum] = num;
> > +					sk_arr[accum] = sk;
> > +					if (++accum == SKARR_SZ)
> > +						goto pause_bind_walk;
> > +next_bind:
> > +					num++;
> > +				}
> > +			}


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next v2] tcp: Dump bound-only sockets in inet_diag.
  2023-11-27 17:26   ` Guillaume Nault
@ 2023-11-27 17:56     ` Kuniyuki Iwashima
  2023-11-28  1:13       ` Guillaume Nault
  0 siblings, 1 reply; 9+ messages in thread
From: Kuniyuki Iwashima @ 2023-11-27 17:56 UTC (permalink / raw)
  To: gnault; +Cc: davem, dsahern, edumazet, kuba, kuniyu, mkubecek, netdev, pabeni

From: Guillaume Nault <gnault@redhat.com>
Date: Mon, 27 Nov 2023 18:26:05 +0100
> On Fri, Nov 24, 2023 at 05:39:42PM -0800, Kuniyuki Iwashima wrote:
> > > +			spin_lock_bh(&ibb->lock);
> > > +			inet_bind_bucket_for_each(tb2, &ibb->chain) {
> > > +				if (!net_eq(ib2_net(tb2), net))
> > > +					continue;
> > > +
> > > +				sk_for_each_bound_bhash2(sk, &tb2->owners) {
> > > +					struct inet_sock *inet = inet_sk(sk);
> > > +
> > > +					if (num < s_num)
> > > +						goto next_bind;
> > > +
> > > +					if (sk->sk_state != TCP_CLOSE ||
> > > +					    !inet->inet_num)
> > > +						goto next_bind;
> > > +
> > > +					if (r->sdiag_family != AF_UNSPEC &&
> > > +					    r->sdiag_family != sk->sk_family)
> > > +						goto next_bind;
> > > +
> > > +					if (!inet_diag_bc_sk(bc, sk))
> > > +						goto next_bind;
> > > +
> > > +					if (!refcount_inc_not_zero(&sk->sk_refcnt))
> > > +						goto next_bind;
> > 
> > I guess this is copied from the ehash code below, but could
> > refcount_inc_not_zero() fail for bhash2 under spin_lock_bh() ?
> 
> My understanding is that it can't fail, but I prefered to keep the test
> to be on the safe side.
> 
> I can post a v3 using a plain sock_hold(), if you prefer.

I prefer sock_hold() because refcount_inc_not_zero() implies that it could
fail and is confusing if it never fails.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next v2] tcp: Dump bound-only sockets in inet_diag.
  2023-11-27 17:56     ` Kuniyuki Iwashima
@ 2023-11-28  1:13       ` Guillaume Nault
  0 siblings, 0 replies; 9+ messages in thread
From: Guillaume Nault @ 2023-11-28  1:13 UTC (permalink / raw)
  To: Kuniyuki Iwashima
  Cc: davem, dsahern, edumazet, kuba, mkubecek, netdev, pabeni

On Mon, Nov 27, 2023 at 09:56:43AM -0800, Kuniyuki Iwashima wrote:
> From: Guillaume Nault <gnault@redhat.com>
> Date: Mon, 27 Nov 2023 18:26:05 +0100
> > On Fri, Nov 24, 2023 at 05:39:42PM -0800, Kuniyuki Iwashima wrote:
> > > > +			spin_lock_bh(&ibb->lock);
> > > > +			inet_bind_bucket_for_each(tb2, &ibb->chain) {
> > > > +				if (!net_eq(ib2_net(tb2), net))
> > > > +					continue;
> > > > +
> > > > +				sk_for_each_bound_bhash2(sk, &tb2->owners) {
> > > > +					struct inet_sock *inet = inet_sk(sk);
> > > > +
> > > > +					if (num < s_num)
> > > > +						goto next_bind;
> > > > +
> > > > +					if (sk->sk_state != TCP_CLOSE ||
> > > > +					    !inet->inet_num)
> > > > +						goto next_bind;
> > > > +
> > > > +					if (r->sdiag_family != AF_UNSPEC &&
> > > > +					    r->sdiag_family != sk->sk_family)
> > > > +						goto next_bind;
> > > > +
> > > > +					if (!inet_diag_bc_sk(bc, sk))
> > > > +						goto next_bind;
> > > > +
> > > > +					if (!refcount_inc_not_zero(&sk->sk_refcnt))
> > > > +						goto next_bind;
> > > 
> > > I guess this is copied from the ehash code below, but could
> > > refcount_inc_not_zero() fail for bhash2 under spin_lock_bh() ?
> > 
> > My understanding is that it can't fail, but I prefered to keep the test
> > to be on the safe side.
> > 
> > I can post a v3 using a plain sock_hold(), if you prefer.
> 
> I prefer sock_hold() because refcount_inc_not_zero() implies that it could
> fail and is confusing if it never fails.

Ok, I'll do that for v3.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next v2] tcp: Dump bound-only sockets in inet_diag.
  2023-11-23 23:11 [PATCH net-next v2] tcp: Dump bound-only sockets in inet_diag Guillaume Nault
  2023-11-25  1:39 ` Kuniyuki Iwashima
@ 2023-11-28 10:14 ` Eric Dumazet
  2023-11-28 22:18   ` Guillaume Nault
  1 sibling, 1 reply; 9+ messages in thread
From: Eric Dumazet @ 2023-11-28 10:14 UTC (permalink / raw)
  To: Guillaume Nault
  Cc: David Miller, Jakub Kicinski, Paolo Abeni, netdev, David Ahern,
	Kuniyuki Iwashima, Michal Kubecek

On Fri, Nov 24, 2023 at 12:11 AM Guillaume Nault <gnault@redhat.com> wrote:
>
> Walk the hashinfo->bhash2 table so that inet_diag can dump TCP sockets
> that are bound but haven't yet called connect() or listen().
>
> This allows ss to dump bound-only TCP sockets, together with listening
> sockets (as there's no specific state for bound-only sockets). This is
> similar to the UDP behaviour for which bound-only sockets are already
> dumped by ss -lu.
>
> The code is inspired by the ->lhash2 loop. However there's no manual
> test of the source port, since this kind of filtering is already
> handled by inet_diag_bc_sk(). Also, a maximum of 16 sockets are dumped
> at a time, to avoid running with bh disabled for too long.
>
> No change is needed for ss. With an IPv4, an IPv6 and an IPv6-only
> socket, bound respectively to 40000, 64000, 60000, the result is:
>
>   $ ss -lt
>   State  Recv-Q Send-Q Local Address:Port  Peer Address:PortProcess
>   UNCONN 0      0            0.0.0.0:40000      0.0.0.0:*
>   UNCONN 0      0               [::]:60000         [::]:*
>   UNCONN 0      0                  *:64000            *:*


Hmm...   "ss -l" is supposed to only list listening sockets.

So this change might confuse some users ?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next v2] tcp: Dump bound-only sockets in inet_diag.
  2023-11-28 10:14 ` Eric Dumazet
@ 2023-11-28 22:18   ` Guillaume Nault
  2023-11-29 15:39     ` Eric Dumazet
  0 siblings, 1 reply; 9+ messages in thread
From: Guillaume Nault @ 2023-11-28 22:18 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Jakub Kicinski, Paolo Abeni, netdev, David Ahern,
	Kuniyuki Iwashima, Michal Kubecek

On Tue, Nov 28, 2023 at 11:14:28AM +0100, Eric Dumazet wrote:
> On Fri, Nov 24, 2023 at 12:11 AM Guillaume Nault <gnault@redhat.com> wrote:
> >
> > Walk the hashinfo->bhash2 table so that inet_diag can dump TCP sockets
> > that are bound but haven't yet called connect() or listen().
> >
> > This allows ss to dump bound-only TCP sockets, together with listening
> > sockets (as there's no specific state for bound-only sockets). This is
> > similar to the UDP behaviour for which bound-only sockets are already
> > dumped by ss -lu.
> >
> > The code is inspired by the ->lhash2 loop. However there's no manual
> > test of the source port, since this kind of filtering is already
> > handled by inet_diag_bc_sk(). Also, a maximum of 16 sockets are dumped
> > at a time, to avoid running with bh disabled for too long.
> >
> > No change is needed for ss. With an IPv4, an IPv6 and an IPv6-only
> > socket, bound respectively to 40000, 64000, 60000, the result is:
> >
> >   $ ss -lt
> >   State  Recv-Q Send-Q Local Address:Port  Peer Address:PortProcess
> >   UNCONN 0      0            0.0.0.0:40000      0.0.0.0:*
> >   UNCONN 0      0               [::]:60000         [::]:*
> >   UNCONN 0      0                  *:64000            *:*
> 
> 
> Hmm...   "ss -l" is supposed to only list listening sockets.
> 
> So this change might confuse some users ?
> 

On the other hand I can't find a more sensible solution. The problem is
that "ss -l" sets both the TCPF_LISTEN and the TCPF_CLOSE flags. And
since we don't have a way to express "bound but not yet listening"
sockets, these sockets fall into the CLOSE category. So we're really
just returning what ss asked for.

If we can't rely on TCPF_CLOSE, then I don't see what kind of filter we
could use to request a dump of these TCP sockets. Using "-a" doesn't
help as it just sets all the TCPF_* flags (appart from
TCPF_NEW_SYN_RECV). Adding a new option wouldn't help either as we
couldn't map it to any of the TCPF_* flags. In any case, we still need
to rely on TCPF_CLOSE.

So maybe we can just improve the ss man page for "-l" and explain that
it also lists closed sockets, which includes the bound-only ones
(this is already true for non-TCP sockets anyway). We could also tell
the user to run "ss state listening" for getting listening sockets
exclusively (or we could implement a new option, like "-L", to make
that shorter if necessary).

What do you think?


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next v2] tcp: Dump bound-only sockets in inet_diag.
  2023-11-28 22:18   ` Guillaume Nault
@ 2023-11-29 15:39     ` Eric Dumazet
  2023-11-29 17:52       ` Guillaume Nault
  0 siblings, 1 reply; 9+ messages in thread
From: Eric Dumazet @ 2023-11-29 15:39 UTC (permalink / raw)
  To: Guillaume Nault
  Cc: David Miller, Jakub Kicinski, Paolo Abeni, netdev, David Ahern,
	Kuniyuki Iwashima, Michal Kubecek

On Tue, Nov 28, 2023 at 11:18 PM Guillaume Nault <gnault@redhat.com> wrote:
>
> On Tue, Nov 28, 2023 at 11:14:28AM +0100, Eric Dumazet wrote:
> > On Fri, Nov 24, 2023 at 12:11 AM Guillaume Nault <gnault@redhat.com> wrote:
> > >
> > > Walk the hashinfo->bhash2 table so that inet_diag can dump TCP sockets
> > > that are bound but haven't yet called connect() or listen().
> > >
> > > This allows ss to dump bound-only TCP sockets, together with listening
> > > sockets (as there's no specific state for bound-only sockets). This is
> > > similar to the UDP behaviour for which bound-only sockets are already
> > > dumped by ss -lu.
> > >
> > > The code is inspired by the ->lhash2 loop. However there's no manual
> > > test of the source port, since this kind of filtering is already
> > > handled by inet_diag_bc_sk(). Also, a maximum of 16 sockets are dumped
> > > at a time, to avoid running with bh disabled for too long.
> > >
> > > No change is needed for ss. With an IPv4, an IPv6 and an IPv6-only
> > > socket, bound respectively to 40000, 64000, 60000, the result is:
> > >
> > >   $ ss -lt
> > >   State  Recv-Q Send-Q Local Address:Port  Peer Address:PortProcess
> > >   UNCONN 0      0            0.0.0.0:40000      0.0.0.0:*
> > >   UNCONN 0      0               [::]:60000         [::]:*
> > >   UNCONN 0      0                  *:64000            *:*
> >
> >
> > Hmm...   "ss -l" is supposed to only list listening sockets.
> >
> > So this change might confuse some users ?
> >
>
> On the other hand I can't find a more sensible solution. The problem is
> that "ss -l" sets both the TCPF_LISTEN and the TCPF_CLOSE flags. And
> since we don't have a way to express "bound but not yet listening"
> sockets, these sockets fall into the CLOSE category. So we're really
> just returning what ss asked for.
>
> If we can't rely on TCPF_CLOSE, then I don't see what kind of filter we
> could use to request a dump of these TCP sockets. Using "-a" doesn't
> help as it just sets all the TCPF_* flags (appart from
> TCPF_NEW_SYN_RECV). Adding a new option wouldn't help either as we
> couldn't map it to any of the TCPF_* flags. In any case, we still need
> to rely on TCPF_CLOSE.
>
> So maybe we can just improve the ss man page for "-l" and explain that
> it also lists closed sockets, which includes the bound-only ones
> (this is already true for non-TCP sockets anyway). We could also tell
> the user to run "ss state listening" for getting listening sockets
> exclusively (or we could implement a new option, like "-L", to make
> that shorter if necessary).

This exists already : ss -t state LISTENING

>
> What do you think?
>

We might need a new bit in r->idiag_state (we have a lot of free bits
there), different from
the combination used by "ss -l"  which unfortunately used ( (1 <<
SS_LISTEN) | (1 << SS_CLOSE) ) )

"ss -t state bound" (or ss -tB ???)  would then set this new bit ( 1
<< SS_BOUND) and the kernel would handle this pseudo state ?

(mapped to CLOSED and in bhash2)


diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h
index cc00118acca1b695a534bd73984b9d1f1794db25..97238c0f64aa6643cf492a856e8d67ddcca1a729
100644
--- a/include/net/tcp_states.h
+++ b/include/net/tcp_states.h
@@ -22,6 +22,7 @@ enum {
        TCP_LISTEN,
        TCP_CLOSING,    /* Now a valid state */
        TCP_NEW_SYN_RECV,
+       TCP_BOUND,

        TCP_MAX_STATES  /* Leave at the end! */
 };
@@ -43,6 +44,7 @@ enum {
        TCPF_LISTEN      = (1 << TCP_LISTEN),
        TCPF_CLOSING     = (1 << TCP_CLOSING),
        TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV),
+       TCPF_BOUND       = (1 << TCP_BOUND),
 };

 #endif /* _LINUX_TCP_STATES_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a62423360a5681525496f8840bfe1d37ea3dc504..c3d22edb9e9563672356750153167884c92eae67
100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6570,6 +6570,7 @@ enum {
        BPF_TCP_LISTEN,
        BPF_TCP_CLOSING,        /* Now a valid state */
        BPF_TCP_NEW_SYN_RECV,
+       BPF_TCP_BOUND,

        BPF_TCP_MAX_STATES      /* Leave at the end! */
 };

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next v2] tcp: Dump bound-only sockets in inet_diag.
  2023-11-29 15:39     ` Eric Dumazet
@ 2023-11-29 17:52       ` Guillaume Nault
  0 siblings, 0 replies; 9+ messages in thread
From: Guillaume Nault @ 2023-11-29 17:52 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Jakub Kicinski, Paolo Abeni, netdev, David Ahern,
	Kuniyuki Iwashima, Michal Kubecek

On Wed, Nov 29, 2023 at 04:39:55PM +0100, Eric Dumazet wrote:
> On Tue, Nov 28, 2023 at 11:18 PM Guillaume Nault <gnault@redhat.com> wrote:
> >
> > On Tue, Nov 28, 2023 at 11:14:28AM +0100, Eric Dumazet wrote:
> > > On Fri, Nov 24, 2023 at 12:11 AM Guillaume Nault <gnault@redhat.com> wrote:
> > > >
> > > > Walk the hashinfo->bhash2 table so that inet_diag can dump TCP sockets
> > > > that are bound but haven't yet called connect() or listen().
> > > >
> > > > This allows ss to dump bound-only TCP sockets, together with listening
> > > > sockets (as there's no specific state for bound-only sockets). This is
> > > > similar to the UDP behaviour for which bound-only sockets are already
> > > > dumped by ss -lu.
> > > >
> > > > The code is inspired by the ->lhash2 loop. However there's no manual
> > > > test of the source port, since this kind of filtering is already
> > > > handled by inet_diag_bc_sk(). Also, a maximum of 16 sockets are dumped
> > > > at a time, to avoid running with bh disabled for too long.
> > > >
> > > > No change is needed for ss. With an IPv4, an IPv6 and an IPv6-only
> > > > socket, bound respectively to 40000, 64000, 60000, the result is:
> > > >
> > > >   $ ss -lt
> > > >   State  Recv-Q Send-Q Local Address:Port  Peer Address:PortProcess
> > > >   UNCONN 0      0            0.0.0.0:40000      0.0.0.0:*
> > > >   UNCONN 0      0               [::]:60000         [::]:*
> > > >   UNCONN 0      0                  *:64000            *:*
> > >
> > >
> > > Hmm...   "ss -l" is supposed to only list listening sockets.
> > >
> > > So this change might confuse some users ?
> > >
> >
> > On the other hand I can't find a more sensible solution. The problem is
> > that "ss -l" sets both the TCPF_LISTEN and the TCPF_CLOSE flags. And
> > since we don't have a way to express "bound but not yet listening"
> > sockets, these sockets fall into the CLOSE category. So we're really
> > just returning what ss asked for.
> >
> > If we can't rely on TCPF_CLOSE, then I don't see what kind of filter we
> > could use to request a dump of these TCP sockets. Using "-a" doesn't
> > help as it just sets all the TCPF_* flags (appart from
> > TCPF_NEW_SYN_RECV). Adding a new option wouldn't help either as we
> > couldn't map it to any of the TCPF_* flags. In any case, we still need
> > to rely on TCPF_CLOSE.
> >
> > So maybe we can just improve the ss man page for "-l" and explain that
> > it also lists closed sockets, which includes the bound-only ones
> > (this is already true for non-TCP sockets anyway). We could also tell
> > the user to run "ss state listening" for getting listening sockets
> > exclusively (or we could implement a new option, like "-L", to make
> > that shorter if necessary).
> 
> This exists already : ss -t state LISTENING
> 
> >
> > What do you think?
> >
> 
> We might need a new bit in r->idiag_state (we have a lot of free bits
> there), different from
> the combination used by "ss -l"  which unfortunately used ( (1 <<
> SS_LISTEN) | (1 << SS_CLOSE) ) )
> 
> "ss -t state bound" (or ss -tB ???)  would then set this new bit ( 1
> << SS_BOUND) and the kernel would handle this pseudo state ?
> 
> (mapped to CLOSED and in bhash2)

I'll do that in v3. Thanks.

> diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h
> index cc00118acca1b695a534bd73984b9d1f1794db25..97238c0f64aa6643cf492a856e8d67ddcca1a729
> 100644
> --- a/include/net/tcp_states.h
> +++ b/include/net/tcp_states.h
> @@ -22,6 +22,7 @@ enum {
>         TCP_LISTEN,
>         TCP_CLOSING,    /* Now a valid state */
>         TCP_NEW_SYN_RECV,
> +       TCP_BOUND,
> 
>         TCP_MAX_STATES  /* Leave at the end! */
>  };
> @@ -43,6 +44,7 @@ enum {
>         TCPF_LISTEN      = (1 << TCP_LISTEN),
>         TCPF_CLOSING     = (1 << TCP_CLOSING),
>         TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV),
> +       TCPF_BOUND       = (1 << TCP_BOUND),
>  };
> 
>  #endif /* _LINUX_TCP_STATES_H */
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index a62423360a5681525496f8840bfe1d37ea3dc504..c3d22edb9e9563672356750153167884c92eae67
> 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -6570,6 +6570,7 @@ enum {
>         BPF_TCP_LISTEN,
>         BPF_TCP_CLOSING,        /* Now a valid state */
>         BPF_TCP_NEW_SYN_RECV,
> +       BPF_TCP_BOUND,
> 
>         BPF_TCP_MAX_STATES      /* Leave at the end! */
>  };
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2023-11-29 17:53 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-11-23 23:11 [PATCH net-next v2] tcp: Dump bound-only sockets in inet_diag Guillaume Nault
2023-11-25  1:39 ` Kuniyuki Iwashima
2023-11-27 17:26   ` Guillaume Nault
2023-11-27 17:56     ` Kuniyuki Iwashima
2023-11-28  1:13       ` Guillaume Nault
2023-11-28 10:14 ` Eric Dumazet
2023-11-28 22:18   ` Guillaume Nault
2023-11-29 15:39     ` Eric Dumazet
2023-11-29 17:52       ` Guillaume Nault

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).