* [PATCH net-next 1/2] inet: returns a bool from inet_sk_get_local_port_range()
2023-12-14 19:29 [PATCH net-next 0/2] tcp/dccp: refine source port selection Eric Dumazet
@ 2023-12-14 19:29 ` Eric Dumazet
2023-12-15 1:50 ` Kuniyuki Iwashima
2023-12-14 19:29 ` [PATCH net-next 2/2] tcp/dccp: change source port selection at connect() time Eric Dumazet
` (2 subsequent siblings)
3 siblings, 1 reply; 9+ messages in thread
From: Eric Dumazet @ 2023-12-14 19:29 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Jakub Sitnicki, netdev, eric.dumazet, Eric Dumazet
Change inet_sk_get_local_port_range() to return a boolean,
telling the callers if the port range was provided by
IP_LOCAL_PORT_RANGE socket option.
Adds documentation while we are at it.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/ip.h | 2 +-
net/ipv4/inet_connection_sock.c | 21 ++++++++++++++++-----
2 files changed, 17 insertions(+), 6 deletions(-)
diff --git a/include/net/ip.h b/include/net/ip.h
index b31be912489af8b01cc0393a27ffc80b086feaa0..de0c69c57e3cb7485e3d8473bc0b109e4280d2f6 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -356,7 +356,7 @@ static inline void inet_get_local_port_range(const struct net *net, int *low, in
*low = range & 0xffff;
*high = range >> 16;
}
-void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);
+bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);
#ifdef CONFIG_SYSCTL
static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 70be0f6fe879ea671bf6686b04edf32bf5e0d4b6..bd325b029dd12c9fad754ded266ae232ee7ec260 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -117,16 +117,25 @@ bool inet_rcv_saddr_any(const struct sock *sk)
return !sk->sk_rcv_saddr;
}
-void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
+/**
+ * inet_sk_get_local_port_range - fetch ephemeral ports range
+ * @sk: socket
+ * @low: pointer to low port
+ * @high: pointer to high port
+ *
+ * Fetch netns port range (/proc/sys/net/ipv4/ip_local_port_range)
+ * Range can be overridden if socket got IP_LOCAL_PORT_RANGE option.
+ * Returns true if IP_LOCAL_PORT_RANGE was set on this socket.
+ */
+bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
{
- const struct inet_sock *inet = inet_sk(sk);
- const struct net *net = sock_net(sk);
int lo, hi, sk_lo, sk_hi;
+ bool local_range = false;
u32 sk_range;
- inet_get_local_port_range(net, &lo, &hi);
+ inet_get_local_port_range(sock_net(sk), &lo, &hi);
- sk_range = READ_ONCE(inet->local_port_range);
+ sk_range = READ_ONCE(inet_sk(sk)->local_port_range);
if (unlikely(sk_range)) {
sk_lo = sk_range & 0xffff;
sk_hi = sk_range >> 16;
@@ -135,10 +144,12 @@ void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
lo = sk_lo;
if (lo <= sk_hi && sk_hi <= hi)
hi = sk_hi;
+ local_range = true;
}
*low = lo;
*high = hi;
+ return local_range;
}
EXPORT_SYMBOL(inet_sk_get_local_port_range);
--
2.43.0.472.g3155946c3a-goog
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH net-next 1/2] inet: returns a bool from inet_sk_get_local_port_range()
2023-12-14 19:29 ` [PATCH net-next 1/2] inet: returns a bool from inet_sk_get_local_port_range() Eric Dumazet
@ 2023-12-15 1:50 ` Kuniyuki Iwashima
0 siblings, 0 replies; 9+ messages in thread
From: Kuniyuki Iwashima @ 2023-12-15 1:50 UTC (permalink / raw)
To: edumazet; +Cc: davem, eric.dumazet, jakub, kuba, netdev, pabeni, kuniyu
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 14 Dec 2023 19:29:38 +0000
> Change inet_sk_get_local_port_range() to return a boolean,
> telling the callers if the port range was provided by
> IP_LOCAL_PORT_RANGE socket option.
>
> Adds documentation while we are at it.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> ---
> include/net/ip.h | 2 +-
> net/ipv4/inet_connection_sock.c | 21 ++++++++++++++++-----
> 2 files changed, 17 insertions(+), 6 deletions(-)
>
> diff --git a/include/net/ip.h b/include/net/ip.h
> index b31be912489af8b01cc0393a27ffc80b086feaa0..de0c69c57e3cb7485e3d8473bc0b109e4280d2f6 100644
> --- a/include/net/ip.h
> +++ b/include/net/ip.h
> @@ -356,7 +356,7 @@ static inline void inet_get_local_port_range(const struct net *net, int *low, in
> *low = range & 0xffff;
> *high = range >> 16;
> }
> -void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);
> +bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high);
>
> #ifdef CONFIG_SYSCTL
> static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port)
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index 70be0f6fe879ea671bf6686b04edf32bf5e0d4b6..bd325b029dd12c9fad754ded266ae232ee7ec260 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -117,16 +117,25 @@ bool inet_rcv_saddr_any(const struct sock *sk)
> return !sk->sk_rcv_saddr;
> }
>
> -void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
> +/**
> + * inet_sk_get_local_port_range - fetch ephemeral ports range
> + * @sk: socket
> + * @low: pointer to low port
> + * @high: pointer to high port
> + *
> + * Fetch netns port range (/proc/sys/net/ipv4/ip_local_port_range)
> + * Range can be overridden if socket got IP_LOCAL_PORT_RANGE option.
> + * Returns true if IP_LOCAL_PORT_RANGE was set on this socket.
> + */
> +bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
> {
> - const struct inet_sock *inet = inet_sk(sk);
> - const struct net *net = sock_net(sk);
> int lo, hi, sk_lo, sk_hi;
> + bool local_range = false;
> u32 sk_range;
>
> - inet_get_local_port_range(net, &lo, &hi);
> + inet_get_local_port_range(sock_net(sk), &lo, &hi);
>
> - sk_range = READ_ONCE(inet->local_port_range);
> + sk_range = READ_ONCE(inet_sk(sk)->local_port_range);
> if (unlikely(sk_range)) {
> sk_lo = sk_range & 0xffff;
> sk_hi = sk_range >> 16;
> @@ -135,10 +144,12 @@ void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
> lo = sk_lo;
> if (lo <= sk_hi && sk_hi <= hi)
> hi = sk_hi;
> + local_range = true;
> }
>
> *low = lo;
> *high = hi;
> + return local_range;
> }
> EXPORT_SYMBOL(inet_sk_get_local_port_range);
>
> --
> 2.43.0.472.g3155946c3a-goog
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH net-next 2/2] tcp/dccp: change source port selection at connect() time
2023-12-14 19:29 [PATCH net-next 0/2] tcp/dccp: refine source port selection Eric Dumazet
2023-12-14 19:29 ` [PATCH net-next 1/2] inet: returns a bool from inet_sk_get_local_port_range() Eric Dumazet
@ 2023-12-14 19:29 ` Eric Dumazet
2023-12-15 1:58 ` Kuniyuki Iwashima
2023-12-15 2:26 ` Jason Xing
2023-12-16 2:10 ` [PATCH net-next 0/2] tcp/dccp: refine source port selection patchwork-bot+netdevbpf
2024-01-03 14:17 ` Jakub Sitnicki
3 siblings, 2 replies; 9+ messages in thread
From: Eric Dumazet @ 2023-12-14 19:29 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Jakub Sitnicki, netdev, eric.dumazet, Eric Dumazet
In commit 1580ab63fc9a ("tcp/dccp: better use of ephemeral ports in connect()")
we added an heuristic to select even ports for connect() and odd ports for bind().
This was nice because no applications changes were needed.
But it added more costs when all even ports are in use,
when there are few listeners and many active connections.
Since then, IP_LOCAL_PORT_RANGE has been added to permit an application
to partition ephemeral port range at will.
This patch extends the idea so that if IP_LOCAL_PORT_RANGE is set on
a socket before accept(), port selection no longer favors even ports.
This means that connect() can find a suitable source port faster,
and applications can use a different split between connect() and bind()
users.
This should give more entropy to Toeplitz hash used in RSS: Using even
ports was wasting one bit from the 16bit sport.
A similar change can be done in inet_csk_find_open_port() if needed.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
---
net/ipv4/inet_hashtables.c | 27 ++++++++++++++++-----------
1 file changed, 16 insertions(+), 11 deletions(-)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index a532f749e47781cc951f2003f621cec4387a2384..9ff201bc4e6d2da04735e8c160d446602e0adde1 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -1012,7 +1012,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
bool tb_created = false;
u32 remaining, offset;
int ret, i, low, high;
- int l3mdev;
+ bool local_ports;
+ int step, l3mdev;
u32 index;
if (port) {
@@ -1024,10 +1025,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
l3mdev = inet_sk_bound_l3mdev(sk);
- inet_sk_get_local_port_range(sk, &low, &high);
+ local_ports = inet_sk_get_local_port_range(sk, &low, &high);
+ step = local_ports ? 1 : 2;
+
high++; /* [32768, 60999] -> [32768, 61000[ */
remaining = high - low;
- if (likely(remaining > 1))
+ if (!local_ports && remaining > 1)
remaining &= ~1U;
get_random_sleepable_once(table_perturb,
@@ -1040,10 +1043,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
/* In first pass we try ports of @low parity.
* inet_csk_get_port() does the opposite choice.
*/
- offset &= ~1U;
+ if (!local_ports)
+ offset &= ~1U;
other_parity_scan:
port = low + offset;
- for (i = 0; i < remaining; i += 2, port += 2) {
+ for (i = 0; i < remaining; i += step, port += step) {
if (unlikely(port >= high))
port -= remaining;
if (inet_is_local_reserved_port(net, port))
@@ -1083,10 +1087,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
cond_resched();
}
- offset++;
- if ((offset & 1) && remaining > 1)
- goto other_parity_scan;
-
+ if (!local_ports) {
+ offset++;
+ if ((offset & 1) && remaining > 1)
+ goto other_parity_scan;
+ }
return -EADDRNOTAVAIL;
ok:
@@ -1109,8 +1114,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
* on low contention the randomness is maximal and on high contention
* it may be inexistent.
*/
- i = max_t(int, i, get_random_u32_below(8) * 2);
- WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
+ i = max_t(int, i, get_random_u32_below(8) * step);
+ WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step);
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, tb2, port);
--
2.43.0.472.g3155946c3a-goog
^ permalink raw reply related [flat|nested] 9+ messages in thread* [PATCH net-next 2/2] tcp/dccp: change source port selection at connect() time
2023-12-14 19:29 ` [PATCH net-next 2/2] tcp/dccp: change source port selection at connect() time Eric Dumazet
@ 2023-12-15 1:58 ` Kuniyuki Iwashima
2023-12-15 2:26 ` Jason Xing
1 sibling, 0 replies; 9+ messages in thread
From: Kuniyuki Iwashima @ 2023-12-15 1:58 UTC (permalink / raw)
To: edumazet
Cc: davem, eric.dumazet, jakub, kuba, netdev, pabeni,
Kuniyuki Iwashima
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 14 Dec 2023 19:29:39 +0000
> In commit 1580ab63fc9a ("tcp/dccp: better use of ephemeral ports in connect()")
> we added an heuristic to select even ports for connect() and odd ports for bind().
>
> This was nice because no applications changes were needed.
>
> But it added more costs when all even ports are in use,
> when there are few listeners and many active connections.
>
> Since then, IP_LOCAL_PORT_RANGE has been added to permit an application
> to partition ephemeral port range at will.
>
> This patch extends the idea so that if IP_LOCAL_PORT_RANGE is set on
> a socket before accept(), port selection no longer favors even ports.
>
> This means that connect() can find a suitable source port faster,
> and applications can use a different split between connect() and bind()
> users.
>
> This should give more entropy to Toeplitz hash used in RSS: Using even
> ports was wasting one bit from the 16bit sport.
>
> A similar change can be done in inet_csk_find_open_port() if needed.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
> ---
> net/ipv4/inet_hashtables.c | 27 ++++++++++++++++-----------
> 1 file changed, 16 insertions(+), 11 deletions(-)
>
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index a532f749e47781cc951f2003f621cec4387a2384..9ff201bc4e6d2da04735e8c160d446602e0adde1 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -1012,7 +1012,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
> bool tb_created = false;
> u32 remaining, offset;
> int ret, i, low, high;
> - int l3mdev;
> + bool local_ports;
> + int step, l3mdev;
> u32 index;
>
> if (port) {
> @@ -1024,10 +1025,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
>
> l3mdev = inet_sk_bound_l3mdev(sk);
>
> - inet_sk_get_local_port_range(sk, &low, &high);
> + local_ports = inet_sk_get_local_port_range(sk, &low, &high);
> + step = local_ports ? 1 : 2;
> +
> high++; /* [32768, 60999] -> [32768, 61000[ */
> remaining = high - low;
> - if (likely(remaining > 1))
> + if (!local_ports && remaining > 1)
> remaining &= ~1U;
>
> get_random_sleepable_once(table_perturb,
> @@ -1040,10 +1043,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
> /* In first pass we try ports of @low parity.
> * inet_csk_get_port() does the opposite choice.
> */
> - offset &= ~1U;
> + if (!local_ports)
> + offset &= ~1U;
> other_parity_scan:
> port = low + offset;
> - for (i = 0; i < remaining; i += 2, port += 2) {
> + for (i = 0; i < remaining; i += step, port += step) {
> if (unlikely(port >= high))
> port -= remaining;
> if (inet_is_local_reserved_port(net, port))
> @@ -1083,10 +1087,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
> cond_resched();
> }
>
> - offset++;
> - if ((offset & 1) && remaining > 1)
> - goto other_parity_scan;
> -
> + if (!local_ports) {
> + offset++;
> + if ((offset & 1) && remaining > 1)
> + goto other_parity_scan;
> + }
> return -EADDRNOTAVAIL;
>
> ok:
> @@ -1109,8 +1114,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
> * on low contention the randomness is maximal and on high contention
> * it may be inexistent.
> */
> - i = max_t(int, i, get_random_u32_below(8) * 2);
> - WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
> + i = max_t(int, i, get_random_u32_below(8) * step);
> + WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step);
>
> /* Head lock still held and bh's disabled */
> inet_bind_hash(sk, tb, tb2, port);
> --
> 2.43.0.472.g3155946c3a-goog
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH net-next 2/2] tcp/dccp: change source port selection at connect() time
2023-12-14 19:29 ` [PATCH net-next 2/2] tcp/dccp: change source port selection at connect() time Eric Dumazet
2023-12-15 1:58 ` Kuniyuki Iwashima
@ 2023-12-15 2:26 ` Jason Xing
1 sibling, 0 replies; 9+ messages in thread
From: Jason Xing @ 2023-12-15 2:26 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Jakub Sitnicki,
netdev, eric.dumazet
On Fri, Dec 15, 2023 at 3:30 AM Eric Dumazet <edumazet@google.com> wrote:
>
> In commit 1580ab63fc9a ("tcp/dccp: better use of ephemeral ports in connect()")
> we added an heuristic to select even ports for connect() and odd ports for bind().
>
> This was nice because no applications changes were needed.
>
[...]
> But it added more costs when all even ports are in use,
> when there are few listeners and many active connections.
Yes, I have encountered this issue several times. So internally adding
a switch to decide which selecting port algorithm the connect() phase
should use can address this issue: go back to the original algo
(without splitting ports range) many years ago.
>
> Since then, IP_LOCAL_PORT_RANGE has been added to permit an application
> to partition ephemeral port range at will.
>
> This patch extends the idea so that if IP_LOCAL_PORT_RANGE is set on
> a socket before accept(), port selection no longer favors even ports.
>
> This means that connect() can find a suitable source port faster,
> and applications can use a different split between connect() and bind()
> users.
Great :)
>
> This should give more entropy to Toeplitz hash used in RSS: Using even
> ports was wasting one bit from the 16bit sport.
>
> A similar change can be done in inet_csk_find_open_port() if needed.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Jakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Thanks!
> ---
> net/ipv4/inet_hashtables.c | 27 ++++++++++++++++-----------
> 1 file changed, 16 insertions(+), 11 deletions(-)
>
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index a532f749e47781cc951f2003f621cec4387a2384..9ff201bc4e6d2da04735e8c160d446602e0adde1 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -1012,7 +1012,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
> bool tb_created = false;
> u32 remaining, offset;
> int ret, i, low, high;
> - int l3mdev;
> + bool local_ports;
> + int step, l3mdev;
> u32 index;
>
> if (port) {
> @@ -1024,10 +1025,12 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
>
> l3mdev = inet_sk_bound_l3mdev(sk);
>
> - inet_sk_get_local_port_range(sk, &low, &high);
> + local_ports = inet_sk_get_local_port_range(sk, &low, &high);
> + step = local_ports ? 1 : 2;
> +
> high++; /* [32768, 60999] -> [32768, 61000[ */
> remaining = high - low;
> - if (likely(remaining > 1))
> + if (!local_ports && remaining > 1)
> remaining &= ~1U;
>
> get_random_sleepable_once(table_perturb,
> @@ -1040,10 +1043,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
> /* In first pass we try ports of @low parity.
> * inet_csk_get_port() does the opposite choice.
> */
> - offset &= ~1U;
> + if (!local_ports)
> + offset &= ~1U;
> other_parity_scan:
> port = low + offset;
> - for (i = 0; i < remaining; i += 2, port += 2) {
> + for (i = 0; i < remaining; i += step, port += step) {
> if (unlikely(port >= high))
> port -= remaining;
> if (inet_is_local_reserved_port(net, port))
> @@ -1083,10 +1087,11 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
> cond_resched();
> }
>
> - offset++;
> - if ((offset & 1) && remaining > 1)
> - goto other_parity_scan;
> -
> + if (!local_ports) {
> + offset++;
> + if ((offset & 1) && remaining > 1)
> + goto other_parity_scan;
> + }
> return -EADDRNOTAVAIL;
>
> ok:
> @@ -1109,8 +1114,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
> * on low contention the randomness is maximal and on high contention
> * it may be inexistent.
> */
> - i = max_t(int, i, get_random_u32_below(8) * 2);
> - WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
> + i = max_t(int, i, get_random_u32_below(8) * step);
> + WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step);
>
> /* Head lock still held and bh's disabled */
> inet_bind_hash(sk, tb, tb2, port);
> --
> 2.43.0.472.g3155946c3a-goog
>
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH net-next 0/2] tcp/dccp: refine source port selection
2023-12-14 19:29 [PATCH net-next 0/2] tcp/dccp: refine source port selection Eric Dumazet
2023-12-14 19:29 ` [PATCH net-next 1/2] inet: returns a bool from inet_sk_get_local_port_range() Eric Dumazet
2023-12-14 19:29 ` [PATCH net-next 2/2] tcp/dccp: change source port selection at connect() time Eric Dumazet
@ 2023-12-16 2:10 ` patchwork-bot+netdevbpf
2024-01-03 14:17 ` Jakub Sitnicki
3 siblings, 0 replies; 9+ messages in thread
From: patchwork-bot+netdevbpf @ 2023-12-16 2:10 UTC (permalink / raw)
To: Eric Dumazet; +Cc: davem, kuba, pabeni, jakub, netdev, eric.dumazet
Hello:
This series was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Thu, 14 Dec 2023 19:29:37 +0000 you wrote:
> This patch series leverages IP_LOCAL_PORT_RANGE option
> to no longer favor even source port selection at connect() time.
>
> This should lower time taken by connect() for hosts having
> many active connections to the same destination.
>
> Eric Dumazet (2):
> inet: returns a bool from inet_sk_get_local_port_range()
> tcp/dccp: change source port selection at connect() time
>
> [...]
Here is the summary with links:
- [net-next,1/2] inet: returns a bool from inet_sk_get_local_port_range()
https://git.kernel.org/netdev/net-next/c/41db7626b732
- [net-next,2/2] tcp/dccp: change source port selection at connect() time
https://git.kernel.org/netdev/net-next/c/207184853dbd
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH net-next 0/2] tcp/dccp: refine source port selection
2023-12-14 19:29 [PATCH net-next 0/2] tcp/dccp: refine source port selection Eric Dumazet
` (2 preceding siblings ...)
2023-12-16 2:10 ` [PATCH net-next 0/2] tcp/dccp: refine source port selection patchwork-bot+netdevbpf
@ 2024-01-03 14:17 ` Jakub Sitnicki
2024-01-03 16:48 ` Eric Dumazet
3 siblings, 1 reply; 9+ messages in thread
From: Jakub Sitnicki @ 2024-01-03 14:17 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, netdev,
eric.dumazet, kernel-team
On Thu, Dec 14, 2023 at 07:29 PM GMT, Eric Dumazet wrote:
> This patch series leverages IP_LOCAL_PORT_RANGE option
> to no longer favor even source port selection at connect() time.
>
> This should lower time taken by connect() for hosts having
> many active connections to the same destination.
>
> Eric Dumazet (2):
> inet: returns a bool from inet_sk_get_local_port_range()
> tcp/dccp: change source port selection at connect() time
>
> include/net/ip.h | 2 +-
> net/ipv4/inet_connection_sock.c | 21 ++++++++++++++++-----
> net/ipv4/inet_hashtables.c | 27 ++++++++++++++++-----------
> 3 files changed, 33 insertions(+), 17 deletions(-)
This is great. Thank you.
# sysctl net.ipv4.ip_local_port_range
net.ipv4.ip_local_port_range = 32768 60999
# { sleep 3; stress-ng --sockmany 1 --sockmany-ops 20000; } & \
> /usr/share/bcc/tools/funclatency inet_hash_connect
[1] 240
Tracing 1 functions for "inet_hash_connect"... Hit Ctrl-C to end.
stress-ng: info: [243] defaulting to a 1 day, 0 secs run per stressor
stress-ng: info: [243] dispatching hogs: 1 sockmany
stress-ng: info: [243] skipped: 0
stress-ng: info: [243] passed: 1: sockmany (1)
stress-ng: info: [243] failed: 0
stress-ng: info: [243] metrics untrustworthy: 0
stress-ng: info: [243] successful run completed in 27.60 secs
^C
nsecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 511 |** |
1024 -> 2047 : 8698 |****************************************|
2048 -> 4095 : 2870 |************* |
4096 -> 8191 : 1471 |****** |
8192 -> 16383 : 389 |* |
16384 -> 32767 : 114 | |
32768 -> 65535 : 43 | |
65536 -> 131071 : 15 | |
131072 -> 262143 : 0 | |
262144 -> 524287 : 1 | |
524288 -> 1048575 : 1 | |
1048576 -> 2097151 : 3 | |
2097152 -> 4194303 : 1609 |******* |
4194304 -> 8388607 : 4272 |******************* |
8388608 -> 16777215 : 4 | |
avg = 1314821 nsecs, total: 26297744706 nsecs, count: 20001
Detaching...
[1]+ Done { sleep 3; stress-ng --sockmany 1 --sockmany-ops 20000; }
# { sleep 3; LD_PRELOAD=./setsockopt_ip_local_port_range.so stress-ng --sockmany 1 --sockmany-ops 20000; } & \
> /usr/share/bcc/tools/funclatency inet_hash_connect
[1] 246
Tracing 1 functions for "inet_hash_connect"... Hit Ctrl-C to end.
stress-ng: info: [249] defaulting to a 1 day, 0 secs run per stressor
stress-ng: info: [249] dispatching hogs: 1 sockmany
stress-ng: info: [249] skipped: 0
stress-ng: info: [249] passed: 1: sockmany (1)
stress-ng: info: [249] failed: 0
stress-ng: info: [249] metrics untrustworthy: 0
stress-ng: info: [249] successful run completed in 1.01 secs
^C
nsecs : count distribution
0 -> 1 : 0 | |
2 -> 3 : 0 | |
4 -> 7 : 0 | |
8 -> 15 : 0 | |
16 -> 31 : 0 | |
32 -> 63 : 0 | |
64 -> 127 : 0 | |
128 -> 255 : 0 | |
256 -> 511 : 0 | |
512 -> 1023 : 2085 |****** |
1024 -> 2047 : 13401 |****************************************|
2048 -> 4095 : 3877 |*********** |
4096 -> 8191 : 561 |* |
8192 -> 16383 : 60 | |
16384 -> 32767 : 16 | |
32768 -> 65535 : 2 | |
avg = 1768 nsecs, total: 35376609 nsecs, count: 20002
Detaching...
[1]+ Done { sleep 3; LD_PRELOAD=./setsockopt_ip_local_port_range.so stress-ng --sockmany 1 --sockmany-ops 20000; }
# cat ./setsockopt_ip_local_port_range.c
#include <dlfcn.h>
#include <linux/in.h>
#include <sys/socket.h>
int socket(int domain, int type, int protocol)
{
int (*socket_fn)(int, int, int) = dlsym(RTLD_NEXT, "socket");
int fd;
fd = socket_fn(domain, type, protocol);
if (fd < 0)
return -1;
if (domain == AF_INET || domain == AF_INET6) {
setsockopt(fd, IPPROTO_IP, IP_LOCAL_PORT_RANGE,
&(__u32){ 0xffffU << 16 }, sizeof(__u32));
}
return fd;
}
#
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH net-next 0/2] tcp/dccp: refine source port selection
2024-01-03 14:17 ` Jakub Sitnicki
@ 2024-01-03 16:48 ` Eric Dumazet
0 siblings, 0 replies; 9+ messages in thread
From: Eric Dumazet @ 2024-01-03 16:48 UTC (permalink / raw)
To: Jakub Sitnicki
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, netdev,
eric.dumazet, kernel-team
On Wed, Jan 3, 2024 at 3:19 PM Jakub Sitnicki <jakub@cloudflare.com> wrote:
>
> On Thu, Dec 14, 2023 at 07:29 PM GMT, Eric Dumazet wrote:
> > This patch series leverages IP_LOCAL_PORT_RANGE option
> > to no longer favor even source port selection at connect() time.
> >
> > This should lower time taken by connect() for hosts having
> > many active connections to the same destination.
> >
> > Eric Dumazet (2):
> > inet: returns a bool from inet_sk_get_local_port_range()
> > tcp/dccp: change source port selection at connect() time
> >
> > include/net/ip.h | 2 +-
> > net/ipv4/inet_connection_sock.c | 21 ++++++++++++++++-----
> > net/ipv4/inet_hashtables.c | 27 ++++++++++++++++-----------
> > 3 files changed, 33 insertions(+), 17 deletions(-)
>
> This is great. Thank you.
>
> # sysctl net.ipv4.ip_local_port_range
> net.ipv4.ip_local_port_range = 32768 60999
> # { sleep 3; stress-ng --sockmany 1 --sockmany-ops 20000; } & \
> > /usr/share/bcc/tools/funclatency inet_hash_connect
> [1] 240
> Tracing 1 functions for "inet_hash_connect"... Hit Ctrl-C to end.
> stress-ng: info: [243] defaulting to a 1 day, 0 secs run per stressor
> stress-ng: info: [243] dispatching hogs: 1 sockmany
> stress-ng: info: [243] skipped: 0
> stress-ng: info: [243] passed: 1: sockmany (1)
> stress-ng: info: [243] failed: 0
> stress-ng: info: [243] metrics untrustworthy: 0
> stress-ng: info: [243] successful run completed in 27.60 secs
> ^C
> nsecs : count distribution
> 0 -> 1 : 0 | |
> 2 -> 3 : 0 | |
> 4 -> 7 : 0 | |
> 8 -> 15 : 0 | |
> 16 -> 31 : 0 | |
> 32 -> 63 : 0 | |
> 64 -> 127 : 0 | |
> 128 -> 255 : 0 | |
> 256 -> 511 : 0 | |
> 512 -> 1023 : 511 |** |
> 1024 -> 2047 : 8698 |****************************************|
> 2048 -> 4095 : 2870 |************* |
> 4096 -> 8191 : 1471 |****** |
> 8192 -> 16383 : 389 |* |
> 16384 -> 32767 : 114 | |
> 32768 -> 65535 : 43 | |
> 65536 -> 131071 : 15 | |
> 131072 -> 262143 : 0 | |
> 262144 -> 524287 : 1 | |
> 524288 -> 1048575 : 1 | |
> 1048576 -> 2097151 : 3 | |
> 2097152 -> 4194303 : 1609 |******* |
> 4194304 -> 8388607 : 4272 |******************* |
> 8388608 -> 16777215 : 4 | |
>
> avg = 1314821 nsecs, total: 26297744706 nsecs, count: 20001
>
> Detaching...
> [1]+ Done { sleep 3; stress-ng --sockmany 1 --sockmany-ops 20000; }
> # { sleep 3; LD_PRELOAD=./setsockopt_ip_local_port_range.so stress-ng --sockmany 1 --sockmany-ops 20000; } & \
> > /usr/share/bcc/tools/funclatency inet_hash_connect
> [1] 246
> Tracing 1 functions for "inet_hash_connect"... Hit Ctrl-C to end.
> stress-ng: info: [249] defaulting to a 1 day, 0 secs run per stressor
> stress-ng: info: [249] dispatching hogs: 1 sockmany
> stress-ng: info: [249] skipped: 0
> stress-ng: info: [249] passed: 1: sockmany (1)
> stress-ng: info: [249] failed: 0
> stress-ng: info: [249] metrics untrustworthy: 0
> stress-ng: info: [249] successful run completed in 1.01 secs
> ^C
> nsecs : count distribution
> 0 -> 1 : 0 | |
> 2 -> 3 : 0 | |
> 4 -> 7 : 0 | |
> 8 -> 15 : 0 | |
> 16 -> 31 : 0 | |
> 32 -> 63 : 0 | |
> 64 -> 127 : 0 | |
> 128 -> 255 : 0 | |
> 256 -> 511 : 0 | |
> 512 -> 1023 : 2085 |****** |
> 1024 -> 2047 : 13401 |****************************************|
> 2048 -> 4095 : 3877 |*********** |
> 4096 -> 8191 : 561 |* |
> 8192 -> 16383 : 60 | |
> 16384 -> 32767 : 16 | |
> 32768 -> 65535 : 2 | |
>
> avg = 1768 nsecs, total: 35376609 nsecs, count: 20002
>
> Detaching...
> [1]+ Done { sleep 3; LD_PRELOAD=./setsockopt_ip_local_port_range.so stress-ng --sockmany 1 --sockmany-ops 20000; }
> # cat ./setsockopt_ip_local_port_range.c
> #include <dlfcn.h>
> #include <linux/in.h>
> #include <sys/socket.h>
>
> int socket(int domain, int type, int protocol)
> {
> int (*socket_fn)(int, int, int) = dlsym(RTLD_NEXT, "socket");
> int fd;
>
> fd = socket_fn(domain, type, protocol);
> if (fd < 0)
> return -1;
>
> if (domain == AF_INET || domain == AF_INET6) {
> setsockopt(fd, IPPROTO_IP, IP_LOCAL_PORT_RANGE,
> &(__u32){ 0xffffU << 16 }, sizeof(__u32));
> }
>
> return fd;
> }
> #
Nice tests, thanks for them !
^ permalink raw reply [flat|nested] 9+ messages in thread