* [PATCH net v3 2/2] net/smc: Use correct buffer sizes when switching between TCP and SMC
2023-08-04 17:06 [PATCH net v3 0/2] net/smc: Fix effective buffer size Gerd Bayer
@ 2023-08-04 17:06 ` Gerd Bayer
0 siblings, 0 replies; 2+ messages in thread
From: Gerd Bayer @ 2023-08-04 17:06 UTC (permalink / raw)
To: Wenjia Zhang, Jan Karcher, Tony Lu, Paolo Abeni
Cc: Karsten Graul, D . Wythe, Wen Gu, David S . Miller, Eric Dumazet,
Jakub Kicinski, linux-s390, netdev, linux-kernel
Tuning of the effective buffer size through setsockopts was working for
SMC traffic only but not for TCP fall-back connections even before
commit 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and
make them tunable"). That change made it apparent that TCP fall-back
connections would use net.smc.[rw]mem as buffer size instead of
net.ipv4_tcp_[rw]mem.
Amend the code that copies attributes between the (TCP) clcsock and the
SMC socket and adjust buffer sizes appropriately:
- Copy over sk_userlocks so that both sockets agree on whether tuning
via setsockopt is active.
- When falling back to TCP use sk_sndbuf or sk_rcvbuf as specified with
setsockopt. Otherwise, use the sysctl value for TCP/IPv4.
- Likewise, use either values from setsockopt or from sysctl for SMC
(duplicated) on successful SMC connect.
In smc_tcp_listen_work() drop the explicit copy of buffer sizes as that
is taken care of by the attribute copy.
Fixes: 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and make them tunable")
Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
Signed-off-by: Gerd Bayer <gbayer@linux.ibm.com>
---
net/smc/af_smc.c | 73 +++++++++++++++++++++++++++++++++---------------
1 file changed, 51 insertions(+), 22 deletions(-)
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 5b878e523abf..f5834af5fad5 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -436,13 +436,60 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
return rc;
}
+/* copy only relevant settings and flags of SOL_SOCKET level from smc to
+ * clc socket (since smc is not called for these options from net/core)
+ */
+
+#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
+ (1UL << SOCK_KEEPOPEN) | \
+ (1UL << SOCK_LINGER) | \
+ (1UL << SOCK_BROADCAST) | \
+ (1UL << SOCK_TIMESTAMP) | \
+ (1UL << SOCK_DBG) | \
+ (1UL << SOCK_RCVTSTAMP) | \
+ (1UL << SOCK_RCVTSTAMPNS) | \
+ (1UL << SOCK_LOCALROUTE) | \
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
+ (1UL << SOCK_RXQ_OVFL) | \
+ (1UL << SOCK_WIFI_STATUS) | \
+ (1UL << SOCK_NOFCS) | \
+ (1UL << SOCK_FILTER_LOCKED) | \
+ (1UL << SOCK_TSTAMP_NEW))
+
+/* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */
+static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk,
+ unsigned long mask)
+{
+ struct net *nnet = sock_net(nsk);
+
+ nsk->sk_userlocks = osk->sk_userlocks;
+ if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) {
+ nsk->sk_sndbuf = osk->sk_sndbuf;
+ } else {
+ if (mask == SK_FLAGS_SMC_TO_CLC)
+ WRITE_ONCE(nsk->sk_sndbuf,
+ READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1]));
+ else
+ WRITE_ONCE(nsk->sk_sndbuf,
+ 2 * READ_ONCE(nnet->smc.sysctl_wmem));
+ }
+ if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) {
+ nsk->sk_rcvbuf = osk->sk_rcvbuf;
+ } else {
+ if (mask == SK_FLAGS_SMC_TO_CLC)
+ WRITE_ONCE(nsk->sk_rcvbuf,
+ READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1]));
+ else
+ WRITE_ONCE(nsk->sk_rcvbuf,
+ 2 * READ_ONCE(nnet->smc.sysctl_rmem));
+ }
+}
+
static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
unsigned long mask)
{
/* options we don't get control via setsockopt for */
nsk->sk_type = osk->sk_type;
- nsk->sk_sndbuf = osk->sk_sndbuf;
- nsk->sk_rcvbuf = osk->sk_rcvbuf;
nsk->sk_sndtimeo = osk->sk_sndtimeo;
nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
nsk->sk_mark = READ_ONCE(osk->sk_mark);
@@ -453,26 +500,10 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
nsk->sk_flags &= ~mask;
nsk->sk_flags |= osk->sk_flags & mask;
+
+ smc_adjust_sock_bufsizes(nsk, osk, mask);
}
-#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
- (1UL << SOCK_KEEPOPEN) | \
- (1UL << SOCK_LINGER) | \
- (1UL << SOCK_BROADCAST) | \
- (1UL << SOCK_TIMESTAMP) | \
- (1UL << SOCK_DBG) | \
- (1UL << SOCK_RCVTSTAMP) | \
- (1UL << SOCK_RCVTSTAMPNS) | \
- (1UL << SOCK_LOCALROUTE) | \
- (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
- (1UL << SOCK_RXQ_OVFL) | \
- (1UL << SOCK_WIFI_STATUS) | \
- (1UL << SOCK_NOFCS) | \
- (1UL << SOCK_FILTER_LOCKED) | \
- (1UL << SOCK_TSTAMP_NEW))
-/* copy only relevant settings and flags of SOL_SOCKET level from smc to
- * clc socket (since smc is not called for these options from net/core)
- */
static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
{
smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
@@ -2479,8 +2510,6 @@ static void smc_tcp_listen_work(struct work_struct *work)
sock_hold(lsk); /* sock_put in smc_listen_work */
INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
smc_copy_sock_settings_to_smc(new_smc);
- new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
- new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
sock_hold(&new_smc->sk); /* sock_put in passive closing */
if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
sock_put(&new_smc->sk);
--
2.41.0
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH net v3 2/2] net/smc: Use correct buffer sizes when switching between TCP and SMC
@ 2024-05-30 9:20 Wen Gu
0 siblings, 0 replies; 2+ messages in thread
From: Wen Gu @ 2024-05-30 9:20 UTC (permalink / raw)
To: Gerd Bayer, Wenjia Zhang, Jan Karcher
Cc: linux-s390@vger.kernel.org, netdev@vger.kernel.org, LKML
[-- Attachment #1: Type: text/plain, Size: 5283 bytes --]
> Tuning of the effective buffer size through setsockopts was working for
> SMC traffic only but not for TCP fall-back connections even before
> commit 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and
> make them tunable"). That change made it apparent that TCP fall-back
> connections would use net.smc.[rw]mem as buffer size instead of
> net.ipv4_tcp_[rw]mem.
>
> Amend the code that copies attributes between the (TCP) clcsock and the
> SMC socket and adjust buffer sizes appropriately:
> - Copy over sk_userlocks so that both sockets agree on whether tuning
> via setsockopt is active.
> - When falling back to TCP use sk_sndbuf or sk_rcvbuf as specified with
> setsockopt. Otherwise, use the sysctl value for TCP/IPv4.
> - Likewise, use either values from setsockopt or from sysctl for SMC
> (duplicated) on successful SMC connect.
>
> In smc_tcp_listen_work() drop the explicit copy of buffer sizes as that
> is taken care of by the attribute copy.
[...]
> +/* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */
> +static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk,
> + unsigned long mask)
> +{
> + struct net *nnet = sock_net(nsk);
> +
> + nsk->sk_userlocks = osk->sk_userlocks;
> + if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) {
> + nsk->sk_sndbuf = osk->sk_sndbuf;
> + } else {
> + if (mask == SK_FLAGS_SMC_TO_CLC)
> + WRITE_ONCE(nsk->sk_sndbuf,
> + READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1]));
Hi Gerd,
I noticed that during TCP connection establishment, tcp_sndbuf_expand()
will tune sk->sk_sndbuf, that causes clcsock's sk_sndbuf to no longer
be sysctl_tcp_wmem[1]. But here we set it back to sysctl_tcp_wmem[1].
So I did some tests to see if the values of sk_sndbuf and sk_rcvbuf are
as expected in SMC and fallback cases (see the attached server.c and
client.c for the reproducer and here are the sysctl values in my environment)
net.ipv4.tcp_wmem = 4096 4096 16777216
net.ipv4.tcp_rmem = 4096 4096 16777216
net.smc.wmem = 65536
net.smc.rmem = 65536
1. No additional sk_{snd|rcv}buf settings
1.1 TCP
./server
./client -i <serv_ip>
results:
- server: sndbuf_size 87040, rcvbuf_size 4096
- client: sndbuf_size 87040, rcvbuf_size 4096
1.2 SMC
smc_run ./server
smc_run ./client -i <serv_ip>
results:
- server: sndbuf_size 131072, rcvbuf_size 131072
- client: sndbuf_size 131072, rcvbuf_size 131072
1.3 SMC, but server fallback
smc_run ./server
./client -i <serv_ip>
results:
- server: sndbuf_size 87040, rcvbuf_size 4096
- client: sndbuf_size 87040, rcvbuf_size 4096
1.4 SMC, but client fallback
./server
smc_run ./client -i <serv_ip>
results:
- server: sndbuf_size 87040, rcvbuf_size 4096
- client: sndbuf_size 4096, rcvbuf_size 4096 <--- I think clcsock's sk_sndbuf should
be the same as 1.1 after fallback?
2. Set server listen sock's and client sock's sk_{snd|rcv}buf
as 16KB by setsockopt() before connection establishment.
2.1 TCP
./server -s 16384
./client -i <serv_ip> -s 16384
results:
- server: sndbuf_size 32768, rcvbuf_size 32768
- client: sndbuf_size 32768, rcvbuf_size 32768
2.2 SMC
smc_run ./server -s 16384
smc_run ./client -i <serv_ip> -s 16384
results:
- server: sndbuf_size 32768, rcvbuf_size 32768
- client: sndbuf_size 32768, rcvbuf_size 32768
2.3 SMC, but server fallback
smc_run ./server -s 16384
./client -i <serv_ip> -s 16384
results:
- server: sndbuf_size 32768, rcvbuf_size 32768
- client: sndbuf_size 32768, rcvbuf_size 32768
2.4 SMC, but client fallback
./server -s 16384
smc_run ./client -i <serv_ip> -s 16384
results:
- server: sndbuf_size 32768, rcvbuf_size 32768
- client: sndbuf_size 32768, rcvbuf_size 32768
In the above 8 sets of tests, 1.4 does not seem to meet expectations.
It is because we reset clcsock's sk_sndbuf to sysctl_tcp_wmem[1] in
smc_copy_sock_settings_to_clc(). I think it should be like 1.1 TCP values
after fallback. What do you think?
If so, we may need to avoid setting sysctl value to clcsock's sk_sndbuf
in smc_adjust_sock_bufsizes(). Furthermore, maybe all the setting-sysctl-value
can be omitted, since smc sock's and clcsock's sk_{snd|rcv}buf have been
set to sysctl value during their sock initialization (smc_sock_alloc() and
tcp_init_sock()).
And another question is why 1.3 is as expected? The direct cause is that
server does not call smc_copy_sock_settings_to_clc() when fallback, like the
client smc_connect_fallback() does. But I didn't figure out what is the
reason for the different behavior? Do you have any information? Thanks a lot!
Best regards,
Wen Gu
> + else
> + WRITE_ONCE(nsk->sk_sndbuf,
> + 2 * READ_ONCE(nnet->smc.sysctl_wmem));
> + }
> + if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) {
> + nsk->sk_rcvbuf = osk->sk_rcvbuf;
> + } else {
> + if (mask == SK_FLAGS_SMC_TO_CLC)
> + WRITE_ONCE(nsk->sk_rcvbuf,
> + READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1]));
> + else
> + WRITE_ONCE(nsk->sk_rcvbuf,
> + 2 * READ_ONCE(nnet->smc.sysctl_rmem));
> + }
> +}
> +
[...]
[-- Attachment #2: client.c --]
[-- Type: text/plain, Size: 3094 bytes --]
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <stdbool.h>
#include <errno.h>
#include <netinet/tcp.h>
#ifndef AF_SMC
#define AF_SMC 43
#endif
#define NET_PROTOCAL AF_INET
#define SERV_IP "11.213.5.33"
#define SERV_PORT 10012
char *ip;
int net_clnt(int buf_size, int port)
{
int sndbuf_size, rcvbuf_size;
struct sockaddr_in s_addr;
char msg[128] = { 0 };
int optlen = 4;
int sock;
int rc;
if (!port)
port = SERV_PORT;
sock = socket(NET_PROTOCAL, SOCK_STREAM, 0);
if (buf_size) {
sndbuf_size = rcvbuf_size = buf_size;
/* set sndbuf and rcvbuf */
if (setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
&sndbuf_size, sizeof(int))) {
printf("set sndbuf failed\n");
return 0;
}
if (setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
&rcvbuf_size, sizeof(int))) {
printf("set rcvbuf failed\n");
return 0;
}
}
memset(&s_addr, 0, sizeof(s_addr));
s_addr.sin_family = NET_PROTOCAL;
if (ip)
s_addr.sin_addr.s_addr = inet_addr(ip);
else
s_addr.sin_addr.s_addr = inet_addr(SERV_IP);
s_addr.sin_port = htons(port);
if (connect(sock, (struct sockaddr*)&s_addr, sizeof(s_addr))){
printf("connect fail\n");
return 0;
}
sndbuf_size = 0; rcvbuf_size = 0;
getsockopt(sock, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, &optlen);
getsockopt(sock, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, &optlen);
printf("client: sndbuf_size %d, rcvbuf_size %d\n", sndbuf_size, rcvbuf_size);
recv(sock, msg, sizeof(msg), 0);
printf("get msg: %s\n", msg);
send(sock, "Response", sizeof("Response"), MSG_NOSIGNAL);
close(sock);
}
int main(int argc, char **argv){
bool wrong_param = false;
int buf_size = 0, port = 0;
int c;
while(!wrong_param &&
(-1 != (c = getopt(argc, argv, "p:s:i:")))) {
switch (c) {
case 's':
buf_size = atoi(optarg);
break;
case 'i':
ip = strdup(optarg);
break;
case 'p':
port = atoi(optarg);
break;
case '?':
printf("usage: ./client -s <bufsize> -i <ip> -p <port>\n");
wrong_param = true;
break;
}
}
if (!wrong_param)
net_clnt(buf_size, port);
return 0;
}
[-- Attachment #3: server.c --]
[-- Type: text/plain, Size: 3551 bytes --]
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <errno.h>
#include <stdbool.h>
#include <netinet/tcp.h>
#ifndef AF_SMC
#define AF_SMC 43
#endif
#define NET_PROTOCAL AF_INET
#define SERV_IP "0.0.0.0"
#define SERV_PORT 10012
int net_serv(int buf_size, int port)
{
int sndbuf_size, rcvbuf_size;
struct sockaddr_in s_addr;
struct sockaddr_in c_addr;
char msg[128] = "Request";
int l_sock, s_sock;
int optlen = 4;
if (!port)
port = SERV_PORT;
l_sock = socket(NET_PROTOCAL, SOCK_STREAM, 0);
if (buf_size) {
sndbuf_size = rcvbuf_size = buf_size;
/* set sndbuf and rcvbuf */
if (setsockopt(l_sock, SOL_SOCKET, SO_SNDBUF,
&sndbuf_size, sizeof(int))) {
printf("set sndbuf failed\n");
return 0;
}
if (setsockopt(l_sock, SOL_SOCKET, SO_RCVBUF,
&rcvbuf_size, sizeof(int))) {
printf("set rcvbuf failed\n", rcvbuf_size);
return 0;
}
}
memset(&s_addr, 0, sizeof(struct sockaddr_in));
s_addr.sin_family = NET_PROTOCAL;
s_addr.sin_addr.s_addr = inet_addr(SERV_IP);
s_addr.sin_port = htons(port);
if (bind(l_sock, (struct sockaddr*)&s_addr, sizeof(s_addr))) {
printf("bind listen socket error %d\n", errno);
return 0;
}
if (listen(l_sock, 20)) {
printf("listen error\n");
return 0;
}
socklen_t c_addr_len = sizeof(c_addr);
s_sock = accept(l_sock, (struct sockaddr*)&c_addr,
&c_addr_len);
if (s_sock < 0) {
printf("accept fail\n");
return 0;
} else {
char ip[16] = { 0 };
inet_ntop(NET_PROTOCAL, &(c_addr.sin_addr), ip, INET_ADDRSTRLEN);
printf("accept connection: ip %s port %d\n",
ip, c_addr.sin_port);
}
getsockopt(s_sock, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, &optlen);
getsockopt(s_sock, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, &optlen);
printf("server: sndbuf_size %d, rcvbuf_size %d\n", sndbuf_size, rcvbuf_size);
send(s_sock, "Request", sizeof("Request"), MSG_NOSIGNAL);
recv(s_sock, msg, sizeof(msg), 0);
printf("get msg: %s\n", msg);
close(s_sock);
close(l_sock);
return 0;
}
int main(int argc, char **argv)
{
bool wrong_param = false;
int buf_size = 0, port = 0;
int c;
while(!wrong_param &&
(-1 != (c = getopt(argc, argv, "p:s:")))) {
switch (c) {
case 's':
buf_size = atoi(optarg);
break;
case 'p':
port = atoi(optarg);
break;
case '?':
printf("usage: ./server -s <bufsize> -p <port>\n");
wrong_param = true;
break;
}
}
if (!wrong_param)
net_serv(buf_size, port);
return 0;
}
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2024-05-30 9:20 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-05-30 9:20 [PATCH net v3 2/2] net/smc: Use correct buffer sizes when switching between TCP and SMC Wen Gu
-- strict thread matches above, loose matches on Subject: below --
2023-08-04 17:06 [PATCH net v3 0/2] net/smc: Fix effective buffer size Gerd Bayer
2023-08-04 17:06 ` [PATCH net v3 2/2] net/smc: Use correct buffer sizes when switching between TCP and SMC Gerd Bayer
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox