* [PATCH net-next 1/8] inet: implement lockless IP_MULTICAST_TTL
2023-09-21 13:30 [PATCH net-next 0/8] inet: more data-race fixes Eric Dumazet
@ 2023-09-21 13:30 ` Eric Dumazet
2023-09-21 19:03 ` David Ahern
2023-09-21 13:30 ` [PATCH net-next 2/8] inet: implement lockless IP_MTU_DISCOVER Eric Dumazet
` (6 subsequent siblings)
7 siblings, 1 reply; 18+ messages in thread
From: Eric Dumazet @ 2023-09-21 13:30 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet
inet->mc_ttl can be read locklessly.
Implement proper lockless reads and writes to inet->mc_ttl
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv4/ip_output.c | 2 +-
net/ipv4/ip_sockglue.c | 31 ++++++++++++++++---------------
net/netfilter/ipvs/ip_vs_sync.c | 2 +-
3 files changed, 18 insertions(+), 17 deletions(-)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4ab877cf6d35f229761986d5c6a17eb2a3ad4043..adad16f1e872ce20941a087b3965fdb040868d4e 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1430,7 +1430,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
if (cork->ttl != 0)
ttl = cork->ttl;
else if (rt->rt_type == RTN_MULTICAST)
- ttl = inet->mc_ttl;
+ ttl = READ_ONCE(inet->mc_ttl);
else
ttl = ip_select_ttl(inet, &rt->dst);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index cce9cb25f3b31cd57fa883ae0dedb6829d8da2fa..4ad3003378ae6b186513000264f77b54a7babe6d 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1039,6 +1039,17 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
WRITE_ONCE(inet->min_ttl, val);
return 0;
+ case IP_MULTICAST_TTL:
+ if (sk->sk_type == SOCK_STREAM)
+ return -EINVAL;
+ if (optlen < 1)
+ return -EINVAL;
+ if (val == -1)
+ val = 1;
+ if (val < 0 || val > 255)
+ return -EINVAL;
+ WRITE_ONCE(inet->mc_ttl, val);
+ return 0;
}
err = 0;
@@ -1101,17 +1112,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
goto e_inval;
inet->pmtudisc = val;
break;
- case IP_MULTICAST_TTL:
- if (sk->sk_type == SOCK_STREAM)
- goto e_inval;
- if (optlen < 1)
- goto e_inval;
- if (val == -1)
- val = 1;
- if (val < 0 || val > 255)
- goto e_inval;
- inet->mc_ttl = val;
- break;
case IP_UNICAST_IF:
{
struct net_device *dev = NULL;
@@ -1592,6 +1592,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_MINTTL:
val = READ_ONCE(inet->min_ttl);
goto copyval;
+ case IP_MULTICAST_TTL:
+ val = READ_ONCE(inet->mc_ttl);
+ goto copyval;
}
if (needs_rtnl)
@@ -1649,9 +1652,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
}
break;
}
- case IP_MULTICAST_TTL:
- val = inet->mc_ttl;
- break;
case IP_UNICAST_IF:
val = (__force int)htonl((__u32) inet->uc_index);
break;
@@ -1718,7 +1718,8 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
}
if (inet_test_bit(TTL, sk)) {
- int hlim = inet->mc_ttl;
+ int hlim = READ_ONCE(inet->mc_ttl);
+
put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
}
if (inet_test_bit(TOS, sk)) {
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 5820a8156c4701bb163f569d735c389d7a8e3820..3eed1670224888acf639cff06537ddf2505461bb 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1316,7 +1316,7 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl)
/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
lock_sock(sk);
- inet->mc_ttl = ttl;
+ WRITE_ONCE(inet->mc_ttl, ttl);
#ifdef CONFIG_IP_VS_IPV6
if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
--
2.42.0.459.ge4e396fd5e-goog
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [PATCH net-next 1/8] inet: implement lockless IP_MULTICAST_TTL
2023-09-21 13:30 ` [PATCH net-next 1/8] inet: implement lockless IP_MULTICAST_TTL Eric Dumazet
@ 2023-09-21 19:03 ` David Ahern
0 siblings, 0 replies; 18+ messages in thread
From: David Ahern @ 2023-09-21 19:03 UTC (permalink / raw)
To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, eric.dumazet
On 9/21/23 7:30 AM, Eric Dumazet wrote:
> inet->mc_ttl can be read locklessly.
>
> Implement proper lockless reads and writes to inet->mc_ttl
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> net/ipv4/ip_output.c | 2 +-
> net/ipv4/ip_sockglue.c | 31 ++++++++++++++++---------------
> net/netfilter/ipvs/ip_vs_sync.c | 2 +-
> 3 files changed, 18 insertions(+), 17 deletions(-)
>
Reviewed-by: David Ahern <dsahern@kernel.org>
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH net-next 2/8] inet: implement lockless IP_MTU_DISCOVER
2023-09-21 13:30 [PATCH net-next 0/8] inet: more data-race fixes Eric Dumazet
2023-09-21 13:30 ` [PATCH net-next 1/8] inet: implement lockless IP_MULTICAST_TTL Eric Dumazet
@ 2023-09-21 13:30 ` Eric Dumazet
2023-09-21 19:03 ` David Ahern
2023-09-21 13:30 ` [PATCH net-next 3/8] inet: implement lockless IP_TOS Eric Dumazet
` (5 subsequent siblings)
7 siblings, 1 reply; 18+ messages in thread
From: Eric Dumazet @ 2023-09-21 13:30 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet
inet->pmtudisc can be read locklessly.
Implement proper lockless reads and writes to inet->pmtudisc
ip_sock_set_mtu_discover() can now be called from arbitrary
contexts.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/ip.h | 13 ++++++++-----
net/ipv4/ip_output.c | 7 ++++---
net/ipv4/ip_sockglue.c | 17 ++++++-----------
net/ipv4/ping.c | 2 +-
net/ipv4/raw.c | 2 +-
net/ipv4/udp.c | 2 +-
net/netfilter/ipvs/ip_vs_sync.c | 2 +-
7 files changed, 22 insertions(+), 23 deletions(-)
diff --git a/include/net/ip.h b/include/net/ip.h
index 3489a1cca5e7bc315ba646f6bc125b2b6ded9416..46933a0d98eac2db40c2e88006125588b8f8143e 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -434,19 +434,22 @@ int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
static inline bool ip_sk_accept_pmtu(const struct sock *sk)
{
- return inet_sk(sk)->pmtudisc != IP_PMTUDISC_INTERFACE &&
- inet_sk(sk)->pmtudisc != IP_PMTUDISC_OMIT;
+ u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);
+
+ return pmtudisc != IP_PMTUDISC_INTERFACE &&
+ pmtudisc != IP_PMTUDISC_OMIT;
}
static inline bool ip_sk_use_pmtu(const struct sock *sk)
{
- return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE;
+ return READ_ONCE(inet_sk(sk)->pmtudisc) < IP_PMTUDISC_PROBE;
}
static inline bool ip_sk_ignore_df(const struct sock *sk)
{
- return inet_sk(sk)->pmtudisc < IP_PMTUDISC_DO ||
- inet_sk(sk)->pmtudisc == IP_PMTUDISC_OMIT;
+ u8 pmtudisc = READ_ONCE(inet_sk(sk)->pmtudisc);
+
+ return pmtudisc < IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_OMIT;
}
static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index adad16f1e872ce20941a087b3965fdb040868d4e..2be281f184a5fe5a695ccd51fabe69fa45bea0b8 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1387,8 +1387,8 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
struct ip_options *opt = NULL;
struct rtable *rt = (struct rtable *)cork->dst;
struct iphdr *iph;
+ u8 pmtudisc, ttl;
__be16 df = 0;
- __u8 ttl;
skb = __skb_dequeue(queue);
if (!skb)
@@ -1418,8 +1418,9 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
/* DF bit is set when we want to see DF on outgoing frames.
* If ignore_df is set too, we still allow to fragment this frame
* locally. */
- if (inet->pmtudisc == IP_PMTUDISC_DO ||
- inet->pmtudisc == IP_PMTUDISC_PROBE ||
+ pmtudisc = READ_ONCE(inet->pmtudisc);
+ if (pmtudisc == IP_PMTUDISC_DO ||
+ pmtudisc == IP_PMTUDISC_PROBE ||
(skb->len <= dst_mtu(&rt->dst) &&
ip_dont_fragment(sk, &rt->dst)))
df = htons(IP_DF);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 4ad3003378ae6b186513000264f77b54a7babe6d..6d874cc03c8b4e88d79ebc50a6db105606b6ae60 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -622,9 +622,7 @@ int ip_sock_set_mtu_discover(struct sock *sk, int val)
{
if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
return -EINVAL;
- lock_sock(sk);
- inet_sk(sk)->pmtudisc = val;
- release_sock(sk);
+ WRITE_ONCE(inet_sk(sk)->pmtudisc, val);
return 0;
}
EXPORT_SYMBOL(ip_sock_set_mtu_discover);
@@ -1050,6 +1048,8 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
return -EINVAL;
WRITE_ONCE(inet->mc_ttl, val);
return 0;
+ case IP_MTU_DISCOVER:
+ return ip_sock_set_mtu_discover(sk, val);
}
err = 0;
@@ -1107,11 +1107,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
case IP_TOS: /* This sets both TOS and Precedence */
__ip_sock_set_tos(sk, val);
break;
- case IP_MTU_DISCOVER:
- if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
- goto e_inval;
- inet->pmtudisc = val;
- break;
case IP_UNICAST_IF:
{
struct net_device *dev = NULL;
@@ -1595,6 +1590,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_MULTICAST_TTL:
val = READ_ONCE(inet->mc_ttl);
goto copyval;
+ case IP_MTU_DISCOVER:
+ val = READ_ONCE(inet->pmtudisc);
+ goto copyval;
}
if (needs_rtnl)
@@ -1634,9 +1632,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_TOS:
val = inet->tos;
break;
- case IP_MTU_DISCOVER:
- val = inet->pmtudisc;
- break;
case IP_MTU:
{
struct dst_entry *dst;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 4dd809b7b18867154df42bc28809b886913e253c..50d12b0c8d46fdcd9b448c3ebc90395ebf426075 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -551,7 +551,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
case ICMP_DEST_UNREACH:
if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
ipv4_sk_update_pmtu(skb, sk, info);
- if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
+ if (READ_ONCE(inet_sock->pmtudisc) != IP_PMTUDISC_DONT) {
err = EMSGSIZE;
harderr = 1;
break;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4b5db5d1edc279df1fd7412af2845a7a79c95ec8..ade1aecd7c71184d753a28a67bc9b30087247db4 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -239,7 +239,7 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
if (code > NR_ICMP_UNREACH)
break;
if (code == ICMP_FRAG_NEEDED) {
- harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
+ harderr = READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT;
err = EMSGSIZE;
} else {
err = icmp_err_convert[code].errno;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c3ff984b63547daf0ecfb4ab96956aee2f8d589d..731a723dc80816f0b5b0803d7397f7e9e8cd8b09 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -750,7 +750,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
case ICMP_DEST_UNREACH:
if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
ipv4_sk_update_pmtu(skb, sk, info);
- if (inet->pmtudisc != IP_PMTUDISC_DONT) {
+ if (READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT) {
err = EMSGSIZE;
harderr = 1;
break;
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 3eed1670224888acf639cff06537ddf2505461bb..4f6c795588fbdbf084154025b8172e0fd2ea7384 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1335,7 +1335,7 @@ static void set_mcast_pmtudisc(struct sock *sk, int val)
/* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */
lock_sock(sk);
- inet->pmtudisc = val;
+ WRITE_ONCE(inet->pmtudisc, val);
#ifdef CONFIG_IP_VS_IPV6
if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
--
2.42.0.459.ge4e396fd5e-goog
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [PATCH net-next 2/8] inet: implement lockless IP_MTU_DISCOVER
2023-09-21 13:30 ` [PATCH net-next 2/8] inet: implement lockless IP_MTU_DISCOVER Eric Dumazet
@ 2023-09-21 19:03 ` David Ahern
0 siblings, 0 replies; 18+ messages in thread
From: David Ahern @ 2023-09-21 19:03 UTC (permalink / raw)
To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, eric.dumazet
On 9/21/23 7:30 AM, Eric Dumazet wrote:
> inet->pmtudisc can be read locklessly.
>
> Implement proper lockless reads and writes to inet->pmtudisc
>
> ip_sock_set_mtu_discover() can now be called from arbitrary
> contexts.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> include/net/ip.h | 13 ++++++++-----
> net/ipv4/ip_output.c | 7 ++++---
> net/ipv4/ip_sockglue.c | 17 ++++++-----------
> net/ipv4/ping.c | 2 +-
> net/ipv4/raw.c | 2 +-
> net/ipv4/udp.c | 2 +-
> net/netfilter/ipvs/ip_vs_sync.c | 2 +-
> 7 files changed, 22 insertions(+), 23 deletions(-)
>
Reviewed-by: David Ahern <dsahern@kernel.org>
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH net-next 3/8] inet: implement lockless IP_TOS
2023-09-21 13:30 [PATCH net-next 0/8] inet: more data-race fixes Eric Dumazet
2023-09-21 13:30 ` [PATCH net-next 1/8] inet: implement lockless IP_MULTICAST_TTL Eric Dumazet
2023-09-21 13:30 ` [PATCH net-next 2/8] inet: implement lockless IP_MTU_DISCOVER Eric Dumazet
@ 2023-09-21 13:30 ` Eric Dumazet
2023-09-21 19:10 ` David Ahern
2023-09-21 13:30 ` [PATCH net-next 4/8] inet: lockless getsockopt(IP_OPTIONS) Eric Dumazet
` (4 subsequent siblings)
7 siblings, 1 reply; 18+ messages in thread
From: Eric Dumazet @ 2023-09-21 13:30 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet
Some reads of inet->tos are racy.
Add needed READ_ONCE() annotations and convert IP_TOS option lockless.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
include/net/ip.h | 1 -
net/dccp/ipv4.c | 2 +-
net/ipv4/inet_diag.c | 2 +-
net/ipv4/ip_output.c | 4 +--
net/ipv4/ip_sockglue.c | 29 ++++++++-----------
net/ipv4/tcp_ipv4.c | 9 +++---
net/mptcp/sockopt.c | 8 ++---
net/sctp/protocol.c | 4 +--
.../selftests/net/mptcp/mptcp_connect.sh | 2 +-
9 files changed, 28 insertions(+), 33 deletions(-)
diff --git a/include/net/ip.h b/include/net/ip.h
index 46933a0d98eac2db40c2e88006125588b8f8143e..8836ee5502669f6d3a5fc35a045695bac1c3b1a9 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -810,6 +810,5 @@ int ip_sock_set_mtu_discover(struct sock *sk, int val);
void ip_sock_set_pktinfo(struct sock *sk);
void ip_sock_set_recverr(struct sock *sk);
void ip_sock_set_tos(struct sock *sk, int val);
-void __ip_sock_set_tos(struct sock *sk, int val);
#endif /* _IP_H */
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 8f56e8723c7386c9f9344f1376823bfd0077c8c2..ef55e4c99e5109d68da5016e5c30e01fa50722be 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -516,7 +516,7 @@ static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
rcu_dereference(ireq->ireq_opt),
- inet_sk(sk)->tos);
+ READ_ONCE(inet_sk(sk)->tos));
rcu_read_unlock();
err = net_xmit_eval(err);
}
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e13a84433413ed88088435ff8e11efeb30fc3cca..1f2d7a8bd060e59baeb00fcb1c6aabfcb3bb213d 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -134,7 +134,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
* hence this needs to be included regardless of socket family.
*/
if (ext & (1 << (INET_DIAG_TOS - 1)))
- if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+ if (nla_put_u8(skb, INET_DIAG_TOS, READ_ONCE(inet->tos)) < 0)
goto errout;
#if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 2be281f184a5fe5a695ccd51fabe69fa45bea0b8..85320f92e8363d59e92c54139044cbab7e0561fa 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -544,7 +544,7 @@ EXPORT_SYMBOL(__ip_queue_xmit);
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
- return __ip_queue_xmit(sk, skb, fl, inet_sk(sk)->tos);
+ return __ip_queue_xmit(sk, skb, fl, READ_ONCE(inet_sk(sk)->tos));
}
EXPORT_SYMBOL(ip_queue_xmit);
@@ -1438,7 +1438,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = 5;
- iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
+ iph->tos = (cork->tos != -1) ? cork->tos : READ_ONCE(inet->tos);
iph->frag_off = df;
iph->ttl = ttl;
iph->protocol = sk->sk_protocol;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 6d874cc03c8b4e88d79ebc50a6db105606b6ae60..50c008efbb6de7303621dd30b178c90cb3f5a2fc 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -585,25 +585,20 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
return err;
}
-void __ip_sock_set_tos(struct sock *sk, int val)
+void ip_sock_set_tos(struct sock *sk, int val)
{
+ u8 old_tos = READ_ONCE(inet_sk(sk)->tos);
+
if (sk->sk_type == SOCK_STREAM) {
val &= ~INET_ECN_MASK;
- val |= inet_sk(sk)->tos & INET_ECN_MASK;
+ val |= old_tos & INET_ECN_MASK;
}
- if (inet_sk(sk)->tos != val) {
- inet_sk(sk)->tos = val;
+ if (old_tos != val) {
+ WRITE_ONCE(inet_sk(sk)->tos, val);
WRITE_ONCE(sk->sk_priority, rt_tos2priority(val));
sk_dst_reset(sk);
}
}
-
-void ip_sock_set_tos(struct sock *sk, int val)
-{
- lock_sock(sk);
- __ip_sock_set_tos(sk, val);
- release_sock(sk);
-}
EXPORT_SYMBOL(ip_sock_set_tos);
void ip_sock_set_freebind(struct sock *sk)
@@ -1050,6 +1045,9 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
return 0;
case IP_MTU_DISCOVER:
return ip_sock_set_mtu_discover(sk, val);
+ case IP_TOS: /* This sets both TOS and Precedence */
+ ip_sock_set_tos(sk, val);
+ return 0;
}
err = 0;
@@ -1104,9 +1102,6 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
}
}
break;
- case IP_TOS: /* This sets both TOS and Precedence */
- __ip_sock_set_tos(sk, val);
- break;
case IP_UNICAST_IF:
{
struct net_device *dev = NULL;
@@ -1593,6 +1588,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_MTU_DISCOVER:
val = READ_ONCE(inet->pmtudisc);
goto copyval;
+ case IP_TOS:
+ val = READ_ONCE(inet->tos);
+ goto copyval;
}
if (needs_rtnl)
@@ -1629,9 +1627,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
return 0;
}
- case IP_TOS:
- val = inet->tos;
- break;
case IP_MTU:
{
struct dst_entry *dst;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f13eb7e23d03f3681055257e6ebea0612ae3f9b3..1f89ba58e71eff74d8ed75019de9e70d2f4d5926 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1024,10 +1024,11 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
- tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
- (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
- (inet_sk(sk)->tos & INET_ECN_MASK) :
- inet_sk(sk)->tos;
+ tos = READ_ONCE(inet_sk(sk)->tos);
+
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
+ tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
+ (tos & INET_ECN_MASK);
if (!INET_ECN_is_capable(tos) &&
tcp_bpf_ca_needs_ecn((struct sock *)req))
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 8260202c00669fd7d2eed2f94a3c2cf225a0d89c..155e8472ba9b83c35c6f827b2bb35c0be4127917 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -734,11 +734,11 @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname,
lock_sock(sk);
sockopt_seq_inc(msk);
- val = inet_sk(sk)->tos;
+ val = READ_ONCE(inet_sk(sk)->tos);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
- __ip_sock_set_tos(ssk, val);
+ ip_sock_set_tos(ssk, val);
}
release_sock(sk);
@@ -1343,7 +1343,7 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname,
switch (optname) {
case IP_TOS:
- return mptcp_put_int_option(msk, optval, optlen, inet_sk(sk)->tos);
+ return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos));
}
return -EOPNOTSUPP;
@@ -1411,7 +1411,7 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
ssk->sk_bound_dev_if = sk->sk_bound_dev_if;
ssk->sk_incoming_cpu = sk->sk_incoming_cpu;
ssk->sk_ipv6only = sk->sk_ipv6only;
- __ip_sock_set_tos(ssk, inet_sk(sk)->tos);
+ ip_sock_set_tos(ssk, inet_sk(sk)->tos);
if (sk->sk_userlocks & tx_rx_locks) {
ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 2185f44198deb002bc8ed7f1b0f3fe02d6bb9f09..94c6dd53cd62d1fa6236d07946e8d5ff68eb587d 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -426,7 +426,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
struct dst_entry *dst = NULL;
union sctp_addr *daddr = &t->ipaddr;
union sctp_addr dst_saddr;
- __u8 tos = inet_sk(sk)->tos;
+ u8 tos = READ_ONCE(inet_sk(sk)->tos);
if (t->dscp & SCTP_DSCP_SET_MASK)
tos = t->dscp & SCTP_DSCP_VAL_MASK;
@@ -1057,7 +1057,7 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t)
struct flowi4 *fl4 = &t->fl.u.ip4;
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
- __u8 dscp = inet->tos;
+ __u8 dscp = READ_ONCE(inet->tos);
__be16 df = 0;
pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb,
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
index b1fc8afd072dc6ddde8d561a675a5549a9a37dba..61a2a1988ce69ffa17e0dd8e629eac550f4f7d99 100755
--- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -716,7 +716,7 @@ run_test_transparent()
# the required infrastructure in MPTCP sockopt code. To support TOS, the
# following function has been exported (T). Not great but better than
# checking for a specific kernel version.
- if ! mptcp_lib_kallsyms_has "T __ip_sock_set_tos$"; then
+ if ! mptcp_lib_kallsyms_has "T ip_sock_set_tos$"; then
echo "INFO: ${msg} not supported by the kernel: SKIP"
mptcp_lib_result_skip "${TEST_GROUP}"
return
--
2.42.0.459.ge4e396fd5e-goog
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [PATCH net-next 3/8] inet: implement lockless IP_TOS
2023-09-21 13:30 ` [PATCH net-next 3/8] inet: implement lockless IP_TOS Eric Dumazet
@ 2023-09-21 19:10 ` David Ahern
2023-09-21 19:15 ` Eric Dumazet
0 siblings, 1 reply; 18+ messages in thread
From: David Ahern @ 2023-09-21 19:10 UTC (permalink / raw)
To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, eric.dumazet
On 9/21/23 7:30 AM, Eric Dumazet wrote:
> Some reads of inet->tos are racy.
>
> Add needed READ_ONCE() annotations and convert IP_TOS option lockless.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> include/net/ip.h | 1 -
> net/dccp/ipv4.c | 2 +-
> net/ipv4/inet_diag.c | 2 +-
> net/ipv4/ip_output.c | 4 +--
> net/ipv4/ip_sockglue.c | 29 ++++++++-----------
> net/ipv4/tcp_ipv4.c | 9 +++---
> net/mptcp/sockopt.c | 8 ++---
> net/sctp/protocol.c | 4 +--
> .../selftests/net/mptcp/mptcp_connect.sh | 2 +-
> 9 files changed, 28 insertions(+), 33 deletions(-)
>
include/net/route.h dereferences sk tos as well.
net/ipv4/icmp.c has a setting of it
^ permalink raw reply [flat|nested] 18+ messages in thread
* Re: [PATCH net-next 3/8] inet: implement lockless IP_TOS
2023-09-21 19:10 ` David Ahern
@ 2023-09-21 19:15 ` Eric Dumazet
0 siblings, 0 replies; 18+ messages in thread
From: Eric Dumazet @ 2023-09-21 19:15 UTC (permalink / raw)
To: David Ahern
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, netdev,
eric.dumazet
On Thu, Sep 21, 2023 at 9:10 PM David Ahern <dsahern@kernel.org> wrote:
>
> On 9/21/23 7:30 AM, Eric Dumazet wrote:
> > Some reads of inet->tos are racy.
> >
> > Add needed READ_ONCE() annotations and convert IP_TOS option lockless.
> >
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > ---
> > include/net/ip.h | 1 -
> > net/dccp/ipv4.c | 2 +-
> > net/ipv4/inet_diag.c | 2 +-
> > net/ipv4/ip_output.c | 4 +--
> > net/ipv4/ip_sockglue.c | 29 ++++++++-----------
> > net/ipv4/tcp_ipv4.c | 9 +++---
> > net/mptcp/sockopt.c | 8 ++---
> > net/sctp/protocol.c | 4 +--
> > .../selftests/net/mptcp/mptcp_connect.sh | 2 +-
> > 9 files changed, 28 insertions(+), 33 deletions(-)
> >
>
> include/net/route.h dereferences sk tos as well.
Right, thanks for catching this.
> net/ipv4/icmp.c has a setting of it
This is safe, the socket is private to the current thread ( sk =
icmp_xmit_lock()) and not visible to other threads.
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH net-next 4/8] inet: lockless getsockopt(IP_OPTIONS)
2023-09-21 13:30 [PATCH net-next 0/8] inet: more data-race fixes Eric Dumazet
` (2 preceding siblings ...)
2023-09-21 13:30 ` [PATCH net-next 3/8] inet: implement lockless IP_TOS Eric Dumazet
@ 2023-09-21 13:30 ` Eric Dumazet
2023-09-21 19:12 ` David Ahern
2023-09-21 13:30 ` [PATCH net-next 5/8] inet: lockless getsockopt(IP_MTU) Eric Dumazet
` (3 subsequent siblings)
7 siblings, 1 reply; 18+ messages in thread
From: Eric Dumazet @ 2023-09-21 13:30 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet
inet->inet_opt being RCU protected, we can use RCU instead
of locking the socket.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv4/ip_sockglue.c | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 50c008efbb6de7303621dd30b178c90cb3f5a2fc..45d89487914a12061f05c192004ad79f0abbf756 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1591,27 +1591,20 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_TOS:
val = READ_ONCE(inet->tos);
goto copyval;
- }
-
- if (needs_rtnl)
- rtnl_lock();
- sockopt_lock_sock(sk);
-
- switch (optname) {
case IP_OPTIONS:
{
unsigned char optbuf[sizeof(struct ip_options)+40];
struct ip_options *opt = (struct ip_options *)optbuf;
struct ip_options_rcu *inet_opt;
- inet_opt = rcu_dereference_protected(inet->inet_opt,
- lockdep_sock_is_held(sk));
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
opt->optlen = 0;
if (inet_opt)
memcpy(optbuf, &inet_opt->opt,
sizeof(struct ip_options) +
inet_opt->opt.optlen);
- sockopt_release_sock(sk);
+ rcu_read_unlock();
if (opt->optlen == 0) {
len = 0;
@@ -1627,6 +1620,13 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
return 0;
}
+ }
+
+ if (needs_rtnl)
+ rtnl_lock();
+ sockopt_lock_sock(sk);
+
+ switch (optname) {
case IP_MTU:
{
struct dst_entry *dst;
--
2.42.0.459.ge4e396fd5e-goog
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH net-next 5/8] inet: lockless getsockopt(IP_MTU)
2023-09-21 13:30 [PATCH net-next 0/8] inet: more data-race fixes Eric Dumazet
` (3 preceding siblings ...)
2023-09-21 13:30 ` [PATCH net-next 4/8] inet: lockless getsockopt(IP_OPTIONS) Eric Dumazet
@ 2023-09-21 13:30 ` Eric Dumazet
2023-09-21 19:13 ` David Ahern
2023-09-21 13:30 ` [PATCH net-next 6/8] inet: implement lockless getsockopt(IP_UNICAST_IF) Eric Dumazet
` (2 subsequent siblings)
7 siblings, 1 reply; 18+ messages in thread
From: Eric Dumazet @ 2023-09-21 13:30 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet
sk_dst_get() does not require socket lock.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv4/ip_sockglue.c | 20 +++++++++-----------
1 file changed, 9 insertions(+), 11 deletions(-)
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 45d89487914a12061f05c192004ad79f0abbf756..04579e390ddd4dadb8a107ef0b5da15e7a60f1ff 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1620,13 +1620,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
return 0;
}
- }
-
- if (needs_rtnl)
- rtnl_lock();
- sockopt_lock_sock(sk);
-
- switch (optname) {
case IP_MTU:
{
struct dst_entry *dst;
@@ -1636,12 +1629,17 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
val = dst_mtu(dst);
dst_release(dst);
}
- if (!val) {
- sockopt_release_sock(sk);
+ if (!val)
return -ENOTCONN;
- }
- break;
+ goto copyval;
+ }
}
+
+ if (needs_rtnl)
+ rtnl_lock();
+ sockopt_lock_sock(sk);
+
+ switch (optname) {
case IP_UNICAST_IF:
val = (__force int)htonl((__u32) inet->uc_index);
break;
--
2.42.0.459.ge4e396fd5e-goog
^ permalink raw reply related [flat|nested] 18+ messages in thread* [PATCH net-next 6/8] inet: implement lockless getsockopt(IP_UNICAST_IF)
2023-09-21 13:30 [PATCH net-next 0/8] inet: more data-race fixes Eric Dumazet
` (4 preceding siblings ...)
2023-09-21 13:30 ` [PATCH net-next 5/8] inet: lockless getsockopt(IP_MTU) Eric Dumazet
@ 2023-09-21 13:30 ` Eric Dumazet
2023-09-21 19:19 ` David Ahern
2023-09-21 13:30 ` [PATCH net-next 7/8] inet: lockless IP_PKTOPTIONS implementation Eric Dumazet
2023-09-21 13:30 ` [PATCH net-next 8/8] inet: implement lockless getsockopt(IP_MULTICAST_IF) Eric Dumazet
7 siblings, 1 reply; 18+ messages in thread
From: Eric Dumazet @ 2023-09-21 13:30 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet
Add missing READ_ONCE() annotations when reading inet->uc_index
Implementing getsockopt(IP_UNICAST_IF) locklessly seems possible,
the setsockopt() part might not be possible at the moment.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv4/datagram.c | 2 +-
net/ipv4/ip_sockglue.c | 10 +++++-----
net/ipv4/ping.c | 2 +-
net/ipv4/raw.c | 13 +++++++------
net/ipv4/udp.c | 12 +++++++-----
5 files changed, 21 insertions(+), 18 deletions(-)
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index cb5dbee9e018fbba1bc1e5705e8bec6c4203af56..1480e9ebdfef445960e1f70f34f33a0e0c52b65b 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -43,7 +43,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
if (!saddr)
saddr = inet->mc_addr;
} else if (!oif) {
- oif = inet->uc_index;
+ oif = READ_ONCE(inet->uc_index);
}
fl4 = &inet->cork.fl.u.ip4;
rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, oif,
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 04579e390ddd4dadb8a107ef0b5da15e7a60f1ff..58995526c6e965d613b8cdea61b84916d608a6fb 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1113,7 +1113,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
ifindex = (__force int)ntohl((__force __be32)val);
if (ifindex == 0) {
- inet->uc_index = 0;
+ WRITE_ONCE(inet->uc_index, 0);
err = 0;
break;
}
@@ -1130,7 +1130,7 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
if (sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if)
break;
- inet->uc_index = ifindex;
+ WRITE_ONCE(inet->uc_index, ifindex);
err = 0;
break;
}
@@ -1633,6 +1633,9 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
return -ENOTCONN;
goto copyval;
}
+ case IP_UNICAST_IF:
+ val = (__force int)htonl((__u32) READ_ONCE(inet->uc_index));
+ goto copyval;
}
if (needs_rtnl)
@@ -1640,9 +1643,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
sockopt_lock_sock(sk);
switch (optname) {
- case IP_UNICAST_IF:
- val = (__force int)htonl((__u32) inet->uc_index);
- break;
case IP_MULTICAST_IF:
{
struct in_addr addr;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 50d12b0c8d46fdcd9b448c3ebc90395ebf426075..66ad1f95af49f222afe0ee75b9163dd0af0a2c49 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -777,7 +777,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (!saddr)
saddr = inet->mc_addr;
} else if (!ipc.oif)
- ipc.oif = inet->uc_index;
+ ipc.oif = READ_ONCE(inet->uc_index);
flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope,
sk->sk_protocol, inet_sk_flowi_flags(sk), faddr,
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ade1aecd7c71184d753a28a67bc9b30087247db4..e2357d23202e5a39832bb1550c365de9a836c363 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -482,7 +482,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int free = 0;
__be32 daddr;
__be32 saddr;
- int err;
+ int uc_index, err;
struct ip_options_data opt_copy;
struct raw_frag_vec rfv;
int hdrincl;
@@ -576,24 +576,25 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
tos = get_rttos(&ipc, inet);
scope = ip_sendmsg_scope(inet, &ipc, msg);
+ uc_index = READ_ONCE(inet->uc_index);
if (ipv4_is_multicast(daddr)) {
if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
if (!saddr)
saddr = inet->mc_addr;
} else if (!ipc.oif) {
- ipc.oif = inet->uc_index;
- } else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
+ ipc.oif = uc_index;
+ } else if (ipv4_is_lbcast(daddr) && uc_index) {
/* oif is set, packet is to local broadcast
* and uc_index is set. oif is most likely set
* by sk_bound_dev_if. If uc_index != oif check if the
* oif is an L3 master and uc_index is an L3 slave.
* If so, we want to allow the send using the uc_index.
*/
- if (ipc.oif != inet->uc_index &&
+ if (ipc.oif != uc_index &&
ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
- inet->uc_index)) {
- ipc.oif = inet->uc_index;
+ uc_index)) {
+ ipc.oif = uc_index;
}
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 731a723dc80816f0b5b0803d7397f7e9e8cd8b09..1e0c3aba1e5a88c7ba50a28511412a1710f1bab5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1055,6 +1055,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
struct sk_buff *skb;
struct ip_options_data opt_copy;
+ int uc_index;
if (len > 0xFFFF)
return -EMSGSIZE;
@@ -1173,6 +1174,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (scope == RT_SCOPE_LINK)
connected = 0;
+ uc_index = READ_ONCE(inet->uc_index);
if (ipv4_is_multicast(daddr)) {
if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
ipc.oif = inet->mc_index;
@@ -1180,18 +1182,18 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
saddr = inet->mc_addr;
connected = 0;
} else if (!ipc.oif) {
- ipc.oif = inet->uc_index;
- } else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
+ ipc.oif = uc_index;
+ } else if (ipv4_is_lbcast(daddr) && uc_index) {
/* oif is set, packet is to local broadcast and
* uc_index is set. oif is most likely set
* by sk_bound_dev_if. If uc_index != oif check if the
* oif is an L3 master and uc_index is an L3 slave.
* If so, we want to allow the send using the uc_index.
*/
- if (ipc.oif != inet->uc_index &&
+ if (ipc.oif != uc_index &&
ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
- inet->uc_index)) {
- ipc.oif = inet->uc_index;
+ uc_index)) {
+ ipc.oif = uc_index;
}
}
--
2.42.0.459.ge4e396fd5e-goog
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [PATCH net-next 6/8] inet: implement lockless getsockopt(IP_UNICAST_IF)
2023-09-21 13:30 ` [PATCH net-next 6/8] inet: implement lockless getsockopt(IP_UNICAST_IF) Eric Dumazet
@ 2023-09-21 19:19 ` David Ahern
0 siblings, 0 replies; 18+ messages in thread
From: David Ahern @ 2023-09-21 19:19 UTC (permalink / raw)
To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, eric.dumazet
On 9/21/23 7:30 AM, Eric Dumazet wrote:
> Add missing READ_ONCE() annotations when reading inet->uc_index
>
> Implementing getsockopt(IP_UNICAST_IF) locklessly seems possible,
> the setsockopt() part might not be possible at the moment.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> net/ipv4/datagram.c | 2 +-
> net/ipv4/ip_sockglue.c | 10 +++++-----
> net/ipv4/ping.c | 2 +-
> net/ipv4/raw.c | 13 +++++++------
> net/ipv4/udp.c | 12 +++++++-----
> 5 files changed, 21 insertions(+), 18 deletions(-)
>
Reviewed-by: David Ahern <dsahern@kernel.org>
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH net-next 7/8] inet: lockless IP_PKTOPTIONS implementation
2023-09-21 13:30 [PATCH net-next 0/8] inet: more data-race fixes Eric Dumazet
` (5 preceding siblings ...)
2023-09-21 13:30 ` [PATCH net-next 6/8] inet: implement lockless getsockopt(IP_UNICAST_IF) Eric Dumazet
@ 2023-09-21 13:30 ` Eric Dumazet
2023-09-21 19:23 ` David Ahern
2023-09-21 13:30 ` [PATCH net-next 8/8] inet: implement lockless getsockopt(IP_MULTICAST_IF) Eric Dumazet
7 siblings, 1 reply; 18+ messages in thread
From: Eric Dumazet @ 2023-09-21 13:30 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet
Current implementation is already lockless, because the socket
lock is released before reading socket fields.
Add missing READ_ONCE() annotations.
Note that corresponding WRITE_ONCE() are needed, the order
of the patches do not really matter.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv4/ip_sockglue.c | 76 ++++++++++++++++++++----------------------
1 file changed, 37 insertions(+), 39 deletions(-)
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 58995526c6e965d613b8cdea61b84916d608a6fb..1ee01ff64171c94b6b244589518a53ce807a212d 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1633,6 +1633,43 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
return -ENOTCONN;
goto copyval;
}
+ case IP_PKTOPTIONS:
+ {
+ struct msghdr msg;
+
+ if (sk->sk_type != SOCK_STREAM)
+ return -ENOPROTOOPT;
+
+ if (optval.is_kernel) {
+ msg.msg_control_is_user = false;
+ msg.msg_control = optval.kernel;
+ } else {
+ msg.msg_control_is_user = true;
+ msg.msg_control_user = optval.user;
+ }
+ msg.msg_controllen = len;
+ msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0;
+
+ if (inet_test_bit(PKTINFO, sk)) {
+ struct in_pktinfo info;
+
+ info.ipi_addr.s_addr = READ_ONCE(inet->inet_rcv_saddr);
+ info.ipi_spec_dst.s_addr = READ_ONCE(inet->inet_rcv_saddr);
+ info.ipi_ifindex = READ_ONCE(inet->mc_index);
+ put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+ }
+ if (inet_test_bit(TTL, sk)) {
+ int hlim = READ_ONCE(inet->mc_ttl);
+
+ put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
+ }
+ if (inet_test_bit(TOS, sk)) {
+ int tos = READ_ONCE(inet->rcv_tos);
+ put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
+ }
+ len -= msg.msg_controllen;
+ return copy_to_sockptr(optlen, &len, sizeof(int));
+ }
case IP_UNICAST_IF:
val = (__force int)htonl((__u32) READ_ONCE(inet->uc_index));
goto copyval;
@@ -1678,45 +1715,6 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
else
err = ip_get_mcast_msfilter(sk, optval, optlen, len);
goto out;
- case IP_PKTOPTIONS:
- {
- struct msghdr msg;
-
- sockopt_release_sock(sk);
-
- if (sk->sk_type != SOCK_STREAM)
- return -ENOPROTOOPT;
-
- if (optval.is_kernel) {
- msg.msg_control_is_user = false;
- msg.msg_control = optval.kernel;
- } else {
- msg.msg_control_is_user = true;
- msg.msg_control_user = optval.user;
- }
- msg.msg_controllen = len;
- msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0;
-
- if (inet_test_bit(PKTINFO, sk)) {
- struct in_pktinfo info;
-
- info.ipi_addr.s_addr = inet->inet_rcv_saddr;
- info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr;
- info.ipi_ifindex = inet->mc_index;
- put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
- }
- if (inet_test_bit(TTL, sk)) {
- int hlim = READ_ONCE(inet->mc_ttl);
-
- put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
- }
- if (inet_test_bit(TOS, sk)) {
- int tos = inet->rcv_tos;
- put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
- }
- len -= msg.msg_controllen;
- return copy_to_sockptr(optlen, &len, sizeof(int));
- }
case IP_LOCAL_PORT_RANGE:
val = inet->local_port_range.hi << 16 | inet->local_port_range.lo;
break;
--
2.42.0.459.ge4e396fd5e-goog
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [PATCH net-next 7/8] inet: lockless IP_PKTOPTIONS implementation
2023-09-21 13:30 ` [PATCH net-next 7/8] inet: lockless IP_PKTOPTIONS implementation Eric Dumazet
@ 2023-09-21 19:23 ` David Ahern
0 siblings, 0 replies; 18+ messages in thread
From: David Ahern @ 2023-09-21 19:23 UTC (permalink / raw)
To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, eric.dumazet
On 9/21/23 7:30 AM, Eric Dumazet wrote:
> Current implementation is already lockless, because the socket
> lock is released before reading socket fields.
>
> Add missing READ_ONCE() annotations.
>
> Note that corresponding WRITE_ONCE() are needed, the order
> of the patches do not really matter.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> net/ipv4/ip_sockglue.c | 76 ++++++++++++++++++++----------------------
> 1 file changed, 37 insertions(+), 39 deletions(-)
>
Reviewed-by: David Ahern <dsahern@kernel.org>
^ permalink raw reply [flat|nested] 18+ messages in thread
* [PATCH net-next 8/8] inet: implement lockless getsockopt(IP_MULTICAST_IF)
2023-09-21 13:30 [PATCH net-next 0/8] inet: more data-race fixes Eric Dumazet
` (6 preceding siblings ...)
2023-09-21 13:30 ` [PATCH net-next 7/8] inet: lockless IP_PKTOPTIONS implementation Eric Dumazet
@ 2023-09-21 13:30 ` Eric Dumazet
2023-09-21 19:24 ` David Ahern
7 siblings, 1 reply; 18+ messages in thread
From: Eric Dumazet @ 2023-09-21 13:30 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: David Ahern, netdev, eric.dumazet, Eric Dumazet
Add missing annotations to inet->mc_index and inet->mc_addr
to fix data-races.
getsockopt(IP_MULTICAST_IF) can be lockless.
setsockopt() side is left for later.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv4/datagram.c | 4 ++--
net/ipv4/ip_sockglue.c | 25 ++++++++++++-------------
net/ipv4/ping.c | 4 ++--
net/ipv4/raw.c | 4 ++--
net/ipv4/udp.c | 4 ++--
5 files changed, 20 insertions(+), 21 deletions(-)
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 1480e9ebdfef445960e1f70f34f33a0e0c52b65b..2cc50cbfc2a31ec91fbdc4a541cb89df689cd9ae 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -39,9 +39,9 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
saddr = inet->inet_saddr;
if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
if (!oif || netif_index_is_l3_master(sock_net(sk), oif))
- oif = inet->mc_index;
+ oif = READ_ONCE(inet->mc_index);
if (!saddr)
- saddr = inet->mc_addr;
+ saddr = READ_ONCE(inet->mc_addr);
} else if (!oif) {
oif = READ_ONCE(inet->uc_index);
}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 1ee01ff64171c94b6b244589518a53ce807a212d..0b74ac49d6a6f82f5e8ffe5279dba3baf30f874e 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1168,8 +1168,8 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
if (!mreq.imr_ifindex) {
if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) {
- inet->mc_index = 0;
- inet->mc_addr = 0;
+ WRITE_ONCE(inet->mc_index, 0);
+ WRITE_ONCE(inet->mc_addr, 0);
err = 0;
break;
}
@@ -1194,8 +1194,8 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname,
midx != sk->sk_bound_dev_if)
break;
- inet->mc_index = mreq.imr_ifindex;
- inet->mc_addr = mreq.imr_address.s_addr;
+ WRITE_ONCE(inet->mc_index, mreq.imr_ifindex);
+ WRITE_ONCE(inet->mc_addr, mreq.imr_address.s_addr);
err = 0;
break;
}
@@ -1673,19 +1673,11 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_UNICAST_IF:
val = (__force int)htonl((__u32) READ_ONCE(inet->uc_index));
goto copyval;
- }
-
- if (needs_rtnl)
- rtnl_lock();
- sockopt_lock_sock(sk);
-
- switch (optname) {
case IP_MULTICAST_IF:
{
struct in_addr addr;
len = min_t(unsigned int, len, sizeof(struct in_addr));
- addr.s_addr = inet->mc_addr;
- sockopt_release_sock(sk);
+ addr.s_addr = READ_ONCE(inet->mc_addr);
if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
@@ -1693,6 +1685,13 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
return 0;
}
+ }
+
+ if (needs_rtnl)
+ rtnl_lock();
+ sockopt_lock_sock(sk);
+
+ switch (optname) {
case IP_MSFILTER:
{
struct ip_msfilter msf;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 66ad1f95af49f222afe0ee75b9163dd0af0a2c49..2c61f444e1c7d322e75e020c41af02977d8814f0 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -773,9 +773,9 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (ipv4_is_multicast(daddr)) {
if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
- ipc.oif = inet->mc_index;
+ ipc.oif = READ_ONCE(inet->mc_index);
if (!saddr)
- saddr = inet->mc_addr;
+ saddr = READ_ONCE(inet->mc_addr);
} else if (!ipc.oif)
ipc.oif = READ_ONCE(inet->uc_index);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index e2357d23202e5a39832bb1550c365de9a836c363..27da9d7294c0b4fb9027bb7feb704063dc6302db 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -579,9 +579,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
uc_index = READ_ONCE(inet->uc_index);
if (ipv4_is_multicast(daddr)) {
if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
- ipc.oif = inet->mc_index;
+ ipc.oif = READ_ONCE(inet->mc_index);
if (!saddr)
- saddr = inet->mc_addr;
+ saddr = READ_ONCE(inet->mc_addr);
} else if (!ipc.oif) {
ipc.oif = uc_index;
} else if (ipv4_is_lbcast(daddr) && uc_index) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1e0c3aba1e5a88c7ba50a28511412a1710f1bab5..7f7724beca33781f8ff12750d1c9c9ccc420f481 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1177,9 +1177,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
uc_index = READ_ONCE(inet->uc_index);
if (ipv4_is_multicast(daddr)) {
if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
- ipc.oif = inet->mc_index;
+ ipc.oif = READ_ONCE(inet->mc_index);
if (!saddr)
- saddr = inet->mc_addr;
+ saddr = READ_ONCE(inet->mc_addr);
connected = 0;
} else if (!ipc.oif) {
ipc.oif = uc_index;
--
2.42.0.459.ge4e396fd5e-goog
^ permalink raw reply related [flat|nested] 18+ messages in thread* Re: [PATCH net-next 8/8] inet: implement lockless getsockopt(IP_MULTICAST_IF)
2023-09-21 13:30 ` [PATCH net-next 8/8] inet: implement lockless getsockopt(IP_MULTICAST_IF) Eric Dumazet
@ 2023-09-21 19:24 ` David Ahern
0 siblings, 0 replies; 18+ messages in thread
From: David Ahern @ 2023-09-21 19:24 UTC (permalink / raw)
To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, eric.dumazet
On 9/21/23 7:30 AM, Eric Dumazet wrote:
> Add missing annotations to inet->mc_index and inet->mc_addr
> to fix data-races.
>
> getsockopt(IP_MULTICAST_IF) can be lockless.
>
> setsockopt() side is left for later.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
> net/ipv4/datagram.c | 4 ++--
> net/ipv4/ip_sockglue.c | 25 ++++++++++++-------------
> net/ipv4/ping.c | 4 ++--
> net/ipv4/raw.c | 4 ++--
> net/ipv4/udp.c | 4 ++--
> 5 files changed, 20 insertions(+), 21 deletions(-)
>
Reviewed-by: David Ahern <dsahern@kernel.org>
^ permalink raw reply [flat|nested] 18+ messages in thread