* [PATCH v3 net 01/16] net: add dev_net_rcu() helper
2025-02-04 13:23 [PATCH v3 net 00/16] net: first round to use dev_net_rcu() Eric Dumazet
@ 2025-02-04 13:23 ` Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 02/16] ipv4: add RCU protection to ip4_dst_hoplimit() Eric Dumazet
` (14 subsequent siblings)
15 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 13:23 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, Kuniyuki Iwashima, Simon Horman, eric.dumazet,
Eric Dumazet
dev->nd_net can change, readers should either
use rcu_read_lock() or RTNL.
We currently use a generic helper, dev_net() with
no debugging support. We probably have many hidden bugs.
Add dev_net_rcu() helper for callers using rcu_read_lock()
protection.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/linux/netdevice.h | 6 ++++++
include/net/net_namespace.h | 2 +-
2 files changed, 7 insertions(+), 1 deletion(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 03bb584c62cf8a920b12c673dcc438eb1cc41499..c0a86afb85daa2b50e26a1ca238707a24a1842ad 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2663,6 +2663,12 @@ struct net *dev_net(const struct net_device *dev)
return read_pnet(&dev->nd_net);
}
+static inline
+struct net *dev_net_rcu(const struct net_device *dev)
+{
+ return read_pnet_rcu(&dev->nd_net);
+}
+
static inline
void dev_net_set(struct net_device *dev, struct net *net)
{
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 0f5eb9db0c6264efc1ac83ab577511fd6823f4fe..7ba1402ca7796663bed3373b1a0c6a0249cd1599 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -398,7 +398,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
#endif
}
-static inline struct net *read_pnet_rcu(possible_net_t *pnet)
+static inline struct net *read_pnet_rcu(const possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
return rcu_dereference(pnet->net);
--
2.48.1.362.g079036d154-goog
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH v3 net 02/16] ipv4: add RCU protection to ip4_dst_hoplimit()
2025-02-04 13:23 [PATCH v3 net 00/16] net: first round to use dev_net_rcu() Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 01/16] net: add dev_net_rcu() helper Eric Dumazet
@ 2025-02-04 13:23 ` Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 03/16] ipv4: use RCU protection in ip_dst_mtu_maybe_forward() Eric Dumazet
` (13 subsequent siblings)
15 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 13:23 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, Kuniyuki Iwashima, Simon Horman, eric.dumazet,
Eric Dumazet
ip4_dst_hoplimit() must use RCU protection to make
sure the net structure it reads does not disappear.
Fixes: fa50d974d104 ("ipv4: Namespaceify ip_default_ttl sysctl knob")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/net/route.h | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/include/net/route.h b/include/net/route.h
index f86775be3e2934697533a61f566aca1ef196d74e..c605fd5ec0c08cc7658c3cf6aa6223790d463ede 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -382,10 +382,15 @@ static inline int inet_iif(const struct sk_buff *skb)
static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
{
int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
- struct net *net = dev_net(dst->dev);
- if (hoplimit == 0)
+ if (hoplimit == 0) {
+ const struct net *net;
+
+ rcu_read_lock();
+ net = dev_net_rcu(dst->dev);
hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl);
+ rcu_read_unlock();
+ }
return hoplimit;
}
--
2.48.1.362.g079036d154-goog
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH v3 net 03/16] ipv4: use RCU protection in ip_dst_mtu_maybe_forward()
2025-02-04 13:23 [PATCH v3 net 00/16] net: first round to use dev_net_rcu() Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 01/16] net: add dev_net_rcu() helper Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 02/16] ipv4: add RCU protection to ip4_dst_hoplimit() Eric Dumazet
@ 2025-02-04 13:23 ` Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 04/16] ipv4: use RCU protection in ipv4_default_advmss() Eric Dumazet
` (12 subsequent siblings)
15 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 13:23 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, Kuniyuki Iwashima, Simon Horman, eric.dumazet,
Eric Dumazet
ip_dst_mtu_maybe_forward() must use RCU protection to make
sure the net structure it reads does not disappear.
Fixes: f87c10a8aa1e8 ("ipv4: introduce ip_dst_mtu_maybe_forward and protect forwarding path against pmtu spoofing")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/net/ip.h | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/include/net/ip.h b/include/net/ip.h
index 9f5e33e371fcdd8ea88c54584b8d4b6c50e7d0c9..ba7b43447775e51b3b9a8cbf5c3345d6308bb525 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -471,9 +471,12 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
bool forwarding)
{
const struct rtable *rt = dst_rtable(dst);
- struct net *net = dev_net(dst->dev);
- unsigned int mtu;
+ unsigned int mtu, res;
+ struct net *net;
+
+ rcu_read_lock();
+ net = dev_net_rcu(dst->dev);
if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) ||
ip_mtu_locked(dst) ||
!forwarding) {
@@ -497,7 +500,11 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
out:
mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
- return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
+ res = mtu - lwtunnel_headroom(dst->lwtstate, mtu);
+
+ rcu_read_unlock();
+
+ return res;
}
static inline unsigned int ip_skb_dst_mtu(struct sock *sk,
--
2.48.1.362.g079036d154-goog
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH v3 net 04/16] ipv4: use RCU protection in ipv4_default_advmss()
2025-02-04 13:23 [PATCH v3 net 00/16] net: first round to use dev_net_rcu() Eric Dumazet
` (2 preceding siblings ...)
2025-02-04 13:23 ` [PATCH v3 net 03/16] ipv4: use RCU protection in ip_dst_mtu_maybe_forward() Eric Dumazet
@ 2025-02-04 13:23 ` Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 05/16] ipv4: use RCU protection in rt_is_expired() Eric Dumazet
` (11 subsequent siblings)
15 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 13:23 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, Kuniyuki Iwashima, Simon Horman, eric.dumazet,
Eric Dumazet
ipv4_default_advmss() must use RCU protection to make
sure the net structure it reads does not disappear.
Fixes: 2e9589ff809e ("ipv4: Namespaceify min_adv_mss sysctl knob")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv4/route.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 577b88a43293aa801c3ee736d7e5cc4d97917717..74c074f45758be5ae78a87edb31837481cc40278 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1307,10 +1307,15 @@ static void set_class_tag(struct rtable *rt, u32 tag)
static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
{
- struct net *net = dev_net(dst->dev);
unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
- unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
- net->ipv4.ip_rt_min_advmss);
+ unsigned int advmss;
+ struct net *net;
+
+ rcu_read_lock();
+ net = dev_net_rcu(dst->dev);
+ advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
+ net->ipv4.ip_rt_min_advmss);
+ rcu_read_unlock();
return min(advmss, IPV4_MAX_PMTU - header_size);
}
--
2.48.1.362.g079036d154-goog
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH v3 net 05/16] ipv4: use RCU protection in rt_is_expired()
2025-02-04 13:23 [PATCH v3 net 00/16] net: first round to use dev_net_rcu() Eric Dumazet
` (3 preceding siblings ...)
2025-02-04 13:23 ` [PATCH v3 net 04/16] ipv4: use RCU protection in ipv4_default_advmss() Eric Dumazet
@ 2025-02-04 13:23 ` Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 06/16] tcp: convert to dev_net_rcu() Eric Dumazet
` (10 subsequent siblings)
15 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 13:23 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, Kuniyuki Iwashima, Simon Horman, eric.dumazet,
Eric Dumazet
rt_is_expired() must use RCU protection to make
sure the net structure it reads does not disappear.
Fixes: e84f84f27647 ("netns: place rt_genid into struct net")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv4/route.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 74c074f45758be5ae78a87edb31837481cc40278..e959327c0ba8979ce5c7ca8c46ae41068824edc6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -390,7 +390,13 @@ static inline int ip_rt_proc_init(void)
static inline bool rt_is_expired(const struct rtable *rth)
{
- return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
+ bool res;
+
+ rcu_read_lock();
+ res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev));
+ rcu_read_unlock();
+
+ return res;
}
void rt_cache_flush(struct net *net)
--
2.48.1.362.g079036d154-goog
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH v3 net 06/16] tcp: convert to dev_net_rcu()
2025-02-04 13:23 [PATCH v3 net 00/16] net: first round to use dev_net_rcu() Eric Dumazet
` (4 preceding siblings ...)
2025-02-04 13:23 ` [PATCH v3 net 05/16] ipv4: use RCU protection in rt_is_expired() Eric Dumazet
@ 2025-02-04 13:23 ` Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 07/16] net: gro: convert four dev_net() calls Eric Dumazet
` (9 subsequent siblings)
15 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 13:23 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, Kuniyuki Iwashima, Simon Horman, eric.dumazet,
Eric Dumazet
TCP uses of dev_net() are safe, change them to dev_net_rcu()
to get LOCKDEP support.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/net/inet6_hashtables.h | 2 +-
include/net/inet_hashtables.h | 2 +-
net/ipv4/tcp_ipv4.c | 8 ++++----
net/ipv4/tcp_metrics.c | 6 +++---
net/ipv6/tcp_ipv6.c | 10 +++++-----
5 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 74dd90ff5f129fe4c8adad67a642ae5070410518..c32878c69179dac5a7fcfa098a297420d9adfab2 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -150,7 +150,7 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
int iif, int sdif,
bool *refcounted)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
+ struct net *net = dev_net_rcu(skb_dst(skb)->dev);
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
struct sock *sk;
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 5eea47f135a421ce8275d4cd83c5771b3f448e5c..da818fb0205fed6b4120946bc032e67e046b716f 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -492,7 +492,7 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
const int sdif,
bool *refcounted)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
+ struct net *net = dev_net_rcu(skb_dst(skb)->dev);
const struct iphdr *iph = ip_hdr(skb);
struct sock *sk;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index cc2b5194a18d2e64595f474f62c6f2fd3eff319f..3bd835220d43d6d6491fd5c8d5e9954c37303f83 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -503,7 +503,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
struct request_sock *fastopen;
u32 seq, snd_una;
int err;
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
iph->daddr, th->dest, iph->saddr,
@@ -788,7 +788,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
arg.iov[0].iov_base = (unsigned char *)&rep;
arg.iov[0].iov_len = sizeof(rep.th);
- net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
+ net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev);
/* Invalid TCP option size or twice included auth */
if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
@@ -1967,7 +1967,7 @@ EXPORT_SYMBOL(tcp_v4_do_rcv);
int tcp_v4_early_demux(struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
const struct iphdr *iph;
const struct tcphdr *th;
struct sock *sk;
@@ -2178,7 +2178,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
int tcp_v4_rcv(struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
enum skb_drop_reason drop_reason;
int sdif = inet_sdif(skb);
int dif = inet_iif(skb);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 95669935494ef8003a1877e2b86c76bd27307afd..4251670e328c83b55eff7bbda3cc3d97d78563a8 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -170,7 +170,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
bool reclaim = false;
spin_lock_bh(&tcp_metrics_lock);
- net = dev_net(dst->dev);
+ net = dev_net_rcu(dst->dev);
/* While waiting for the spin-lock the cache might have been populated
* with this entry and so we have to check again.
@@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
return NULL;
}
- net = dev_net(dst->dev);
+ net = dev_net_rcu(dst->dev);
hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
@@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
else
return NULL;
- net = dev_net(dst->dev);
+ net = dev_net_rcu(dst->dev);
hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2debdf085a3b4d2452b2b316cb5368507b17efc8..429f8a5ab511b671aa405ae20f7c1b3163839779 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -376,7 +376,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
{
const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
struct request_sock *fastopen;
struct ipv6_pinfo *np;
struct tcp_sock *tp;
@@ -868,7 +868,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
struct tcphdr *t1;
struct sk_buff *buff;
struct flowi6 fl6;
- struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
+ struct net *net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev);
struct sock *ctl_sk = net->ipv6.tcp_sk;
unsigned int tot_len = sizeof(struct tcphdr);
__be32 mrst = 0, *topt;
@@ -1039,7 +1039,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
if (!sk && !ipv6_unicast_destination(skb))
return;
- net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
+ net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev);
/* Invalid TCP option size or twice included auth */
if (tcp_parse_auth_options(th, &md5_hash_location, &aoh))
return;
@@ -1744,6 +1744,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
{
+ struct net *net = dev_net_rcu(skb->dev);
enum skb_drop_reason drop_reason;
int sdif = inet6_sdif(skb);
int dif = inet6_iif(skb);
@@ -1753,7 +1754,6 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
bool refcounted;
int ret;
u32 isn;
- struct net *net = dev_net(skb->dev);
drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (skb->pkt_type != PACKET_HOST)
@@ -2004,7 +2004,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
void tcp_v6_early_demux(struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
const struct ipv6hdr *hdr;
const struct tcphdr *th;
struct sock *sk;
--
2.48.1.362.g079036d154-goog
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH v3 net 07/16] net: gro: convert four dev_net() calls
2025-02-04 13:23 [PATCH v3 net 00/16] net: first round to use dev_net_rcu() Eric Dumazet
` (5 preceding siblings ...)
2025-02-04 13:23 ` [PATCH v3 net 06/16] tcp: convert to dev_net_rcu() Eric Dumazet
@ 2025-02-04 13:23 ` Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 08/16] udp: convert to dev_net_rcu() Eric Dumazet
` (8 subsequent siblings)
15 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 13:23 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, Kuniyuki Iwashima, Simon Horman, eric.dumazet,
Eric Dumazet
tcp4_check_fraglist_gro(), tcp6_check_fraglist_gro(),
udp4_gro_lookup_skb() and udp6_gro_lookup_skb()
assume RCU is held so that the net structure does not disappear.
Use dev_net_rcu() instead of dev_net() to get LOCKDEP support.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv4/tcp_offload.c | 2 +-
net/ipv4/udp_offload.c | 2 +-
net/ipv6/tcpv6_offload.c | 2 +-
net/ipv6/udp_offload.c | 2 +-
4 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 2308665b51c5388814e5b61a262a1636d897c4a9..ecef16c58c07146cbeebade0620a5ec7251ddbc5 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -425,7 +425,7 @@ static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
inet_get_iif_sdif(skb, &iif, &sdif);
iph = skb_gro_network_header(skb);
- net = dev_net(skb->dev);
+ net = dev_net_rcu(skb->dev);
sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
iph->saddr, th->source,
iph->daddr, ntohs(th->dest),
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index a5be6e4ed326fbdc6a9b3889db4da903f7f25d37..c1a85b300ee87758ee683a834248a600a3e7f18d 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -630,7 +630,7 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
__be16 dport)
{
const struct iphdr *iph = skb_gro_network_header(skb);
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
int iif, sdif;
inet_get_iif_sdif(skb, &iif, &sdif);
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index a45bf17cb2a172d4612cb42f51481b97bbf364cd..91b88daa5b555cb1af591db7680b7d829ce7b1b7 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -35,7 +35,7 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
inet6_get_iif_sdif(skb, &iif, &sdif);
hdr = skb_gro_network_header(skb);
- net = dev_net(skb->dev);
+ net = dev_net_rcu(skb->dev);
sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
&hdr->saddr, th->source,
&hdr->daddr, ntohs(th->dest),
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index b41152dd424697a9fc3cef13fbb430de49dcb913..404212dfc99abba4d48fc27a574b48ab53731d39 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -117,7 +117,7 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
__be16 dport)
{
const struct ipv6hdr *iph = skb_gro_network_header(skb);
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
int iif, sdif;
inet6_get_iif_sdif(skb, &iif, &sdif);
--
2.48.1.362.g079036d154-goog
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH v3 net 08/16] udp: convert to dev_net_rcu()
2025-02-04 13:23 [PATCH v3 net 00/16] net: first round to use dev_net_rcu() Eric Dumazet
` (6 preceding siblings ...)
2025-02-04 13:23 ` [PATCH v3 net 07/16] net: gro: convert four dev_net() calls Eric Dumazet
@ 2025-02-04 13:23 ` Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 09/16] ipv4: icmp: " Eric Dumazet
` (7 subsequent siblings)
15 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 13:23 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, Kuniyuki Iwashima, Simon Horman, eric.dumazet,
Eric Dumazet
UDP uses of dev_net() are safe, change them to dev_net_rcu()
to get LOCKDEP support.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv4/udp.c | 19 ++++++++++---------
net/ipv6/udp.c | 18 +++++++++---------
2 files changed, 19 insertions(+), 18 deletions(-)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index a9bb9ce5438eaa9f9ceede1e4ac080dc6ab74588..fc1e37eb49190cb7e2671ebd54ac4fca54b77ac2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -750,7 +750,7 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
{
const struct iphdr *iph = ip_hdr(skb);
- return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
+ return __udp4_lib_lookup(dev_net_rcu(skb->dev), iph->saddr, sport,
iph->daddr, dport, inet_iif(skb),
inet_sdif(skb), udptable, skb);
}
@@ -760,7 +760,7 @@ struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
{
const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
int iif, sdif;
inet_get_iif_sdif(skb, &iif, &sdif);
@@ -934,13 +934,13 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
struct inet_sock *inet;
const struct iphdr *iph = (const struct iphdr *)skb->data;
struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
+ struct net *net = dev_net_rcu(skb->dev);
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
bool tunnel = false;
struct sock *sk;
int harderr;
int err;
- struct net *net = dev_net(skb->dev);
sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
iph->saddr, uh->source, skb->dev->ifindex,
@@ -1025,7 +1025,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
int udp_err(struct sk_buff *skb, u32 info)
{
- return __udp4_lib_err(skb, info, dev_net(skb->dev)->ipv4.udp_table);
+ return __udp4_lib_err(skb, info, dev_net_rcu(skb->dev)->ipv4.udp_table);
}
/*
@@ -2466,7 +2466,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
udp_post_segment_fix_csum(skb);
ret = udp_queue_rcv_one_skb(sk, skb);
if (ret > 0)
- ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
+ ip_protocol_deliver_rcu(dev_net_rcu(skb->dev), skb, ret);
}
return 0;
}
@@ -2632,12 +2632,12 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
+ struct net *net = dev_net_rcu(skb->dev);
+ struct rtable *rt = skb_rtable(skb);
struct sock *sk = NULL;
struct udphdr *uh;
unsigned short ulen;
- struct rtable *rt = skb_rtable(skb);
__be32 saddr, daddr;
- struct net *net = dev_net(skb->dev);
bool refcounted;
int drop_reason;
@@ -2804,7 +2804,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
int udp_v4_early_demux(struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
struct in_device *in_dev = NULL;
const struct iphdr *iph;
const struct udphdr *uh;
@@ -2873,7 +2873,8 @@ int udp_v4_early_demux(struct sk_buff *skb)
int udp_rcv(struct sk_buff *skb)
{
- return __udp4_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP);
+ return __udp4_lib_rcv(skb, dev_net_rcu(skb->dev)->ipv4.udp_table,
+ IPPROTO_UDP);
}
void udp_destroy_sock(struct sock *sk)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c6ea438b5c7588edd2971997f21382c26446a45c..d0b8f724e4362ec35352dae547e916c912716cab 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -410,7 +410,7 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
- return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
+ return __udp6_lib_lookup(dev_net_rcu(skb->dev), &iph->saddr, sport,
&iph->daddr, dport, inet6_iif(skb),
inet6_sdif(skb), udptable, skb);
}
@@ -420,7 +420,7 @@ struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
{
const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset);
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
int iif, sdif;
inet6_get_iif_sdif(skb, &iif, &sdif);
@@ -702,16 +702,16 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info,
struct udp_table *udptable)
{
- struct ipv6_pinfo *np;
const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
const struct in6_addr *saddr = &hdr->saddr;
const struct in6_addr *daddr = seg6_get_daddr(skb, opt) ? : &hdr->daddr;
struct udphdr *uh = (struct udphdr *)(skb->data+offset);
+ struct net *net = dev_net_rcu(skb->dev);
+ struct ipv6_pinfo *np;
bool tunnel = false;
struct sock *sk;
int harderr;
int err;
- struct net *net = dev_net(skb->dev);
sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
inet6_iif(skb), inet6_sdif(skb), udptable, NULL);
@@ -818,7 +818,7 @@ static __inline__ int udpv6_err(struct sk_buff *skb,
u8 code, int offset, __be32 info)
{
return __udp6_lib_err(skb, opt, type, code, offset, info,
- dev_net(skb->dev)->ipv4.udp_table);
+ dev_net_rcu(skb->dev)->ipv4.udp_table);
}
static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
@@ -929,7 +929,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
udp_post_segment_fix_csum(skb);
ret = udpv6_queue_rcv_one_skb(sk, skb);
if (ret > 0)
- ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret,
+ ip6_protocol_deliver_rcu(dev_net_rcu(skb->dev), skb, ret,
true);
}
return 0;
@@ -1071,8 +1071,8 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ struct net *net = dev_net_rcu(skb->dev);
const struct in6_addr *saddr, *daddr;
- struct net *net = dev_net(skb->dev);
struct sock *sk = NULL;
struct udphdr *uh;
bool refcounted;
@@ -1220,7 +1220,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
void udp_v6_early_demux(struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
const struct udphdr *uh;
struct sock *sk;
struct dst_entry *dst;
@@ -1262,7 +1262,7 @@ void udp_v6_early_demux(struct sk_buff *skb)
INDIRECT_CALLABLE_SCOPE int udpv6_rcv(struct sk_buff *skb)
{
- return __udp6_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP);
+ return __udp6_lib_rcv(skb, dev_net_rcu(skb->dev)->ipv4.udp_table, IPPROTO_UDP);
}
/*
--
2.48.1.362.g079036d154-goog
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH v3 net 09/16] ipv4: icmp: convert to dev_net_rcu()
2025-02-04 13:23 [PATCH v3 net 00/16] net: first round to use dev_net_rcu() Eric Dumazet
` (7 preceding siblings ...)
2025-02-04 13:23 ` [PATCH v3 net 08/16] udp: convert to dev_net_rcu() Eric Dumazet
@ 2025-02-04 13:23 ` Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 10/16] ipv6: " Eric Dumazet
` (6 subsequent siblings)
15 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 13:23 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, Kuniyuki Iwashima, Simon Horman, eric.dumazet,
Eric Dumazet
__icmp_send() must ensure rcu_read_lock() is held, as spotted
by Jakub.
Other ICMP uses of dev_net() seem safe, change them to dev_net_rcu()
to get LOCKDEP support.
Fixes: dde1bc0e6f86 ("[NETNS]: Add namespace for ICMP replying code.")
Closes: https://lore.kernel.org/netdev/20250203153633.46ce0337@kernel.org/
Reported-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv4/icmp.c | 31 +++++++++++++++++--------------
1 file changed, 17 insertions(+), 14 deletions(-)
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 094084b61bff8a17c4e85c99019b84e9cba21599..5482edb5aade2bc25a39d75ab16feba476bb08ac 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -399,10 +399,10 @@ static void icmp_push_reply(struct sock *sk,
static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
{
- struct ipcm_cookie ipc;
struct rtable *rt = skb_rtable(skb);
- struct net *net = dev_net(rt->dst.dev);
+ struct net *net = dev_net_rcu(rt->dst.dev);
bool apply_ratelimit = false;
+ struct ipcm_cookie ipc;
struct flowi4 fl4;
struct sock *sk;
struct inet_sock *inet;
@@ -608,12 +608,14 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
struct sock *sk;
if (!rt)
- goto out;
+ return;
+
+ rcu_read_lock();
if (rt->dst.dev)
- net = dev_net(rt->dst.dev);
+ net = dev_net_rcu(rt->dst.dev);
else if (skb_in->dev)
- net = dev_net(skb_in->dev);
+ net = dev_net_rcu(skb_in->dev);
else
goto out;
@@ -785,7 +787,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
icmp_xmit_unlock(sk);
out_bh_enable:
local_bh_enable();
-out:;
+out:
+ rcu_read_unlock();
}
EXPORT_SYMBOL(__icmp_send);
@@ -834,7 +837,7 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
* avoid additional coding at protocol handlers.
*/
if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
- __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
return;
}
@@ -868,7 +871,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb)
struct net *net;
u32 info = 0;
- net = dev_net(skb_dst(skb)->dev);
+ net = dev_net_rcu(skb_dst(skb)->dev);
/*
* Incomplete header ?
@@ -979,7 +982,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb)
static enum skb_drop_reason icmp_redirect(struct sk_buff *skb)
{
if (skb->len < sizeof(struct iphdr)) {
- __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
return SKB_DROP_REASON_PKT_TOO_SMALL;
}
@@ -1011,7 +1014,7 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
struct icmp_bxm icmp_param;
struct net *net;
- net = dev_net(skb_dst(skb)->dev);
+ net = dev_net_rcu(skb_dst(skb)->dev);
/* should there be an ICMP stat for ignored echos? */
if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all))
return SKB_NOT_DROPPED_YET;
@@ -1040,9 +1043,9 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr)
{
+ struct net *net = dev_net_rcu(skb->dev);
struct icmp_ext_hdr *ext_hdr, _ext_hdr;
struct icmp_ext_echo_iio *iio, _iio;
- struct net *net = dev_net(skb->dev);
struct inet6_dev *in6_dev;
struct in_device *in_dev;
struct net_device *dev;
@@ -1181,7 +1184,7 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb)
return SKB_NOT_DROPPED_YET;
out_err:
- __ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(dev_net_rcu(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
return SKB_DROP_REASON_PKT_TOO_SMALL;
}
@@ -1198,7 +1201,7 @@ int icmp_rcv(struct sk_buff *skb)
{
enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct rtable *rt = skb_rtable(skb);
- struct net *net = dev_net(rt->dst.dev);
+ struct net *net = dev_net_rcu(rt->dst.dev);
struct icmphdr *icmph;
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
@@ -1371,9 +1374,9 @@ int icmp_err(struct sk_buff *skb, u32 info)
struct iphdr *iph = (struct iphdr *)skb->data;
int offset = iph->ihl<<2;
struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset);
+ struct net *net = dev_net_rcu(skb->dev);
int type = icmp_hdr(skb)->type;
int code = icmp_hdr(skb)->code;
- struct net *net = dev_net(skb->dev);
/*
* Use ping_err to handle all icmp errors except those
--
2.48.1.362.g079036d154-goog
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH v3 net 10/16] ipv6: icmp: convert to dev_net_rcu()
2025-02-04 13:23 [PATCH v3 net 00/16] net: first round to use dev_net_rcu() Eric Dumazet
` (8 preceding siblings ...)
2025-02-04 13:23 ` [PATCH v3 net 09/16] ipv4: icmp: " Eric Dumazet
@ 2025-02-04 13:23 ` Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 11/16] ipv6: input: " Eric Dumazet
` (5 subsequent siblings)
15 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 13:23 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, Kuniyuki Iwashima, Simon Horman, eric.dumazet,
Eric Dumazet
icmp6_send() must acquire rcu_read_lock() sooner to ensure
the dev_net() call done from a safe context.
Other ICMPv6 uses of dev_net() seem safe, change them to
dev_net_rcu() to get LOCKDEP support to catch bugs.
Fixes: 9a43b709a230 ("[NETNS][IPV6] icmp6 - make icmpv6_socket per namespace")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv6/icmp.c | 42 +++++++++++++++++++++++-------------------
1 file changed, 23 insertions(+), 19 deletions(-)
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index a6984a29fdb9dd972a11ca9f8d5e794c443bac6f..4d14ab7f7e99f152cd5f5adaa023f0280957f275 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -76,7 +76,7 @@ static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
{
/* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */
struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset);
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
if (type == ICMPV6_PKT_TOOBIG)
ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL));
@@ -473,7 +473,10 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
if (!skb->dev)
return;
- net = dev_net(skb->dev);
+
+ rcu_read_lock();
+
+ net = dev_net_rcu(skb->dev);
mark = IP6_REPLY_MARK(net, skb->mark);
/*
* Make sure we respect the rules
@@ -496,7 +499,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
!(type == ICMPV6_PARAMPROB &&
code == ICMPV6_UNK_OPTION &&
(opt_unrec(skb, info))))
- return;
+ goto out;
saddr = NULL;
}
@@ -526,7 +529,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) {
net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n",
&hdr->saddr, &hdr->daddr);
- return;
+ goto out;
}
/*
@@ -535,7 +538,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
if (is_ineligible(skb)) {
net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n",
&hdr->saddr, &hdr->daddr);
- return;
+ goto out;
}
/* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */
@@ -582,7 +585,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
np = inet6_sk(sk);
if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit))
- goto out;
+ goto out_unlock;
tmp_hdr.icmp6_type = type;
tmp_hdr.icmp6_code = code;
@@ -600,7 +603,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
dst = icmpv6_route_lookup(net, skb, sk, &fl6);
if (IS_ERR(dst))
- goto out;
+ goto out_unlock;
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
@@ -616,7 +619,6 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
goto out_dst_release;
}
- rcu_read_lock();
idev = __in6_dev_get(skb->dev);
if (ip6_append_data(sk, icmpv6_getfrag, &msg,
@@ -630,13 +632,15 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
len + sizeof(struct icmp6hdr));
}
- rcu_read_unlock();
+
out_dst_release:
dst_release(dst);
-out:
+out_unlock:
icmpv6_xmit_unlock(sk);
out_bh_enable:
local_bh_enable();
+out:
+ rcu_read_unlock();
}
EXPORT_SYMBOL(icmp6_send);
@@ -679,8 +683,8 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
skb_pull(skb2, nhs);
skb_reset_network_header(skb2);
- rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0,
- skb, 0);
+ rt = rt6_lookup(dev_net_rcu(skb->dev), &ipv6_hdr(skb2)->saddr,
+ NULL, 0, skb, 0);
if (rt && rt->dst.dev)
skb2->dev = rt->dst.dev;
@@ -717,7 +721,7 @@ EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach);
static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
struct sock *sk;
struct inet6_dev *idev;
struct ipv6_pinfo *np;
@@ -832,7 +836,7 @@ enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type,
u8 code, __be32 info)
{
struct inet6_skb_parm *opt = IP6CB(skb);
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
const struct inet6_protocol *ipprot;
enum skb_drop_reason reason;
int inner_offset;
@@ -889,7 +893,7 @@ enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type,
static int icmpv6_rcv(struct sk_buff *skb)
{
enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
struct net_device *dev = icmp6_dev(skb);
struct inet6_dev *idev = __in6_dev_get(dev);
const struct in6_addr *saddr, *daddr;
@@ -921,7 +925,7 @@ static int icmpv6_rcv(struct sk_buff *skb)
skb_set_network_header(skb, nh);
}
- __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INMSGS);
+ __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INMSGS);
saddr = &ipv6_hdr(skb)->saddr;
daddr = &ipv6_hdr(skb)->daddr;
@@ -939,7 +943,7 @@ static int icmpv6_rcv(struct sk_buff *skb)
type = hdr->icmp6_type;
- ICMP6MSGIN_INC_STATS(dev_net(dev), idev, type);
+ ICMP6MSGIN_INC_STATS(dev_net_rcu(dev), idev, type);
switch (type) {
case ICMPV6_ECHO_REQUEST:
@@ -1034,9 +1038,9 @@ static int icmpv6_rcv(struct sk_buff *skb)
csum_error:
reason = SKB_DROP_REASON_ICMP_CSUM;
- __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS);
+ __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_CSUMERRORS);
discard_it:
- __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS);
+ __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INERRORS);
drop_no_count:
kfree_skb_reason(skb, reason);
return 0;
--
2.48.1.362.g079036d154-goog
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH v3 net 11/16] ipv6: input: convert to dev_net_rcu()
2025-02-04 13:23 [PATCH v3 net 00/16] net: first round to use dev_net_rcu() Eric Dumazet
` (9 preceding siblings ...)
2025-02-04 13:23 ` [PATCH v3 net 10/16] ipv6: " Eric Dumazet
@ 2025-02-04 13:23 ` Eric Dumazet
2025-02-04 20:09 ` Jakub Kicinski
2025-02-04 13:23 ` [PATCH v3 net 12/16] ipv6: output: " Eric Dumazet
` (4 subsequent siblings)
15 siblings, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 13:23 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, Kuniyuki Iwashima, Simon Horman, eric.dumazet,
Eric Dumazet
dev_net() calls from net/ipv6/ip6_input.c seem to
happen under RCU protection.
Convert them to dev_net_rcu() to ensure LOCKDEP support.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv6/ip6_input.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 70c0e16c0ae6837d1c64d0036829c8b61799578b..4030527ebe098e86764f37c9068d2f2f9af2d183 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -301,7 +301,7 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
skb = ip6_rcv_core(skb, dev, net);
if (skb == NULL)
@@ -330,7 +330,7 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
list_for_each_entry_safe(skb, next, head, list) {
struct net_device *dev = skb->dev;
- struct net *net = dev_net(dev);
+ struct net *net = dev_net_rcu(dev);
skb_list_del_init(skb);
skb = ip6_rcv_core(skb, dev, net);
@@ -488,7 +488,7 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk
int ip6_input(struct sk_buff *skb)
{
return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
- dev_net(skb->dev), NULL, skb, skb->dev, NULL,
+ dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL,
ip6_input_finish);
}
EXPORT_SYMBOL_GPL(ip6_input);
@@ -500,14 +500,14 @@ int ip6_mc_input(struct sk_buff *skb)
struct net_device *dev;
bool deliver;
- __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
+ __IP6_UPD_PO_STATS(dev_net_rcu(skb_dst(skb)->dev),
__in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
skb->len);
/* skb->dev passed may be master dev for vrfs. */
if (sdif) {
rcu_read_lock();
- dev = dev_get_by_index_rcu(dev_net(skb->dev), sdif);
+ dev = dev_get_by_index_rcu(dev_net_rcu(skb->dev), sdif);
if (!dev) {
rcu_read_unlock();
kfree_skb(skb);
@@ -526,7 +526,7 @@ int ip6_mc_input(struct sk_buff *skb)
/*
* IPv6 multicast router mode is now supported ;)
*/
- if (atomic_read(&dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding) &&
+ if (atomic_read(&dev_net_rcu(skb->dev)->ipv6.devconf_all->mc_forwarding) &&
!(ipv6_addr_type(&hdr->daddr) &
(IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) &&
likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) {
--
2.48.1.362.g079036d154-goog
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH v3 net 11/16] ipv6: input: convert to dev_net_rcu()
2025-02-04 13:23 ` [PATCH v3 net 11/16] ipv6: input: " Eric Dumazet
@ 2025-02-04 20:09 ` Jakub Kicinski
2025-02-04 20:10 ` Eric Dumazet
0 siblings, 1 reply; 26+ messages in thread
From: Jakub Kicinski @ 2025-02-04 20:09 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Paolo Abeni, netdev, Kuniyuki Iwashima,
Simon Horman, eric.dumazet
On Tue, 4 Feb 2025 13:23:52 +0000 Eric Dumazet wrote:
> @@ -488,7 +488,7 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk
> int ip6_input(struct sk_buff *skb)
> {
> return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
> - dev_net(skb->dev), NULL, skb, skb->dev, NULL,
> + dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL,
> ip6_input_finish);
> }
> EXPORT_SYMBOL_GPL(ip6_input);
One more here:
[ 4326.034939][ T50] =============================
[ 4326.035125][ T50] WARNING: suspicious RCU usage
[ 4326.035299][ T50] 6.13.0-virtme #1 Not tainted
[ 4326.035955][ T50] -----------------------------
[ 4326.036124][ T50] ./include/net/net_namespace.h:404 suspicious rcu_dereference_check() usage!
[ 4326.036398][ T50]
[ 4326.036398][ T50] other info that might help us debug this:
[ 4326.036398][ T50]
[ 4326.036684][ T50]
[ 4326.036684][ T50] rcu_scheduler_active = 2, debug_locks = 1
[ 4326.036910][ T50] 2 locks held by kworker/2:1/50:
[ 4326.037111][ T50] #0: ffff8880010a9548 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x7ec/0x16d0
[ 4326.037439][ T50] #1: ffffc9000036fd40 ((work_completion)(&trans->work)){+.+.}-{0:0}, at: process_one_work+0xe0b/0x16d0
[ 4326.037741][ T50]
[ 4326.037741][ T50] stack backtrace:
[ 4326.037930][ T50] CPU: 2 UID: 0 PID: 50 Comm: kworker/2:1 Not tainted 6.13.0-virtme #1
[ 4326.037935][ T50] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[ 4326.037937][ T50] Workqueue: events xfrm_trans_reinject
[ 4326.037947][ T50] Call Trace:
[ 4326.037949][ T50] <TASK>
[ 4326.037952][ T50] dump_stack_lvl+0xb0/0xd0
[ 4326.037963][ T50] lockdep_rcu_suspicious+0x1ea/0x280
[ 4326.037975][ T50] ip6_input+0x262/0x3e0
[ 4326.038009][ T50] xfrm_trans_reinject+0x2a2/0x460
[ 4326.038055][ T50] process_one_work+0xe55/0x16d0
[ 4326.038098][ T50] worker_thread+0x58c/0xce0
[ 4326.038121][ T50] kthread+0x359/0x5d0
[ 4326.038141][ T50] ret_from_fork+0x31/0x70
[ 4326.038150][ T50] ret_from_fork_asm+0x1a/0x30
Test output:
https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/61-l2tp-sh/
Decoded:
https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/vm-crash-thr2-0
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH v3 net 11/16] ipv6: input: convert to dev_net_rcu()
2025-02-04 20:09 ` Jakub Kicinski
@ 2025-02-04 20:10 ` Eric Dumazet
2025-02-04 21:00 ` Jakub Kicinski
0 siblings, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 20:10 UTC (permalink / raw)
To: Jakub Kicinski
Cc: David S . Miller, Paolo Abeni, netdev, Kuniyuki Iwashima,
Simon Horman, eric.dumazet
On Tue, Feb 4, 2025 at 9:09 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Tue, 4 Feb 2025 13:23:52 +0000 Eric Dumazet wrote:
> > @@ -488,7 +488,7 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk
> > int ip6_input(struct sk_buff *skb)
> > {
> > return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
> > - dev_net(skb->dev), NULL, skb, skb->dev, NULL,
> > + dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL,
> > ip6_input_finish);
> > }
> > EXPORT_SYMBOL_GPL(ip6_input);
>
> One more here:
>
> [ 4326.034939][ T50] =============================
> [ 4326.035125][ T50] WARNING: suspicious RCU usage
> [ 4326.035299][ T50] 6.13.0-virtme #1 Not tainted
> [ 4326.035955][ T50] -----------------------------
> [ 4326.036124][ T50] ./include/net/net_namespace.h:404 suspicious rcu_dereference_check() usage!
> [ 4326.036398][ T50]
> [ 4326.036398][ T50] other info that might help us debug this:
> [ 4326.036398][ T50]
> [ 4326.036684][ T50]
> [ 4326.036684][ T50] rcu_scheduler_active = 2, debug_locks = 1
> [ 4326.036910][ T50] 2 locks held by kworker/2:1/50:
> [ 4326.037111][ T50] #0: ffff8880010a9548 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x7ec/0x16d0
> [ 4326.037439][ T50] #1: ffffc9000036fd40 ((work_completion)(&trans->work)){+.+.}-{0:0}, at: process_one_work+0xe0b/0x16d0
> [ 4326.037741][ T50]
> [ 4326.037741][ T50] stack backtrace:
> [ 4326.037930][ T50] CPU: 2 UID: 0 PID: 50 Comm: kworker/2:1 Not tainted 6.13.0-virtme #1
> [ 4326.037935][ T50] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> [ 4326.037937][ T50] Workqueue: events xfrm_trans_reinject
> [ 4326.037947][ T50] Call Trace:
> [ 4326.037949][ T50] <TASK>
> [ 4326.037952][ T50] dump_stack_lvl+0xb0/0xd0
> [ 4326.037963][ T50] lockdep_rcu_suspicious+0x1ea/0x280
> [ 4326.037975][ T50] ip6_input+0x262/0x3e0
> [ 4326.038009][ T50] xfrm_trans_reinject+0x2a2/0x460
> [ 4326.038055][ T50] process_one_work+0xe55/0x16d0
> [ 4326.038098][ T50] worker_thread+0x58c/0xce0
> [ 4326.038121][ T50] kthread+0x359/0x5d0
> [ 4326.038141][ T50] ret_from_fork+0x31/0x70
> [ 4326.038150][ T50] ret_from_fork_asm+0x1a/0x30
>
> Test output:
> https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/61-l2tp-sh/
> Decoded:
> https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/vm-crash-thr2-0
Oh well. So many bugs.
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH v3 net 11/16] ipv6: input: convert to dev_net_rcu()
2025-02-04 20:10 ` Eric Dumazet
@ 2025-02-04 21:00 ` Jakub Kicinski
2025-02-04 21:06 ` Eric Dumazet
0 siblings, 1 reply; 26+ messages in thread
From: Jakub Kicinski @ 2025-02-04 21:00 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Paolo Abeni, netdev, Kuniyuki Iwashima,
Simon Horman, eric.dumazet, rcu
On Tue, 4 Feb 2025 21:10:59 +0100 Eric Dumazet wrote:
> > Test output:
> > https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/61-l2tp-sh/
> > Decoded:
> > https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/vm-crash-thr2-0
>
> Oh well. So many bugs.
TBH I'm slightly confused by this, and the previous warnings.
The previous one was from a timer callback.
This one is with BH disabled.
I thought BH implies RCU protection. We certainly depend on that
in NAPI for XDP. And threaded NAPI does the exact same thing as
xfrm_trans_reinject(), a bare local_bh_disable().
RCU folks, did something change or is just holes in my brain again?
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v3 net 11/16] ipv6: input: convert to dev_net_rcu()
2025-02-04 21:00 ` Jakub Kicinski
@ 2025-02-04 21:06 ` Eric Dumazet
2025-02-04 21:17 ` Paul E. McKenney
0 siblings, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 21:06 UTC (permalink / raw)
To: Jakub Kicinski
Cc: David S . Miller, Paolo Abeni, netdev, Kuniyuki Iwashima,
Simon Horman, eric.dumazet, rcu
On Tue, Feb 4, 2025 at 10:00 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Tue, 4 Feb 2025 21:10:59 +0100 Eric Dumazet wrote:
> > > Test output:
> > > https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/61-l2tp-sh/
> > > Decoded:
> > > https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/vm-crash-thr2-0
> >
> > Oh well. So many bugs.
>
> TBH I'm slightly confused by this, and the previous warnings.
>
> The previous one was from a timer callback.
>
> This one is with BH disabled.
>
> I thought BH implies RCU protection. We certainly depend on that
> in NAPI for XDP. And threaded NAPI does the exact same thing as
> xfrm_trans_reinject(), a bare local_bh_disable().
>
> RCU folks, did something change or is just holes in my brain again?
Nope, BH does not imply rcu_read_lock()
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v3 net 11/16] ipv6: input: convert to dev_net_rcu()
2025-02-04 21:06 ` Eric Dumazet
@ 2025-02-04 21:17 ` Paul E. McKenney
2025-02-04 21:30 ` Jakub Kicinski
0 siblings, 1 reply; 26+ messages in thread
From: Paul E. McKenney @ 2025-02-04 21:17 UTC (permalink / raw)
To: Eric Dumazet
Cc: Jakub Kicinski, David S . Miller, Paolo Abeni, netdev,
Kuniyuki Iwashima, Simon Horman, eric.dumazet, rcu
On Tue, Feb 04, 2025 at 10:06:15PM +0100, Eric Dumazet wrote:
> On Tue, Feb 4, 2025 at 10:00 PM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > On Tue, 4 Feb 2025 21:10:59 +0100 Eric Dumazet wrote:
> > > > Test output:
> > > > https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/61-l2tp-sh/
> > > > Decoded:
> > > > https://netdev-3.bots.linux.dev/vmksft-net-dbg/results/978202/vm-crash-thr2-0
> > >
> > > Oh well. So many bugs.
> >
> > TBH I'm slightly confused by this, and the previous warnings.
> >
> > The previous one was from a timer callback.
> >
> > This one is with BH disabled.
> >
> > I thought BH implies RCU protection. We certainly depend on that
> > in NAPI for XDP. And threaded NAPI does the exact same thing as
> > xfrm_trans_reinject(), a bare local_bh_disable().
> >
> > RCU folks, did something change or is just holes in my brain again?
>
> Nope, BH does not imply rcu_read_lock()
You are both right? ;-)
The synchronize_rcu() function will wait for all types of RCU readers,
including BH-disabled regions of code. However, lockdep can distinguish
between the various sorts of readers. So for example
lockdep_assert_in_rcu_read_lock_bh();
will complain unless you did rcu_read_lock_bh(), even if you did something
like disable_bh(). If you don't want to distinguish and are happy with
any type of RCU reader, you can use
lockdep_assert_in_rcu_reader();
I have been expecting that CONFIG_PREEMPT_RT=y kernels will break this
any day now, but so far so good. ;-)
Thanx, Paul
^ permalink raw reply [flat|nested] 26+ messages in thread
* Re: [PATCH v3 net 11/16] ipv6: input: convert to dev_net_rcu()
2025-02-04 21:17 ` Paul E. McKenney
@ 2025-02-04 21:30 ` Jakub Kicinski
2025-02-04 23:25 ` Paul E. McKenney
2025-02-05 7:57 ` Eric Dumazet
0 siblings, 2 replies; 26+ messages in thread
From: Jakub Kicinski @ 2025-02-04 21:30 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Eric Dumazet, David S . Miller, Paolo Abeni, netdev,
Kuniyuki Iwashima, Simon Horman, eric.dumazet, rcu
On Tue, 4 Feb 2025 13:17:08 -0800 Paul E. McKenney wrote:
> > > TBH I'm slightly confused by this, and the previous warnings.
> > >
> > > The previous one was from a timer callback.
> > >
> > > This one is with BH disabled.
> > >
> > > I thought BH implies RCU protection. We certainly depend on that
> > > in NAPI for XDP. And threaded NAPI does the exact same thing as
> > > xfrm_trans_reinject(), a bare local_bh_disable().
> > >
> > > RCU folks, did something change or is just holes in my brain again?
> >
> > Nope, BH does not imply rcu_read_lock()
>
> You are both right? ;-)
>
> The synchronize_rcu() function will wait for all types of RCU readers,
> including BH-disabled regions of code. However, lockdep can distinguish
> between the various sorts of readers. So for example
>
> lockdep_assert_in_rcu_read_lock_bh();
>
> will complain unless you did rcu_read_lock_bh(), even if you did something
> like disable_bh(). If you don't want to distinguish and are happy with
> any type of RCU reader, you can use
>
> lockdep_assert_in_rcu_reader();
>
> I have been expecting that CONFIG_PREEMPT_RT=y kernels will break this
> any day now, but so far so good. ;-)
Thanks Paul! So IIUC in this case we could:
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 0f5eb9db0c62..58ec1eb9ae6a 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -401,7 +401,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
static inline struct net *read_pnet_rcu(possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
- return rcu_dereference(pnet->net);
+ return rcu_dereference_check(pnet->net, rcu_read_lock_bh_held());
#else
return &init_net;
#endif
Sorry for the sideline, Eric, up to you how to proceed..
I'll try to remember the details better next time :)
^ permalink raw reply related [flat|nested] 26+ messages in thread* Re: [PATCH v3 net 11/16] ipv6: input: convert to dev_net_rcu()
2025-02-04 21:30 ` Jakub Kicinski
@ 2025-02-04 23:25 ` Paul E. McKenney
2025-02-05 7:57 ` Eric Dumazet
1 sibling, 0 replies; 26+ messages in thread
From: Paul E. McKenney @ 2025-02-04 23:25 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Eric Dumazet, David S . Miller, Paolo Abeni, netdev,
Kuniyuki Iwashima, Simon Horman, eric.dumazet, rcu
On Tue, Feb 04, 2025 at 01:30:25PM -0800, Jakub Kicinski wrote:
> On Tue, 4 Feb 2025 13:17:08 -0800 Paul E. McKenney wrote:
> > > > TBH I'm slightly confused by this, and the previous warnings.
> > > >
> > > > The previous one was from a timer callback.
> > > >
> > > > This one is with BH disabled.
> > > >
> > > > I thought BH implies RCU protection. We certainly depend on that
> > > > in NAPI for XDP. And threaded NAPI does the exact same thing as
> > > > xfrm_trans_reinject(), a bare local_bh_disable().
> > > >
> > > > RCU folks, did something change or is just holes in my brain again?
> > >
> > > Nope, BH does not imply rcu_read_lock()
> >
> > You are both right? ;-)
> >
> > The synchronize_rcu() function will wait for all types of RCU readers,
> > including BH-disabled regions of code. However, lockdep can distinguish
> > between the various sorts of readers. So for example
> >
> > lockdep_assert_in_rcu_read_lock_bh();
> >
> > will complain unless you did rcu_read_lock_bh(), even if you did something
> > like disable_bh(). If you don't want to distinguish and are happy with
> > any type of RCU reader, you can use
> >
> > lockdep_assert_in_rcu_reader();
> >
> > I have been expecting that CONFIG_PREEMPT_RT=y kernels will break this
> > any day now, but so far so good. ;-)
>
> Thanks Paul! So IIUC in this case we could:
>
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index 0f5eb9db0c62..58ec1eb9ae6a 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -401,7 +401,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
> static inline struct net *read_pnet_rcu(possible_net_t *pnet)
> {
> #ifdef CONFIG_NET_NS
> - return rcu_dereference(pnet->net);
> + return rcu_dereference_check(pnet->net, rcu_read_lock_bh_held());
That should do it!
Thanx, Paul
> #else
> return &init_net;
> #endif
>
> Sorry for the sideline, Eric, up to you how to proceed..
> I'll try to remember the details better next time :)
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH v3 net 11/16] ipv6: input: convert to dev_net_rcu()
2025-02-04 21:30 ` Jakub Kicinski
2025-02-04 23:25 ` Paul E. McKenney
@ 2025-02-05 7:57 ` Eric Dumazet
2025-02-05 8:05 ` Eric Dumazet
1 sibling, 1 reply; 26+ messages in thread
From: Eric Dumazet @ 2025-02-05 7:57 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Paul E. McKenney, David S . Miller, Paolo Abeni, netdev,
Kuniyuki Iwashima, Simon Horman, eric.dumazet, rcu
On Tue, Feb 4, 2025 at 10:30 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Tue, 4 Feb 2025 13:17:08 -0800 Paul E. McKenney wrote:
> > > > TBH I'm slightly confused by this, and the previous warnings.
> > > >
> > > > The previous one was from a timer callback.
> > > >
> > > > This one is with BH disabled.
> > > >
> > > > I thought BH implies RCU protection. We certainly depend on that
> > > > in NAPI for XDP. And threaded NAPI does the exact same thing as
> > > > xfrm_trans_reinject(), a bare local_bh_disable().
> > > >
> > > > RCU folks, did something change or is just holes in my brain again?
> > >
> > > Nope, BH does not imply rcu_read_lock()
> >
> > You are both right? ;-)
> >
> > The synchronize_rcu() function will wait for all types of RCU readers,
> > including BH-disabled regions of code. However, lockdep can distinguish
> > between the various sorts of readers. So for example
> >
> > lockdep_assert_in_rcu_read_lock_bh();
> >
> > will complain unless you did rcu_read_lock_bh(), even if you did something
> > like disable_bh(). If you don't want to distinguish and are happy with
> > any type of RCU reader, you can use
> >
> > lockdep_assert_in_rcu_reader();
> >
> > I have been expecting that CONFIG_PREEMPT_RT=y kernels will break this
> > any day now, but so far so good. ;-)
>
> Thanks Paul! So IIUC in this case we could:
>
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index 0f5eb9db0c62..58ec1eb9ae6a 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -401,7 +401,7 @@ static inline struct net *read_pnet(const possible_net_t *pnet)
> static inline struct net *read_pnet_rcu(possible_net_t *pnet)
> {
> #ifdef CONFIG_NET_NS
> - return rcu_dereference(pnet->net);
> + return rcu_dereference_check(pnet->net, rcu_read_lock_bh_held());
> #else
> return &init_net;
> #endif
>
> Sorry for the sideline, Eric, up to you how to proceed..
I will squash this diff to the following iteration, and keep rcu_dereference()
Note that nf_hook() also grabs rcu_read_lock().
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 4030527ebe098e86764f37c9068d2f2f9af2d183..ee5a69fdc67a30e55c5a073455e1d7299f168f34
100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -477,9 +477,7 @@ void ip6_protocol_deliver_rcu(struct net *net,
struct sk_buff *skb, int nexthdr,
static int ip6_input_finish(struct net *net, struct sock *sk, struct
sk_buff *skb)
{
skb_clear_delivery_time(skb);
- rcu_read_lock();
ip6_protocol_deliver_rcu(net, skb, 0, false);
- rcu_read_unlock();
return 0;
}
@@ -487,9 +485,14 @@ static int ip6_input_finish(struct net *net,
struct sock *sk, struct sk_buff *sk
int ip6_input(struct sk_buff *skb)
{
- return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
- dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL,
- ip6_input_finish);
+ int res;
+
+ rcu_read_lock();
+ res = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
+ dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL,
+ ip6_input_finish);
+ rcu_read_unlock();
+ return res;
}
EXPORT_SYMBOL_GPL(ip6_input);
^ permalink raw reply [flat|nested] 26+ messages in thread* Re: [PATCH v3 net 11/16] ipv6: input: convert to dev_net_rcu()
2025-02-05 7:57 ` Eric Dumazet
@ 2025-02-05 8:05 ` Eric Dumazet
0 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2025-02-05 8:05 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Paul E. McKenney, David S . Miller, Paolo Abeni, netdev,
Kuniyuki Iwashima, Simon Horman, eric.dumazet, rcu
On Wed, Feb 5, 2025 at 8:57 AM Eric Dumazet <edumazet@google.com> wrote:
>
>
> I will squash this diff to the following iteration, and keep rcu_dereference()
>
>
I will shrink the series in V4 to only include known bug fixes, to
lower the risk of having 10 more iterations.
^ permalink raw reply [flat|nested] 26+ messages in thread
* [PATCH v3 net 12/16] ipv6: output: convert to dev_net_rcu()
2025-02-04 13:23 [PATCH v3 net 00/16] net: first round to use dev_net_rcu() Eric Dumazet
` (10 preceding siblings ...)
2025-02-04 13:23 ` [PATCH v3 net 11/16] ipv6: input: " Eric Dumazet
@ 2025-02-04 13:23 ` Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 13/16] ipv6: use RCU protection in ip6_default_advmss() Eric Dumazet
` (3 subsequent siblings)
15 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 13:23 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, Kuniyuki Iwashima, Simon Horman, eric.dumazet,
Eric Dumazet
dev_net() calls from net/ipv6/ip6_output.c
and net/ipv6/output_core.c are happening under RCU
protection.
Convert them to dev_net_rcu() to ensure LOCKDEP support.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv6/ip6_output.c | 4 ++--
net/ipv6/output_core.c | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index d577bf2f3053873d27b241029592cdbb0a124ad7..4c73a4cdcb23f76d81e572d5b1bd0f6902447c0e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -393,7 +393,7 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
sk->sk_bound_dev_if == skb->dev->ifindex)) {
if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
- !net_eq(sock_net(sk), dev_net(skb->dev))) {
+ !net_eq(sock_net(sk), dev_net_rcu(skb->dev))) {
continue;
}
if (last) {
@@ -503,7 +503,7 @@ int ip6_forward(struct sk_buff *skb)
struct dst_entry *dst = skb_dst(skb);
struct ipv6hdr *hdr = ipv6_hdr(skb);
struct inet6_skb_parm *opt = IP6CB(skb);
- struct net *net = dev_net(dst->dev);
+ struct net *net = dev_net_rcu(dst->dev);
struct inet6_dev *idev;
SKB_DR(reason);
u32 mtu;
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 806d4b5dd1e60b27726facbb59bbef97d6fee7f5..94438fd4f0e833bb8f5ea4822c7312376ea79304 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -113,7 +113,7 @@ int ip6_dst_hoplimit(struct dst_entry *dst)
if (idev)
hoplimit = READ_ONCE(idev->cnf.hop_limit);
else
- hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit);
+ hoplimit = READ_ONCE(dev_net_rcu(dev)->ipv6.devconf_all->hop_limit);
rcu_read_unlock();
}
return hoplimit;
--
2.48.1.362.g079036d154-goog
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH v3 net 13/16] ipv6: use RCU protection in ip6_default_advmss()
2025-02-04 13:23 [PATCH v3 net 00/16] net: first round to use dev_net_rcu() Eric Dumazet
` (11 preceding siblings ...)
2025-02-04 13:23 ` [PATCH v3 net 12/16] ipv6: output: " Eric Dumazet
@ 2025-02-04 13:23 ` Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 14/16] net: filter: convert to dev_net_rcu() Eric Dumazet
` (2 subsequent siblings)
15 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 13:23 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, Kuniyuki Iwashima, Simon Horman, eric.dumazet,
Eric Dumazet
ip6_default_advmss() needs rcu protection to make
sure the net structure it reads does not disappear.
Fixes: 5578689a4e3c ("[NETNS][IPV6] route6 - make route6 per namespace")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv6/route.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 78362822b9070df138a0724dc76003b63026f9e2..ef2d23a1e3d532f5db37ca94ca482c5522dddffc 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3196,13 +3196,18 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
{
struct net_device *dev = dst->dev;
unsigned int mtu = dst_mtu(dst);
- struct net *net = dev_net(dev);
+ struct net *net;
mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
+ rcu_read_lock();
+
+ net = dev_net_rcu(dev);
if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
+ rcu_read_unlock();
+
/*
* Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
* corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
--
2.48.1.362.g079036d154-goog
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH v3 net 14/16] net: filter: convert to dev_net_rcu()
2025-02-04 13:23 [PATCH v3 net 00/16] net: first round to use dev_net_rcu() Eric Dumazet
` (12 preceding siblings ...)
2025-02-04 13:23 ` [PATCH v3 net 13/16] ipv6: use RCU protection in ip6_default_advmss() Eric Dumazet
@ 2025-02-04 13:23 ` Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 15/16] flow_dissector: use rcu protection to fetch dev_net() Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 16/16] ipv4: use RCU protection in inet_select_addr() Eric Dumazet
15 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 13:23 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, Kuniyuki Iwashima, Simon Horman, eric.dumazet,
Eric Dumazet
All calls to dev_net() from net/core/filter.c are currently
done under rcu_read_lock().
Convert them to dev_net_rcu() to ensure LOCKDEP support.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/core/filter.c | 40 ++++++++++++++++++++--------------------
1 file changed, 20 insertions(+), 20 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c
index 2ec162dd83c463640dcf3c151327206f519b217a..4db537a982d55fa9b42aaa70820cb337d5283299 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2244,7 +2244,7 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
struct bpf_nh_params *nh)
{
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
- struct net *net = dev_net(dev);
+ struct net *net = dev_net_rcu(dev);
int err, ret = NET_XMIT_DROP;
if (!nh) {
@@ -2348,7 +2348,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
struct bpf_nh_params *nh)
{
const struct iphdr *ip4h = ip_hdr(skb);
- struct net *net = dev_net(dev);
+ struct net *net = dev_net_rcu(dev);
int err, ret = NET_XMIT_DROP;
if (!nh) {
@@ -2438,7 +2438,7 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
return -EINVAL;
- dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
+ dev = dev_get_by_index_rcu(dev_net_rcu(skb->dev), ifindex);
if (unlikely(!dev))
return -EINVAL;
@@ -2482,7 +2482,7 @@ static struct net_device *skb_get_peer_dev(struct net_device *dev)
int skb_do_redirect(struct sk_buff *skb)
{
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
struct net_device *dev;
u32 flags = ri->flags;
@@ -2497,7 +2497,7 @@ int skb_do_redirect(struct sk_buff *skb)
dev = skb_get_peer_dev(dev);
if (unlikely(!dev ||
!(dev->flags & IFF_UP) ||
- net_eq(net, dev_net(dev))))
+ net_eq(net, dev_net_rcu(dev))))
goto out_drop;
skb->dev = dev;
dev_sw_netstats_rx_add(dev, skb->len);
@@ -4425,7 +4425,7 @@ __xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev,
break;
case BPF_MAP_TYPE_UNSPEC:
if (map_id == INT_MAX) {
- fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+ fwd = dev_get_by_index_rcu(dev_net_rcu(dev), ri->tgt_index);
if (unlikely(!fwd)) {
err = -EINVAL;
break;
@@ -4550,7 +4550,7 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
ri->map_type = BPF_MAP_TYPE_UNSPEC;
if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
- fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+ fwd = dev_get_by_index_rcu(dev_net_rcu(dev), ri->tgt_index);
if (unlikely(!fwd)) {
err = -EINVAL;
goto err;
@@ -6203,12 +6203,12 @@ BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
case AF_INET:
- return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
+ return bpf_ipv4_fib_lookup(dev_net_rcu(ctx->rxq->dev), params,
flags, true);
#endif
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
+ return bpf_ipv6_fib_lookup(dev_net_rcu(ctx->rxq->dev), params,
flags, true);
#endif
}
@@ -6228,7 +6228,7 @@ static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
struct bpf_fib_lookup *, params, int, plen, u32, flags)
{
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
int rc = -EAFNOSUPPORT;
bool check_mtu = false;
@@ -6283,7 +6283,7 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
static struct net_device *__dev_via_ifindex(struct net_device *dev_curr,
u32 ifindex)
{
- struct net *netns = dev_net(dev_curr);
+ struct net *netns = dev_net_rcu(dev_curr);
/* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */
if (ifindex == 0)
@@ -6806,7 +6806,7 @@ bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
int ifindex;
if (skb->dev) {
- caller_net = dev_net(skb->dev);
+ caller_net = dev_net_rcu(skb->dev);
ifindex = skb->dev->ifindex;
} else {
caller_net = sock_net(skb->sk);
@@ -6906,7 +6906,7 @@ BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb,
{
struct net_device *dev = skb->dev;
int ifindex = dev->ifindex, sdif = dev_sdif(dev);
- struct net *caller_net = dev_net(dev);
+ struct net *caller_net = dev_net_rcu(dev);
return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net,
ifindex, IPPROTO_TCP, netns_id,
@@ -6930,7 +6930,7 @@ BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb,
{
struct net_device *dev = skb->dev;
int ifindex = dev->ifindex, sdif = dev_sdif(dev);
- struct net *caller_net = dev_net(dev);
+ struct net *caller_net = dev_net_rcu(dev);
return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
ifindex, IPPROTO_TCP, netns_id,
@@ -6954,7 +6954,7 @@ BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb,
{
struct net_device *dev = skb->dev;
int ifindex = dev->ifindex, sdif = dev_sdif(dev);
- struct net *caller_net = dev_net(dev);
+ struct net *caller_net = dev_net_rcu(dev);
return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
ifindex, IPPROTO_UDP, netns_id,
@@ -6992,7 +6992,7 @@ BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
{
struct net_device *dev = ctx->rxq->dev;
int ifindex = dev->ifindex, sdif = dev_sdif(dev);
- struct net *caller_net = dev_net(dev);
+ struct net *caller_net = dev_net_rcu(dev);
return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
ifindex, IPPROTO_UDP, netns_id,
@@ -7016,7 +7016,7 @@ BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx,
{
struct net_device *dev = ctx->rxq->dev;
int ifindex = dev->ifindex, sdif = dev_sdif(dev);
- struct net *caller_net = dev_net(dev);
+ struct net *caller_net = dev_net_rcu(dev);
return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net,
ifindex, IPPROTO_TCP, netns_id,
@@ -7040,7 +7040,7 @@ BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,
{
struct net_device *dev = ctx->rxq->dev;
int ifindex = dev->ifindex, sdif = dev_sdif(dev);
- struct net *caller_net = dev_net(dev);
+ struct net *caller_net = dev_net_rcu(dev);
return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
ifindex, IPPROTO_TCP, netns_id,
@@ -7510,7 +7510,7 @@ BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
return -EINVAL;
if (!skb_at_tc_ingress(skb))
return -EOPNOTSUPP;
- if (unlikely(dev_net(skb->dev) != sock_net(sk)))
+ if (unlikely(dev_net_rcu(skb->dev) != sock_net(sk)))
return -ENETUNREACH;
if (sk_unhashed(sk))
return -EOPNOTSUPP;
@@ -11985,7 +11985,7 @@ __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk,
if (!skb_at_tc_ingress(skb))
return -EINVAL;
- net = dev_net(skb->dev);
+ net = dev_net_rcu(skb->dev);
if (net != sock_net(sk))
return -ENETUNREACH;
--
2.48.1.362.g079036d154-goog
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH v3 net 15/16] flow_dissector: use rcu protection to fetch dev_net()
2025-02-04 13:23 [PATCH v3 net 00/16] net: first round to use dev_net_rcu() Eric Dumazet
` (13 preceding siblings ...)
2025-02-04 13:23 ` [PATCH v3 net 14/16] net: filter: convert to dev_net_rcu() Eric Dumazet
@ 2025-02-04 13:23 ` Eric Dumazet
2025-02-04 13:23 ` [PATCH v3 net 16/16] ipv4: use RCU protection in inet_select_addr() Eric Dumazet
15 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 13:23 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, Kuniyuki Iwashima, Simon Horman, eric.dumazet,
Eric Dumazet
__skb_flow_dissect() can be called from arbitrary contexts.
It must extend its rcu protection section to include
the call to dev_net(), which can become dev_net_rcu().
This makes sure the net structure can not disappear under us.
Fixes: 9b52e3f267a6 ("flow_dissector: handle no-skb use case")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/core/flow_dissector.c | 21 +++++++++++----------
1 file changed, 11 insertions(+), 10 deletions(-)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 0e638a37aa0961de6281deeed227b3e7ef70e546..5db41bf2ed93e0df721c216ca4557dad16aa5f83 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1108,10 +1108,12 @@ bool __skb_flow_dissect(const struct net *net,
FLOW_DISSECTOR_KEY_BASIC,
target_container);
+ rcu_read_lock();
+
if (skb) {
if (!net) {
if (skb->dev)
- net = dev_net(skb->dev);
+ net = dev_net_rcu(skb->dev);
else if (skb->sk)
net = sock_net(skb->sk);
}
@@ -1122,7 +1124,6 @@ bool __skb_flow_dissect(const struct net *net,
enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
struct bpf_prog_array *run_array;
- rcu_read_lock();
run_array = rcu_dereference(init_net.bpf.run_array[type]);
if (!run_array)
run_array = rcu_dereference(net->bpf.run_array[type]);
@@ -1150,17 +1151,17 @@ bool __skb_flow_dissect(const struct net *net,
prog = READ_ONCE(run_array->items[0].prog);
result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
hlen, flags);
- if (result == BPF_FLOW_DISSECTOR_CONTINUE)
- goto dissect_continue;
- __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
- target_container);
- rcu_read_unlock();
- return result == BPF_OK;
+ if (result != BPF_FLOW_DISSECTOR_CONTINUE) {
+ __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
+ target_container);
+ rcu_read_unlock();
+ return result == BPF_OK;
+ }
}
-dissect_continue:
- rcu_read_unlock();
}
+ rcu_read_unlock();
+
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
struct ethhdr *eth = eth_hdr(skb);
--
2.48.1.362.g079036d154-goog
^ permalink raw reply related [flat|nested] 26+ messages in thread* [PATCH v3 net 16/16] ipv4: use RCU protection in inet_select_addr()
2025-02-04 13:23 [PATCH v3 net 00/16] net: first round to use dev_net_rcu() Eric Dumazet
` (14 preceding siblings ...)
2025-02-04 13:23 ` [PATCH v3 net 15/16] flow_dissector: use rcu protection to fetch dev_net() Eric Dumazet
@ 2025-02-04 13:23 ` Eric Dumazet
15 siblings, 0 replies; 26+ messages in thread
From: Eric Dumazet @ 2025-02-04 13:23 UTC (permalink / raw)
To: David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: netdev, Kuniyuki Iwashima, Simon Horman, eric.dumazet,
Eric Dumazet
inet_select_addr() must use RCU protection to make
sure the net structure it reads does not disappear.
Fixes: c4544c724322 ("[NETNS]: Process inet_select_addr inside a namespace.")
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/ipv4/devinet.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index c8b3cf5fba4c02941b919687a6a657cf68f5f99a..55b8151759bc9f76ebdbfae27544d6ee666a4809 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1371,10 +1371,11 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
__be32 addr = 0;
unsigned char localnet_scope = RT_SCOPE_HOST;
struct in_device *in_dev;
- struct net *net = dev_net(dev);
+ struct net *net;
int master_idx;
rcu_read_lock();
+ net = dev_net_rcu(dev);
in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
goto no_in_dev;
--
2.48.1.362.g079036d154-goog
^ permalink raw reply related [flat|nested] 26+ messages in thread