public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH net] net: add RCU protection to (struct packet_type)->dev
@ 2026-01-31 21:29 Eric Dumazet
  2026-02-02  3:16 ` YinFengwei
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2026-01-31 21:29 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Willem de Bruijn, netdev, eric.dumazet,
	Eric Dumazet, Yin Fengwei

Yin Fengwei reported an RCU stall in ptype_seq_show() and provided a patch.

Real issue is that (struct packet_type)->dev needs RCU protection:

ptype_seq_show() runs under rcu_read_lock(), and reads pt->dev
to get device name without any barrier.

At the same time, concurrent writer can remove a packet_type structure
(which is correctly freed after an RCU grace period) _and_ clear pt->dev
without an RCU grace period.

Fix this issue by using proper RCU on pt->dev pointer.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Yin Fengwei <fengwei_yin@linux.alibaba.com>
Closes: https://lore.kernel.org/netdev/CANn89iKRRKPnWjJmb-_3a=sq+9h6DvTQM4DBZHT5ZRGPMzQaiA@mail.gmail.com/T/#m7b80b9fc9b9267f90e0b7aad557595f686f9c50d
---
 drivers/net/ethernet/amd/xgbe/xgbe-selftest.c |  2 +-
 .../ethernet/mellanox/mlx5/core/en_selftest.c |  2 +-
 .../stmicro/stmmac/stmmac_selftests.c         | 12 ++++----
 drivers/scsi/bnx2fc/bnx2fc_fcoe.c             |  4 +--
 drivers/scsi/fcoe/fcoe.c                      |  6 ++--
 include/linux/netdevice.h                     |  2 +-
 net/batman-adv/hard-interface.c               |  2 +-
 net/core/dev.c                                | 30 +++++++++++--------
 net/core/net-procfs.c                         | 18 ++++++-----
 net/core/selftests.c                          |  2 +-
 net/ncsi/ncsi-manage.c                        |  2 +-
 net/packet/af_packet.c                        | 24 ++++++++-------
 net/tipc/bearer.c                             |  6 ++--
 13 files changed, 61 insertions(+), 51 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-selftest.c b/drivers/net/ethernet/amd/xgbe/xgbe-selftest.c
index 55e5e467facd7f546ba208361ec9fdcfd7a627d9..006d80a387431cb7d4acdd35f4f1990c8c1f3366 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-selftest.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-selftest.c
@@ -121,7 +121,7 @@ static int __xgbe_test_loopback(struct xgbe_prv_data *pdata,
 
 	tdata->pt.type = htons(ETH_P_IP);
 	tdata->pt.func = xgbe_test_loopback_validate;
-	tdata->pt.dev = pdata->netdev;
+	RCU_INIT_POINTER(tdata->pt.dev, pdata->netdev);
 	tdata->pt.af_packet_priv = tdata;
 	tdata->packet = attr;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
index fcad464bc4d58af1a7f76cee4cf2088b8889dd0b..d5be21a4c5a3a2635ef69ec60defcb2f665fe205 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
@@ -223,7 +223,7 @@ static int mlx5e_test_loopback_setup(struct mlx5e_priv *priv,
 
 	lbtp->pt.type = htons(ETH_P_IP);
 	lbtp->pt.func = mlx5e_test_loopback_validate;
-	lbtp->pt.dev = priv->netdev;
+	RCU_INIT_POINTER(lbtp->pt.dev, priv->netdev);
 	lbtp->pt.af_packet_priv = lbtp;
 	dev_add_pack(&lbtp->pt);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
index e90a2c469b9a6f576c1b6f99954af08bae69007c..218ff198625e44063e85b717b75b15b1b565ca7b 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_selftests.c
@@ -333,7 +333,7 @@ static int __stmmac_test_loopback(struct stmmac_priv *priv,
 
 	tpriv->pt.type = htons(ETH_P_IP);
 	tpriv->pt.func = stmmac_test_loopback_validate;
-	tpriv->pt.dev = priv->dev;
+	RCU_INIT_POINTER(tpriv->pt.dev, priv->dev);
 	tpriv->pt.af_packet_priv = tpriv;
 	tpriv->packet = attr;
 
@@ -752,7 +752,7 @@ static int stmmac_test_flowctrl(struct stmmac_priv *priv)
 	init_completion(&tpriv->comp);
 	tpriv->pt.type = htons(ETH_P_PAUSE);
 	tpriv->pt.func = stmmac_test_flowctrl_validate;
-	tpriv->pt.dev = priv->dev;
+	RCU_INIT_POINTER(tpriv->pt.dev, priv->dev);
 	tpriv->pt.af_packet_priv = tpriv;
 	dev_add_pack(&tpriv->pt);
 
@@ -907,7 +907,7 @@ static int __stmmac_test_vlanfilt(struct stmmac_priv *priv)
 
 	tpriv->pt.type = htons(ETH_P_IP);
 	tpriv->pt.func = stmmac_test_vlan_validate;
-	tpriv->pt.dev = priv->dev;
+	RCU_INIT_POINTER(tpriv->pt.dev, priv->dev);
 	tpriv->pt.af_packet_priv = tpriv;
 	tpriv->packet = &attr;
 
@@ -1001,7 +1001,7 @@ static int __stmmac_test_dvlanfilt(struct stmmac_priv *priv)
 
 	tpriv->pt.type = htons(ETH_P_8021Q);
 	tpriv->pt.func = stmmac_test_vlan_validate;
-	tpriv->pt.dev = priv->dev;
+	RCU_INIT_POINTER(tpriv->pt.dev, priv->dev);
 	tpriv->pt.af_packet_priv = tpriv;
 	tpriv->packet = &attr;
 
@@ -1278,7 +1278,7 @@ static int stmmac_test_vlanoff_common(struct stmmac_priv *priv, bool svlan)
 
 	tpriv->pt.type = svlan ? htons(ETH_P_8021Q) : htons(ETH_P_IP);
 	tpriv->pt.func = stmmac_test_vlan_validate;
-	tpriv->pt.dev = priv->dev;
+	RCU_INIT_POINTER(tpriv->pt.dev, priv->dev);
 	tpriv->pt.af_packet_priv = tpriv;
 	tpriv->packet = &attr;
 	tpriv->vlan_id = 0x123;
@@ -1637,7 +1637,7 @@ static int stmmac_test_arpoffload(struct stmmac_priv *priv)
 
 	tpriv->pt.type = htons(ETH_P_ARP);
 	tpriv->pt.func = stmmac_test_arp_validate;
-	tpriv->pt.dev = priv->dev;
+	RCU_INIT_POINTER(tpriv->pt.dev, priv->dev);
 	tpriv->pt.af_packet_priv = tpriv;
 	tpriv->packet = &attr;
 	dev_add_pack(&tpriv->pt);
diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
index 0f68739d380a0ae67f18aadb1f0b3c6c5f3ee6e5..22ba17b624626edf1e1631d6f1e2a3ef9898e539 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
@@ -1257,12 +1257,12 @@ static int bnx2fc_interface_setup(struct bnx2fc_interface *interface)
 
 	interface->fip_packet_type.func = bnx2fc_fip_recv;
 	interface->fip_packet_type.type = htons(ETH_P_FIP);
-	interface->fip_packet_type.dev = netdev;
+	RCU_INIT_POINTER(interface->fip_packet_type.dev, netdev);
 	dev_add_pack(&interface->fip_packet_type);
 
 	interface->fcoe_packet_type.func = bnx2fc_rcv;
 	interface->fcoe_packet_type.type = __constant_htons(ETH_P_FCOE);
-	interface->fcoe_packet_type.dev = netdev;
+	RCU_INIT_POINTER(interface->fcoe_packet_type.dev, netdev);
 	dev_add_pack(&interface->fcoe_packet_type);
 
 	return 0;
diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index c8c5dfb3ba9a124439f83afabb8d10e1abe4cf58..ea6617b378a5a051a492d5810ee0abc157261cc5 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -352,18 +352,18 @@ static int fcoe_interface_setup(struct fcoe_interface *fcoe,
 	 */
 	fcoe->fcoe_packet_type.func = fcoe_rcv;
 	fcoe->fcoe_packet_type.type = htons(ETH_P_FCOE);
-	fcoe->fcoe_packet_type.dev = netdev;
+	RCU_INIT_POINTER(fcoe->fcoe_packet_type.dev, netdev);
 	dev_add_pack(&fcoe->fcoe_packet_type);
 
 	fcoe->fip_packet_type.func = fcoe_fip_recv;
 	fcoe->fip_packet_type.type = htons(ETH_P_FIP);
-	fcoe->fip_packet_type.dev = netdev;
+	RCU_INIT_POINTER(fcoe->fip_packet_type.dev, netdev);
 	dev_add_pack(&fcoe->fip_packet_type);
 
 	if (netdev != real_dev) {
 		fcoe->fip_vlan_packet_type.func = fcoe_fip_vlan_recv;
 		fcoe->fip_vlan_packet_type.type = htons(ETH_P_FIP);
-		fcoe->fip_vlan_packet_type.dev = real_dev;
+		RCU_INIT_POINTER(fcoe->fip_vlan_packet_type.dev, real_dev);
 		dev_add_pack(&fcoe->fip_vlan_packet_type);
 	}
 	return 0;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d99b0fbc1942ad1dbbd372cfb9e809e413251f15..c92889d7c0d51bc218c622f4f3b7019534a38dd6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2931,7 +2931,7 @@ void netif_set_affinity_auto(struct net_device *dev);
 struct packet_type {
 	__be16			type;	/* This is really htons(ether_type). */
 	bool			ignore_outgoing;
-	struct net_device	*dev;	/* NULL is wildcarded here	     */
+	struct net_device __rcu	*dev;	/* NULL is wildcarded here	     */
 	netdevice_tracker	dev_tracker;
 	int			(*func) (struct sk_buff *,
 					 struct net_device *,
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 5113f879736b54f0231d0a030dd4bef5a320e9ae..36ce70463ba5ef5dc3549ce9f2a8814b865fc678 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -740,7 +740,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
 	kref_get(&hard_iface->refcount);
 	hard_iface->batman_adv_ptype.type = ethertype;
 	hard_iface->batman_adv_ptype.func = batadv_batman_skb_recv;
-	hard_iface->batman_adv_ptype.dev = hard_iface->net_dev;
+	RCU_INIT_POINTER(hard_iface->batman_adv_ptype.dev, hard_iface->net_dev);
 	dev_add_pack(&hard_iface->batman_adv_ptype);
 
 	batadv_info(hard_iface->mesh_iface, "Adding interface: %s\n",
diff --git a/net/core/dev.c b/net/core/dev.c
index ccef685023c299dbd9fc1ccb7a914a282219a327..11d0c598f7d28e824bbd23a670ba75f4561fe810 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -587,16 +587,19 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 
 static inline struct list_head *ptype_head(const struct packet_type *pt)
 {
+	struct net_device *dev;
+
+	dev = rcu_dereference_protected(pt->dev, lockdep_is_held(&ptype_lock));
+
 	if (pt->type == htons(ETH_P_ALL)) {
-		if (!pt->af_packet_net && !pt->dev)
+		if (!pt->af_packet_net && !dev)
 			return NULL;
 
-		return pt->dev ? &pt->dev->ptype_all :
-				 &pt->af_packet_net->ptype_all;
+		return dev ? &dev->ptype_all : &pt->af_packet_net->ptype_all;
 	}
 
-	if (pt->dev)
-		return &pt->dev->ptype_specific;
+	if (dev)
+		return &dev->ptype_specific;
 
 	return pt->af_packet_net ? &pt->af_packet_net->ptype_specific :
 				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
@@ -617,13 +620,12 @@ static inline struct list_head *ptype_head(const struct packet_type *pt)
 
 void dev_add_pack(struct packet_type *pt)
 {
-	struct list_head *head = ptype_head(pt);
-
-	if (WARN_ON_ONCE(!head))
-		return;
+	struct list_head *head;
 
 	spin_lock(&ptype_lock);
-	list_add_rcu(&pt->list, head);
+	head = ptype_head(pt);
+	if (!WARN_ON_ONCE(!head))
+		list_add_rcu(&pt->list, head);
 	spin_unlock(&ptype_lock);
 }
 EXPORT_SYMBOL(dev_add_pack);
@@ -643,13 +645,15 @@ EXPORT_SYMBOL(dev_add_pack);
  */
 void __dev_remove_pack(struct packet_type *pt)
 {
-	struct list_head *head = ptype_head(pt);
 	struct packet_type *pt1;
+	struct list_head *head;
 
+	spin_lock(&ptype_lock);
+
+	head = ptype_head(pt);
 	if (!head)
-		return;
+		goto out;
 
-	spin_lock(&ptype_lock);
 
 	list_for_each_entry(pt1, head, list) {
 		if (pt == pt1) {
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 70e0e9a3b650c0753f0b865642aa372a956a4bf5..160dd729178fd37a6340148d9e35f95bd92aecdb 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -230,11 +230,11 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 	pt = v;
 	nxt = pt->list.next;
-	if (pt->dev) {
-		if (nxt != &pt->dev->ptype_all)
+	dev = rcu_dereference(pt->dev);
+	if (dev) {
+		if (nxt != &dev->ptype_all)
 			goto found;
 
-		dev = pt->dev;
 		for_each_netdev_continue_rcu(seq_file_net(seq), dev) {
 			if (!list_empty(&dev->ptype_all)) {
 				nxt = dev->ptype_all.next;
@@ -280,18 +280,22 @@ static void ptype_seq_stop(struct seq_file *seq, void *v)
 static int ptype_seq_show(struct seq_file *seq, void *v)
 {
 	struct packet_type *pt = v;
+	struct net_device *dev;
 
-	if (v == SEQ_START_TOKEN)
+	if (v == SEQ_START_TOKEN) {
 		seq_puts(seq, "Type Device      Function\n");
-	else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) &&
-		 (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) {
+		return 0;
+	}
+	dev = rcu_dereference(pt->dev);
+	if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) &&
+		 (!dev || net_eq(dev_net(dev), seq_file_net(seq)))) {
 		if (pt->type == htons(ETH_P_ALL))
 			seq_puts(seq, "ALL ");
 		else
 			seq_printf(seq, "%04x", ntohs(pt->type));
 
 		seq_printf(seq, " %-8s %ps\n",
-			   pt->dev ? pt->dev->name : "", pt->func);
+			   dev ? dev->name : "", pt->func);
 	}
 
 	return 0;
diff --git a/net/core/selftests.c b/net/core/selftests.c
index 8b81feb82c4ae719b770a5b5480dd07aaae5a54b..e536d998023bb3fb7dc3a8107bc0777fd5ef4eef 100644
--- a/net/core/selftests.c
+++ b/net/core/selftests.c
@@ -246,7 +246,7 @@ static int __net_test_loopback(struct net_device *ndev,
 
 	tpriv->pt.type = htons(ETH_P_IP);
 	tpriv->pt.func = net_test_loopback_validate;
-	tpriv->pt.dev = ndev;
+	rcu_assign_pointer(tpriv->pt.dev, ndev);
 	tpriv->pt.af_packet_priv = tpriv;
 	tpriv->packet = attr;
 	dev_add_pack(&tpriv->pt);
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index 446e4e3b9553a0aea936801f545ebc8ca9cdb736..bf1272f33dc18f3731127e7de727001d587ffc7a 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -1799,7 +1799,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
 	/* Register NCSI packet Rx handler */
 	ndp->ptype.type = cpu_to_be16(ETH_P_NCSI);
 	ndp->ptype.func = ncsi_rcv_rsp;
-	ndp->ptype.dev = dev;
+	RCU_INIT_POINTER(ndp->ptype.dev, dev);
 	dev_add_pack(&ndp->ptype);
 
 	pdev = to_platform_device(dev->dev.parent);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 494d628d10a5105a6a32788b4673993f218ec881..a3130c790d9cf898fe4070fd9bfcd4fe07817b76 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3118,6 +3118,7 @@ static int packet_release(struct socket *sock)
 	struct sock *sk = sock->sk;
 	struct packet_sock *po;
 	struct packet_fanout *f;
+	struct net_device *dev;
 	struct net *net;
 	union tpacket_req_u req_u;
 
@@ -3137,9 +3138,10 @@ static int packet_release(struct socket *sock)
 	unregister_prot_hook(sk, false);
 	packet_cached_dev_reset(po);
 
-	if (po->prot_hook.dev) {
-		netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);
-		po->prot_hook.dev = NULL;
+	dev = rcu_dereference_protected(po->prot_hook.dev, 1);
+	if (dev) {
+		netdev_put(dev, &po->prot_hook.dev_tracker);
+		rcu_assign_pointer(po->prot_hook.dev, NULL);
 	}
 	spin_unlock(&po->bind_lock);
 
@@ -3188,8 +3190,8 @@ static int packet_release(struct socket *sock)
 static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
 			  __be16 proto)
 {
+	struct net_device *odev, *dev = NULL;
 	struct packet_sock *po = pkt_sk(sk);
-	struct net_device *dev = NULL;
 	bool unlisted = false;
 	bool need_rehook;
 	int ret = 0;
@@ -3220,7 +3222,8 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
 		}
 	}
 
-	need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev;
+	odev = rcu_dereference_protected(po->prot_hook.dev, 1);
+	need_rehook = po->prot_hook.type != proto || odev != dev;
 
 	if (need_rehook) {
 		dev_hold(dev);
@@ -3241,16 +3244,16 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
 		WRITE_ONCE(po->num, proto);
 		po->prot_hook.type = proto;
 
-		netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);
+		netdev_put(odev, &po->prot_hook.dev_tracker);
 
 		if (unlikely(unlisted)) {
-			po->prot_hook.dev = NULL;
+			RCU_INIT_POINTER(po->prot_hook.dev, NULL);
 			WRITE_ONCE(po->ifindex, -1);
 			packet_cached_dev_reset(po);
 		} else {
 			netdev_hold(dev, &po->prot_hook.dev_tracker,
 				    GFP_ATOMIC);
-			po->prot_hook.dev = dev;
+			rcu_assign_pointer(po->prot_hook.dev, dev);
 			WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0);
 			packet_cached_dev_assign(po, dev);
 		}
@@ -4209,9 +4212,8 @@ static int packet_notifier(struct notifier_block *this,
 				if (msg == NETDEV_UNREGISTER) {
 					packet_cached_dev_reset(po);
 					WRITE_ONCE(po->ifindex, -1);
-					netdev_put(po->prot_hook.dev,
-						   &po->prot_hook.dev_tracker);
-					po->prot_hook.dev = NULL;
+					netdev_put(dev, &po->prot_hook.dev_tracker);
+					rcu_assign_pointer(po->prot_hook.dev, NULL);
 				}
 				spin_unlock(&po->bind_lock);
 			}
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index ae1ddbf71853924cb01c56bf75e40190f48dec45..c8a7ab9ee437f3361f60557e0c7da0639d5beb0f 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -456,7 +456,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
 
 	/* Associate TIPC bearer with L2 bearer */
 	rcu_assign_pointer(b->media_ptr, dev);
-	b->pt.dev = dev;
+	RCU_INIT_POINTER(b->pt.dev, dev);
 	b->pt.type = htons(ETH_P_TIPC);
 	b->pt.func = tipc_l2_rcv_msg;
 	dev_add_pack(&b->pt);
@@ -665,7 +665,7 @@ static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev,
 		   (skb->pkt_type <= PACKET_MULTICAST))) {
 		skb_mark_not_on_list(skb);
 		TIPC_SKB_CB(skb)->flags = 0;
-		tipc_rcv(dev_net(b->pt.dev), skb, b);
+		tipc_rcv(dev_net(rcu_dereference(b->pt.dev)), skb, b);
 		rcu_read_unlock();
 		return NET_RX_SUCCESS;
 	}
@@ -804,7 +804,7 @@ int tipc_attach_loopback(struct net *net)
 		return -ENODEV;
 
 	netdev_hold(dev, &tn->loopback_pt.dev_tracker, GFP_KERNEL);
-	tn->loopback_pt.dev = dev;
+	RCU_INIT_POINTER(tn->loopback_pt.dev, dev);
 	tn->loopback_pt.type = htons(ETH_P_TIPC);
 	tn->loopback_pt.func = tipc_loopback_rcv_pkt;
 	dev_add_pack(&tn->loopback_pt);
-- 
2.53.0.rc1.225.gd81095ad13-goog


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH net] net: add RCU protection to (struct packet_type)->dev
  2026-01-31 21:29 [PATCH net] net: add RCU protection to (struct packet_type)->dev Eric Dumazet
@ 2026-02-02  3:16 ` YinFengwei
  2026-02-02  4:19   ` Eric Dumazet
  0 siblings, 1 reply; 10+ messages in thread
From: YinFengwei @ 2026-02-02  3:16 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Willem de Bruijn, netdev, eric.dumazet, dongchenchen2

+ Chenchen as he hit the exact same issue.

Hi Eric,

> Yin Fengwei reported an RCU stall in ptype_seq_show() and provided a patch.
> 
> Real issue is that (struct packet_type)->dev needs RCU protection:
> 
> ptype_seq_show() runs under rcu_read_lock(), and reads pt->dev
> to get device name without any barrier.
> 
> At the same time, concurrent writer can remove a packet_type structure
> (which is correctly freed after an RCU grace period) _and_ clear pt->dev
> without an RCU grace period.
> 
> Fix this issue by using proper RCU on pt->dev pointer.
Still can hit this issue with same backtrace even with this fixing patch.

Look at the __dev_remove_pack(), the pt->list is protected by ptype_lock
while prot_hook.dev is protected by bind_lock. Could it make sure the
procfs interface see either the list element with prot_hook.dev not NULL
or can't see the list element with NULL prot_hook.dev?

Regards
Yin, Fengwei

> 
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Reported-by: Yin Fengwei <fengwei_yin@linux.alibaba.com>
> Closes: https://lore.kernel.org/netdev/CANn89iKRRKPnWjJmb-_3a=sq+9h6DvTQM4DBZHT5ZRGPMzQaiA@mail.gmail.com/T/#m7b80b9fc9b9267f90e0b7aad557595f686f9c50d
> ---
>  drivers/net/ethernet/amd/xgbe/xgbe-selftest.c |  2 +-
>  .../ethernet/mellanox/mlx5/core/en_selftest.c |  2 +-
>  .../stmicro/stmmac/stmmac_selftests.c         | 12 ++++----
>  drivers/scsi/bnx2fc/bnx2fc_fcoe.c             |  4 +--
>  drivers/scsi/fcoe/fcoe.c                      |  6 ++--
>  include/linux/netdevice.h                     |  2 +-
>  net/batman-adv/hard-interface.c               |  2 +-
>  net/core/dev.c                                | 30 +++++++++++--------
>  net/core/net-procfs.c                         | 18 ++++++-----
>  net/core/selftests.c                          |  2 +-
>  net/ncsi/ncsi-manage.c                        |  2 +-
>  net/packet/af_packet.c                        | 24 ++++++++-------
>  net/tipc/bearer.c                             |  6 ++--
>  13 files changed, 61 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-selftest.c b/drivers/net/ethernet/amd/xgbe/xgbe-selftest.c
> index 55e5e467facd7f546ba208361ec9fdcfd7a627d9..006d80a387431cb7d4acdd35f4f1990c8c1f3366 100644
> --- a/drivers/net/ethernet/amd/xgbe/xgbe-selftest.c
> +++ b/drivers/net/ethernet/amd/xgbe/xgbe-selftest.c
> @@ -121,7 +121,7 @@ static int __xgbe_test_loopback(struct xgbe_prv_data *pdata,
>  
>  	tdata->pt.type = htons(ETH_P_IP);
>  	tdata->pt.func = xgbe_test_loopback_validate;
> -	tdata->pt.dev = pdata->netdev;
> +	RCU_INIT_POINTER(tdata->pt.dev, pdata->netdev);
>  	tdata->pt.af_packet_priv = tdata;
>  	tdata->packet = attr;
>  

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net] net: add RCU protection to (struct packet_type)->dev
  2026-02-02  3:16 ` YinFengwei
@ 2026-02-02  4:19   ` Eric Dumazet
  2026-02-02  7:06     ` YinFengwei
  2026-02-02  8:21     ` dongchenchen (A)
  0 siblings, 2 replies; 10+ messages in thread
From: Eric Dumazet @ 2026-02-02  4:19 UTC (permalink / raw)
  To: YinFengwei
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Willem de Bruijn, netdev, eric.dumazet, dongchenchen2

On Mon, Feb 2, 2026 at 4:16 AM YinFengwei <fengwei_yin@linux.alibaba.com> wrote:
>
> + Chenchen as he hit the exact same issue.
>
> Hi Eric,
>
> > Yin Fengwei reported an RCU stall in ptype_seq_show() and provided a patch.
> >
> > Real issue is that (struct packet_type)->dev needs RCU protection:
> >
> > ptype_seq_show() runs under rcu_read_lock(), and reads pt->dev
> > to get device name without any barrier.
> >
> > At the same time, concurrent writer can remove a packet_type structure
> > (which is correctly freed after an RCU grace period) _and_ clear pt->dev
> > without an RCU grace period.
> >
> > Fix this issue by using proper RCU on pt->dev pointer.
> Still can hit this issue with same backtrace even with this fixing patch.
>
> Look at the __dev_remove_pack(), the pt->list is protected by ptype_lock
> while prot_hook.dev is protected by bind_lock. Could it make sure the
> procfs interface see either the list element with prot_hook.dev not NULL
> or can't see the list element with NULL prot_hook.dev?
>
> Regards
> Yin, Fengwei

Please share the new stack trace (with the symbols), or the repro, thanks !

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net] net: add RCU protection to (struct packet_type)->dev
  2026-02-02  4:19   ` Eric Dumazet
@ 2026-02-02  7:06     ` YinFengwei
  2026-02-02  8:21     ` dongchenchen (A)
  1 sibling, 0 replies; 10+ messages in thread
From: YinFengwei @ 2026-02-02  7:06 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Willem de Bruijn, netdev, eric.dumazet, dongchenchen2

Hi Eric,

> On Mon, Feb 2, 2026 at 4:16 AM YinFengwei <fengwei_yin@linux.alibaba.com> wrote:
> >
> > + Chenchen as he hit the exact same issue.
> >
> > Hi Eric,
> >
> > > Yin Fengwei reported an RCU stall in ptype_seq_show() and provided a patch.
> > >
> > > Real issue is that (struct packet_type)->dev needs RCU protection:
> > >
> > > ptype_seq_show() runs under rcu_read_lock(), and reads pt->dev
> > > to get device name without any barrier.
> > >
> > > At the same time, concurrent writer can remove a packet_type structure
> > > (which is correctly freed after an RCU grace period) _and_ clear pt->dev
> > > without an RCU grace period.
> > >
> > > Fix this issue by using proper RCU on pt->dev pointer.
> > Still can hit this issue with same backtrace even with this fixing patch.
> >
> > Look at the __dev_remove_pack(), the pt->list is protected by ptype_lock
> > while prot_hook.dev is protected by bind_lock. Could it make sure the
> > procfs interface see either the list element with prot_hook.dev not NULL
> > or can't see the list element with NULL prot_hook.dev?
> >
> > Regards
> > Yin, Fengwei
> 
> Please share the new stack trace (with the symbols), or the repro, thanks !

About the reproducing environment:
1. We can reproduce it on bare mental arm64 server and kvm guest on arm64 server.
   So I use virtualization env to reproduce it now.
2. With debian cloud disk image, it took very long to hit it (like days).
3. With a buildroot based disk image from our QA team, it took less than 1 hours
   to hit it. Unfortunately, I can't share the buildroot based disk image.
4. I didn't reproduce it on x86 env.


The base commit I applied your fix (So you dup the code matches the stacktrace
decode output):
commit 18f7fcd5e69a04df57b563360b88be72471d6b62 (HEAD -> master, tag:
v6.19-rc8, origin/master)
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Sun Feb 1 14:01:13 2026 -0800

    Linux 6.19-rc8

Let me know if you need furhter information. Thanks.

Regards
Yin, Fengwei

---------------------------------------------------------------------------------
The stack trace is like:
[ 1248.009152] watchdog: BUG: soft lockup - CPU#0 stuck for 149s!
[b2ac940_rcu_sta:12914]
[ 1248.009167] Modules linked in:
[ 1248.009171] CPU: 0 UID: 0 PID: 12914 Comm: b2ac940_rcu_sta Tainted: G	L      6.19.0-rc8-00001-gbcedc33b3f13 #8 VOLUNTARY
[ 1248.009241] Tainted: [L]=SOFTLOCKUP
[ 1248.009248] Hardware name: linux,dummy-virt (DT)
[ 1248.009250] pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
[ 1248.009251] pc : ptype_seq_next+0x1c/0x180
[ 1248.009257] lr : traverse.part.0+0x88/0x198
[ 1248.009260] sp : ffff800087a13b20
[ 1248.009260] x29: ffff800087a13b20 x28: ffff000004d35900 x27:0000000000000000
[ 1248.009262] x26: 0000000000000000 x25: ffff0000040ddad0 x24:ffff0000040ddac0
[ 1248.009264] x23: 00000000000080db x22: 0000000000000033 x21:0000000000000000
[ 1248.009266] x20: ffff000003bbc158 x19: ffff0000040dda98 x18:0000000000000000
[ 1248.009268] x17: 0000000000000000 x16: 0000000000000000 x15:0000000000000000
[ 1248.009270] x14: 0000000000000000 x13: 0a6e6f6974636e75 x12:4620202020202065
[ 1248.009272] x11: 0000000000000000 x10: 0000000000000001 x9 :ffff8000804d5ac8
[ 1248.009274] x8 : 000000000000000a x7 : ffff8000818cb9e2 x6 :000000000000000a
[ 1248.009289] x5 : 0000000000000000 x4 : ffff8000827d2e40 x3 :0000000772a30439
[ 1248.009290] x2 : ffff0000040ddac0 x1 : ffff000003bbc158 x0 :ffff0000040dda98
[ 1248.009292] Call trace:
[ 1248.009293]  ptype_seq_next+0x1c/0x180 (P)
[ 1248.009296]  seq_read_iter+0x300/0x500
[ 1248.009297]  seq_read+0xe8/0x128
[ 1248.009298]  proc_reg_read+0xb8/0x108
[ 1248.009300]  do_loop_readv_writev.part.0+0xc0/0x128
[ 1248.009302]  vfs_readv+0x178/0x1e0
[ 1248.009303]  do_preadv+0x98/0x100
[ 1248.009305]  __arm64_sys_preadv+0x28/0x40
[ 1248.009306]  invoke_syscall+0x50/0x120
[ 1248.009307]  el0_svc_common.constprop.0+0x48/0xf0
[ 1248.009309]  do_el0_svc+0x24/0x38
[ 1248.009310]  el0_svc+0x38/0x168
[ 1248.009312]  el0t_64_sync_handler+0xa0/0xe8
[ 1248.009314]  el0t_64_sync+0x1ac/0x1b0


stack trace decode:
[ 1248.009152] watchdog: BUG: soft lockup - CPU#0 stuck for 149s!	[b2ac940_rcu_sta:12914]
[ 1248.009167] Modules linked in:
[ 1248.009241] Tainted: [L]=SOFTLOCKUP
[ 1248.009248] Hardware name: linux,dummy-virt (DT)
[ 1248.009250] pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
[ 1248.009251] pc : ptype_seq_next (/mnt/ssd/fyin/linux/net/core/net-procfs.c:228)
[ 1248.009257] lr : traverse.part.0 (/mnt/ssd/fyin/linux/fs/seq_file.c:120)
[ 1248.009260] sp : ffff800087a13b20
[ 1248.009260] x29: ffff800087a13b20 x28: ffff000004d35900 x27:0000000000000000
[ 1248.009262] x26: 0000000000000000 x25: ffff0000040ddad0 x24:ffff0000040ddac0
[ 1248.009264] x23: 00000000000080db x22: 0000000000000033 x21:0000000000000000
[ 1248.009266] x20: ffff000003bbc158 x19: ffff0000040dda98 x18:0000000000000000
[ 1248.009268] x17: 0000000000000000 x16: 0000000000000000 x15:0000000000000000
[ 1248.009270] x14: 0000000000000000 x13: 0a6e6f6974636e75 x12:4620202020202065
[ 1248.009272] x11: 0000000000000000 x10: 0000000000000001 x9 :ffff8000804d5ac8
[ 1248.009274] x8 : 000000000000000a x7 : ffff8000818cb9e2 x6 :000000000000000a
[ 1248.009289] x5 : 0000000000000000 x4 : ffff8000827d2e40 x3 :0000000772a30439
[ 1248.009290] x2 : ffff0000040ddac0 x1 : ffff000003bbc158 x0 :ffff0000040dda98
[ 1248.009292] Call trace:
[ 1248.009293]  ptype_seq_next (/mnt/ssd/fyin/linux/net/core/net-procfs.c:228) (P)
[ 1248.009296]  seq_read_iter (/mnt/ssd/fyin/linux/fs/seq_file.c:101 /mnt/ssd/fyin/linux/fs/seq_file.c:195)
[ 1248.009297]  seq_read (/mnt/ssd/fyin/linux/fs/seq_file.c:163)
[ 1248.009298]  proc_reg_read (/mnt/ssd/fyin/linux/fs/proc/inode.c:308 /mnt/ssd/fyin/linux/fs/proc/inode.c:320)
[ 1248.009300]  do_loop_readv_writev.part.0 (/mnt/ssd/fyin/linux/fs/read_write.c:850)
[ 1248.009302]  vfs_readv (/mnt/ssd/fyin/linux/fs/read_write.c:840 /mnt/ssd/fyin/linux/fs/read_write.c:1020)
[ 1248.009303]  do_preadv (/mnt/ssd/fyin/linux/fs/read_write.c:1132)
[ 1248.009305]  __arm64_sys_preadv (/mnt/ssd/fyin/linux/fs/read_write.c:1174)
[ 1248.009306]  invoke_syscall (/mnt/ssd/fyin/linux/./arch/arm64/include/asm/current.h:19 /mnt/ssd/fyin/linux/arch/arm64/kernel/syscall.c:54)                               
[ 1248.009307]  el0_svc_common.constprop.0 (/mnt/ssd/fyin/linux/./include/linux/thread_info.h:140 /mnt/ssd/fyin/linux/arch/arm64/kernel/syscall.c:140)
[ 1248.009309]  do_el0_svc (/mnt/ssd/fyin/linux/arch/arm64/kernel/syscall.c:152)
[ 1248.009310]  el0_svc (/mnt/ssd/fyin/linux/./arch/arm64/include/asm/alternative-macros.h:254 /mnt/ssd/fyin/linux/./arch/arm64/include/asm/cpufeature.h:809 /mnt/ssd/fyin/linux/./arch/arm64/include/asm/irqflags.h:73 /mnt/ssd/fyin/linux/arch/arm64/kernel/entry-common.c:80 /mnt/ssd/fyin/linux/arch/arm64/kernel/entry-common.c:725)
[ 1248.009312]  el0t_64_sync_handler (/mnt/ssd/fyin/linux/arch/arm64/kernel/entry-common.c:744)                                                                             
[ 1248.009314]  el0t_64_sync (/mnt/ssd/fyin/linux/arch/arm64/kernel/entry.S:596)


The C reproducer:
// autogenerated by syzkaller (https://github.com/google/syzkaller)

#define _GNU_SOURCE

#include <dirent.h>
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>

#include <linux/futex.h>

#ifndef __NR_bind
#define __NR_bind 200
#endif
#ifndef __NR_close
#define __NR_close 57
#endif
#ifndef __NR_ioctl
#define __NR_ioctl 29
#endif
#ifndef __NR_mmap
#define __NR_mmap 222
#endif
#ifndef __NR_openat
#define __NR_openat 56
#endif
#ifndef __NR_preadv
#define __NR_preadv 69
#endif
#ifndef __NR_socket
#define __NR_socket 198
#endif

static void sleep_ms(uint64_t ms)
{
  usleep(ms * 1000);
}

static uint64_t current_time_ms(void)
{
  struct timespec ts;
  if (clock_gettime(CLOCK_MONOTONIC, &ts))
    exit(1);
  return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
}

static void thread_start(void* (*fn)(void*), void* arg)
{
  pthread_t th;
  pthread_attr_t attr;
  pthread_attr_init(&attr);
  pthread_attr_setstacksize(&attr, 128 << 10);
  int i = 0;
  for (; i < 100; i++) {
    if (pthread_create(&th, &attr, fn, arg) == 0) {
      pthread_attr_destroy(&attr);
      return;
    }
    if (errno == EAGAIN) {
      usleep(50);
      continue;
    }
    break;
  }
  exit(1);
}

typedef struct {
  int state;
} event_t;

static void event_init(event_t* ev)
{
  ev->state = 0;
}

static void event_reset(event_t* ev)
{
  ev->state = 0;
}

static void event_set(event_t* ev)
{
  if (ev->state)
    exit(1);
  __atomic_store_n(&ev->state, 1, __ATOMIC_RELEASE);
  syscall(SYS_futex, &ev->state, FUTEX_WAKE | FUTEX_PRIVATE_FLAG, 1000000);
}

static void event_wait(event_t* ev)
{
  while (!__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE))
    syscall(SYS_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, 0);
}

static int event_isset(event_t* ev)
{
  return __atomic_load_n(&ev->state, __ATOMIC_ACQUIRE);
}

static int event_timedwait(event_t* ev, uint64_t timeout)
{
  uint64_t start = current_time_ms();
  uint64_t now = start;
  for (;;) {
    uint64_t remain = timeout - (now - start);
    struct timespec ts;
    ts.tv_sec = remain / 1000;
    ts.tv_nsec = (remain % 1000) * 1000 * 1000;
    syscall(SYS_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, &ts);
    if (__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE))
      return 1;
    now = current_time_ms();
    if (now - start > timeout)
      return 0;
  }
}

static bool write_file(const char* file, const char* what, ...)
{
  char buf[1024];
  va_list args;
  va_start(args, what);
  vsnprintf(buf, sizeof(buf), what, args);
  va_end(args);
  buf[sizeof(buf) - 1] = 0;
  int len = strlen(buf);
  int fd = open(file, O_WRONLY | O_CLOEXEC);
  if (fd == -1)
    return false;
  if (write(fd, buf, len) != len) {
    int err = errno;
    close(fd);
    errno = err;
    return false;
  }
  close(fd);
  return true;
}

static long syz_open_procfs(volatile long a0, volatile long a1)
{
  char buf[128];
  memset(buf, 0, sizeof(buf));
  if (a0 == 0) {
    snprintf(buf, sizeof(buf), "/proc/self/%s", (char*)a1);
  } else if (a0 == -1) {
    snprintf(buf, sizeof(buf), "/proc/thread-self/%s", (char*)a1);
  } else {
    snprintf(buf, sizeof(buf), "/proc/self/task/%d/%s", (int)a0, (char*)a1);
  }
  int fd = open(buf, O_RDWR);
  if (fd == -1)
    fd = open(buf, O_RDONLY);
  return fd;
}

static void kill_and_wait(int pid, int* status)
{
  kill(-pid, SIGKILL);
  kill(pid, SIGKILL);
  for (int i = 0; i < 100; i++) {
    if (waitpid(-1, status, WNOHANG | __WALL) == pid)
      return;
    usleep(1000);
  }
  DIR* dir = opendir("/sys/fs/fuse/connections");
  if (dir) {
    for (;;) {
      struct dirent* ent = readdir(dir);
      if (!ent)
        break;
      if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0)
        continue;
      char abort[300];
      snprintf(abort, sizeof(abort), "/sys/fs/fuse/connections/%s/abort",
               ent->d_name);
      int fd = open(abort, O_WRONLY);
      if (fd == -1) {
        continue;
      }
      if (write(fd, abort, 1) < 0) {
      }
      close(fd);
    }
    closedir(dir);
  } else {
  }
  while (waitpid(-1, status, __WALL) != pid) {
  }
}

static void setup_test()
{
  prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
  setpgrp();
  write_file("/proc/self/oom_score_adj", "1000");
}

struct thread_t {
  int created, call;
  event_t ready, done;
};

static struct thread_t threads[16];
static void execute_call(int call);
static int running;

static void* thr(void* arg)
{
  struct thread_t* th = (struct thread_t*)arg;
  for (;;) {
    event_wait(&th->ready);
    event_reset(&th->ready);
    execute_call(th->call);
    __atomic_fetch_sub(&running, 1, __ATOMIC_RELAXED);
    event_set(&th->done);
  }
  return 0;
}

static void execute_one(void)
{
  if (write(1, "executing program\n", sizeof("executing program\n") - 1)) {
  }
  int i, call, thread;
  for (call = 0; call < 9; call++) {
    for (thread = 0; thread < (int)(sizeof(threads) / sizeof(threads[0]));
         thread++) {
      struct thread_t* th = &threads[thread];
      if (!th->created) {
        th->created = 1;
        event_init(&th->ready);
        event_init(&th->done);
        event_set(&th->done);
        thread_start(thr, th);
      }
      if (!event_isset(&th->done))
        continue;
      event_reset(&th->done);
      th->call = call;
      __atomic_fetch_add(&running, 1, __ATOMIC_RELAXED);
      event_set(&th->ready);
      if (call == 6)
        break;
      event_timedwait(&th->done, 50);
      break;
    }
  }
  for (i = 0; i < 100 && __atomic_load_n(&running, __ATOMIC_RELAXED); i++)
    sleep_ms(1);
}

static void execute_one(void);

#define WAIT_FLAGS __WALL

static void loop(void)
{
  int iter = 0;
  for (;; iter++) {
    int pid = fork();
    if (pid < 0)
      exit(1);
    if (pid == 0) {
      setup_test();
      execute_one();
      exit(0);
    }
    int status = 0;
    uint64_t start = current_time_ms();
    for (;;) {
      sleep_ms(10);
      if (waitpid(-1, &status, WNOHANG | WAIT_FLAGS) == pid)
        break;
      if (current_time_ms() - start < 5000)
        continue;
      kill_and_wait(pid, &status);
      break;
    }
  }
}

uint64_t r[4] = {0xffffffffffffffff, 0xffffffffffffffff, 0x0,
                 0xffffffffffffffff};

void execute_call(int call)
{
  intptr_t res = 0;
  switch (call) {
  case 0:
    //  close arguments: [
    //    fd: fd (resource)
    //  ]
    syscall(__NR_close, /*fd=*/3);
    break;
  case 1:
    //  socket$packet arguments: [
    //    domain: const = 0x11 (8 bytes)
    //    type: packet_socket_type = 0x3 (8 bytes)
    //    proto: const = 0x300 (4 bytes)
    //  ]
    //  returns sock_packet
    res = syscall(__NR_socket, /*domain=*/0x11ul, /*type=SOCK_RAW*/ 3ul,
                  /*proto=*/0x300);
    if (res != -1)
      r[0] = res;
    break;
  case 2:
    //  socket$nl_audit arguments: [
    //    domain: const = 0x10 (8 bytes)
    //    type: const = 0x3 (8 bytes)
    //    proto: const = 0x9 (4 bytes)
    //  ]
    //  returns sock_nl_audit
    res = syscall(__NR_socket, /*domain=*/0x10ul, /*type=*/3ul, /*proto=*/9);
    if (res != -1)
      r[1] = res;
    break;
  case 3:
    //  ioctl$sock_SIOCGIFINDEX arguments: [
    //    fd: sock (resource)
    //    cmd: const = 0x8933 (4 bytes)
    //    arg: ptr[out, ifreq_dev_t[devnames, ifindex]] {
    //      ifreq_dev_t[devnames, ifindex] {
    //        ifr_ifrn: buffer: {6c 6f 00 00 00 00 00 00 00 00 00 00 00 00 00
    //        00} (length 0x10) elem: ifindex (resource) pad = 0x0 (20 bytes)
    //      }
    //    }
    //  ]
    memcpy((void*)0x20000100,
           "lo\000\000\000\000\000\000\000\000\000\000\000\000\000\000", 16);
    res =
        syscall(__NR_ioctl, /*fd=*/r[1], /*cmd=*/0x8933, /*arg=*/0x20000100ul);
    if (res != -1)
      r[2] = *(uint32_t*)0x20000110;
    break;
  case 4:
    //  bind$packet arguments: [
    //    fd: sock_packet (resource)
    //    addr: ptr[in, sockaddr_ll] {
    //      sockaddr_ll {
    //        sll_family: const = 0x11 (2 bytes)
    //        sll_protocol: packet_protocols = 0x3 (2 bytes)
    //        sll_ifindex: ifindex (resource)
    //        sll_hatype: const = 0x1 (2 bytes)
    //        sll_pkttype: int8 = 0xc1 (1 bytes)
    //        sll_halen: const = 0x6 (1 bytes)
    //        sll_addr: union mac_addr {
    //          local: mac_addr_t[const[0xaa, int8]] {
    //            a0: buffer: {aa aa aa aa aa} (length 0x5)
    //            a1: const = 0xaa (1 bytes)
    //          }
    //        }
    //        pad: buffer: {00 00} (length 0x2)
    //      }
    //    }
    //    addrlen: len = 0x14 (8 bytes)
    //  ]
    *(uint16_t*)0x20001340 = 0x11;
    *(uint16_t*)0x20001342 = htobe16(3);
    *(uint32_t*)0x20001344 = r[2];
    *(uint16_t*)0x20001348 = 1;
    *(uint8_t*)0x2000134a = 0xc1;
    *(uint8_t*)0x2000134b = 6;
    memset((void*)0x2000134c, 170, 5);
    *(uint8_t*)0x20001351 = 0xaa;
    memset((void*)0x20001352, 0, 2);
    syscall(__NR_bind, /*fd=*/r[0], /*addr=*/0x20001340ul, /*addrlen=*/0x14ul);
    break;
  case 5:
    //  syz_open_procfs arguments: [
    //    pid: pid (resource)
    //    file: ptr[in, buffer] {
    //      buffer: {6e 65 74 2f 70 74 79 70 65 00} (length 0xa)
    //    }
    //  ]
    //  returns fd
    memcpy((void*)0x20000180, "net/ptype\000", 10);
    res = -1;
    res = syz_open_procfs(/*pid=*/0, /*file=*/0x20000180);
    if (res != -1)
      r[3] = res;
    break;
  case 6:
    //  preadv arguments: [
    //    fd: fd (resource)
    //    vec: ptr[in, array[iovec[out, array[int8]]]] {
    //      array[iovec[out, array[int8]]] {
    //        iovec[out, array[int8]] {
    //          addr: ptr[out, buffer] {
    //            buffer: (DirOut)
    //          }
    //          len: len = 0xe8 (8 bytes)
    //        }
    //      }
    //    }
    //    vlen: len = 0x1 (8 bytes)
    //    off_low: int32 = 0x80db (4 bytes)
    //    off_high: int32 = 0x2 (4 bytes)
    //  ]
    *(uint64_t*)0x20001840 = 0x20000280;
    *(uint64_t*)0x20001848 = 0xe8;
    syscall(__NR_preadv, /*fd=*/r[3], /*vec=*/0x20001840ul, /*vlen=*/1ul,
            /*off_low=*/0x80db, /*off_high=*/2);
    break;
  case 7:
    //  close arguments: [
    //    fd: fd (resource)
    //  ]
    syscall(__NR_close, /*fd=*/3);
    break;
  case 8:
    //  openat arguments: [
    //    fd: fd_dir (resource)
    //    file: nil
    //    flags: open_flags = 0x42 (4 bytes)
    //    mode: open_mode = 0x17f (2 bytes)
    //  ]
    //  returns fd
    syscall(
        __NR_openat, /*fd=*/0xffffff9c, /*file=*/0ul,
        /*flags=O_CREAT|O_RDWR*/ 0x42,
        /*mode=S_IXOTH|S_IWOTH|S_IROTH|S_IXGRP|S_IWGRP|S_IRGRP|S_IXUSR|0x100*/
        0x17f);
    break;
  }
}
int main(void)
{
  syscall(__NR_mmap, /*addr=*/0x1ffff000ul, /*len=*/0x1000ul, /*prot=*/0ul,
          /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul,
          /*fd=*/(intptr_t)-1, /*offset=*/0ul);
  syscall(__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0x1000000ul,
          /*prot=PROT_WRITE|PROT_READ|PROT_EXEC*/ 7ul,
          /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul,
          /*fd=*/(intptr_t)-1, /*offset=*/0ul);
  syscall(__NR_mmap, /*addr=*/0x21000000ul, /*len=*/0x1000ul, /*prot=*/0ul,
          /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul,
          /*fd=*/(intptr_t)-1, /*offset=*/0ul);
  const char* reason;
  (void)reason;
  loop();
  return 0;
}


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net] net: add RCU protection to (struct packet_type)->dev
  2026-02-02  4:19   ` Eric Dumazet
  2026-02-02  7:06     ` YinFengwei
@ 2026-02-02  8:21     ` dongchenchen (A)
  2026-02-02  8:22       ` Eric Dumazet
  2026-02-02  8:47       ` Eric Dumazet
  1 sibling, 2 replies; 10+ messages in thread
From: dongchenchen (A) @ 2026-02-02  8:21 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Willem de Bruijn, netdev, eric.dumazet, fengwei_yin,
	zhangchangzhong


> On Mon, Feb 2, 2026 at 4:16 AM YinFengwei <fengwei_yin@linux.alibaba.com> wrote:
>> + Chenchen as he hit the exact same issue.
>>
>> Hi Eric,
>>
>>> Yin Fengwei reported an RCU stall in ptype_seq_show() and provided a patch.
>>>
>>> Real issue is that (struct packet_type)->dev needs RCU protection:
>>>
>>> ptype_seq_show() runs under rcu_read_lock(), and reads pt->dev
>>> to get device name without any barrier.
>>>
>>> At the same time, concurrent writer can remove a packet_type structure
>>> (which is correctly freed after an RCU grace period) _and_ clear pt->dev
>>> without an RCU grace period.
>>>
>>> Fix this issue by using proper RCU on pt->dev pointer.
>> Still can hit this issue with same backtrace even with this fixing patch.
>>
>> Look at the __dev_remove_pack(), the pt->list is protected by ptype_lock
>> while prot_hook.dev is protected by bind_lock. Could it make sure the
>> procfs interface see either the list element with prot_hook.dev not NULL
>> or can't see the list element with NULL prot_hook.dev?
>>
>> Regards
>> Yin, Fengwei
> Please share the new stack trace (with the symbols), or the repro, thanks !

Hi, Eric.
I encountered a similar issue.
https://lore.kernel.org/all/20260128112348.3950437-1-dongchenchen2@huawei.com/
we can reproduce it using the following method[1][2][3].

I think the reason why using only rcu to protect dev cannot fix this problem
is that pt->head.nxt is not protected by rcu.

list_del_rcu
__list_del
     next->prev = prev;
     WRITE_ONCE(prev->next, next);

While traversing the ptype sequence, once the nxt pointer is obtained,
pt can be concurrently deleted or modified. When pt->dev does not correspond
to the list that pt is currently in, the list head detection will no longer
work correctly.

CPU1				CPU2
ptype_seq_next
     nxt = pt->list.next;
     //nxt = ptype_head(pt) = dev->ptype_all
				packet_release/packet_notifier
				    unregister_prot_hook(sk, false);
				    //no sync wait, pt->list.next not change
				    rcu_assign_pointer(po->prot_hook.dev, NULL);
     dev = rcu_dereference(pt->dev);
     if (pt->dev) //check fail
         if (nxt != &pt->dev->ptype_all)
             goto found;
     if (nxt != &ptype_all) //check success
         goto found;
found:
     return list_entry(nxt, struct packet_type, list);
     //return list head to seq traversal

-----
Best Regards,
Dong Chenchen

[1] gcc packet_type.c -o packet_type_test
#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <net/if.h>
#include <netinet/in.h>
#include <linux/if_packet.h>
#include <linux/sockios.h>
#include <linux/if_ether.h>
#include <linux/if_arp.h>
#include <linux/if_tun.h>
#include <fcntl.h>
#include <sys/uio.h>
#include <sched.h>

#define TUN_NAME "tun0"

int tun_create(char *dev_name, int flags) {
     struct ifreq ifr;
     int fd, err;
     
     if ((fd = open("/dev/net/tun", O_RDWR)) < 0) {
         perror("open /dev/net/tun");
         return -1;
     }
     memset(&ifr, 0, sizeof(ifr));
     ifr.ifr_flags = flags;
     
     if (dev_name) {
         strncpy(ifr.ifr_name, dev_name, IFNAMSIZ);
     }
     if ((err = ioctl(fd, TUNSETIFF, (void *)&ifr)) < 0) {
         perror("ioctl TUNSETIFF");
         close(fd);
         return err;
     }
     return fd;
}

void tun_set(const char *dev_name, short flag) {
     int sockfd;
     struct ifreq ifr;
     
     sockfd = socket(AF_INET, SOCK_DGRAM, 0);
     if (sockfd < 0) {
         perror("socket");
         return;
     }
     memset(&ifr, 0, sizeof(ifr));
     strncpy(ifr.ifr_name, dev_name, IFNAMSIZ);
     if (ioctl(sockfd, SIOCGIFFLAGS, &ifr) < 0) {
         perror("ioctl SIOCGIFFLAGS");
         close(sockfd);
         return;
     }
     ifr.ifr_flags |= flag;
     if (ioctl(sockfd, SIOCSIFFLAGS, &ifr) < 0) {
         perror("ioctl SIOCSIFFLAGS");
         close(sockfd);
         return;
     }
     close(sockfd);
}

int main() {
     int tunfd, sockfd;
     struct ifreq ifr;
     struct sockaddr_ll sll;

     unshare(CLONE_NEWNET);
     tunfd = tun_create(TUN_NAME, IFF_TUN | IFF_NO_PI);
     tun_set(TUN_NAME, IFF_UP);
     sockfd = socket(AF_PACKET, SOCK_DGRAM, htons(ETH_P_ALL));
     if (sockfd < 0) {
	close(tunfd);
         perror("socket");
         exit(1);
     }
     
     memset(&ifr, 0, sizeof(ifr));
     strncpy(ifr.ifr_name, TUN_NAME, IFNAMSIZ);
     
     if (ioctl(sockfd, SIOCGIFINDEX, &ifr) < 0) {
         perror("ioctl SIOCGIFINDEX");
	close(sockfd);
         close(tunfd);
         exit(1);
     }

     memset(&sll, 0, sizeof(sll));
     sll.sll_family = AF_PACKET;
     sll.sll_protocol = htons(ETH_P_ALL);
     sll.sll_ifindex = ifr.ifr_ifindex;
     sll.sll_pkttype = PACKET_HOST;
     sll.sll_hatype = ARPHRD_ETHER;
     
     if (bind(sockfd, (struct sockaddr *)&sll, sizeof(sll)) < 0) {
         perror("bind");
         close(sockfd);
	close(tunfd);
         exit(1);
     }
     printf("begin sleep\n");
     sleep(1);

     close(sockfd);
     close(tunfd);
     return 0;
}

[2] add delay to pype_seq_next
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 160dd729178f..73f5a20ef57c 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -230,11 +230,14 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  
         pt = v;
         nxt = pt->list.next;
+       if (pt->dev)
+               mdelay(5000);
         dev = rcu_dereference(pt->dev);

[3] run_test.sh
./packet_type_test &
cat /proc/$(pgrep -x "packet_type_test")/net/ptype &


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH net] net: add RCU protection to (struct packet_type)->dev
  2026-02-02  8:21     ` dongchenchen (A)
@ 2026-02-02  8:22       ` Eric Dumazet
  2026-02-02  8:47       ` Eric Dumazet
  1 sibling, 0 replies; 10+ messages in thread
From: Eric Dumazet @ 2026-02-02  8:22 UTC (permalink / raw)
  To: dongchenchen (A)
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Willem de Bruijn, netdev, eric.dumazet, fengwei_yin,
	zhangchangzhong

On Mon, Feb 2, 2026 at 9:21 AM dongchenchen (A)
<dongchenchen2@huawei.com> wrote:
>
>
> > On Mon, Feb 2, 2026 at 4:16 AM YinFengwei <fengwei_yin@linux.alibaba.com> wrote:
> >> + Chenchen as he hit the exact same issue.
> >>
> >> Hi Eric,
> >>
> >>> Yin Fengwei reported an RCU stall in ptype_seq_show() and provided a patch.
> >>>
> >>> Real issue is that (struct packet_type)->dev needs RCU protection:
> >>>
> >>> ptype_seq_show() runs under rcu_read_lock(), and reads pt->dev
> >>> to get device name without any barrier.
> >>>
> >>> At the same time, concurrent writer can remove a packet_type structure
> >>> (which is correctly freed after an RCU grace period) _and_ clear pt->dev
> >>> without an RCU grace period.
> >>>
> >>> Fix this issue by using proper RCU on pt->dev pointer.
> >> Still can hit this issue with same backtrace even with this fixing patch.
> >>
> >> Look at the __dev_remove_pack(), the pt->list is protected by ptype_lock
> >> while prot_hook.dev is protected by bind_lock. Could it make sure the
> >> procfs interface see either the list element with prot_hook.dev not NULL
> >> or can't see the list element with NULL prot_hook.dev?
> >>
> >> Regards
> >> Yin, Fengwei
> > Please share the new stack trace (with the symbols), or the repro, thanks !
>
> Hi, Eric.
> I encountered a similar issue.
> https://lore.kernel.org/all/20260128112348.3950437-1-dongchenchen2@huawei.com/
> we can reproduce it using the following method[1][2][3].
>
> I think the reason why using only rcu to protect dev cannot fix this problem
> is that pt->head.nxt is not protected by rcu.
>
> list_del_rcu
> __list_del
>      next->prev = prev;
>      WRITE_ONCE(prev->next, next);
>
> While traversing the ptype sequence, once the nxt pointer is obtained,
> pt can be concurrently deleted or modified. When pt->dev does not correspond
> to the list that pt is currently in, the list head detection will no longer
> work correctly.
>
> CPU1                            CPU2
> ptype_seq_next
>      nxt = pt->list.next;
>      //nxt = ptype_head(pt) = dev->ptype_all
>                                 packet_release/packet_notifier
>                                     unregister_prot_hook(sk, false);
>                                     //no sync wait, pt->list.next not change
>                                     rcu_assign_pointer(po->prot_hook.dev, NULL);
>      dev = rcu_dereference(pt->dev);
>      if (pt->dev) //check fail
>          if (nxt != &pt->dev->ptype_all)
>              goto found;
>      if (nxt != &ptype_all) //check success
>          goto found;
> found:
>      return list_entry(nxt, struct packet_type, list);
>      //return list head to seq traversal
>
> -----
> Best Regards,
> Dong Chenchen

Thanks a lot, I will take a look.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net] net: add RCU protection to (struct packet_type)->dev
  2026-02-02  8:21     ` dongchenchen (A)
  2026-02-02  8:22       ` Eric Dumazet
@ 2026-02-02  8:47       ` Eric Dumazet
  2026-02-02  9:14         ` Eric Dumazet
  1 sibling, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2026-02-02  8:47 UTC (permalink / raw)
  To: dongchenchen (A)
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Willem de Bruijn, netdev, eric.dumazet, fengwei_yin,
	zhangchangzhong

On Mon, Feb 2, 2026 at 9:21 AM dongchenchen (A)
<dongchenchen2@huawei.com> wrote:
>
>
> }
>
> [2] add delay to pype_seq_next
> diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
> index 160dd729178f..73f5a20ef57c 100644
> --- a/net/core/net-procfs.c
> +++ b/net/core/net-procfs.c
> @@ -230,11 +230,14 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>
>          pt = v;
>          nxt = pt->list.next;
> +       if (pt->dev)
> +               mdelay(5000);

Waiting 5 seconds while holding RCU is going to fire the soft lockup
detection after ~two calls to ptype_seq_next()

What is the actual trace you got, and what happens if you reduce to
mdelay(1000) ?

>          dev = rcu_dereference(pt->dev);
>
> [3] run_test.sh
> ./packet_type_test &
> cat /proc/$(pgrep -x "packet_type_test")/net/ptype &
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net] net: add RCU protection to (struct packet_type)->dev
  2026-02-02  8:47       ` Eric Dumazet
@ 2026-02-02  9:14         ` Eric Dumazet
  2026-02-02 10:10           ` Eric Dumazet
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2026-02-02  9:14 UTC (permalink / raw)
  To: dongchenchen (A)
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Willem de Bruijn, netdev, eric.dumazet, fengwei_yin,
	zhangchangzhong

On Mon, Feb 2, 2026 at 9:47 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Feb 2, 2026 at 9:21 AM dongchenchen (A)
> <dongchenchen2@huawei.com> wrote:
> >
> >
> > }
> >
> > [2] add delay to pype_seq_next
> > diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
> > index 160dd729178f..73f5a20ef57c 100644
> > --- a/net/core/net-procfs.c
> > +++ b/net/core/net-procfs.c
> > @@ -230,11 +230,14 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> >
> >          pt = v;
> >          nxt = pt->list.next;
> > +       if (pt->dev)
> > +               mdelay(5000);
>
> Waiting 5 seconds while holding RCU is going to fire the soft lockup
> detection after ~two calls to ptype_seq_next()
>
> What is the actual trace you got, and what happens if you reduce to
> mdelay(1000) ?

OK, I took a look, and ptype_get_idx() needs to return information
about what list the returned pt is in.

I will send a V2 ,thanks.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net] net: add RCU protection to (struct packet_type)->dev
  2026-02-02  9:14         ` Eric Dumazet
@ 2026-02-02 10:10           ` Eric Dumazet
  2026-02-02 11:27             ` dongchenchen (A)
  0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2026-02-02 10:10 UTC (permalink / raw)
  To: dongchenchen (A)
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Willem de Bruijn, netdev, eric.dumazet, fengwei_yin,
	zhangchangzhong

On Mon, Feb 2, 2026 at 10:14 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Feb 2, 2026 at 9:47 AM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Mon, Feb 2, 2026 at 9:21 AM dongchenchen (A)
> > <dongchenchen2@huawei.com> wrote:
> > >
> > >
> > > }
> > >
> > > [2] add delay to pype_seq_next
> > > diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
> > > index 160dd729178f..73f5a20ef57c 100644
> > > --- a/net/core/net-procfs.c
> > > +++ b/net/core/net-procfs.c
> > > @@ -230,11 +230,14 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
> > >
> > >          pt = v;
> > >          nxt = pt->list.next;
> > > +       if (pt->dev)
> > > +               mdelay(5000);
> >
> > Waiting 5 seconds while holding RCU is going to fire the soft lockup
> > detection after ~two calls to ptype_seq_next()
> >
> > What is the actual trace you got, and what happens if you reduce to
> > mdelay(1000) ?
>
> OK, I took a look, and ptype_get_idx() needs to return information
> about what list the returned pt is in.
>
> I will send a V2 ,thanks.

Here is what I will squash, the diff against V1 is :

diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 160dd729178fd37a6340148d9e35f95bd92aecdb..ad63556c9e0abd15cbfac7777c31894d2eef037b
100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -170,8 +170,15 @@ static const struct seq_operations softnet_seq_ops = {
        .show  = softnet_seq_show,
 };

+
+struct ptype_iter_state {
+       struct seq_net_private  p;
+       struct net_device       *dev;
+};
+
 static void *ptype_get_idx(struct seq_file *seq, loff_t pos)
 {
+       struct ptype_iter_state *iter = seq->private;
        struct list_head *ptype_list = NULL;
        struct packet_type *pt = NULL;
        struct net_device *dev;
@@ -181,12 +188,16 @@ static void *ptype_get_idx(struct seq_file *seq,
loff_t pos)
        for_each_netdev_rcu(seq_file_net(seq), dev) {
                ptype_list = &dev->ptype_all;
                list_for_each_entry_rcu(pt, ptype_list, list) {
-                       if (i == pos)
+                       if (i == pos) {
+                               iter->dev = dev;
                                return pt;
+                       }
                        ++i;
                }
        }

+       iter->dev = NULL;
+
        list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) {
                if (i == pos)
                        return pt;
@@ -218,6 +229,7 @@ static void *ptype_seq_start(struct seq_file *seq,
loff_t *pos)

 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
+       struct ptype_iter_state *iter = seq->private;
        struct net *net = seq_file_net(seq);
        struct net_device *dev;
        struct packet_type *pt;
@@ -229,19 +241,21 @@ static void *ptype_seq_next(struct seq_file
*seq, void *v, loff_t *pos)
                return ptype_get_idx(seq, 0);

        pt = v;
-       nxt = pt->list.next;
-       dev = rcu_dereference(pt->dev);
+       nxt = READ_ONCE(pt->list.next);
+       dev = iter->dev;
        if (dev) {
                if (nxt != &dev->ptype_all)
                        goto found;

                for_each_netdev_continue_rcu(seq_file_net(seq), dev) {
-                       if (!list_empty(&dev->ptype_all)) {
-                               nxt = dev->ptype_all.next;
+                       nxt = READ_ONCE(dev->ptype_all.next);
+                       if (nxt != &dev->ptype_all) {
+                               iter->dev = NULL;
                                goto found;
                        }
                }
-               nxt = net->ptype_all.next;
+               iter->dev = NULL;
+               nxt = READ_ONCE(net->ptype_all.next);
                goto net_ptype_all;
        }

@@ -252,20 +266,20 @@ static void *ptype_seq_next(struct seq_file
*seq, void *v, loff_t *pos)

                if (nxt == &net->ptype_all) {
                        /* continue with ->ptype_specific if it's not empty */
-                       nxt = net->ptype_specific.next;
+                       nxt = READ_ONCE(net->ptype_specific.next);
                        if (nxt != &net->ptype_specific)
                                goto found;
                }

                hash = 0;
-               nxt = ptype_base[0].next;
+               nxt = READ_ONCE(ptype_base[0].next);
        } else
                hash = ntohs(pt->type) & PTYPE_HASH_MASK;

        while (nxt == &ptype_base[hash]) {
                if (++hash >= PTYPE_HASH_SIZE)
                        return NULL;
-               nxt = ptype_base[hash].next;
+               nxt = READ_ONCE(ptype_base[hash].next);
        }
 found:
        return list_entry(nxt, struct packet_type, list);
@@ -279,6 +293,7 @@ static void ptype_seq_stop(struct seq_file *seq, void *v)

 static int ptype_seq_show(struct seq_file *seq, void *v)
 {
+       struct ptype_iter_state *iter = seq->private;
        struct packet_type *pt = v;
        struct net_device *dev;

@@ -286,7 +301,7 @@ static int ptype_seq_show(struct seq_file *seq, void *v)
                seq_puts(seq, "Type Device      Function\n");
                return 0;
        }
-       dev = rcu_dereference(pt->dev);
+       dev = iter->dev;
        if ((!pt->af_packet_net || net_eq(pt->af_packet_net,
seq_file_net(seq))) &&
                 (!dev || net_eq(dev_net(dev), seq_file_net(seq)))) {
                if (pt->type == htons(ETH_P_ALL))
@@ -319,7 +334,7 @@ static int __net_init dev_proc_net_init(struct net *net)
                         &softnet_seq_ops))
                goto out_dev;
        if (!proc_create_net("ptype", 0444, net->proc_net, &ptype_seq_ops,
-                       sizeof(struct seq_net_private)))
+                       sizeof(struct ptype_iter_state)))
                goto out_softnet;

        if (wext_proc_init(net))

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH net] net: add RCU protection to (struct packet_type)->dev
  2026-02-02 10:10           ` Eric Dumazet
@ 2026-02-02 11:27             ` dongchenchen (A)
  0 siblings, 0 replies; 10+ messages in thread
From: dongchenchen (A) @ 2026-02-02 11:27 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Willem de Bruijn, netdev, eric.dumazet, fengwei_yin,
	zhangchangzhong


> On Mon, Feb 2, 2026 at 10:14 AM Eric Dumazet <edumazet@google.com> wrote:
>> On Mon, Feb 2, 2026 at 9:47 AM Eric Dumazet <edumazet@google.com> wrote:
>>> On Mon, Feb 2, 2026 at 9:21 AM dongchenchen (A)
>>> <dongchenchen2@huawei.com> wrote:
>>>>
>>>> }
>>>>
>>>> [2] add delay to pype_seq_next
>>>> diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
>>>> index 160dd729178f..73f5a20ef57c 100644
>>>> --- a/net/core/net-procfs.c
>>>> +++ b/net/core/net-procfs.c
>>>> @@ -230,11 +230,14 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>>>>
>>>>           pt = v;
>>>>           nxt = pt->list.next;
>>>> +       if (pt->dev)
>>>> +               mdelay(5000);
>>> Waiting 5 seconds while holding RCU is going to fire the soft lockup
>>> detection after ~two calls to ptype_seq_next()
>>>
>>> What is the actual trace you got, and what happens if you reduce to
>>> mdelay(1000) ?
>> OK, I took a look, and ptype_get_idx() needs to return information
>> about what list the returned pt is in.
>>
>> I will send a V2 ,thanks.
> Here is what I will squash, the diff against V1 is :
>
> diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
> index 160dd729178fd37a6340148d9e35f95bd92aecdb..ad63556c9e0abd15cbfac7777c31894d2eef037b
> 100644
> --- a/net/core/net-procfs.c
> +++ b/net/core/net-procfs.c
> @@ -170,8 +170,15 @@ static const struct seq_operations softnet_seq_ops = {
>          .show  = softnet_seq_show,
>   };
>
> +
> +struct ptype_iter_state {
> +       struct seq_net_private  p;
> +       struct net_device       *dev;
> +};
> +
>   static void *ptype_get_idx(struct seq_file *seq, loff_t pos)
>   {
> +       struct ptype_iter_state *iter = seq->private;
>          struct list_head *ptype_list = NULL;
>          struct packet_type *pt = NULL;
>          struct net_device *dev;
> @@ -181,12 +188,16 @@ static void *ptype_get_idx(struct seq_file *seq,
> loff_t pos)
>          for_each_netdev_rcu(seq_file_net(seq), dev) {
>                  ptype_list = &dev->ptype_all;
>                  list_for_each_entry_rcu(pt, ptype_list, list) {
> -                       if (i == pos)
> +                       if (i == pos) {
> +                               iter->dev = dev;
>                                  return pt;
> +                       }
>                          ++i;
>                  }
>          }
>
> +       iter->dev = NULL;
> +
>          list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) {
>                  if (i == pos)
>                          return pt;
> @@ -218,6 +229,7 @@ static void *ptype_seq_start(struct seq_file *seq,
> loff_t *pos)
>
>   static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
>   {
> +       struct ptype_iter_state *iter = seq->private;
>          struct net *net = seq_file_net(seq);
>          struct net_device *dev;
>          struct packet_type *pt;
> @@ -229,19 +241,21 @@ static void *ptype_seq_next(struct seq_file
> *seq, void *v, loff_t *pos)
>                  return ptype_get_idx(seq, 0);
>
>          pt = v;
> -       nxt = pt->list.next;
> -       dev = rcu_dereference(pt->dev);
> +       nxt = READ_ONCE(pt->list.next);
> +       dev = iter->dev;
>          if (dev) {
>                  if (nxt != &dev->ptype_all)
>                          goto found;
>
>                  for_each_netdev_continue_rcu(seq_file_net(seq), dev) {
> -                       if (!list_empty(&dev->ptype_all)) {
> -                               nxt = dev->ptype_all.next;
> +                       nxt = READ_ONCE(dev->ptype_all.next);
> +                       if (nxt != &dev->ptype_all) {
> +                               iter->dev = NULL;
>                                  goto found;
>                          }
>                  }
> -               nxt = net->ptype_all.next;
> +               iter->dev = NULL;
> +               nxt = READ_ONCE(net->ptype_all.next);
>                  goto net_ptype_all;
>          }
>
> @@ -252,20 +266,20 @@ static void *ptype_seq_next(struct seq_file
> *seq, void *v, loff_t *pos)
>
>                  if (nxt == &net->ptype_all) {
>                          /* continue with ->ptype_specific if it's not empty */
> -                       nxt = net->ptype_specific.next;
> +                       nxt = READ_ONCE(net->ptype_specific.next);
>                          if (nxt != &net->ptype_specific)
>                                  goto found;
>                  }
>
>                  hash = 0;
> -               nxt = ptype_base[0].next;
> +               nxt = READ_ONCE(ptype_base[0].next);
>          } else
>                  hash = ntohs(pt->type) & PTYPE_HASH_MASK;
>
>          while (nxt == &ptype_base[hash]) {
>                  if (++hash >= PTYPE_HASH_SIZE)
>                          return NULL;
> -               nxt = ptype_base[hash].next;
> +               nxt = READ_ONCE(ptype_base[hash].next);
>          }
>   found:
>          return list_entry(nxt, struct packet_type, list);
> @@ -279,6 +293,7 @@ static void ptype_seq_stop(struct seq_file *seq, void *v)
>
>   static int ptype_seq_show(struct seq_file *seq, void *v)
>   {
> +       struct ptype_iter_state *iter = seq->private;
>          struct packet_type *pt = v;
>          struct net_device *dev;
>
> @@ -286,7 +301,7 @@ static int ptype_seq_show(struct seq_file *seq, void *v)
>                  seq_puts(seq, "Type Device      Function\n");
>                  return 0;
>          }
> -       dev = rcu_dereference(pt->dev);
> +       dev = iter->dev;
>          if ((!pt->af_packet_net || net_eq(pt->af_packet_net,
> seq_file_net(seq))) &&
>                   (!dev || net_eq(dev_net(dev), seq_file_net(seq)))) {
>                  if (pt->type == htons(ETH_P_ALL))
> @@ -319,7 +334,7 @@ static int __net_init dev_proc_net_init(struct net *net)
>                           &softnet_seq_ops))
>                  goto out_dev;
>          if (!proc_create_net("ptype", 0444, net->proc_net, &ptype_seq_ops,
> -                       sizeof(struct seq_net_private)))
> +                       sizeof(struct ptype_iter_state)))
>                  goto out_softnet;
>
>          if (wext_proc_init(net))

I verified that this issue can be fixed with this patch.

Thanks a lot!

-----

Best Regards,
Dong Chenchen


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2026-02-02 11:27 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-31 21:29 [PATCH net] net: add RCU protection to (struct packet_type)->dev Eric Dumazet
2026-02-02  3:16 ` YinFengwei
2026-02-02  4:19   ` Eric Dumazet
2026-02-02  7:06     ` YinFengwei
2026-02-02  8:21     ` dongchenchen (A)
2026-02-02  8:22       ` Eric Dumazet
2026-02-02  8:47       ` Eric Dumazet
2026-02-02  9:14         ` Eric Dumazet
2026-02-02 10:10           ` Eric Dumazet
2026-02-02 11:27             ` dongchenchen (A)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox