All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 net-next 0/4] inet: frags: fully use RCU
@ 2025-03-12  8:22 Eric Dumazet
  2025-03-12  8:22 ` [PATCH v2 net-next 1/4] inet: frags: add inet_frag_putn() helper Eric Dumazet
                   ` (4 more replies)
  0 siblings, 5 replies; 7+ messages in thread
From: Eric Dumazet @ 2025-03-12  8:22 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: David Ahern, Simon Horman, netdev, eric.dumazet, Eric Dumazet

While inet reassembly uses RCU, it is acquiring/releasing
a refcount on struct inet_frag_queue in fast path,
for no good reason.

This was mentioned in one patch changelog seven years ago :/

This series is removing these refcount changes, by extending
RCU sections.

v2: fix a compile error in net/ieee802154/6lowpan/reassembly.c (kbot)

Eric Dumazet (4):
  inet: frags: add inet_frag_putn() helper
  ipv4: frags: remove ipq_put()
  inet: frags: change inet_frag_kill() to defer refcount updates
  inet: frags: save a pair of atomic operations in reassembly

 include/net/inet_frag.h                 |  6 ++--
 include/net/ipv6_frag.h                 |  5 +--
 net/ieee802154/6lowpan/reassembly.c     | 27 ++++++++------
 net/ipv4/inet_fragment.c                | 31 ++++++++--------
 net/ipv4/ip_fragment.c                  | 48 ++++++++++---------------
 net/ipv6/netfilter/nf_conntrack_reasm.c | 27 ++++++++------
 net/ipv6/reassembly.c                   | 29 +++++++--------
 7 files changed, 89 insertions(+), 84 deletions(-)

-- 
2.49.0.rc0.332.g42c0ae87b1-goog


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2 net-next 1/4] inet: frags: add inet_frag_putn() helper
  2025-03-12  8:22 [PATCH v2 net-next 0/4] inet: frags: fully use RCU Eric Dumazet
@ 2025-03-12  8:22 ` Eric Dumazet
  2025-03-12  8:22 ` [PATCH v2 net-next 2/4] ipv4: frags: remove ipq_put() Eric Dumazet
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Eric Dumazet @ 2025-03-12  8:22 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: David Ahern, Simon Horman, netdev, eric.dumazet, Eric Dumazet

inet_frag_putn() can release multiple references
in one step.

Use it in inet_frags_free_cb().

Replace inet_frag_put(X) with inet_frag_putn(X, 1)

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/inet_frag.h                 | 4 ++--
 include/net/ipv6_frag.h                 | 3 ++-
 net/ieee802154/6lowpan/reassembly.c     | 7 ++++---
 net/ipv4/inet_fragment.c                | 3 +--
 net/ipv4/ip_fragment.c                  | 2 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c | 3 ++-
 net/ipv6/reassembly.c                   | 4 ++--
 7 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 5af6eb14c5db1533d559f565deb607934626219d..26687ad0b14142e904b66acee4c98537fa8077c3 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -145,9 +145,9 @@ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key);
 unsigned int inet_frag_rbtree_purge(struct rb_root *root,
 				    enum skb_drop_reason reason);
 
-static inline void inet_frag_put(struct inet_frag_queue *q)
+static inline void inet_frag_putn(struct inet_frag_queue *q, int refs)
 {
-	if (refcount_dec_and_test(&q->refcnt))
+	if (refs && refcount_sub_and_test(refs, &q->refcnt))
 		inet_frag_destroy(q);
 }
 
diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h
index 7321ffe3a108c159490ae358b8c2cfca958055a4..9d968d7d9fa402a4dd5e0f3d458bacbdcd49da77 100644
--- a/include/net/ipv6_frag.h
+++ b/include/net/ipv6_frag.h
@@ -66,6 +66,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
 {
 	struct net_device *dev = NULL;
 	struct sk_buff *head;
+	int refs = 1;
 
 	rcu_read_lock();
 	/* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */
@@ -109,7 +110,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
 	spin_unlock(&fq->q.lock);
 out_rcu_unlock:
 	rcu_read_unlock();
-	inet_frag_put(&fq->q);
+	inet_frag_putn(&fq->q, refs);
 }
 
 /* Check if the upper layer header is truncated in the first fragment. */
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 867d637d86f08887b8312ebb2c6fc64da42d7df4..2df1a027e68a1a839388ba088d2fb49a10914f91 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -45,6 +45,7 @@ static void lowpan_frag_expire(struct timer_list *t)
 {
 	struct inet_frag_queue *frag = from_timer(frag, t, timer);
 	struct frag_queue *fq;
+	int refs = 1;
 
 	fq = container_of(frag, struct frag_queue, q);
 
@@ -56,7 +57,7 @@ static void lowpan_frag_expire(struct timer_list *t)
 	inet_frag_kill(&fq->q);
 out:
 	spin_unlock(&fq->q.lock);
-	inet_frag_put(&fq->q);
+	inet_frag_putn(&fq->q, refs);
 }
 
 static inline struct lowpan_frag_queue *
@@ -302,13 +303,13 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
 
 	fq = fq_find(net, cb, &hdr.source, &hdr.dest);
 	if (fq != NULL) {
-		int ret;
+		int ret, refs = 1;
 
 		spin_lock(&fq->q.lock);
 		ret = lowpan_frag_queue(fq, skb, frag_type);
 		spin_unlock(&fq->q.lock);
 
-		inet_frag_put(&fq->q);
+		inet_frag_putn(&fq->q, refs);
 		return ret;
 	}
 
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index d179a2c8422276f99d6e8be599a36b50e1163380..efc4cbee04c272b1afaf5f2d63b11040c22c11e5 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -145,8 +145,7 @@ static void inet_frags_free_cb(void *ptr, void *arg)
 	}
 	spin_unlock_bh(&fq->lock);
 
-	if (refcount_sub_and_test(count, &fq->refcnt))
-		inet_frag_destroy(fq);
+	inet_frag_putn(fq, count);
 }
 
 static LLIST_HEAD(fqdir_free_list);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 7a435746a22dee9f11c0dc732a8b5a7724f4eea3..474a26191c89802b7a71212968d89ad2085ae476 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -112,7 +112,7 @@ static void ip4_frag_free(struct inet_frag_queue *q)
 
 static void ipq_put(struct ipq *ipq)
 {
-	inet_frag_put(&ipq->q);
+	inet_frag_putn(&ipq->q, 1);
 }
 
 /* Kill ipq entry. It is not destroyed immediately,
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 4120e67a8ce6bb44e3396ec6f66e564aae97aeec..fcf969f9fe2ab57b9bdb30434b933b863e3d3369 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -447,6 +447,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 	struct frag_hdr *fhdr;
 	struct frag_queue *fq;
 	struct ipv6hdr *hdr;
+	int refs = 1;
 	u8 prevhdr;
 
 	/* Jumbo payload inhibits frag. header */
@@ -489,7 +490,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 	}
 
 	spin_unlock_bh(&fq->q.lock);
-	inet_frag_put(&fq->q);
+	inet_frag_putn(&fq->q, refs);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index a48be617a8ab5e0ece9b01b427abe410cafef8bc..5d56a8e5166bcb3a5a169a95029bc14180ef9323 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -380,7 +380,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	fq = fq_find(net, fhdr->identification, hdr, iif);
 	if (fq) {
 		u32 prob_offset = 0;
-		int ret;
+		int ret, refs = 1;
 
 		spin_lock(&fq->q.lock);
 
@@ -389,7 +389,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 				     &prob_offset);
 
 		spin_unlock(&fq->q.lock);
-		inet_frag_put(&fq->q);
+		inet_frag_putn(&fq->q, refs);
 		if (prob_offset) {
 			__IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
 					IPSTATS_MIB_INHDRERRORS);
-- 
2.49.0.rc0.332.g42c0ae87b1-goog


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v2 net-next 2/4] ipv4: frags: remove ipq_put()
  2025-03-12  8:22 [PATCH v2 net-next 0/4] inet: frags: fully use RCU Eric Dumazet
  2025-03-12  8:22 ` [PATCH v2 net-next 1/4] inet: frags: add inet_frag_putn() helper Eric Dumazet
@ 2025-03-12  8:22 ` Eric Dumazet
  2025-03-12  8:22 ` [PATCH v2 net-next 3/4] inet: frags: change inet_frag_kill() to defer refcount updates Eric Dumazet
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Eric Dumazet @ 2025-03-12  8:22 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: David Ahern, Simon Horman, netdev, eric.dumazet, Eric Dumazet

Replace ipq_put() with inet_frag_putn()

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/ip_fragment.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 474a26191c89802b7a71212968d89ad2085ae476..ee953be49b34dd63443e5621e7c2f2b61cf7d914 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -107,14 +107,6 @@ static void ip4_frag_free(struct inet_frag_queue *q)
 		inet_putpeer(qp->peer);
 }
 
-
-/* Destruction primitives. */
-
-static void ipq_put(struct ipq *ipq)
-{
-	inet_frag_putn(&ipq->q, 1);
-}
-
 /* Kill ipq entry. It is not destroyed immediately,
  * because caller (and someone more) holds reference count.
  */
@@ -143,6 +135,7 @@ static void ip_expire(struct timer_list *t)
 	struct sk_buff *head = NULL;
 	struct net *net;
 	struct ipq *qp;
+	int refs = 1;
 
 	qp = container_of(frag, struct ipq, q);
 	net = qp->q.fqdir->net;
@@ -202,7 +195,7 @@ static void ip_expire(struct timer_list *t)
 out_rcu_unlock:
 	rcu_read_unlock();
 	kfree_skb_reason(head, reason);
-	ipq_put(qp);
+	inet_frag_putn(&qp->q, refs);
 }
 
 /* Find the correct entry in the "incomplete datagrams" queue for
@@ -498,14 +491,14 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
 	/* Lookup (or create) queue header */
 	qp = ip_find(net, ip_hdr(skb), user, vif);
 	if (qp) {
-		int ret;
+		int ret, refs = 1;
 
 		spin_lock(&qp->q.lock);
 
 		ret = ip_frag_queue(qp, skb);
 
 		spin_unlock(&qp->q.lock);
-		ipq_put(qp);
+		inet_frag_putn(&qp->q, refs);
 		return ret;
 	}
 
-- 
2.49.0.rc0.332.g42c0ae87b1-goog


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v2 net-next 3/4] inet: frags: change inet_frag_kill() to defer refcount updates
  2025-03-12  8:22 [PATCH v2 net-next 0/4] inet: frags: fully use RCU Eric Dumazet
  2025-03-12  8:22 ` [PATCH v2 net-next 1/4] inet: frags: add inet_frag_putn() helper Eric Dumazet
  2025-03-12  8:22 ` [PATCH v2 net-next 2/4] ipv4: frags: remove ipq_put() Eric Dumazet
@ 2025-03-12  8:22 ` Eric Dumazet
  2025-03-12  8:22 ` [PATCH v2 net-next 4/4] inet: frags: save a pair of atomic operations in reassembly Eric Dumazet
  2025-03-18 12:30 ` [PATCH v2 net-next 0/4] inet: frags: fully use RCU patchwork-bot+netdevbpf
  4 siblings, 0 replies; 7+ messages in thread
From: Eric Dumazet @ 2025-03-12  8:22 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: David Ahern, Simon Horman, netdev, eric.dumazet, Eric Dumazet

In the following patch, we no longer assume inet_frag_kill()
callers own a reference.

Consuming two refcounts from inet_frag_kill() would lead in UAF.

Propagate the pointer to the refs that will be consumed later
by the final inet_frag_putn() call.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/inet_frag.h                 |  2 +-
 include/net/ipv6_frag.h                 |  2 +-
 net/ieee802154/6lowpan/reassembly.c     | 17 ++++++++------
 net/ipv4/inet_fragment.c                | 12 +++++-----
 net/ipv4/ip_fragment.c                  | 30 ++++++++++---------------
 net/ipv6/netfilter/nf_conntrack_reasm.c | 21 +++++++++--------
 net/ipv6/reassembly.c                   | 18 ++++++++-------
 7 files changed, 53 insertions(+), 49 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 26687ad0b14142e904b66acee4c98537fa8077c3..0eccd9c3a883fbceb9a0a740237b7245f13c5d26 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -137,7 +137,7 @@ static inline void fqdir_pre_exit(struct fqdir *fqdir)
 }
 void fqdir_exit(struct fqdir *fqdir);
 
-void inet_frag_kill(struct inet_frag_queue *q);
+void inet_frag_kill(struct inet_frag_queue *q, int *refs);
 void inet_frag_destroy(struct inet_frag_queue *q);
 struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key);
 
diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h
index 9d968d7d9fa402a4dd5e0f3d458bacbdcd49da77..38ef66826939ee561e409f528292ca4af1e29a3b 100644
--- a/include/net/ipv6_frag.h
+++ b/include/net/ipv6_frag.h
@@ -78,7 +78,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
 		goto out;
 
 	fq->q.flags |= INET_FRAG_DROP;
-	inet_frag_kill(&fq->q);
+	inet_frag_kill(&fq->q, &refs);
 
 	dev = dev_get_by_index_rcu(net, fq->iif);
 	if (!dev)
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 2df1a027e68a1a839388ba088d2fb49a10914f91..c5f86cff06ee7f95556955f994b859c86a315646 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -31,7 +31,8 @@ static const char lowpan_frags_cache_name[] = "lowpan-frags";
 static struct inet_frags lowpan_frags;
 
 static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb,
-			     struct sk_buff *prev,  struct net_device *ldev);
+			     struct sk_buff *prev, struct net_device *ldev,
+			     int *refs);
 
 static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
 {
@@ -54,7 +55,7 @@ static void lowpan_frag_expire(struct timer_list *t)
 	if (fq->q.flags & INET_FRAG_COMPLETE)
 		goto out;
 
-	inet_frag_kill(&fq->q);
+	inet_frag_kill(&fq->q, &refs);
 out:
 	spin_unlock(&fq->q.lock);
 	inet_frag_putn(&fq->q, refs);
@@ -83,7 +84,8 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
 }
 
 static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
-			     struct sk_buff *skb, u8 frag_type)
+			     struct sk_buff *skb, u8 frag_type,
+			     int *refs)
 {
 	struct sk_buff *prev_tail;
 	struct net_device *ldev;
@@ -144,7 +146,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
 		unsigned long orefdst = skb->_skb_refdst;
 
 		skb->_skb_refdst = 0UL;
-		res = lowpan_frag_reasm(fq, skb, prev_tail, ldev);
+		res = lowpan_frag_reasm(fq, skb, prev_tail, ldev, refs);
 		skb->_skb_refdst = orefdst;
 		return res;
 	}
@@ -163,11 +165,12 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
  *	the last and the first frames arrived and all the bits are here.
  */
 static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb,
-			     struct sk_buff *prev_tail, struct net_device *ldev)
+			     struct sk_buff *prev_tail, struct net_device *ldev,
+			     int *refs)
 {
 	void *reasm_data;
 
-	inet_frag_kill(&fq->q);
+	inet_frag_kill(&fq->q, refs);
 
 	reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail);
 	if (!reasm_data)
@@ -306,7 +309,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
 		int ret, refs = 1;
 
 		spin_lock(&fq->q.lock);
-		ret = lowpan_frag_queue(fq, skb, frag_type);
+		ret = lowpan_frag_queue(fq, skb, frag_type, &refs);
 		spin_unlock(&fq->q.lock);
 
 		inet_frag_putn(&fq->q, refs);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index efc4cbee04c272b1afaf5f2d63b11040c22c11e5..5eb18605001387e7f23b8661dc9f24a533ab1600 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -225,10 +225,10 @@ void fqdir_exit(struct fqdir *fqdir)
 }
 EXPORT_SYMBOL(fqdir_exit);
 
-void inet_frag_kill(struct inet_frag_queue *fq)
+void inet_frag_kill(struct inet_frag_queue *fq, int *refs)
 {
 	if (del_timer(&fq->timer))
-		refcount_dec(&fq->refcnt);
+		(*refs)++;
 
 	if (!(fq->flags & INET_FRAG_COMPLETE)) {
 		struct fqdir *fqdir = fq->fqdir;
@@ -243,7 +243,7 @@ void inet_frag_kill(struct inet_frag_queue *fq)
 		if (!READ_ONCE(fqdir->dead)) {
 			rhashtable_remove_fast(&fqdir->rhashtable, &fq->node,
 					       fqdir->f->rhash_params);
-			refcount_dec(&fq->refcnt);
+			(*refs)++;
 		} else {
 			fq->flags |= INET_FRAG_HASH_DEAD;
 		}
@@ -349,9 +349,11 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
 	*prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
 						 &q->node, f->rhash_params);
 	if (*prev) {
+		int refs = 2;
+
 		q->flags |= INET_FRAG_COMPLETE;
-		inet_frag_kill(q);
-		inet_frag_destroy(q);
+		inet_frag_kill(q, &refs);
+		inet_frag_putn(q, refs);
 		return NULL;
 	}
 	return q;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index ee953be49b34dd63443e5621e7c2f2b61cf7d914..c5f3c810706fb328c3d8a4d8501424df0dceaa8e 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -76,7 +76,8 @@ static u8 ip4_frag_ecn(u8 tos)
 static struct inet_frags ip4_frags;
 
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
-			 struct sk_buff *prev_tail, struct net_device *dev);
+			 struct sk_buff *prev_tail, struct net_device *dev,
+			 int *refs);
 
 
 static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
@@ -107,14 +108,6 @@ static void ip4_frag_free(struct inet_frag_queue *q)
 		inet_putpeer(qp->peer);
 }
 
-/* Kill ipq entry. It is not destroyed immediately,
- * because caller (and someone more) holds reference count.
- */
-static void ipq_kill(struct ipq *ipq)
-{
-	inet_frag_kill(&ipq->q);
-}
-
 static bool frag_expire_skip_icmp(u32 user)
 {
 	return user == IP_DEFRAG_AF_PACKET ||
@@ -152,7 +145,7 @@ static void ip_expire(struct timer_list *t)
 		goto out;
 
 	qp->q.flags |= INET_FRAG_DROP;
-	ipq_kill(qp);
+	inet_frag_kill(&qp->q, &refs);
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
 
@@ -271,7 +264,7 @@ static int ip_frag_reinit(struct ipq *qp)
 }
 
 /* Add new segment to existing queue. */
-static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb, int *refs)
 {
 	struct net *net = qp->q.fqdir->net;
 	int ihl, end, flags, offset;
@@ -291,7 +284,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
 	    unlikely(ip_frag_too_far(qp)) &&
 	    unlikely(err = ip_frag_reinit(qp))) {
-		ipq_kill(qp);
+		inet_frag_kill(&qp->q, refs);
 		goto err;
 	}
 
@@ -375,10 +368,10 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		unsigned long orefdst = skb->_skb_refdst;
 
 		skb->_skb_refdst = 0UL;
-		err = ip_frag_reasm(qp, skb, prev_tail, dev);
+		err = ip_frag_reasm(qp, skb, prev_tail, dev, refs);
 		skb->_skb_refdst = orefdst;
 		if (err)
-			inet_frag_kill(&qp->q);
+			inet_frag_kill(&qp->q, refs);
 		return err;
 	}
 
@@ -395,7 +388,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	err = -EINVAL;
 	__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
 discard_qp:
-	inet_frag_kill(&qp->q);
+	inet_frag_kill(&qp->q, refs);
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 err:
 	kfree_skb_reason(skb, reason);
@@ -409,7 +402,8 @@ static bool ip_frag_coalesce_ok(const struct ipq *qp)
 
 /* Build a new IP datagram from all its fragments. */
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
-			 struct sk_buff *prev_tail, struct net_device *dev)
+			 struct sk_buff *prev_tail, struct net_device *dev,
+			 int *refs)
 {
 	struct net *net = qp->q.fqdir->net;
 	struct iphdr *iph;
@@ -417,7 +411,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 	int len, err;
 	u8 ecn;
 
-	ipq_kill(qp);
+	inet_frag_kill(&qp->q, refs);
 
 	ecn = ip_frag_ecn_table[qp->ecn];
 	if (unlikely(ecn == 0xff)) {
@@ -495,7 +489,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
 
 		spin_lock(&qp->q.lock);
 
-		ret = ip_frag_queue(qp, skb);
+		ret = ip_frag_queue(qp, skb, &refs);
 
 		spin_unlock(&qp->q.lock);
 		inet_frag_putn(&qp->q, refs);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index fcf969f9fe2ab57b9bdb30434b933b863e3d3369..f33acb730dc5807205811c2675efd27a9ee99222 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -123,7 +123,8 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
 #endif
 
 static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
-			     struct sk_buff *prev_tail, struct net_device *dev);
+			     struct sk_buff *prev_tail, struct net_device *dev,
+			     int *refs);
 
 static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
 {
@@ -167,7 +168,8 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
 
 
 static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
-			     const struct frag_hdr *fhdr, int nhoff)
+			     const struct frag_hdr *fhdr, int nhoff,
+			     int *refs)
 {
 	unsigned int payload_len;
 	struct net_device *dev;
@@ -221,7 +223,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 			 * this case. -DaveM
 			 */
 			pr_debug("end of fragment not rounded to 8 bytes.\n");
-			inet_frag_kill(&fq->q);
+			inet_frag_kill(&fq->q, refs);
 			return -EPROTO;
 		}
 		if (end > fq->q.len) {
@@ -287,7 +289,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 		unsigned long orefdst = skb->_skb_refdst;
 
 		skb->_skb_refdst = 0UL;
-		err = nf_ct_frag6_reasm(fq, skb, prev, dev);
+		err = nf_ct_frag6_reasm(fq, skb, prev, dev, refs);
 		skb->_skb_refdst = orefdst;
 
 		/* After queue has assumed skb ownership, only 0 or
@@ -301,7 +303,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 	return -EINPROGRESS;
 
 insert_error:
-	inet_frag_kill(&fq->q);
+	inet_frag_kill(&fq->q, refs);
 err:
 	skb_dst_drop(skb);
 	return -EINVAL;
@@ -315,13 +317,14 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
  *	the last and the first frames arrived and all the bits are here.
  */
 static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
-			     struct sk_buff *prev_tail, struct net_device *dev)
+			     struct sk_buff *prev_tail, struct net_device *dev,
+			     int *refs)
 {
 	void *reasm_data;
 	int payload_len;
 	u8 ecn;
 
-	inet_frag_kill(&fq->q);
+	inet_frag_kill(&fq->q, refs);
 
 	ecn = ip_frag_ecn_table[fq->ecn];
 	if (unlikely(ecn == 0xff))
@@ -372,7 +375,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
 	return 0;
 
 err:
-	inet_frag_kill(&fq->q);
+	inet_frag_kill(&fq->q, refs);
 	return -EINVAL;
 }
 
@@ -483,7 +486,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 
 	spin_lock_bh(&fq->q.lock);
 
-	ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff);
+	ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff, &refs);
 	if (ret == -EPROTO) {
 		skb->transport_header = savethdr;
 		ret = 0;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 5d56a8e5166bcb3a5a169a95029bc14180ef9323..7560380bd5871217d476f2e0e39332926c458bc1 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -68,7 +68,8 @@ static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
 static struct inet_frags ip6_frags;
 
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
-			  struct sk_buff *prev_tail, struct net_device *dev);
+			  struct sk_buff *prev_tail, struct net_device *dev,
+			  int *refs);
 
 static void ip6_frag_expire(struct timer_list *t)
 {
@@ -105,7 +106,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
 
 static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 			  struct frag_hdr *fhdr, int nhoff,
-			  u32 *prob_offset)
+			  u32 *prob_offset, int *refs)
 {
 	struct net *net = dev_net(skb_dst(skb)->dev);
 	int offset, end, fragsize;
@@ -220,7 +221,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 		unsigned long orefdst = skb->_skb_refdst;
 
 		skb->_skb_refdst = 0UL;
-		err = ip6_frag_reasm(fq, skb, prev_tail, dev);
+		err = ip6_frag_reasm(fq, skb, prev_tail, dev, refs);
 		skb->_skb_refdst = orefdst;
 		return err;
 	}
@@ -238,7 +239,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 			IPSTATS_MIB_REASM_OVERLAPS);
 discard_fq:
-	inet_frag_kill(&fq->q);
+	inet_frag_kill(&fq->q, refs);
 	__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 			IPSTATS_MIB_REASMFAILS);
 err:
@@ -254,7 +255,8 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
  *	the last and the first frames arrived and all the bits are here.
  */
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
-			  struct sk_buff *prev_tail, struct net_device *dev)
+			  struct sk_buff *prev_tail, struct net_device *dev,
+			  int *refs)
 {
 	struct net *net = fq->q.fqdir->net;
 	unsigned int nhoff;
@@ -262,7 +264,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
 	int payload_len;
 	u8 ecn;
 
-	inet_frag_kill(&fq->q);
+	inet_frag_kill(&fq->q, refs);
 
 	ecn = ip_frag_ecn_table[fq->ecn];
 	if (unlikely(ecn == 0xff))
@@ -320,7 +322,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
 	rcu_read_lock();
 	__IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS);
 	rcu_read_unlock();
-	inet_frag_kill(&fq->q);
+	inet_frag_kill(&fq->q, refs);
 	return -1;
 }
 
@@ -386,7 +388,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 
 		fq->iif = iif;
 		ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff,
-				     &prob_offset);
+				     &prob_offset, &refs);
 
 		spin_unlock(&fq->q.lock);
 		inet_frag_putn(&fq->q, refs);
-- 
2.49.0.rc0.332.g42c0ae87b1-goog


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v2 net-next 4/4] inet: frags: save a pair of atomic operations in reassembly
  2025-03-12  8:22 [PATCH v2 net-next 0/4] inet: frags: fully use RCU Eric Dumazet
                   ` (2 preceding siblings ...)
  2025-03-12  8:22 ` [PATCH v2 net-next 3/4] inet: frags: change inet_frag_kill() to defer refcount updates Eric Dumazet
@ 2025-03-12  8:22 ` Eric Dumazet
  2025-03-13 22:53   ` Jacob Keller
  2025-03-18 12:30 ` [PATCH v2 net-next 0/4] inet: frags: fully use RCU patchwork-bot+netdevbpf
  4 siblings, 1 reply; 7+ messages in thread
From: Eric Dumazet @ 2025-03-12  8:22 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: David Ahern, Simon Horman, netdev, eric.dumazet, Eric Dumazet

As mentioned in commit 648700f76b03 ("inet: frags:
use rhashtables for reassembly units"):

  A followup patch will even remove the refcount hold/release
  left from prior implementation and save a couple of atomic
  operations.

This patch implements this idea, seven years later.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ieee802154/6lowpan/reassembly.c     |  5 ++++-
 net/ipv4/inet_fragment.c                | 18 ++++++++----------
 net/ipv4/ip_fragment.c                  |  5 ++++-
 net/ipv6/netfilter/nf_conntrack_reasm.c |  5 ++++-
 net/ipv6/reassembly.c                   |  9 ++++-----
 5 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index c5f86cff06ee7f95556955f994b859c86a315646..d4b983d170382e616ea58821b197da89885a6bb2 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -304,17 +304,20 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
 		goto err;
 	}
 
+	rcu_read_lock();
 	fq = fq_find(net, cb, &hdr.source, &hdr.dest);
 	if (fq != NULL) {
-		int ret, refs = 1;
+		int ret, refs = 0;
 
 		spin_lock(&fq->q.lock);
 		ret = lowpan_frag_queue(fq, skb, frag_type, &refs);
 		spin_unlock(&fq->q.lock);
 
+		rcu_read_unlock();
 		inet_frag_putn(&fq->q, refs);
 		return ret;
 	}
+	rcu_read_unlock();
 
 err:
 	kfree_skb(skb);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 5eb18605001387e7f23b8661dc9f24a533ab1600..19fae4811ab2803bed2faa4900869f883cb3073c 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -327,7 +327,8 @@ static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
 
 	timer_setup(&q->timer, f->frag_expire, 0);
 	spin_lock_init(&q->lock);
-	refcount_set(&q->refcnt, 3);
+	/* One reference for the timer, one for the hash table. */
+	refcount_set(&q->refcnt, 2);
 
 	return q;
 }
@@ -349,7 +350,11 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
 	*prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
 						 &q->node, f->rhash_params);
 	if (*prev) {
-		int refs = 2;
+		/* We could not insert in the hash table,
+		 * we need to cancel what inet_frag_alloc()
+		 * anticipated.
+		 */
+		int refs = 1;
 
 		q->flags |= INET_FRAG_COMPLETE;
 		inet_frag_kill(q, &refs);
@@ -359,7 +364,6 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
 	return q;
 }
 
-/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
 struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
 {
 	/* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */
@@ -369,17 +373,11 @@ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
 	if (!high_thresh || frag_mem_limit(fqdir) > high_thresh)
 		return NULL;
 
-	rcu_read_lock();
-
 	prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
 	if (!prev)
 		fq = inet_frag_create(fqdir, key, &prev);
-	if (!IS_ERR_OR_NULL(prev)) {
+	if (!IS_ERR_OR_NULL(prev))
 		fq = prev;
-		if (!refcount_inc_not_zero(&fq->refcnt))
-			fq = NULL;
-	}
-	rcu_read_unlock();
 	return fq;
 }
 EXPORT_SYMBOL(inet_frag_find);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index c5f3c810706fb328c3d8a4d8501424df0dceaa8e..77f395b28ec748bcd85b8dfa0a8c0b8a74684103 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -483,18 +483,21 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
 
 	/* Lookup (or create) queue header */
+	rcu_read_lock();
 	qp = ip_find(net, ip_hdr(skb), user, vif);
 	if (qp) {
-		int ret, refs = 1;
+		int ret, refs = 0;
 
 		spin_lock(&qp->q.lock);
 
 		ret = ip_frag_queue(qp, skb, &refs);
 
 		spin_unlock(&qp->q.lock);
+		rcu_read_unlock();
 		inet_frag_putn(&qp->q, refs);
 		return ret;
 	}
+	rcu_read_unlock();
 
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 	kfree_skb(skb);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index f33acb730dc5807205811c2675efd27a9ee99222..d6bd8f7079bb74ec99030201163332ed5c6d2eec 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -450,7 +450,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 	struct frag_hdr *fhdr;
 	struct frag_queue *fq;
 	struct ipv6hdr *hdr;
-	int refs = 1;
+	int refs = 0;
 	u8 prevhdr;
 
 	/* Jumbo payload inhibits frag. header */
@@ -477,9 +477,11 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 	hdr = ipv6_hdr(skb);
 	fhdr = (struct frag_hdr *)skb_transport_header(skb);
 
+	rcu_read_lock();
 	fq = fq_find(net, fhdr->identification, user, hdr,
 		     skb->dev ? skb->dev->ifindex : 0);
 	if (fq == NULL) {
+		rcu_read_unlock();
 		pr_debug("Can't find and can't create new queue\n");
 		return -ENOMEM;
 	}
@@ -493,6 +495,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 	}
 
 	spin_unlock_bh(&fq->q.lock);
+	rcu_read_unlock();
 	inet_frag_putn(&fq->q, refs);
 	return ret;
 }
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 7560380bd5871217d476f2e0e39332926c458bc1..49740898bc1370ff0ca89928750c6de85a45303f 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -305,9 +305,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
 	skb_postpush_rcsum(skb, skb_network_header(skb),
 			   skb_network_header_len(skb));
 
-	rcu_read_lock();
 	__IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMOKS);
-	rcu_read_unlock();
 	fq->q.rb_fragments = RB_ROOT;
 	fq->q.fragments_tail = NULL;
 	fq->q.last_run_head = NULL;
@@ -319,9 +317,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
 out_oom:
 	net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n");
 out_fail:
-	rcu_read_lock();
 	__IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS);
-	rcu_read_unlock();
 	inet_frag_kill(&fq->q, refs);
 	return -1;
 }
@@ -379,10 +375,11 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 	}
 
 	iif = skb->dev ? skb->dev->ifindex : 0;
+	rcu_read_lock();
 	fq = fq_find(net, fhdr->identification, hdr, iif);
 	if (fq) {
 		u32 prob_offset = 0;
-		int ret, refs = 1;
+		int ret, refs = 0;
 
 		spin_lock(&fq->q.lock);
 
@@ -391,6 +388,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 				     &prob_offset, &refs);
 
 		spin_unlock(&fq->q.lock);
+		rcu_read_unlock();
 		inet_frag_putn(&fq->q, refs);
 		if (prob_offset) {
 			__IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
@@ -400,6 +398,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		}
 		return ret;
 	}
+	rcu_read_unlock();
 
 	__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS);
 	kfree_skb(skb);
-- 
2.49.0.rc0.332.g42c0ae87b1-goog


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 net-next 4/4] inet: frags: save a pair of atomic operations in reassembly
  2025-03-12  8:22 ` [PATCH v2 net-next 4/4] inet: frags: save a pair of atomic operations in reassembly Eric Dumazet
@ 2025-03-13 22:53   ` Jacob Keller
  0 siblings, 0 replies; 7+ messages in thread
From: Jacob Keller @ 2025-03-13 22:53 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: David Ahern, Simon Horman, netdev, eric.dumazet



On 3/12/2025 1:22 AM, Eric Dumazet wrote:
> As mentioned in commit 648700f76b03 ("inet: frags:
> use rhashtables for reassembly units"):
> 
>   A followup patch will even remove the refcount hold/release
>   left from prior implementation and save a couple of atomic
>   operations.
> 
> This patch implements this idea, seven years later.
> 

Easy for things to slip through the cracks and get forgotten.

Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>

> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  net/ieee802154/6lowpan/reassembly.c     |  5 ++++-
>  net/ipv4/inet_fragment.c                | 18 ++++++++----------
>  net/ipv4/ip_fragment.c                  |  5 ++++-
>  net/ipv6/netfilter/nf_conntrack_reasm.c |  5 ++++-
>  net/ipv6/reassembly.c                   |  9 ++++-----
>  5 files changed, 24 insertions(+), 18 deletions(-)
> 
> diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
> index c5f86cff06ee7f95556955f994b859c86a315646..d4b983d170382e616ea58821b197da89885a6bb2 100644
> --- a/net/ieee802154/6lowpan/reassembly.c
> +++ b/net/ieee802154/6lowpan/reassembly.c
> @@ -304,17 +304,20 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
>  		goto err;
>  	}
>  
> +	rcu_read_lock();
>  	fq = fq_find(net, cb, &hdr.source, &hdr.dest);
>  	if (fq != NULL) {
> -		int ret, refs = 1;
> +		int ret, refs = 0;
>  
>  		spin_lock(&fq->q.lock);
>  		ret = lowpan_frag_queue(fq, skb, frag_type, &refs);
>  		spin_unlock(&fq->q.lock);
>  
> +		rcu_read_unlock();
>  		inet_frag_putn(&fq->q, refs);
>  		return ret;
>  	}
> +	rcu_read_unlock();
>  
>  err:
>  	kfree_skb(skb);
> diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
> index 5eb18605001387e7f23b8661dc9f24a533ab1600..19fae4811ab2803bed2faa4900869f883cb3073c 100644
> --- a/net/ipv4/inet_fragment.c
> +++ b/net/ipv4/inet_fragment.c
> @@ -327,7 +327,8 @@ static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
>  
>  	timer_setup(&q->timer, f->frag_expire, 0);
>  	spin_lock_init(&q->lock);
> -	refcount_set(&q->refcnt, 3);
> +	/* One reference for the timer, one for the hash table. */
> +	refcount_set(&q->refcnt, 2);
>  
>  	return q;
>  }
> @@ -349,7 +350,11 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
>  	*prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
>  						 &q->node, f->rhash_params);
>  	if (*prev) {
> -		int refs = 2;
> +		/* We could not insert in the hash table,
> +		 * we need to cancel what inet_frag_alloc()
> +		 * anticipated.
> +		 */
> +		int refs = 1;
>  
>  		q->flags |= INET_FRAG_COMPLETE;
>  		inet_frag_kill(q, &refs);
> @@ -359,7 +364,6 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
>  	return q;
>  }
>  
> -/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
>  struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
>  {
>  	/* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */
> @@ -369,17 +373,11 @@ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
>  	if (!high_thresh || frag_mem_limit(fqdir) > high_thresh)
>  		return NULL;
>  
> -	rcu_read_lock();
> -
>  	prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
>  	if (!prev)
>  		fq = inet_frag_create(fqdir, key, &prev);
> -	if (!IS_ERR_OR_NULL(prev)) {
> +	if (!IS_ERR_OR_NULL(prev))
>  		fq = prev;
> -		if (!refcount_inc_not_zero(&fq->refcnt))
> -			fq = NULL;
> -	}
> -	rcu_read_unlock();
>  	return fq;
>  }
>  EXPORT_SYMBOL(inet_frag_find);
> diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
> index c5f3c810706fb328c3d8a4d8501424df0dceaa8e..77f395b28ec748bcd85b8dfa0a8c0b8a74684103 100644
> --- a/net/ipv4/ip_fragment.c
> +++ b/net/ipv4/ip_fragment.c
> @@ -483,18 +483,21 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
>  	__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
>  
>  	/* Lookup (or create) queue header */
> +	rcu_read_lock();
>  	qp = ip_find(net, ip_hdr(skb), user, vif);
>  	if (qp) {
> -		int ret, refs = 1;
> +		int ret, refs = 0;
>  
>  		spin_lock(&qp->q.lock);
>  
>  		ret = ip_frag_queue(qp, skb, &refs);
>  
>  		spin_unlock(&qp->q.lock);
> +		rcu_read_unlock();
>  		inet_frag_putn(&qp->q, refs);
>  		return ret;
>  	}
> +	rcu_read_unlock();
>  
>  	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
>  	kfree_skb(skb);
> diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
> index f33acb730dc5807205811c2675efd27a9ee99222..d6bd8f7079bb74ec99030201163332ed5c6d2eec 100644
> --- a/net/ipv6/netfilter/nf_conntrack_reasm.c
> +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
> @@ -450,7 +450,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
>  	struct frag_hdr *fhdr;
>  	struct frag_queue *fq;
>  	struct ipv6hdr *hdr;
> -	int refs = 1;
> +	int refs = 0;
>  	u8 prevhdr;
>  
>  	/* Jumbo payload inhibits frag. header */
> @@ -477,9 +477,11 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
>  	hdr = ipv6_hdr(skb);
>  	fhdr = (struct frag_hdr *)skb_transport_header(skb);
>  
> +	rcu_read_lock();
>  	fq = fq_find(net, fhdr->identification, user, hdr,
>  		     skb->dev ? skb->dev->ifindex : 0);
>  	if (fq == NULL) {
> +		rcu_read_unlock();
>  		pr_debug("Can't find and can't create new queue\n");
>  		return -ENOMEM;
>  	}
> @@ -493,6 +495,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
>  	}
>  
>  	spin_unlock_bh(&fq->q.lock);
> +	rcu_read_unlock();
>  	inet_frag_putn(&fq->q, refs);
>  	return ret;
>  }
> diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
> index 7560380bd5871217d476f2e0e39332926c458bc1..49740898bc1370ff0ca89928750c6de85a45303f 100644
> --- a/net/ipv6/reassembly.c
> +++ b/net/ipv6/reassembly.c
> @@ -305,9 +305,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
>  	skb_postpush_rcsum(skb, skb_network_header(skb),
>  			   skb_network_header_len(skb));
>  
> -	rcu_read_lock();
>  	__IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMOKS);
> -	rcu_read_unlock();
>  	fq->q.rb_fragments = RB_ROOT;
>  	fq->q.fragments_tail = NULL;
>  	fq->q.last_run_head = NULL;
> @@ -319,9 +317,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
>  out_oom:
>  	net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n");
>  out_fail:
> -	rcu_read_lock();
>  	__IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS);
> -	rcu_read_unlock();
>  	inet_frag_kill(&fq->q, refs);
>  	return -1;
>  }
> @@ -379,10 +375,11 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
>  	}
>  
>  	iif = skb->dev ? skb->dev->ifindex : 0;
> +	rcu_read_lock();
>  	fq = fq_find(net, fhdr->identification, hdr, iif);
>  	if (fq) {
>  		u32 prob_offset = 0;
> -		int ret, refs = 1;
> +		int ret, refs = 0;
>  
>  		spin_lock(&fq->q.lock);
>  
> @@ -391,6 +388,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
>  				     &prob_offset, &refs);
>  
>  		spin_unlock(&fq->q.lock);
> +		rcu_read_unlock();
>  		inet_frag_putn(&fq->q, refs);
>  		if (prob_offset) {
>  			__IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
> @@ -400,6 +398,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
>  		}
>  		return ret;
>  	}
> +	rcu_read_unlock();
>  
>  	__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS);
>  	kfree_skb(skb);


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2 net-next 0/4] inet: frags: fully use RCU
  2025-03-12  8:22 [PATCH v2 net-next 0/4] inet: frags: fully use RCU Eric Dumazet
                   ` (3 preceding siblings ...)
  2025-03-12  8:22 ` [PATCH v2 net-next 4/4] inet: frags: save a pair of atomic operations in reassembly Eric Dumazet
@ 2025-03-18 12:30 ` patchwork-bot+netdevbpf
  4 siblings, 0 replies; 7+ messages in thread
From: patchwork-bot+netdevbpf @ 2025-03-18 12:30 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: davem, kuba, pabeni, dsahern, horms, netdev, eric.dumazet

Hello:

This series was applied to netdev/net-next.git (main)
by Paolo Abeni <pabeni@redhat.com>:

On Wed, 12 Mar 2025 08:22:46 +0000 you wrote:
> While inet reassembly uses RCU, it is acquiring/releasing
> a refcount on struct inet_frag_queue in fast path,
> for no good reason.
> 
> This was mentioned in one patch changelog seven years ago :/
> 
> This series is removing these refcount changes, by extending
> RCU sections.
> 
> [...]

Here is the summary with links:
  - [v2,net-next,1/4] inet: frags: add inet_frag_putn() helper
    https://git.kernel.org/netdev/net-next/c/ae2d90355aa5
  - [v2,net-next,2/4] ipv4: frags: remove ipq_put()
    https://git.kernel.org/netdev/net-next/c/a2fb987c0ecf
  - [v2,net-next,3/4] inet: frags: change inet_frag_kill() to defer refcount updates
    https://git.kernel.org/netdev/net-next/c/eb0dfc0ef195
  - [v2,net-next,4/4] inet: frags: save a pair of atomic operations in reassembly
    https://git.kernel.org/netdev/net-next/c/ca0359df45a5

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-03-18 12:30 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-12  8:22 [PATCH v2 net-next 0/4] inet: frags: fully use RCU Eric Dumazet
2025-03-12  8:22 ` [PATCH v2 net-next 1/4] inet: frags: add inet_frag_putn() helper Eric Dumazet
2025-03-12  8:22 ` [PATCH v2 net-next 2/4] ipv4: frags: remove ipq_put() Eric Dumazet
2025-03-12  8:22 ` [PATCH v2 net-next 3/4] inet: frags: change inet_frag_kill() to defer refcount updates Eric Dumazet
2025-03-12  8:22 ` [PATCH v2 net-next 4/4] inet: frags: save a pair of atomic operations in reassembly Eric Dumazet
2025-03-13 22:53   ` Jacob Keller
2025-03-18 12:30 ` [PATCH v2 net-next 0/4] inet: frags: fully use RCU patchwork-bot+netdevbpf

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.