public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
From: Florian Westphal <fw@strlen.de>
To: <netdev@vger.kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>,
	"David S. Miller" <davem@davemloft.net>,
	Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>,
	<netfilter-devel@vger.kernel.org>,
	pablo@netfilter.org
Subject: [PATCH v2 net-next 01/11] netfilter: nft_set_rbtree: don't gc elements on insert
Date: Fri,  6 Feb 2026 16:30:38 +0100	[thread overview]
Message-ID: <20260206153048.17570-2-fw@strlen.de> (raw)
In-Reply-To: <20260206153048.17570-1-fw@strlen.de>

During insertion we can queue up expired elements for garbage
collection.

In case of later abort, the commit hook will never be called.
Packet path and 'get' requests will find free'd elements in the
binary search blob:

 nft_set_ext_key include/net/netfilter/nf_tables.h:800 [inline]
 nft_array_get_cmp+0x1f6/0x2a0 net/netfilter/nft_set_rbtree.c:133
 __inline_bsearch include/linux/bsearch.h:15 [inline]
 bsearch+0x50/0xc0 lib/bsearch.c:33
 nft_rbtree_get+0x16b/0x400 net/netfilter/nft_set_rbtree.c:169
 nft_setelem_get net/netfilter/nf_tables_api.c:6495 [inline]
 nft_get_set_elem+0x420/0xaa0 net/netfilter/nf_tables_api.c:6543
 nf_tables_getsetelem+0x448/0x5e0 net/netfilter/nf_tables_api.c:6632
 nfnetlink_rcv_msg+0x8ae/0x12c0 net/netfilter/nfnetlink.c:290

Also, when we insert an element that triggers -EEXIST, and that insertion
happens to also zap a timed-out entry, we end up with same issue:
Neither commit nor abort hook is called.

Fix this by removing gc api usage during insertion.

The blamed commit also removes concurrency of the rbtree with the
packet path, so we can now safely rb_erase() the element and move
it to a new expired list that can be reaped in the commit hook
before building the next blob iteration.

This also avoids the need to rebuild the blob in the abort path:
Expired elements seen during insertion attempts are kept around
until a transaction passes.

Reported-by: syzbot+d417922a3e7935517ef6@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=d417922a3e7935517ef6
Fixes: 7e43e0a1141d ("netfilter: nft_set_rbtree: translate rbtree to array for binary search")
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nft_set_rbtree.c | 136 ++++++++++++++++-----------------
 1 file changed, 68 insertions(+), 68 deletions(-)

diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 7598c368c4e5..0efaa8c3f31b 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -34,11 +34,15 @@ struct nft_rbtree {
 	struct nft_array __rcu	*array;
 	struct nft_array	*array_next;
 	unsigned long		last_gc;
+	struct list_head	expired;
 };
 
 struct nft_rbtree_elem {
 	struct nft_elem_priv	priv;
-	struct rb_node		node;
+	union {
+		struct rb_node	node;
+		struct list_head list;
+	};
 	struct nft_set_ext	ext;
 };
 
@@ -179,13 +183,16 @@ nft_rbtree_get(const struct net *net, const struct nft_set *set,
 	return &rbe->priv;
 }
 
-static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set,
-				      struct nft_rbtree *priv,
-				      struct nft_rbtree_elem *rbe)
+static void nft_rbtree_gc_elem_move(struct net *net, struct nft_set *set,
+				    struct nft_rbtree *priv,
+				    struct nft_rbtree_elem *rbe)
 {
 	lockdep_assert_held_write(&priv->lock);
 	nft_setelem_data_deactivate(net, set, &rbe->priv);
 	rb_erase(&rbe->node, &priv->root);
+
+	/* collected later on in commit callback */
+	list_add(&rbe->list, &priv->expired);
 }
 
 static const struct nft_rbtree_elem *
@@ -196,11 +203,6 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
 	struct rb_node *prev = rb_prev(&rbe->node);
 	struct net *net = read_pnet(&set->net);
 	struct nft_rbtree_elem *rbe_prev;
-	struct nft_trans_gc *gc;
-
-	gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
-	if (!gc)
-		return ERR_PTR(-ENOMEM);
 
 	/* search for end interval coming before this element.
 	 * end intervals don't carry a timeout extension, they
@@ -218,28 +220,10 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
 	rbe_prev = NULL;
 	if (prev) {
 		rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
-		nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev);
-
-		/* There is always room in this trans gc for this element,
-		 * memory allocation never actually happens, hence, the warning
-		 * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT,
-		 * this is synchronous gc which never fails.
-		 */
-		gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
-		if (WARN_ON_ONCE(!gc))
-			return ERR_PTR(-ENOMEM);
-
-		nft_trans_gc_elem_add(gc, rbe_prev);
+		nft_rbtree_gc_elem_move(net, set, priv, rbe_prev);
 	}
 
-	nft_rbtree_gc_elem_remove(net, set, priv, rbe);
-	gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
-	if (WARN_ON_ONCE(!gc))
-		return ERR_PTR(-ENOMEM);
-
-	nft_trans_gc_elem_add(gc, rbe);
-
-	nft_trans_gc_queue_sync_done(gc);
+	nft_rbtree_gc_elem_move(net, set, priv, rbe);
 
 	return rbe_prev;
 }
@@ -675,29 +659,13 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
 	}
 }
 
-static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
-				 struct nft_rbtree *priv,
-				 struct nft_rbtree_elem *rbe)
-{
-	nft_setelem_data_deactivate(net, set, &rbe->priv);
-	nft_rbtree_erase(priv, rbe);
-}
-
-static void nft_rbtree_gc(struct nft_set *set)
+static void nft_rbtree_gc_scan(struct nft_set *set)
 {
 	struct nft_rbtree *priv = nft_set_priv(set);
 	struct nft_rbtree_elem *rbe, *rbe_end = NULL;
 	struct net *net = read_pnet(&set->net);
 	u64 tstamp = nft_net_tstamp(net);
 	struct rb_node *node, *next;
-	struct nft_trans_gc *gc;
-
-	set  = nft_set_container_of(priv);
-	net  = read_pnet(&set->net);
-
-	gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
-	if (!gc)
-		return;
 
 	for (node = rb_first(&priv->root); node ; node = next) {
 		next = rb_next(node);
@@ -715,34 +683,46 @@ static void nft_rbtree_gc(struct nft_set *set)
 		if (!__nft_set_elem_expired(&rbe->ext, tstamp))
 			continue;
 
-		gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
-		if (!gc)
-			goto try_later;
-
 		/* end element needs to be removed first, it has
 		 * no timeout extension.
 		 */
+		write_lock_bh(&priv->lock);
 		if (rbe_end) {
-			nft_rbtree_gc_remove(net, set, priv, rbe_end);
-			nft_trans_gc_elem_add(gc, rbe_end);
+			nft_rbtree_gc_elem_move(net, set, priv, rbe_end);
 			rbe_end = NULL;
 		}
 
-		gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
-		if (!gc)
-			goto try_later;
-
-		nft_rbtree_gc_remove(net, set, priv, rbe);
-		nft_trans_gc_elem_add(gc, rbe);
+		nft_rbtree_gc_elem_move(net, set, priv, rbe);
+		write_unlock_bh(&priv->lock);
 	}
 
-try_later:
+	priv->last_gc = jiffies;
+}
+
+static void nft_rbtree_gc_queue(struct nft_set *set)
+{
+	struct nft_rbtree *priv = nft_set_priv(set);
+	struct nft_rbtree_elem *rbe, *rbe_end;
+	struct nft_trans_gc *gc;
+
+	if (list_empty(&priv->expired))
+		return;
 
-	if (gc) {
-		gc = nft_trans_gc_catchall_sync(gc);
-		nft_trans_gc_queue_sync_done(gc);
-		priv->last_gc = jiffies;
+	gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
+	if (!gc)
+		return;
+
+	list_for_each_entry_safe(rbe, rbe_end, &priv->expired, list) {
+		list_del(&rbe->list);
+		nft_trans_gc_elem_add(gc, rbe);
+
+		gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+		if (!gc)
+			return;
 	}
+
+	gc = nft_trans_gc_catchall_sync(gc);
+	nft_trans_gc_queue_sync_done(gc);
 }
 
 static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
@@ -761,6 +741,7 @@ static int nft_rbtree_init(const struct nft_set *set,
 
 	rwlock_init(&priv->lock);
 	priv->root = RB_ROOT;
+	INIT_LIST_HEAD(&priv->expired);
 
 	priv->array = NULL;
 	priv->array_next = NULL;
@@ -778,10 +759,15 @@ static void nft_rbtree_destroy(const struct nft_ctx *ctx,
 			       const struct nft_set *set)
 {
 	struct nft_rbtree *priv = nft_set_priv(set);
-	struct nft_rbtree_elem *rbe;
+	struct nft_rbtree_elem *rbe, *next;
 	struct nft_array *array;
 	struct rb_node *node;
 
+	list_for_each_entry_safe(rbe, next, &priv->expired, list) {
+		list_del(&rbe->list);
+		nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
+	}
+
 	while ((node = priv->root.rb_node) != NULL) {
 		rb_erase(node, &priv->root);
 		rbe = rb_entry(node, struct nft_rbtree_elem, node);
@@ -828,13 +814,21 @@ static void nft_rbtree_commit(struct nft_set *set)
 	u32 num_intervals = 0;
 	struct rb_node *node;
 
-	if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
-		nft_rbtree_gc(set);
-
 	/* No changes, skip, eg. elements updates only. */
 	if (!priv->array_next)
 		return;
 
+	/* GC can be performed if the binary search blob is going
+	 * to be rebuilt.  It has to be done in two phases: first
+	 * scan tree and move all expired elements to the expired
+	 * list.
+	 *
+	 * Then, after blob has been re-built and published to other
+	 * CPUs, queue collected entries for freeing.
+	 */
+	if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
+		nft_rbtree_gc_scan(set);
+
 	/* Reverse walk to create an array from smaller to largest interval. */
 	node = rb_last(&priv->root);
 	if (node)
@@ -881,10 +875,16 @@ static void nft_rbtree_commit(struct nft_set *set)
 		num_intervals++;
 err_out:
 	priv->array_next->num_intervals = num_intervals;
-	old = rcu_replace_pointer(priv->array, priv->array_next, true);
+	old = rcu_replace_pointer(priv->array, priv->array_next,
+				  lockdep_is_held(&nft_pernet(read_pnet(&set->net))->commit_mutex));
 	priv->array_next = NULL;
 	if (old)
 		call_rcu(&old->rcu_head, nft_array_free_rcu);
+
+	/* New blob is public, queue collected entries for freeing.
+	 * call_rcu ensures elements stay around until readers are done.
+	 */
+	nft_rbtree_gc_queue(set);
 }
 
 static void nft_rbtree_abort(const struct nft_set *set)
-- 
2.52.0


  reply	other threads:[~2026-02-06 15:31 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-06 15:30 [PATCH v2 net-next 00/11] netfilter: updates for net-next Florian Westphal
2026-02-06 15:30 ` Florian Westphal [this message]
2026-02-11  5:00   ` [PATCH v2 net-next 01/11] netfilter: nft_set_rbtree: don't gc elements on insert patchwork-bot+netdevbpf
2026-02-06 15:30 ` [PATCH v2 net-next 02/11] netfilter: nfnetlink_queue: do shared-unconfirmed check before segmentation Florian Westphal
2026-02-06 15:30 ` [PATCH v2 net-next 03/11] selftests: netfilter: nft_queue.sh: add udp fraglist gro test case Florian Westphal
2026-02-19  2:41   ` [TEST] nft_queue / test_udp_gro_ct flakes Jakub Kicinski
2026-02-19 15:11     ` Florian Westphal
2026-02-06 15:30 ` [PATCH v2 net-next 04/11] netfilter: flowtable: dedicated slab for flow entry Florian Westphal
2026-02-06 15:30 ` [PATCH v2 net-next 05/11] selftests: netfilter: add IPV6_TUNNEL to config Florian Westphal
2026-02-06 15:30 ` [PATCH v2 net-next 06/11] netfilter: nft_set_hash: fix get operation on big endian Florian Westphal
2026-02-06 15:30 ` [PATCH v2 net-next 07/11] netfilter: nft_counter: fix reset of counters on 32bit archs Florian Westphal
2026-02-06 15:30 ` [PATCH v2 net-next 08/11] netfilter: nft_set_rbtree: fix bogus EEXIST with NLM_F_CREATE with null interval Florian Westphal
2026-02-06 15:30 ` [PATCH v2 net-next 09/11] netfilter: nft_set_rbtree: check for partial overlaps in anonymous sets Florian Westphal
2026-02-06 15:30 ` [PATCH v2 net-next 10/11] netfilter: nft_set_rbtree: validate element belonging to interval Florian Westphal
2026-02-06 15:30 ` [PATCH v2 net-next 11/11] netfilter: nft_set_rbtree: validate open interval overlap Florian Westphal
2026-02-10 11:49   ` Paolo Abeni
2026-02-10 15:29     ` Florian Westphal
2026-02-11  3:56       ` Jakub Kicinski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260206153048.17570-2-fw@strlen.de \
    --to=fw@strlen.de \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=kuba@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=netfilter-devel@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=pablo@netfilter.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox