public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH net v3] netfilter: nft_set_rbtree: fix use count leak on transaction abort
@ 2026-04-12 22:28 Marko Jevtic
  2026-04-12 22:31 ` Florian Westphal
  0 siblings, 1 reply; 2+ messages in thread
From: Marko Jevtic @ 2026-04-12 22:28 UTC (permalink / raw)
  To: pablo, fw, netfilter-devel
  Cc: phil, coreteam, davem, edumazet, kuba, pabeni, horms, netdev,
	linux-kernel

nft_rbtree_abort() does not handle elements moved to the expired list
by inline GC during __nft_rbtree_insert(). When inline GC encounters
expired elements during overlap detection, it calls
nft_rbtree_gc_elem_move() which deactivates element data (decrementing
chain/object use counts), removes the element from the rbtree, and
queues it for deferred freeing. On commit, these elements are freed
via nft_rbtree_gc_queue(). On abort, however, the expired list is
ignored entirely.

This leaves use counts permanently decremented after abort.

This restores transactional semantics by ensuring that inline GC side
effects are fully rolled back on abort:

- Introduce a separate tx_gc list for elements collected during insert
  (transaction-scoped), distinct from the existing expired list used
  by commit-time gc_scan (commit-scoped). This prevents abort from
  touching committed expired elements left over from a prior gc_queue
  OOM.

- On commit: splice tx_gc into expired after publishing the new binary
  search blob, then drain via gc_queue as before.

- On abort: iterate tx_gc, re-activate element data (restoring use
  counts), and re-insert into the rbtree. Elements remain expired and
  will be properly collected on the next successful commit.

- Extract nft_rbtree_node_insert() helper from __nft_rbtree_insert()
  to share the tree insertion logic with the abort restore path.

- Add WARN_ON_ONCE in commit early-return path to catch any violation
  of the invariant that tx_gc is empty when no tree changes occurred.

- Reset start_rbe_cookie on abort so insertion state from a failed
  transaction does not persist.

Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API")
Signed-off-by: Marko Jevtic <marko.jevtic@codereflect.io>
---
v3:
- add Fixes tag
- narrow the changelog to the abort-side use-count accounting bug

v2:
- introduce a transaction-scoped tx_gc list for insert-time GC
- restore tx_gc entries on abort and splice them to expired on commit
- export nft_setelem_data_activate() and factor out nft_rbtree_node_insert()

 include/net/netfilter/nf_tables.h |  3 +
 net/netfilter/nf_tables_api.c     |  4 +-
 net/netfilter/nft_set_rbtree.c    | 96 ++++++++++++++++++++++---------
 3 files changed, 74 insertions(+), 29 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index ec8a8ec9c..f8c912332 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1910,6 +1910,9 @@ struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc,
 						 unsigned int gc_seq);
 struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc);
 
+void nft_setelem_data_activate(const struct net *net,
+				 const struct nft_set *set,
+				 struct nft_elem_priv *elem_priv);
 void nft_setelem_data_deactivate(const struct net *net,
 				 const struct nft_set *set,
 				 struct nft_elem_priv *elem_priv);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8c42247a1..8e783db3f 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5837,7 +5837,7 @@ static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
 	}
 }
 
-static void nft_setelem_data_activate(const struct net *net,
-				      const struct nft_set *set,
-				      struct nft_elem_priv *elem_priv);
+void nft_setelem_data_activate(const struct net *net,
+			       const struct nft_set *set,
+			       struct nft_elem_priv *elem_priv);
 
@@ -7656,7 +7656,7 @@ static int nft_setelem_active_next(const struct net *net,
 	return nft_set_elem_active(ext, genmask);
 }
 
-static void nft_setelem_data_activate(const struct net *net,
-				      const struct nft_set *set,
-				      struct nft_elem_priv *elem_priv)
+void nft_setelem_data_activate(const struct net *net,
+			       const struct nft_set *set,
+			       struct nft_elem_priv *elem_priv)
 {
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 737c339de..e1f76d6ef 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -36,6 +36,7 @@ struct nft_rbtree {
 	unsigned long		start_rbe_cookie;
 	unsigned long		last_gc;
 	struct list_head	expired;
+	struct list_head	tx_gc;
 	u64			last_tstamp;
 };
 
@@ -194,14 +195,14 @@ nft_rbtree_get(const struct net *net, const struct nft_set *set,
 
 static void nft_rbtree_gc_elem_move(struct net *net, struct nft_set *set,
 				    struct nft_rbtree *priv,
-				    struct nft_rbtree_elem *rbe)
+				    struct nft_rbtree_elem *rbe,
+				    struct list_head *target_list)
 {
 	lockdep_assert_held_write(&priv->lock);
 	nft_setelem_data_deactivate(net, set, &rbe->priv);
 	rb_erase(&rbe->node, &priv->root);
 
-	/* collected later on in commit callback */
-	list_add(&rbe->list, &priv->expired);
+	list_add(&rbe->list, target_list);
 }
 
 static const struct nft_rbtree_elem *
@@ -229,10 +230,10 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
 	rbe_prev = NULL;
 	if (prev) {
 		rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
-		nft_rbtree_gc_elem_move(net, set, priv, rbe_prev);
+		nft_rbtree_gc_elem_move(net, set, priv, rbe_prev, &priv->tx_gc);
 	}
 
-	nft_rbtree_gc_elem_move(net, set, priv, rbe);
+	nft_rbtree_gc_elem_move(net, set, priv, rbe, &priv->tx_gc);
 
 	return rbe_prev;
 }
@@ -335,6 +336,35 @@ static bool nft_rbtree_insert_same_interval(const struct net *net,
 	return false;
 }
 
+static void nft_rbtree_node_insert(const struct nft_set *set,
+				   struct nft_rbtree *priv,
+				   struct nft_rbtree_elem *new)
+{
+	struct nft_rbtree_elem *rbe;
+	struct rb_node *parent, **p;
+	int d;
+
+	lockdep_assert_held_write(&priv->lock);
+
+	parent = NULL;
+	p = &priv->root.rb_node;
+	while (*p) {
+		parent = *p;
+		rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+		d = nft_rbtree_cmp(set, rbe, new);
+		if (d < 0)
+			p = &parent->rb_left;
+		else if (d > 0)
+			p = &parent->rb_right;
+		else if (nft_rbtree_interval_end(rbe))
+			p = &parent->rb_left;
+		else
+			p = &parent->rb_right;
+	}
+	rb_link_node_rcu(&new->node, parent, p);
+	rb_insert_color(&new->node, &priv->root);
+}
+
 static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 			       struct nft_rbtree_elem *new,
 			       struct nft_elem_priv **elem_priv, u64 tstamp)
@@ -516,25 +546,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 		return -ENOTEMPTY;
 
 	/* Accepted element: pick insertion point depending on key value */
-	parent = NULL;
-	p = &priv->root.rb_node;
-	while (*p != NULL) {
-		parent = *p;
-		rbe = rb_entry(parent, struct nft_rbtree_elem, node);
-		d = nft_rbtree_cmp(set, rbe, new);
-
-		if (d < 0)
-			p = &parent->rb_left;
-		else if (d > 0)
-			p = &parent->rb_right;
-		else if (nft_rbtree_interval_end(rbe))
-			p = &parent->rb_left;
-		else
-			p = &parent->rb_right;
-	}
-
-	rb_link_node_rcu(&new->node, parent, p);
-	rb_insert_color(&new->node, &priv->root);
+	nft_rbtree_node_insert(set, priv, new);
 	return 0;
 }
 
@@ -920,11 +932,11 @@ static void nft_rbtree_gc_scan(struct nft_set *set)
 		 */
 		write_lock_bh(&priv->lock);
 		if (rbe_end) {
-			nft_rbtree_gc_elem_move(net, set, priv, rbe_end);
+			nft_rbtree_gc_elem_move(net, set, priv, rbe_end, &priv->expired);
 			rbe_end = NULL;
 		}
 
-		nft_rbtree_gc_elem_move(net, set, priv, rbe);
+		nft_rbtree_gc_elem_move(net, set, priv, rbe, &priv->expired);
 		write_unlock_bh(&priv->lock);
 	}
 
@@ -974,6 +986,7 @@ static int nft_rbtree_init(const struct nft_set *set,
 	rwlock_init(&priv->lock);
 	priv->root = RB_ROOT;
 	INIT_LIST_HEAD(&priv->expired);
+	INIT_LIST_HEAD(&priv->tx_gc);
 
 	priv->array = NULL;
 	priv->array_next = NULL;
@@ -1000,6 +1013,11 @@ static void nft_rbtree_destroy(const struct nft_ctx *ctx,
 		nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
 	}
 
+	list_for_each_entry_safe(rbe, next, &priv->tx_gc, list) {
+		list_del(&rbe->list);
+		nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
+	}
+
 	while ((node = priv->root.rb_node) != NULL) {
 		rb_erase(node, &priv->root);
 		rbe = rb_entry(node, struct nft_rbtree_elem, node);
@@ -1047,8 +1065,10 @@ static void nft_rbtree_commit(struct nft_set *set)
 	struct rb_node *node;
 
 	/* No changes, skip, eg. elements updates only. */
-	if (!priv->array_next)
+	if (!priv->array_next) {
+		WARN_ON_ONCE(!list_empty(&priv->tx_gc));
 		return;
+	}
 
 	/* GC can be performed if the binary search blob is going
 	 * to be rebuilt.  It has to be done in two phases: first
@@ -1116,13 +1136,35 @@ static void nft_rbtree_commit(struct nft_set *set)
 	/* New blob is public, queue collected entries for freeing.
 	 * call_rcu ensures elements stay around until readers are done.
 	 */
+	list_splice_tail_init(&priv->tx_gc, &priv->expired);
 	nft_rbtree_gc_queue(set);
 }
 
 static void nft_rbtree_abort(const struct nft_set *set)
 {
 	struct nft_rbtree *priv = nft_set_priv(set);
+	struct nft_rbtree_elem *rbe, *tmp;
 	struct nft_array *array_next;
+	struct net *net;
+
+	/* Restore elements that inline GC moved to the tx_gc list during
+	 * insert: their data was deactivated (use counts decremented) but
+	 * the transaction was aborted, so re-activate and re-insert to
+	 * undo GC side effects and restore transactional rollback semantics.
+	 */
+	if (!list_empty(&priv->tx_gc)) {
+		net = read_pnet(&set->net);
+
+		write_lock_bh(&priv->lock);
+		list_for_each_entry_safe(rbe, tmp, &priv->tx_gc, list) {
+			list_del_init(&rbe->list);
+			nft_setelem_data_activate(net, set, &rbe->priv);
+			nft_rbtree_node_insert(set, priv, rbe);
+		}
+		write_unlock_bh(&priv->lock);
+	}
+
+	priv->start_rbe_cookie = 0;
 
 	if (!priv->array_next)
 		return;
-- 
2.43.0

^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-04-12 22:31 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-12 22:28 [PATCH net v3] netfilter: nft_set_rbtree: fix use count leak on transaction abort Marko Jevtic
2026-04-12 22:31 ` Florian Westphal

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox