public inbox for netfilter-devel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH nf,v2] netfilter: nft_set_rbtree: revisit array resize logic
@ 2026-03-16 11:48 Pablo Neira Ayuso
  2026-03-16 12:35 ` Florian Westphal
  0 siblings, 1 reply; 2+ messages in thread
From: Pablo Neira Ayuso @ 2026-03-16 11:48 UTC (permalink / raw)
  To: netfilter-devel; +Cc: carges, fw

Chris Arges reports high memory consumption with thousands of
containers, this patch revisits the array allocation logic.

Start by 1024 slots in the array (which takes 16 Kbytes initially on
x86_64), and expand it by x1.5.

Use set->ndeact to subtract deactivated elements when calculating the
number of the slots in the array, otherwise the array size array gets
increased artifically. Add special case shrink logic to deal with flush
set too.

Use check_add_overflow() to calculate the new array size.

Add a WARN_ON_ONCE check to make sure elements fit into the new array
size.

Reported-by: Chris Arges <carges@cloudflare.com>
Fixes: 7e43e0a1141d ("netfilter: nft_set_rbtree: translate rbtree to array for binary search")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
v2: - increase/decrease by x1.5 factor per Florian
    - shrink array on flush
    - add bugtrap in case number of elements is less than array size
    - consolidate array (re)allocation
    - use check_overflow_add()
 
Hi, this is v2, slightly larger than previous version.

 net/netfilter/nft_set_rbtree.c | 72 +++++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 19 deletions(-)

diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index bdcea649467f..c80f903e0546 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -616,14 +616,12 @@ static struct nft_array *nft_array_alloc(u32 max_intervals)
 	return array;
 }
 
-#define NFT_ARRAY_EXTRA_SIZE	10240
-
 /* Similar to nft_rbtree_{u,k}size to hide details to userspace, but consider
  * packed representation coming from userspace for anonymous sets too.
  */
 static u32 nft_array_elems(const struct nft_set *set)
 {
-	u32 nelems = atomic_read(&set->nelems);
+	u32 nelems = atomic_read(&set->nelems) - set->ndeact;
 
 	/* Adjacent intervals are represented with a single start element in
 	 * anonymous sets, use the current element counter as is.
@@ -639,18 +637,61 @@ static u32 nft_array_elems(const struct nft_set *set)
 	return (nelems / 2) + 2;
 }
 
-static int nft_array_may_resize(const struct nft_set *set)
+#define NFT_ARRAY_INITIAL_SIZE	1024
+
+static int nft_array_may_resize(const struct nft_set *set, bool flush)
 {
-	u32 nelems = nft_array_elems(set), new_max_intervals;
+	u32 max_intervals, new_max_intervals, shrinked_max_intervals;
 	struct nft_rbtree *priv = nft_set_priv(set);
+	u32 nelems = nft_array_elems(set), delta;
 	struct nft_array *array;
 
-	if (!priv->array_next) {
-		if (priv->array)
+	if (priv->array_next) {
+		max_intervals = priv->array_next->max_intervals;
+		new_max_intervals = priv->array_next->max_intervals;
+	} else {
+		if (priv->array) {
+			max_intervals = priv->array->max_intervals;
 			new_max_intervals = priv->array->max_intervals;
-		else
-			new_max_intervals = NFT_ARRAY_EXTRA_SIZE;
+		} else {
+			max_intervals = 0;
+			new_max_intervals = NFT_ARRAY_INITIAL_SIZE;
+		}
+	}
+
+	if (flush) {
+		nelems = 0;
+		new_max_intervals = NFT_ARRAY_INITIAL_SIZE;
+		goto realloc_array;
+	}
+
+	if (check_add_overflow(new_max_intervals, new_max_intervals, &shrinked_max_intervals))
+		return -EOVERFLOW;
+
+	shrinked_max_intervals = DIV_ROUND_UP(shrinked_max_intervals, 3);
+	if (shrinked_max_intervals > NFT_ARRAY_INITIAL_SIZE &&
+	    nelems < shrinked_max_intervals) {
+		new_max_intervals = shrinked_max_intervals;
+		goto realloc_array;
+	}
 
+	if (nelems > new_max_intervals) {
+		delta = new_max_intervals >> 1;
+		if (check_add_overflow(new_max_intervals, delta, &new_max_intervals))
+			return -EOVERFLOW;
+	}
+
+realloc_array:
+	if (WARN_ON_ONCE(nelems > new_max_intervals))
+		return -ENOMEM;
+
+	if (priv->array_next) {
+		if (max_intervals == new_max_intervals)
+			return 0;
+
+		if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0)
+			return -ENOMEM;
+	} else {
 		array = nft_array_alloc(new_max_intervals);
 		if (!array)
 			return -ENOMEM;
@@ -658,13 +699,6 @@ static int nft_array_may_resize(const struct nft_set *set)
 		priv->array_next = array;
 	}
 
-	if (nelems < priv->array_next->max_intervals)
-		return 0;
-
-	new_max_intervals = priv->array_next->max_intervals + NFT_ARRAY_EXTRA_SIZE;
-	if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0)
-		return -ENOMEM;
-
 	return 0;
 }
 
@@ -680,7 +714,7 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
 
 	nft_rbtree_maybe_reset_start_cookie(priv, tstamp);
 
-	if (nft_array_may_resize(set) < 0)
+	if (nft_array_may_resize(set, false) < 0)
 		return -ENOMEM;
 
 	do {
@@ -796,7 +830,7 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
 	    nft_rbtree_interval_null(set, this))
 		priv->start_rbe_cookie = 0;
 
-	if (nft_array_may_resize(set) < 0)
+	if (nft_array_may_resize(set, false) < 0)
 		return NULL;
 
 	while (parent != NULL) {
@@ -867,7 +901,7 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
 
 	switch (iter->type) {
 	case NFT_ITER_UPDATE_CLONE:
-		if (nft_array_may_resize(set) < 0) {
+		if (nft_array_may_resize(set, true) < 0) {
 			iter->err = -ENOMEM;
 			break;
 		}
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* Re: [PATCH nf,v2] netfilter: nft_set_rbtree: revisit array resize logic
  2026-03-16 11:48 [PATCH nf,v2] netfilter: nft_set_rbtree: revisit array resize logic Pablo Neira Ayuso
@ 2026-03-16 12:35 ` Florian Westphal
  0 siblings, 0 replies; 2+ messages in thread
From: Florian Westphal @ 2026-03-16 12:35 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: netfilter-devel, carges

Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> Chris Arges reports high memory consumption with thousands of
> containers, this patch revisits the array allocation logic.
> 
> Start by 1024 slots in the array (which takes 16 Kbytes initially on
> x86_64), and expand it by x1.5.

Do you think it makese sense to start with set->size for anonymous sets?
I suspect most anon sets are pretty small, 16kb / rule is a lot.

No need to send a v3, this could be done in a followup patch.

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-03-16 12:35 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-16 11:48 [PATCH nf,v2] netfilter: nft_set_rbtree: revisit array resize logic Pablo Neira Ayuso
2026-03-16 12:35 ` Florian Westphal

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox