From: Florian Westphal <fw@strlen.de>
To: <netdev@vger.kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>,
"David S. Miller" <davem@davemloft.net>,
Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>,
<netfilter-devel@vger.kernel.org>,
pablo@netfilter.org
Subject: [PATCH net 07/14] netfilter: nft_set_rbtree: revisit array resize logic
Date: Wed, 25 Mar 2026 14:11:01 +0100 [thread overview]
Message-ID: <20260325131108.23045-8-fw@strlen.de> (raw)
In-Reply-To: <20260325131108.23045-1-fw@strlen.de>
From: Pablo Neira Ayuso <pablo@netfilter.org>
Chris Arges reports high memory consumption with thousands of
containers, this patch revisits the array allocation logic.
For anonymous sets, start by 16 slots (which takes 256 bytes on x86_64).
Expand it by x2 until threshold of 512 slots is reached, over that
threshold, expand it by x1.5.
For non-anonymous set, start by 1024 slots in the array (which takes 16
Kbytes initially on x86_64). Expand it by x1.5.
Use set->ndeact to subtract deactivated elements when calculating the
number of the slots in the array, otherwise the array size array gets
increased artifically. Add special case shrink logic to deal with flush
set too.
The shrink logic is skipped by anonymous sets.
Use check_add_overflow() to calculate the new array size.
Add a WARN_ON_ONCE check to make sure elements fit into the new array
size.
Reported-by: Chris Arges <carges@cloudflare.com>
Fixes: 7e43e0a1141d ("netfilter: nft_set_rbtree: translate rbtree to array for binary search")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/netfilter/nft_set_rbtree.c | 92 +++++++++++++++++++++++++++-------
1 file changed, 75 insertions(+), 17 deletions(-)
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index fe8bd497d74a..737c339decd0 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -572,14 +572,12 @@ static struct nft_array *nft_array_alloc(u32 max_intervals)
return array;
}
-#define NFT_ARRAY_EXTRA_SIZE 10240
-
/* Similar to nft_rbtree_{u,k}size to hide details to userspace, but consider
* packed representation coming from userspace for anonymous sets too.
*/
static u32 nft_array_elems(const struct nft_set *set)
{
- u32 nelems = atomic_read(&set->nelems);
+ u32 nelems = atomic_read(&set->nelems) - set->ndeact;
/* Adjacent intervals are represented with a single start element in
* anonymous sets, use the current element counter as is.
@@ -595,27 +593,87 @@ static u32 nft_array_elems(const struct nft_set *set)
return (nelems / 2) + 2;
}
-static int nft_array_may_resize(const struct nft_set *set)
+#define NFT_ARRAY_INITIAL_SIZE 1024
+#define NFT_ARRAY_INITIAL_ANON_SIZE 16
+#define NFT_ARRAY_INITIAL_ANON_THRESH (8192U / sizeof(struct nft_array_interval))
+
+static int nft_array_may_resize(const struct nft_set *set, bool flush)
{
- u32 nelems = nft_array_elems(set), new_max_intervals;
+ u32 initial_intervals, max_intervals, new_max_intervals, delta;
+ u32 shrinked_max_intervals, nelems = nft_array_elems(set);
struct nft_rbtree *priv = nft_set_priv(set);
struct nft_array *array;
- if (!priv->array_next) {
- array = nft_array_alloc(nelems + NFT_ARRAY_EXTRA_SIZE);
- if (!array)
- return -ENOMEM;
+ if (nft_set_is_anonymous(set))
+ initial_intervals = NFT_ARRAY_INITIAL_ANON_SIZE;
+ else
+ initial_intervals = NFT_ARRAY_INITIAL_SIZE;
+
+ if (priv->array_next) {
+ max_intervals = priv->array_next->max_intervals;
+ new_max_intervals = priv->array_next->max_intervals;
+ } else {
+ if (priv->array) {
+ max_intervals = priv->array->max_intervals;
+ new_max_intervals = priv->array->max_intervals;
+ } else {
+ max_intervals = 0;
+ new_max_intervals = initial_intervals;
+ }
+ }
- priv->array_next = array;
+ if (nft_set_is_anonymous(set))
+ goto maybe_grow;
+
+ if (flush) {
+ /* Set flush just started, nelems still report elements.*/
+ nelems = 0;
+ new_max_intervals = NFT_ARRAY_INITIAL_SIZE;
+ goto realloc_array;
}
- if (nelems < priv->array_next->max_intervals)
- return 0;
+ if (check_add_overflow(new_max_intervals, new_max_intervals,
+ &shrinked_max_intervals))
+ return -EOVERFLOW;
+
+ shrinked_max_intervals = DIV_ROUND_UP(shrinked_max_intervals, 3);
- new_max_intervals = priv->array_next->max_intervals + NFT_ARRAY_EXTRA_SIZE;
- if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0)
+ if (shrinked_max_intervals > NFT_ARRAY_INITIAL_SIZE &&
+ nelems < shrinked_max_intervals) {
+ new_max_intervals = shrinked_max_intervals;
+ goto realloc_array;
+ }
+maybe_grow:
+ if (nelems > new_max_intervals) {
+ if (nft_set_is_anonymous(set) &&
+ new_max_intervals < NFT_ARRAY_INITIAL_ANON_THRESH) {
+ new_max_intervals <<= 1;
+ } else {
+ delta = new_max_intervals >> 1;
+ if (check_add_overflow(new_max_intervals, delta,
+ &new_max_intervals))
+ return -EOVERFLOW;
+ }
+ }
+
+realloc_array:
+ if (WARN_ON_ONCE(nelems > new_max_intervals))
return -ENOMEM;
+ if (priv->array_next) {
+ if (max_intervals == new_max_intervals)
+ return 0;
+
+ if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0)
+ return -ENOMEM;
+ } else {
+ array = nft_array_alloc(new_max_intervals);
+ if (!array)
+ return -ENOMEM;
+
+ priv->array_next = array;
+ }
+
return 0;
}
@@ -630,7 +688,7 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
nft_rbtree_maybe_reset_start_cookie(priv, tstamp);
- if (nft_array_may_resize(set) < 0)
+ if (nft_array_may_resize(set, false) < 0)
return -ENOMEM;
do {
@@ -741,7 +799,7 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
nft_rbtree_interval_null(set, this))
priv->start_rbe_cookie = 0;
- if (nft_array_may_resize(set) < 0)
+ if (nft_array_may_resize(set, false) < 0)
return NULL;
while (parent != NULL) {
@@ -811,7 +869,7 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
switch (iter->type) {
case NFT_ITER_UPDATE_CLONE:
- if (nft_array_may_resize(set) < 0) {
+ if (nft_array_may_resize(set, true) < 0) {
iter->err = -ENOMEM;
break;
}
--
2.52.0
next prev parent reply other threads:[~2026-03-25 13:12 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-25 13:10 [PATCH net 00/14] netfilter: updates for net Florian Westphal
2026-03-25 13:10 ` [PATCH net 01/14] netfilter: nft_set_pipapo_avx2: don't return non-matching entry on expiry Florian Westphal
2026-03-25 13:10 ` [PATCH net 02/14] selftests: netfilter: nft_concat_range.sh: add check for flush+reload bug Florian Westphal
2026-03-25 13:10 ` [PATCH net 03/14] netfilter: ipset: Fix data race between add and list header in all hash types Florian Westphal
2026-03-25 13:10 ` [PATCH net 04/14] netfilter: nfnetlink_log: fix uninitialized padding leak in NFULA_PAYLOAD Florian Westphal
2026-03-25 13:10 ` [PATCH net 05/14] netfilter: x_tables: reject unsupported families in xt_check_match/xt_check_target Florian Westphal
2026-03-25 13:11 ` [PATCH net 06/14] netfilter: ip6t_rt: reject oversized addrnr in rt_mt6_check() Florian Westphal
2026-03-25 13:11 ` Florian Westphal [this message]
2026-03-25 13:11 ` [PATCH net 08/14] netfilter: nf_conntrack_expect: honor expectation helper field Florian Westphal
2026-03-25 13:11 ` [PATCH net 09/14] netfilter: nf_conntrack_expect: use expect->helper Florian Westphal
2026-03-25 13:11 ` [PATCH net 10/14] netfilter: ctnetlink: ensure safe access to master conntrack Florian Westphal
2026-03-25 17:26 ` Pablo Neira Ayuso
2026-03-25 17:28 ` Pablo Neira Ayuso
2026-03-25 17:28 ` Florian Westphal
2026-03-25 17:38 ` Pablo Neira Ayuso
2026-03-25 13:11 ` [PATCH net 11/14] netfilter: nf_conntrack_expect: store netns and zone in expectation Florian Westphal
2026-03-25 13:11 ` [PATCH net 12/14] netfilter: nf_conntrack_expect: skip expectations in other netns via proc Florian Westphal
2026-03-25 13:11 ` [PATCH net 13/14] netfilter: nf_conntrack_sip: fix use of uninitialized rtp_addr in process_sdp Florian Westphal
2026-03-25 13:11 ` [PATCH net 14/14] netfilter: ctnetlink: use netlink policy range checks Florian Westphal
2026-03-25 17:42 ` [PATCH net 00/14] netfilter: updates for net Florian Westphal
2026-03-25 17:48 ` Pablo Neira Ayuso
2026-03-25 17:51 ` Florian Westphal
2026-03-25 17:59 ` Pablo Neira Ayuso
-- strict thread matches above, loose matches on Subject: below --
2026-03-25 22:26 [PATCH net,v2 00/14] Netfilter fixes " Pablo Neira Ayuso
2026-03-25 22:26 ` [PATCH net 07/14] netfilter: nft_set_rbtree: revisit array resize logic Pablo Neira Ayuso
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260325131108.23045-8-fw@strlen.de \
--to=fw@strlen.de \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=kuba@kernel.org \
--cc=netdev@vger.kernel.org \
--cc=netfilter-devel@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=pablo@netfilter.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox