From: Pablo Neira Ayuso <pablo@netfilter.org>
To: netfilter-devel@vger.kernel.org
Cc: davem@davemloft.net, netdev@vger.kernel.org, kuba@kernel.org,
pabeni@redhat.com, edumazet@google.com, fw@strlen.de,
horms@kernel.org
Subject: [PATCH net-next 04/11] netfilter: nf_conncount: use per nf_conncount_data spinlocks
Date: Sun, 14 Jun 2026 13:45:58 +0200 [thread overview]
Message-ID: <20260614114605.474783-5-pablo@netfilter.org> (raw)
In-Reply-To: <20260614114605.474783-1-pablo@netfilter.org>
From: Florian Westphal <fw@strlen.de>
This change replaces the rb_root with a new container structure.
Instead of an array of locks shared by all nf_conncount_data objects,
each tree gains its own dedicated lock.
Downside: nf_conncount_data increases in size. Before this change:
struct nf_conncount_data {
[..]
/* --- cacheline 33 boundary (2112 bytes) was 16 bytes ago --- */
unsigned int gc_tree; /* 2128 4 */
/* size: 2136, cachelines: 34, members: 7 */
/* padding: 4 */
After:
/* size: 4184, cachelines: 66, members: 7 */
/* padding: 4 */
On LOCKDEP enabled kernels, this is even worse:
/* size: 18560, cachelines: 290, members: 7 */
(due to lockdep map in each spinlock).
For this reason also switch to kvzalloc. The zeroing variant is needed
to not start with random (heap memory content) in the ->pending_trees
bitmap.
Followup patch will add and use a sequence counter.
Assisted-by: Claude:claude-sonnet-4-6
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_conncount.c | 63 +++++++++++++++++++-----------------
1 file changed, 34 insertions(+), 29 deletions(-)
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 81e4a4e20df5..faecc05d34d4 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -54,12 +54,15 @@ struct nf_conncount_rb {
struct rcu_head rcu_head;
};
-static spinlock_t nf_conncount_locks[CONNCOUNT_SLOTS] __cacheline_aligned_in_smp;
+struct nf_conncount_root {
+ struct rb_root root;
+ spinlock_t lock;
+};
struct nf_conncount_data {
unsigned int keylen;
u32 initval;
- struct rb_root root[CONNCOUNT_SLOTS];
+ struct nf_conncount_root root[CONNCOUNT_SLOTS];
struct net *net;
struct work_struct gc_work;
unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)];
@@ -367,18 +370,19 @@ static void __tree_nodes_free(struct rcu_head *h)
kmem_cache_free(conncount_rb_cachep, rbconn);
}
-/* caller must hold tree nf_conncount_locks[] lock */
-static void tree_nodes_free(struct rb_root *root,
+static void tree_nodes_free(struct nf_conncount_root *root,
struct nf_conncount_rb *gc_nodes[],
unsigned int gc_count)
{
struct nf_conncount_rb *rbconn;
+ lockdep_assert_held(&root->lock);
+
while (gc_count) {
rbconn = gc_nodes[--gc_count];
spin_lock(&rbconn->list.list_lock);
if (!rbconn->list.count) {
- rb_erase(&rbconn->node, root);
+ rb_erase(&rbconn->node, &root->root);
call_rcu(&rbconn->rcu_head, __tree_nodes_free);
}
spin_unlock(&rbconn->list.list_lock);
@@ -396,10 +400,10 @@ insert_tree(struct net *net,
const struct sk_buff *skb,
u16 l3num,
struct nf_conncount_data *data,
- struct rb_root *root,
unsigned int hash,
const u32 *key)
{
+ struct nf_conncount_root *root = &data->root[hash];
struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
bool do_gc = true, refcounted = false;
@@ -410,10 +414,10 @@ insert_tree(struct net *net,
struct nf_conncount_rb *rbconn;
struct nf_conn *ct = NULL;
- spin_lock_bh(&nf_conncount_locks[hash]);
+ spin_lock_bh(&root->lock);
restart:
parent = NULL;
- rbnode = &(root->rb_node);
+ rbnode = &root->root.rb_node;
while (*rbnode) {
int diff;
rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
@@ -475,12 +479,12 @@ insert_tree(struct net *net,
rbconn->list.count = count;
rb_link_node_rcu(&rbconn->node, parent, rbnode);
- rb_insert_color(&rbconn->node, root);
+ rb_insert_color(&rbconn->node, &root->root);
}
out_unlock:
if (refcounted)
nf_ct_put(ct);
- spin_unlock_bh(&nf_conncount_locks[hash]);
+ spin_unlock_bh(&root->lock);
return count;
}
@@ -491,7 +495,7 @@ count_tree(struct net *net,
struct nf_conncount_data *data,
const u32 *key)
{
- struct rb_root *root;
+ struct nf_conncount_root *root;
struct rb_node *parent;
struct nf_conncount_rb *rbconn;
unsigned int hash;
@@ -499,7 +503,7 @@ count_tree(struct net *net,
hash = jhash2(key, data->keylen, data->initval) % CONNCOUNT_SLOTS;
root = &data->root[hash];
- parent = rcu_dereference(root->rb_node);
+ parent = rcu_dereference(root->root.rb_node);
while (parent) {
int diff;
@@ -544,14 +548,14 @@ count_tree(struct net *net,
if (!skb)
return 0;
- return insert_tree(net, skb, l3num, data, root, hash, key);
+ return insert_tree(net, skb, l3num, data, hash, key);
}
static void tree_gc_worker(struct work_struct *work)
{
struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work);
struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;
- struct rb_root *root;
+ struct nf_conncount_root *root;
struct rb_node *node;
unsigned int tree, next_tree, gc_count = 0;
@@ -560,7 +564,7 @@ static void tree_gc_worker(struct work_struct *work)
local_bh_disable();
rcu_read_lock();
- for (node = rb_first(root); node != NULL; node = rb_next(node)) {
+ for (node = rb_first(&root->root); node ; node = rb_next(node)) {
rbconn = rb_entry(node, struct nf_conncount_rb, node);
if (nf_conncount_gc_list(data->net, &rbconn->list))
gc_count++;
@@ -570,12 +574,12 @@ static void tree_gc_worker(struct work_struct *work)
cond_resched();
- spin_lock_bh(&nf_conncount_locks[tree]);
+ spin_lock_bh(&root->lock);
if (gc_count < ARRAY_SIZE(gc_nodes))
goto next; /* do not bother */
gc_count = 0;
- node = rb_first(root);
+ node = rb_first(&root->root);
while (node != NULL) {
rbconn = rb_entry(node, struct nf_conncount_rb, node);
node = rb_next(node);
@@ -602,7 +606,7 @@ static void tree_gc_worker(struct work_struct *work)
schedule_work(work);
}
- spin_unlock_bh(&nf_conncount_locks[tree]);
+ spin_unlock_bh(&root->lock);
}
/* Count and return number of conntrack entries in 'net' with particular 'key'.
@@ -620,6 +624,12 @@ unsigned int nf_conncount_count_skb(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_conncount_count_skb);
+static void nf_conncount_root_init(struct nf_conncount_root *r)
+{
+ r->root = RB_ROOT;
+ spin_lock_init(&r->lock);
+}
+
struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen)
{
struct nf_conncount_data *data;
@@ -630,12 +640,12 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen
keylen == 0)
return ERR_PTR(-EINVAL);
- data = kmalloc_obj(*data);
+ data = kvzalloc_obj(*data);
if (!data)
return ERR_PTR(-ENOMEM);
for (i = 0; i < ARRAY_SIZE(data->root); ++i)
- data->root[i] = RB_ROOT;
+ nf_conncount_root_init(&data->root[i]);
data->keylen = keylen / sizeof(u32);
data->net = net;
@@ -655,15 +665,15 @@ void nf_conncount_cache_free(struct nf_conncount_list *list)
}
EXPORT_SYMBOL_GPL(nf_conncount_cache_free);
-static void destroy_tree(struct rb_root *r)
+static void destroy_tree(struct nf_conncount_root *r)
{
struct nf_conncount_rb *rbconn;
struct rb_node *node;
- while ((node = rb_first(r)) != NULL) {
+ while ((node = rb_first(&r->root)) != NULL) {
rbconn = rb_entry(node, struct nf_conncount_rb, node);
- rb_erase(node, r);
+ rb_erase(node, &r->root);
nf_conncount_cache_free(&rbconn->list);
@@ -680,17 +690,12 @@ void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data)
for (i = 0; i < ARRAY_SIZE(data->root); ++i)
destroy_tree(&data->root[i]);
- kfree(data);
+ kvfree(data);
}
EXPORT_SYMBOL_GPL(nf_conncount_destroy);
static int __init nf_conncount_modinit(void)
{
- int i;
-
- for (i = 0; i < CONNCOUNT_SLOTS; ++i)
- spin_lock_init(&nf_conncount_locks[i]);
-
conncount_conn_cachep = KMEM_CACHE(nf_conncount_tuple, 0);
if (!conncount_conn_cachep)
return -ENOMEM;
--
2.47.3
next prev parent reply other threads:[~2026-06-14 11:46 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-06-14 11:45 [PATCH net-next 00/11] Netfilter/IPVS updates for net-next Pablo Neira Ayuso
2026-06-14 11:45 ` [PATCH net-next 01/11] ipvs: Replace use of system_unbound_wq with system_dfl_long_wq Pablo Neira Ayuso
2026-06-14 11:45 ` [PATCH net-next 02/11] netfilter: nf_tables: use DEBUG_NET_WARN_ON_ONCE in packet and control paths Pablo Neira Ayuso
2026-06-14 11:45 ` [PATCH net-next 03/11] netfilter: nf_conncount: callers must hold rcu read lock Pablo Neira Ayuso
2026-06-14 11:45 ` Pablo Neira Ayuso [this message]
2026-06-14 11:45 ` [PATCH net-next 05/11] netfilter: nf_conncount: split count_tree_node rbtree walk into helper Pablo Neira Ayuso
2026-06-14 11:46 ` [PATCH net-next 06/11] netfilter: nf_conncount: add sequence counter to detect tree modifications Pablo Neira Ayuso
2026-06-14 11:46 ` [PATCH net-next 07/11] netfilter: nf_conncount: gc and rcu fixes Pablo Neira Ayuso
2026-06-14 11:46 ` [PATCH net-next 08/11] netfilter: conntrack: check NULL when retrieving ct extension Pablo Neira Ayuso
2026-06-14 11:46 ` [PATCH net-next 09/11] netfilter: flowtable: bail out if forward path cannot be discovered Pablo Neira Ayuso
2026-06-14 11:46 ` [PATCH net-next 10/11] ipvs: fix doc syntax for conn_max sysctl Pablo Neira Ayuso
2026-06-14 11:46 ` [PATCH net-next 11/11] netfilter: nf_dup_netdev: add nf_dev_xmit_recursion*() helpers and use them Pablo Neira Ayuso
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260614114605.474783-5-pablo@netfilter.org \
--to=pablo@netfilter.org \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=fw@strlen.de \
--cc=horms@kernel.org \
--cc=kuba@kernel.org \
--cc=netdev@vger.kernel.org \
--cc=netfilter-devel@vger.kernel.org \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox