* [PATCH nf-next v4 1/5] netfilter: nf_conncount: callers must hold rcu read lock
2026-06-05 13:11 [PATCH nf-next v4 0/5] netfilter: nf_conncount: fix gc and rbtree bugs Florian Westphal
@ 2026-06-05 13:11 ` Florian Westphal
2026-06-05 13:11 ` [PATCH nf-next v4 2/5] netfilter: nf_conncount: use per nf_conncount_data spinlocks Florian Westphal
` (3 subsequent siblings)
4 siblings, 0 replies; 7+ messages in thread
From: Florian Westphal @ 2026-06-05 13:11 UTC (permalink / raw)
To: netfilter-devel; +Cc: Florian Westphal
rcu_derefence_raw() should not have been used here, it concealed this bug.
Its used because struct rb_node lacks __rcu annotated pointers, so plain
rcu_derefence causes sparse warnings.
The major tradeoff is that rcu_derefence_raw() doesn't warn when the caller
isn't in a rcu read section.
Extend the rcu read lock scope accordingly and cause sparse warnings,
those warnings are the lesser evil.
Fixes: 11efd5cb04a1 ("openvswitch: Support conntrack zone limit")
Closes: https://sashiko.dev/#/patchset/20260603230610.7900-1-fw%40strlen.de
Signed-off-by: Florian Westphal <fw@strlen.de>
---
v4: no changes.
net/netfilter/nf_conncount.c | 6 +++---
net/openvswitch/conntrack.c | 2 +-
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index ab28b47395bd..81e4a4e20df5 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -499,7 +499,7 @@ count_tree(struct net *net,
hash = jhash2(key, data->keylen, data->initval) % CONNCOUNT_SLOTS;
root = &data->root[hash];
- parent = rcu_dereference_raw(root->rb_node);
+ parent = rcu_dereference(root->rb_node);
while (parent) {
int diff;
@@ -507,9 +507,9 @@ count_tree(struct net *net,
diff = key_diff(key, rbconn->key, data->keylen);
if (diff < 0) {
- parent = rcu_dereference_raw(parent->rb_left);
+ parent = rcu_dereference(parent->rb_left);
} else if (diff > 0) {
- parent = rcu_dereference_raw(parent->rb_right);
+ parent = rcu_dereference(parent->rb_right);
} else {
int ret;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 7c9256572284..c6fd9c424e8f 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1797,10 +1797,10 @@ static int ovs_ct_limit_get_zone_limit(struct net *net,
} else {
rcu_read_lock();
limit = ct_limit_get(info, zone);
- rcu_read_unlock();
err = __ovs_ct_limit_get_zone_limit(
net, info->data, zone, limit, reply);
+ rcu_read_unlock();
if (err)
return err;
}
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH nf-next v4 2/5] netfilter: nf_conncount: use per nf_conncount_data spinlocks
2026-06-05 13:11 [PATCH nf-next v4 0/5] netfilter: nf_conncount: fix gc and rbtree bugs Florian Westphal
2026-06-05 13:11 ` [PATCH nf-next v4 1/5] netfilter: nf_conncount: callers must hold rcu read lock Florian Westphal
@ 2026-06-05 13:11 ` Florian Westphal
2026-06-05 14:40 ` Florian Westphal
2026-06-05 13:11 ` [PATCH nf-next v4 3/5] netfilter: nf_conncount: split count_tree_node rbtree walk into helper Florian Westphal
` (2 subsequent siblings)
4 siblings, 1 reply; 7+ messages in thread
From: Florian Westphal @ 2026-06-05 13:11 UTC (permalink / raw)
To: netfilter-devel; +Cc: Florian Westphal
This change replaces the rb_root with a new container structure.
Instead of an array of locks shared by all nf_conncount_data objects,
each tree gains its own dedicated lock.
Downside: nf_conncount_data increases in size. Before this change:
struct nf_conncount_data {
[..]
/* --- cacheline 33 boundary (2112 bytes) was 16 bytes ago --- */
unsigned int gc_tree; /* 2128 4 */
/* size: 2136, cachelines: 34, members: 7 */
/* padding: 4 */
After:
/* size: 4184, cachelines: 66, members: 7 */
/* padding: 4 */
On LOCKDEP enabled kernels, this is even worse:
/* size: 18560, cachelines: 290, members: 7 */
(due to lockdep map in each spinlock).
For this reason also switch to kvzalloc. The zeroing variant is needed
to not start with random (heap memory content) in the ->pending_trees
bitmap.
Followup patch will add and use a sequence counter.
Assisted-by: Claude:claude-sonnet-4-6
Signed-off-by: Florian Westphal <fw@strlen.de>
---
v4: remove global lock array and use per-tree lock.
net/netfilter/nf_conncount.c | 63 +++++++++++++++++++-----------------
1 file changed, 34 insertions(+), 29 deletions(-)
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 81e4a4e20df5..faecc05d34d4 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -54,12 +54,15 @@ struct nf_conncount_rb {
struct rcu_head rcu_head;
};
-static spinlock_t nf_conncount_locks[CONNCOUNT_SLOTS] __cacheline_aligned_in_smp;
+struct nf_conncount_root {
+ struct rb_root root;
+ spinlock_t lock;
+};
struct nf_conncount_data {
unsigned int keylen;
u32 initval;
- struct rb_root root[CONNCOUNT_SLOTS];
+ struct nf_conncount_root root[CONNCOUNT_SLOTS];
struct net *net;
struct work_struct gc_work;
unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)];
@@ -367,18 +370,19 @@ static void __tree_nodes_free(struct rcu_head *h)
kmem_cache_free(conncount_rb_cachep, rbconn);
}
-/* caller must hold tree nf_conncount_locks[] lock */
-static void tree_nodes_free(struct rb_root *root,
+static void tree_nodes_free(struct nf_conncount_root *root,
struct nf_conncount_rb *gc_nodes[],
unsigned int gc_count)
{
struct nf_conncount_rb *rbconn;
+ lockdep_assert_held(&root->lock);
+
while (gc_count) {
rbconn = gc_nodes[--gc_count];
spin_lock(&rbconn->list.list_lock);
if (!rbconn->list.count) {
- rb_erase(&rbconn->node, root);
+ rb_erase(&rbconn->node, &root->root);
call_rcu(&rbconn->rcu_head, __tree_nodes_free);
}
spin_unlock(&rbconn->list.list_lock);
@@ -396,10 +400,10 @@ insert_tree(struct net *net,
const struct sk_buff *skb,
u16 l3num,
struct nf_conncount_data *data,
- struct rb_root *root,
unsigned int hash,
const u32 *key)
{
+ struct nf_conncount_root *root = &data->root[hash];
struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
bool do_gc = true, refcounted = false;
@@ -410,10 +414,10 @@ insert_tree(struct net *net,
struct nf_conncount_rb *rbconn;
struct nf_conn *ct = NULL;
- spin_lock_bh(&nf_conncount_locks[hash]);
+ spin_lock_bh(&root->lock);
restart:
parent = NULL;
- rbnode = &(root->rb_node);
+ rbnode = &root->root.rb_node;
while (*rbnode) {
int diff;
rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
@@ -475,12 +479,12 @@ insert_tree(struct net *net,
rbconn->list.count = count;
rb_link_node_rcu(&rbconn->node, parent, rbnode);
- rb_insert_color(&rbconn->node, root);
+ rb_insert_color(&rbconn->node, &root->root);
}
out_unlock:
if (refcounted)
nf_ct_put(ct);
- spin_unlock_bh(&nf_conncount_locks[hash]);
+ spin_unlock_bh(&root->lock);
return count;
}
@@ -491,7 +495,7 @@ count_tree(struct net *net,
struct nf_conncount_data *data,
const u32 *key)
{
- struct rb_root *root;
+ struct nf_conncount_root *root;
struct rb_node *parent;
struct nf_conncount_rb *rbconn;
unsigned int hash;
@@ -499,7 +503,7 @@ count_tree(struct net *net,
hash = jhash2(key, data->keylen, data->initval) % CONNCOUNT_SLOTS;
root = &data->root[hash];
- parent = rcu_dereference(root->rb_node);
+ parent = rcu_dereference(root->root.rb_node);
while (parent) {
int diff;
@@ -544,14 +548,14 @@ count_tree(struct net *net,
if (!skb)
return 0;
- return insert_tree(net, skb, l3num, data, root, hash, key);
+ return insert_tree(net, skb, l3num, data, hash, key);
}
static void tree_gc_worker(struct work_struct *work)
{
struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work);
struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;
- struct rb_root *root;
+ struct nf_conncount_root *root;
struct rb_node *node;
unsigned int tree, next_tree, gc_count = 0;
@@ -560,7 +564,7 @@ static void tree_gc_worker(struct work_struct *work)
local_bh_disable();
rcu_read_lock();
- for (node = rb_first(root); node != NULL; node = rb_next(node)) {
+ for (node = rb_first(&root->root); node ; node = rb_next(node)) {
rbconn = rb_entry(node, struct nf_conncount_rb, node);
if (nf_conncount_gc_list(data->net, &rbconn->list))
gc_count++;
@@ -570,12 +574,12 @@ static void tree_gc_worker(struct work_struct *work)
cond_resched();
- spin_lock_bh(&nf_conncount_locks[tree]);
+ spin_lock_bh(&root->lock);
if (gc_count < ARRAY_SIZE(gc_nodes))
goto next; /* do not bother */
gc_count = 0;
- node = rb_first(root);
+ node = rb_first(&root->root);
while (node != NULL) {
rbconn = rb_entry(node, struct nf_conncount_rb, node);
node = rb_next(node);
@@ -602,7 +606,7 @@ static void tree_gc_worker(struct work_struct *work)
schedule_work(work);
}
- spin_unlock_bh(&nf_conncount_locks[tree]);
+ spin_unlock_bh(&root->lock);
}
/* Count and return number of conntrack entries in 'net' with particular 'key'.
@@ -620,6 +624,12 @@ unsigned int nf_conncount_count_skb(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_conncount_count_skb);
+static void nf_conncount_root_init(struct nf_conncount_root *r)
+{
+ r->root = RB_ROOT;
+ spin_lock_init(&r->lock);
+}
+
struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen)
{
struct nf_conncount_data *data;
@@ -630,12 +640,12 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen
keylen == 0)
return ERR_PTR(-EINVAL);
- data = kmalloc_obj(*data);
+ data = kvzalloc_obj(*data);
if (!data)
return ERR_PTR(-ENOMEM);
for (i = 0; i < ARRAY_SIZE(data->root); ++i)
- data->root[i] = RB_ROOT;
+ nf_conncount_root_init(&data->root[i]);
data->keylen = keylen / sizeof(u32);
data->net = net;
@@ -655,15 +665,15 @@ void nf_conncount_cache_free(struct nf_conncount_list *list)
}
EXPORT_SYMBOL_GPL(nf_conncount_cache_free);
-static void destroy_tree(struct rb_root *r)
+static void destroy_tree(struct nf_conncount_root *r)
{
struct nf_conncount_rb *rbconn;
struct rb_node *node;
- while ((node = rb_first(r)) != NULL) {
+ while ((node = rb_first(&r->root)) != NULL) {
rbconn = rb_entry(node, struct nf_conncount_rb, node);
- rb_erase(node, r);
+ rb_erase(node, &r->root);
nf_conncount_cache_free(&rbconn->list);
@@ -680,17 +690,12 @@ void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data)
for (i = 0; i < ARRAY_SIZE(data->root); ++i)
destroy_tree(&data->root[i]);
- kfree(data);
+ kvfree(data);
}
EXPORT_SYMBOL_GPL(nf_conncount_destroy);
static int __init nf_conncount_modinit(void)
{
- int i;
-
- for (i = 0; i < CONNCOUNT_SLOTS; ++i)
- spin_lock_init(&nf_conncount_locks[i]);
-
conncount_conn_cachep = KMEM_CACHE(nf_conncount_tuple, 0);
if (!conncount_conn_cachep)
return -ENOMEM;
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH nf-next v4 3/5] netfilter: nf_conncount: split count_tree_node rbtree walk into helper
2026-06-05 13:11 [PATCH nf-next v4 0/5] netfilter: nf_conncount: fix gc and rbtree bugs Florian Westphal
2026-06-05 13:11 ` [PATCH nf-next v4 1/5] netfilter: nf_conncount: callers must hold rcu read lock Florian Westphal
2026-06-05 13:11 ` [PATCH nf-next v4 2/5] netfilter: nf_conncount: use per nf_conncount_data spinlocks Florian Westphal
@ 2026-06-05 13:11 ` Florian Westphal
2026-06-05 13:11 ` [PATCH nf-next v4 4/5] netfilter: nf_conncount: add sequence counter to detect tree modifications Florian Westphal
2026-06-05 13:11 ` [PATCH nf-next v4 5/5] netfilter: nf_conncount: gc and rcu fixes Florian Westphal
4 siblings, 0 replies; 7+ messages in thread
From: Florian Westphal @ 2026-06-05 13:11 UTC (permalink / raw)
To: netfilter-devel; +Cc: Florian Westphal
Add find_tree_node() helper that fetches a matching rbtree node.
This is used by followup patch to optionally search the tree again while
preventing concurrent updates via tree lock.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
v4: prefer direct lockdep_is_held() use.
net/netfilter/nf_conncount.c | 102 +++++++++++++++++++++--------------
1 file changed, 62 insertions(+), 40 deletions(-)
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index faecc05d34d4..56ac64ecfb75 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -488,6 +488,34 @@ insert_tree(struct net *net,
return count;
}
+static struct nf_conncount_rb *
+find_tree_node(struct nf_conncount_root *root, struct nf_conncount_data *data,
+ const u32 *key)
+{
+ struct rb_node *parent;
+
+ parent = rcu_dereference_check(root->root.rb_node,
+ lockdep_is_held(&root->lock));
+ while (parent) {
+ struct nf_conncount_rb *rbconn;
+ int diff;
+
+ rbconn = rb_entry(parent, struct nf_conncount_rb, node);
+
+ diff = key_diff(key, rbconn->key, data->keylen);
+ if (diff < 0)
+ parent = rcu_dereference_check(parent->rb_left,
+ lockdep_is_held(&root->lock));
+ else if (diff > 0)
+ parent = rcu_dereference_check(parent->rb_right,
+ lockdep_is_held(&root->lock));
+ else
+ return rbconn;
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+
static unsigned int
count_tree(struct net *net,
const struct sk_buff *skb,
@@ -496,59 +524,53 @@ count_tree(struct net *net,
const u32 *key)
{
struct nf_conncount_root *root;
- struct rb_node *parent;
struct nf_conncount_rb *rbconn;
unsigned int hash;
+ int ret;
hash = jhash2(key, data->keylen, data->initval) % CONNCOUNT_SLOTS;
root = &data->root[hash];
- parent = rcu_dereference(root->root.rb_node);
- while (parent) {
- int diff;
-
- rbconn = rb_entry(parent, struct nf_conncount_rb, node);
+ rbconn = find_tree_node(root, data, key);
+ if (IS_ERR(rbconn)) {
+ if (PTR_ERR(rbconn) == -ENOENT) {
+ if (!skb)
+ return 0;
- diff = key_diff(key, rbconn->key, data->keylen);
- if (diff < 0) {
- parent = rcu_dereference(parent->rb_left);
- } else if (diff > 0) {
- parent = rcu_dereference(parent->rb_right);
- } else {
- int ret;
+ return insert_tree(net, skb, l3num, data, hash, key);
+ }
+ DEBUG_NET_WARN_ON_ONCE(IS_ERR(rbconn));
+ }
- if (!skb) {
- nf_conncount_gc_list(net, &rbconn->list);
- return rbconn->list.count;
- }
+ DEBUG_NET_WARN_ON_ONCE(IS_ERR_OR_NULL(rbconn));
+ if (IS_ERR_OR_NULL(rbconn))
+ return 0;
- spin_lock_bh(&rbconn->list.list_lock);
- /* Node might be about to be free'd.
- * We need to defer to insert_tree() in this case.
- */
- if (rbconn->list.count == 0) {
- spin_unlock_bh(&rbconn->list.list_lock);
- break;
- }
+ if (!skb) {
+ nf_conncount_gc_list(net, &rbconn->list);
+ return rbconn->list.count;
+ }
- /* same source network -> be counted! */
- ret = __nf_conncount_add(net, skb, l3num, &rbconn->list);
- spin_unlock_bh(&rbconn->list.list_lock);
- if (ret && ret != -EEXIST) {
- return 0; /* hotdrop */
- } else {
- /* -EEXIST means add was skipped, update the list */
- if (ret == -EEXIST)
- nf_conncount_gc_list(net, &rbconn->list);
- return rbconn->list.count;
- }
- }
+ spin_lock_bh(&rbconn->list.list_lock);
+ /* Node might be about to be free'd.
+ * We need to defer to insert_tree() in this case.
+ */
+ if (rbconn->list.count == 0) {
+ spin_unlock_bh(&rbconn->list.list_lock);
+ return insert_tree(net, skb, l3num, data, hash, key);
}
- if (!skb)
- return 0;
+ /* same source network -> be counted! */
+ ret = __nf_conncount_add(net, skb, l3num, &rbconn->list);
+ spin_unlock_bh(&rbconn->list.list_lock);
+
+ if (ret && ret != -EEXIST)
+ return 0; /* hotdrop */
+ /* -EEXIST means add was skipped, update the list */
+ if (ret == -EEXIST)
+ nf_conncount_gc_list(net, &rbconn->list);
- return insert_tree(net, skb, l3num, data, hash, key);
+ return rbconn->list.count;
}
static void tree_gc_worker(struct work_struct *work)
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH nf-next v4 4/5] netfilter: nf_conncount: add sequence counter to detect tree modifications
2026-06-05 13:11 [PATCH nf-next v4 0/5] netfilter: nf_conncount: fix gc and rbtree bugs Florian Westphal
` (2 preceding siblings ...)
2026-06-05 13:11 ` [PATCH nf-next v4 3/5] netfilter: nf_conncount: split count_tree_node rbtree walk into helper Florian Westphal
@ 2026-06-05 13:11 ` Florian Westphal
2026-06-05 13:11 ` [PATCH nf-next v4 5/5] netfilter: nf_conncount: gc and rcu fixes Florian Westphal
4 siblings, 0 replies; 7+ messages in thread
From: Florian Westphal @ 2026-06-05 13:11 UTC (permalink / raw)
To: netfilter-devel; +Cc: Florian Westphal
There a two issues with traversal:
1. Key lookup (tree search) cannot detect concurrent modifications and may
not find a result in case of parallel modification.
2. Worker does a lockless iteration. This is never safe.
Add a sequence counter and re-do the lookup under lock in case the
tree was modified / seqcount changed.
gc_worker bugs are addressed in the next patch.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
v4: use seqcount_spinlock_t.
net/netfilter/nf_conncount.c | 19 +++++++++++++++++++
1 file changed, 19 insertions(+)
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 56ac64ecfb75..1247cbe77740 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -57,6 +57,7 @@ struct nf_conncount_rb {
struct nf_conncount_root {
struct rb_root root;
spinlock_t lock;
+ seqcount_spinlock_t count;
};
struct nf_conncount_data {
@@ -382,8 +383,10 @@ static void tree_nodes_free(struct nf_conncount_root *root,
rbconn = gc_nodes[--gc_count];
spin_lock(&rbconn->list.list_lock);
if (!rbconn->list.count) {
+ write_seqcount_begin(&root->count);
rb_erase(&rbconn->node, &root->root);
call_rcu(&rbconn->rcu_head, __tree_nodes_free);
+ write_seqcount_end(&root->count);
}
spin_unlock(&rbconn->list.list_lock);
}
@@ -478,8 +481,10 @@ insert_tree(struct net *net,
count = 1;
rbconn->list.count = count;
+ write_seqcount_begin(&root->count);
rb_link_node_rcu(&rbconn->node, parent, rbnode);
rb_insert_color(&rbconn->node, &root->root);
+ write_seqcount_end(&root->count);
}
out_unlock:
if (refcounted)
@@ -492,6 +497,7 @@ static struct nf_conncount_rb *
find_tree_node(struct nf_conncount_root *root, struct nf_conncount_data *data,
const u32 *key)
{
+ unsigned int seq = read_seqcount_begin(&root->count);
struct rb_node *parent;
parent = rcu_dereference_check(root->root.rb_node,
@@ -511,8 +517,14 @@ find_tree_node(struct nf_conncount_root *root, struct nf_conncount_data *data,
lockdep_is_held(&root->lock));
else
return rbconn;
+
+ if (read_seqcount_retry(&root->count, seq))
+ return ERR_PTR(-EAGAIN);
}
+ if (read_seqcount_retry(&root->count, seq))
+ return ERR_PTR(-EAGAIN);
+
return ERR_PTR(-ENOENT);
}
@@ -533,6 +545,12 @@ count_tree(struct net *net,
rbconn = find_tree_node(root, data, key);
if (IS_ERR(rbconn)) {
+ if (PTR_ERR(rbconn) == -EAGAIN) {
+ spin_lock_bh(&root->lock);
+ rbconn = find_tree_node(root, data, key);
+ spin_unlock_bh(&root->lock);
+ }
+
if (PTR_ERR(rbconn) == -ENOENT) {
if (!skb)
return 0;
@@ -650,6 +668,7 @@ static void nf_conncount_root_init(struct nf_conncount_root *r)
{
r->root = RB_ROOT;
spin_lock_init(&r->lock);
+ seqcount_spinlock_init(&r->count, &r->lock);
}
struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen)
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH nf-next v4 5/5] netfilter: nf_conncount: gc and rcu fixes
2026-06-05 13:11 [PATCH nf-next v4 0/5] netfilter: nf_conncount: fix gc and rbtree bugs Florian Westphal
` (3 preceding siblings ...)
2026-06-05 13:11 ` [PATCH nf-next v4 4/5] netfilter: nf_conncount: add sequence counter to detect tree modifications Florian Westphal
@ 2026-06-05 13:11 ` Florian Westphal
4 siblings, 0 replies; 7+ messages in thread
From: Florian Westphal @ 2026-06-05 13:11 UTC (permalink / raw)
To: netfilter-devel; +Cc: Florian Westphal
Another drive-by AI review:
1) tree_gc_worker fails to wrap around after it can't find more pending
work. Update data->gc_tree unconditionally. If its 0, start from
the first pending tree (which can be 0).
2) tree_gc_worker() iterates the rbtree without lock. This is never
safe. Move iteration under the spinlock. If this takes too long
(resched needed), save key of next node, drop lock, resched, re-lock,
then search for the key (node). In very rare cases this node might
no longer exist, in that case we can just wait for next gc.
3) use disable_work_sync(), we don't want any restarts.
4) module exit function needs rcu_barrier before we zap the kmem cache.
Fixes: 5c789e131cbb ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search")
Closes: https://sashiko.dev/#/patchset/20260525182924.28456-1-fw%40strlen.de
Assisted-by: Claude:claude-sonnet-4-6
Signed-off-by: Florian Westphal <fw@strlen.de>
---
v4: minor adjustments due to earlier lock array removal.
net/netfilter/nf_conncount.c | 54 +++++++++++++++++++++---------------
1 file changed, 32 insertions(+), 22 deletions(-)
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 1247cbe77740..dd67004a5cc0 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -595,47 +595,54 @@ static void tree_gc_worker(struct work_struct *work)
{
struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work);
struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;
+ unsigned int tree, next_tree, gc_count = 0;
struct nf_conncount_root *root;
struct rb_node *node;
- unsigned int tree, next_tree, gc_count = 0;
+
+ if (data->gc_tree == 0)
+ data->gc_tree = find_first_bit(data->pending_trees, CONNCOUNT_SLOTS);
tree = data->gc_tree % CONNCOUNT_SLOTS;
root = &data->root[tree];
- local_bh_disable();
- rcu_read_lock();
- for (node = rb_first(&root->root); node ; node = rb_next(node)) {
- rbconn = rb_entry(node, struct nf_conncount_rb, node);
- if (nf_conncount_gc_list(data->net, &rbconn->list))
- gc_count++;
- }
- rcu_read_unlock();
- local_bh_enable();
-
- cond_resched();
-
spin_lock_bh(&root->lock);
- if (gc_count < ARRAY_SIZE(gc_nodes))
- goto next; /* do not bother */
-
gc_count = 0;
node = rb_first(&root->root);
while (node != NULL) {
+ u32 key[MAX_KEYLEN];
+ bool drop_lock;
+
rbconn = rb_entry(node, struct nf_conncount_rb, node);
node = rb_next(node);
- if (rbconn->list.count > 0)
- continue;
+ if (nf_conncount_gc_list(data->net, &rbconn->list))
+ gc_nodes[gc_count++] = rbconn;
+
+ drop_lock = need_resched();
- gc_nodes[gc_count++] = rbconn;
- if (gc_count >= ARRAY_SIZE(gc_nodes)) {
+ if (drop_lock || gc_count >= ARRAY_SIZE(gc_nodes)) {
tree_nodes_free(root, gc_nodes, gc_count);
gc_count = 0;
}
+
+ if (!drop_lock || !node)
+ continue;
+
+ rbconn = rb_entry(node, struct nf_conncount_rb, node);
+ memcpy(key, rbconn->key, sizeof(key));
+ spin_unlock_bh(&root->lock);
+
+ cond_resched();
+
+ spin_lock_bh(&root->lock);
+ rbconn = find_tree_node(root, data, key);
+ if (IS_ERR_OR_NULL(rbconn)) /* rbconn was reaped */
+ break;
+
+ node = &rbconn->node;
}
tree_nodes_free(root, gc_nodes, gc_count);
-next:
clear_bit(tree, data->pending_trees);
next_tree = (tree + 1) % CONNCOUNT_SLOTS;
@@ -644,6 +651,8 @@ static void tree_gc_worker(struct work_struct *work)
if (next_tree < CONNCOUNT_SLOTS) {
data->gc_tree = next_tree;
schedule_work(work);
+ } else {
+ data->gc_tree = 0;
}
spin_unlock_bh(&root->lock);
@@ -726,7 +735,7 @@ void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data)
{
unsigned int i;
- cancel_work_sync(&data->gc_work);
+ disable_work_sync(&data->gc_work);
for (i = 0; i < ARRAY_SIZE(data->root); ++i)
destroy_tree(&data->root[i]);
@@ -752,6 +761,7 @@ static int __init nf_conncount_modinit(void)
static void __exit nf_conncount_modexit(void)
{
+ rcu_barrier();
kmem_cache_destroy(conncount_conn_cachep);
kmem_cache_destroy(conncount_rb_cachep);
}
--
2.53.0
^ permalink raw reply related [flat|nested] 7+ messages in thread