* [PATCH net 1/3] netfilter: nft_ct: Use __refcount_inc() for per-CPU nft_ct_pcpu_template.
2025-03-06 15:34 [PATCH net 0/3] Netfilter fixes for net Pablo Neira Ayuso
@ 2025-03-06 15:34 ` Pablo Neira Ayuso
2025-03-07 2:30 ` patchwork-bot+netdevbpf
2025-03-06 15:34 ` [PATCH net 2/3] netfilter: nf_conncount: garbage collection is not skipped when jiffies wrap around Pablo Neira Ayuso
2025-03-06 15:34 ` [PATCH net 3/3] netfilter: nf_tables: make destruction work queue pernet Pablo Neira Ayuso
2 siblings, 1 reply; 5+ messages in thread
From: Pablo Neira Ayuso @ 2025-03-06 15:34 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
nft_ct_pcpu_template is a per-CPU variable and relies on disabled BH for its
locking. The refcounter is read and if its value is set to one then the
refcounter is incremented and variable is used - otherwise it is already
in use and left untouched.
Without per-CPU locking in local_bh_disable() on PREEMPT_RT the
read-then-increment operation is not atomic and therefore racy.
This can be avoided by using unconditionally __refcount_inc() which will
increment counter and return the old value as an atomic operation.
In case the returned counter is not one, the variable is in use and we
need to decrement counter. Otherwise we can use it.
Use __refcount_inc() instead of read and a conditional increment.
Fixes: edee4f1e9245 ("netfilter: nft_ct: add zone id set support")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nft_ct.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 2e59aba681a1..d526e69a2a2b 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -230,6 +230,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr,
enum ip_conntrack_info ctinfo;
u16 value = nft_reg_load16(®s->data[priv->sreg]);
struct nf_conn *ct;
+ int oldcnt;
ct = nf_ct_get(skb, &ctinfo);
if (ct) /* already tracked */
@@ -250,10 +251,11 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr,
ct = this_cpu_read(nft_ct_pcpu_template);
- if (likely(refcount_read(&ct->ct_general.use) == 1)) {
- refcount_inc(&ct->ct_general.use);
+ __refcount_inc(&ct->ct_general.use, &oldcnt);
+ if (likely(oldcnt == 1)) {
nf_ct_zone_add(ct, &zone);
} else {
+ refcount_dec(&ct->ct_general.use);
/* previous skb got queued to userspace, allocate temporary
* one until percpu template can be reused.
*/
--
2.30.2
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH net 3/3] netfilter: nf_tables: make destruction work queue pernet
2025-03-06 15:34 [PATCH net 0/3] Netfilter fixes for net Pablo Neira Ayuso
2025-03-06 15:34 ` [PATCH net 1/3] netfilter: nft_ct: Use __refcount_inc() for per-CPU nft_ct_pcpu_template Pablo Neira Ayuso
2025-03-06 15:34 ` [PATCH net 2/3] netfilter: nf_conncount: garbage collection is not skipped when jiffies wrap around Pablo Neira Ayuso
@ 2025-03-06 15:34 ` Pablo Neira Ayuso
2 siblings, 0 replies; 5+ messages in thread
From: Pablo Neira Ayuso @ 2025-03-06 15:34 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw, horms
From: Florian Westphal <fw@strlen.de>
The call to flush_work before tearing down a table from the netlink
notifier was supposed to make sure that all earlier updates (e.g. rule
add) that might reference that table have been processed.
Unfortunately, flush_work() waits for the last queued instance.
This could be an instance that is different from the one that we must
wait for.
This is because transactions are protected with a pernet mutex, but the
work item is global, so holding the transaction mutex doesn't prevent
another netns from queueing more work.
Make the work item pernet so that flush_work() will wait for all
transactions queued from this netns.
A welcome side effect is that we no longer need to wait for transaction
objects from foreign netns.
The gc work queue is still global. This seems to be ok because nft_set
structures are reference counted and each container structure owns a
reference on the net namespace.
The destroy_list is still protected by a global spinlock rather than
pernet one but the hold time is very short anyway.
v2: call cancel_work_sync before reaping the remaining tables (Pablo).
Fixes: 9f6958ba2e90 ("netfilter: nf_tables: unconditionally flush pending work before notifier")
Reported-by: syzbot+5d8c5789c8cb076b2c25@syzkaller.appspotmail.com
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/net/netfilter/nf_tables.h | 4 +++-
net/netfilter/nf_tables_api.c | 24 ++++++++++++++----------
net/netfilter/nft_compat.c | 8 ++++----
3 files changed, 21 insertions(+), 15 deletions(-)
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 60d5dcdb289c..803d5f1601f9 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1891,7 +1891,7 @@ void nft_chain_filter_fini(void);
void __init nft_chain_route_init(void);
void nft_chain_route_fini(void);
-void nf_tables_trans_destroy_flush_work(void);
+void nf_tables_trans_destroy_flush_work(struct net *net);
int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result);
__be64 nf_jiffies64_to_msecs(u64 input);
@@ -1905,6 +1905,7 @@ static inline int nft_request_module(struct net *net, const char *fmt, ...) { re
struct nftables_pernet {
struct list_head tables;
struct list_head commit_list;
+ struct list_head destroy_list;
struct list_head commit_set_list;
struct list_head binding_list;
struct list_head module_list;
@@ -1915,6 +1916,7 @@ struct nftables_pernet {
unsigned int base_seq;
unsigned int gc_seq;
u8 validate_state;
+ struct work_struct destroy_work;
};
extern unsigned int nf_tables_net_id;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index a34de9c17cf1..c2df81b7e950 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -34,7 +34,6 @@ unsigned int nf_tables_net_id __read_mostly;
static LIST_HEAD(nf_tables_expressions);
static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
-static LIST_HEAD(nf_tables_destroy_list);
static LIST_HEAD(nf_tables_gc_list);
static DEFINE_SPINLOCK(nf_tables_destroy_list_lock);
static DEFINE_SPINLOCK(nf_tables_gc_list_lock);
@@ -125,7 +124,6 @@ static void nft_validate_state_update(struct nft_table *table, u8 new_validate_s
table->validate_state = new_validate_state;
}
static void nf_tables_trans_destroy_work(struct work_struct *w);
-static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work);
static void nft_trans_gc_work(struct work_struct *work);
static DECLARE_WORK(trans_gc_work, nft_trans_gc_work);
@@ -10006,11 +10004,12 @@ static void nft_commit_release(struct nft_trans *trans)
static void nf_tables_trans_destroy_work(struct work_struct *w)
{
+ struct nftables_pernet *nft_net = container_of(w, struct nftables_pernet, destroy_work);
struct nft_trans *trans, *next;
LIST_HEAD(head);
spin_lock(&nf_tables_destroy_list_lock);
- list_splice_init(&nf_tables_destroy_list, &head);
+ list_splice_init(&nft_net->destroy_list, &head);
spin_unlock(&nf_tables_destroy_list_lock);
if (list_empty(&head))
@@ -10024,9 +10023,11 @@ static void nf_tables_trans_destroy_work(struct work_struct *w)
}
}
-void nf_tables_trans_destroy_flush_work(void)
+void nf_tables_trans_destroy_flush_work(struct net *net)
{
- flush_work(&trans_destroy_work);
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ flush_work(&nft_net->destroy_work);
}
EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work);
@@ -10484,11 +10485,11 @@ static void nf_tables_commit_release(struct net *net)
trans->put_net = true;
spin_lock(&nf_tables_destroy_list_lock);
- list_splice_tail_init(&nft_net->commit_list, &nf_tables_destroy_list);
+ list_splice_tail_init(&nft_net->commit_list, &nft_net->destroy_list);
spin_unlock(&nf_tables_destroy_list_lock);
nf_tables_module_autoload_cleanup(net);
- schedule_work(&trans_destroy_work);
+ schedule_work(&nft_net->destroy_work);
mutex_unlock(&nft_net->commit_mutex);
}
@@ -11853,7 +11854,7 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event,
gc_seq = nft_gc_seq_begin(nft_net);
- nf_tables_trans_destroy_flush_work();
+ nf_tables_trans_destroy_flush_work(net);
again:
list_for_each_entry(table, &nft_net->tables, list) {
if (nft_table_has_owner(table) &&
@@ -11895,6 +11896,7 @@ static int __net_init nf_tables_init_net(struct net *net)
INIT_LIST_HEAD(&nft_net->tables);
INIT_LIST_HEAD(&nft_net->commit_list);
+ INIT_LIST_HEAD(&nft_net->destroy_list);
INIT_LIST_HEAD(&nft_net->commit_set_list);
INIT_LIST_HEAD(&nft_net->binding_list);
INIT_LIST_HEAD(&nft_net->module_list);
@@ -11903,6 +11905,7 @@ static int __net_init nf_tables_init_net(struct net *net)
nft_net->base_seq = 1;
nft_net->gc_seq = 0;
nft_net->validate_state = NFT_VALIDATE_SKIP;
+ INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work);
return 0;
}
@@ -11931,14 +11934,17 @@ static void __net_exit nf_tables_exit_net(struct net *net)
if (!list_empty(&nft_net->module_list))
nf_tables_module_autoload_cleanup(net);
+ cancel_work_sync(&nft_net->destroy_work);
__nft_release_tables(net);
nft_gc_seq_end(nft_net, gc_seq);
mutex_unlock(&nft_net->commit_mutex);
+
WARN_ON_ONCE(!list_empty(&nft_net->tables));
WARN_ON_ONCE(!list_empty(&nft_net->module_list));
WARN_ON_ONCE(!list_empty(&nft_net->notify_list));
+ WARN_ON_ONCE(!list_empty(&nft_net->destroy_list));
}
static void nf_tables_exit_batch(struct list_head *net_exit_list)
@@ -12029,10 +12035,8 @@ static void __exit nf_tables_module_exit(void)
unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
nft_chain_filter_fini();
nft_chain_route_fini();
- nf_tables_trans_destroy_flush_work();
unregister_pernet_subsys(&nf_tables_net_ops);
cancel_work_sync(&trans_gc_work);
- cancel_work_sync(&trans_destroy_work);
rcu_barrier();
rhltable_destroy(&nft_objname_ht);
nf_tables_core_module_exit();
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 7ca4f0d21fe2..72711d62fddf 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -228,7 +228,7 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv)
return 0;
}
-static void nft_compat_wait_for_destructors(void)
+static void nft_compat_wait_for_destructors(struct net *net)
{
/* xtables matches or targets can have side effects, e.g.
* creation/destruction of /proc files.
@@ -236,7 +236,7 @@ static void nft_compat_wait_for_destructors(void)
* work queue. If we have pending invocations we thus
* need to wait for those to finish.
*/
- nf_tables_trans_destroy_flush_work();
+ nf_tables_trans_destroy_flush_work(net);
}
static int
@@ -262,7 +262,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
- nft_compat_wait_for_destructors();
+ nft_compat_wait_for_destructors(ctx->net);
ret = xt_check_target(&par, size, proto, inv);
if (ret < 0) {
@@ -515,7 +515,7 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
- nft_compat_wait_for_destructors();
+ nft_compat_wait_for_destructors(ctx->net);
return xt_check_match(&par, size, proto, inv);
}
--
2.30.2
^ permalink raw reply related [flat|nested] 5+ messages in thread