* [PATCH nf 2/5] netfilter: nf_tables: GC transaction API to avoid race with control plane
2023-08-09 13:15 [PATCH nf 1/5] netfilter: nf_tables: don't skip expired elements during walk Pablo Neira Ayuso
@ 2023-08-09 13:15 ` Pablo Neira Ayuso
2023-08-09 13:15 ` [PATCH nf 3/5] netfilter: nf_tables: adapt set backend to use GC transaction API Pablo Neira Ayuso
` (2 subsequent siblings)
3 siblings, 0 replies; 6+ messages in thread
From: Pablo Neira Ayuso @ 2023-08-09 13:15 UTC (permalink / raw)
To: netfilter-devel
The set types 'rhashtable' and 'rbtree' use a GC worker to reclaim memory.
From system work queue, in periodic intervals, a scan of the table is done.
The major caveat here is that the nft transaction mutex is not held.
This causes a race between control plane and GC when they attempt to
delete the same element.
We cannot grab the netlink mutex from the work queue, because the
GC work queue. Else we would run into following deadlock:
cpu 1 cpu2
GC work transaction comes in , lock nft mutex
`acquire nft mutex // BLOCKS
transaction asks to remove the set
set destruction calls cancel_work_sync()
cancel_work_sync will now block forever, because its waiting for the
mutex the caller already owns.
This patch adds a new API that deals with garbage collection in two
steps:
1) Lockless GC of expired elements sets on the NFT_SET_ELEM_DEAD_BIT
so they are not visible via lookup. Annotate current GC sequence in
the GC transaction. Enqueue GC transaction work as soon as it is
full. If ruleset is updated, then GC is aborted and retried later.
2) When work callback runs then grab the mutex. If GC sequence has
changed then this GC translation lost race with control plane,
abort it as it contains stale references to objects and let GC try
again later. If the ruleset is intact, then this GC transaction
deactivates and removes the elements and it uses call_rcu() to
destroy elements.
Note that no elements are removed from GC lockless path, the _DEAD bit
is set and pointers are collected. GC catchall does not remove the
elements anymore too. There is a new set->dead flag that is set on to
abort the GC transaction to deal with set->ops->destroy() path which
removes the remaining elements in the set from commit_release, where no
mutex is held.
To deal with GC when mutex is held, which allows safe deactivate and
removal, add sync GC API which releases the set element object via
call_rcu(). This is used by rbtree and pipapo backends which also
perform garbage collection from control plane path.
Since element removal from sets can happen from control plane and
element garbage collection/timeout, it is necessary to keep the set
structure alive until all elements have been deactivated and destroyed.
We cannot do a cancel_work_sync or flush_work in nft_set_destroy
because its called with the transaction mutex held, but the
aforementioned async work queue might be blocked on the very mutex
that nft_set_destroy() callchain is sitting on.
This gives us the choice of ABBA deadlock or UaF.
To avoid both, add set->refs refcount_t member. The GC API can then
increment the set refcount and release it once the elements have been
free'd.
Set backends are adapted to use the GC transaction API in a follow up
patch entitled:
("netfilter: nf_tables: use gc transaction API in set backends")
This is joint work with Florian Westphal.
Fixes: cfed7e1b1f8e ("netfilter: nf_tables: add set garbage collection helpers")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/net/netfilter/nf_tables.h | 64 +++++++-
net/netfilter/nf_tables_api.c | 248 ++++++++++++++++++++++++++++--
2 files changed, 300 insertions(+), 12 deletions(-)
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 640441a2f926..7256e9c80477 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -512,6 +512,7 @@ struct nft_set_elem_expr {
*
* @list: table set list node
* @bindings: list of set bindings
+ * @refs: internal refcounting for async set destruction
* @table: table this set belongs to
* @net: netnamespace this set belongs to
* @name: name of the set
@@ -541,6 +542,7 @@ struct nft_set_elem_expr {
struct nft_set {
struct list_head list;
struct list_head bindings;
+ refcount_t refs;
struct nft_table *table;
possible_net_t net;
char *name;
@@ -562,7 +564,8 @@ struct nft_set {
struct list_head pending_update;
/* runtime data below here */
const struct nft_set_ops *ops ____cacheline_aligned;
- u16 flags:14,
+ u16 flags:13,
+ dead:1,
genmask:2;
u8 klen;
u8 dlen;
@@ -1592,6 +1595,32 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext)
clear_bit(NFT_SET_ELEM_BUSY_BIT, word);
}
+#define NFT_SET_ELEM_DEAD_MASK (1 << 3)
+
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+#define NFT_SET_ELEM_DEAD_BIT 3
+#elif defined(__BIG_ENDIAN_BITFIELD)
+#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3)
+#else
+#error
+#endif
+
+static inline void nft_set_elem_dead(struct nft_set_ext *ext)
+{
+ unsigned long *word = (unsigned long *)ext;
+
+ BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
+ set_bit(NFT_SET_ELEM_DEAD_BIT, word);
+}
+
+static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext)
+{
+ unsigned long *word = (unsigned long *)ext;
+
+ BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
+ return test_bit(NFT_SET_ELEM_DEAD_BIT, word);
+}
+
/**
* struct nft_trans - nf_tables object update in transaction
*
@@ -1732,6 +1761,38 @@ struct nft_trans_flowtable {
#define nft_trans_flowtable_flags(trans) \
(((struct nft_trans_flowtable *)trans->data)->flags)
+#define NFT_TRANS_GC_BATCHCOUNT 256
+
+struct nft_trans_gc {
+ struct list_head list;
+ struct net *net;
+ struct nft_set *set;
+ u32 seq;
+ u8 count;
+ void *priv[NFT_TRANS_GC_BATCHCOUNT];
+ struct rcu_head rcu;
+};
+
+struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
+ unsigned int gc_seq, gfp_t gfp);
+void nft_trans_gc_destroy(struct nft_trans_gc *trans);
+
+struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
+ unsigned int gc_seq, gfp_t gfp);
+void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc);
+
+struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp);
+void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans);
+
+void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv);
+
+struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc,
+ unsigned int gc_seq);
+
+void nft_setelem_data_deactivate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_set_elem *elem);
+
int __init nft_chain_filter_init(void);
void nft_chain_filter_fini(void);
@@ -1758,6 +1819,7 @@ struct nftables_pernet {
struct mutex commit_mutex;
u64 table_handle;
unsigned int base_seq;
+ unsigned int gc_seq;
};
extern unsigned int nf_tables_net_id;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index b4321869e5c6..c28bacb9479b 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -31,7 +31,9 @@ static LIST_HEAD(nf_tables_expressions);
static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
static LIST_HEAD(nf_tables_destroy_list);
+static LIST_HEAD(nf_tables_gc_list);
static DEFINE_SPINLOCK(nf_tables_destroy_list_lock);
+static DEFINE_SPINLOCK(nf_tables_gc_list_lock);
enum {
NFT_VALIDATE_SKIP = 0,
@@ -120,6 +122,9 @@ static void nft_validate_state_update(struct nft_table *table, u8 new_validate_s
static void nf_tables_trans_destroy_work(struct work_struct *w);
static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work);
+static void nft_trans_gc_work(struct work_struct *work);
+static DECLARE_WORK(trans_gc_work, nft_trans_gc_work);
+
static void nft_ctx_init(struct nft_ctx *ctx,
struct net *net,
const struct sk_buff *skb,
@@ -582,10 +587,6 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
return __nft_trans_set_add(ctx, msg_type, set, NULL);
}
-static void nft_setelem_data_deactivate(const struct net *net,
- const struct nft_set *set,
- struct nft_set_elem *elem);
-
static int nft_mapelem_deactivate(const struct nft_ctx *ctx,
struct nft_set *set,
const struct nft_set_iter *iter,
@@ -5055,6 +5056,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
INIT_LIST_HEAD(&set->bindings);
INIT_LIST_HEAD(&set->catchall_list);
+ refcount_set(&set->refs, 1);
set->table = table;
write_pnet(&set->net, net);
set->ops = ops;
@@ -5122,6 +5124,14 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx,
}
}
+static void nft_set_put(struct nft_set *set)
+{
+ if (refcount_dec_and_test(&set->refs)) {
+ kfree(set->name);
+ kvfree(set);
+ }
+}
+
static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
{
int i;
@@ -5134,8 +5144,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
set->ops->destroy(ctx, set);
nft_set_catchall_destroy(ctx, set);
- kfree(set->name);
- kvfree(set);
+ nft_set_put(set);
}
static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info,
@@ -6278,7 +6287,8 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
ext = nft_set_elem_ext(set, catchall->elem);
if (nft_set_elem_active(ext, genmask) &&
- !nft_set_elem_expired(ext))
+ !nft_set_elem_expired(ext) &&
+ !nft_set_elem_is_dead(ext))
return ext;
}
@@ -6933,9 +6943,9 @@ static void nft_setelem_data_activate(const struct net *net,
nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use);
}
-static void nft_setelem_data_deactivate(const struct net *net,
- const struct nft_set *set,
- struct nft_set_elem *elem)
+void nft_setelem_data_deactivate(const struct net *net,
+ const struct nft_set *set,
+ struct nft_set_elem *elem)
{
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
@@ -9418,6 +9428,207 @@ void nft_chain_del(struct nft_chain *chain)
list_del_rcu(&chain->list);
}
+static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx,
+ struct nft_trans_gc *trans)
+{
+ void **priv = trans->priv;
+ unsigned int i;
+
+ for (i = 0; i < trans->count; i++) {
+ struct nft_set_elem elem = {
+ .priv = priv[i],
+ };
+
+ nft_setelem_data_deactivate(ctx->net, trans->set, &elem);
+ nft_setelem_remove(ctx->net, trans->set, &elem);
+ }
+}
+
+void nft_trans_gc_destroy(struct nft_trans_gc *trans)
+{
+ nft_set_put(trans->set);
+ put_net(trans->net);
+ kfree(trans);
+}
+
+static void nft_trans_gc_trans_free(struct rcu_head *rcu)
+{
+ struct nft_set_elem elem = {};
+ struct nft_trans_gc *trans;
+ struct nft_ctx ctx = {};
+ unsigned int i;
+
+ trans = container_of(rcu, struct nft_trans_gc, rcu);
+ ctx.net = read_pnet(&trans->set->net);
+
+ for (i = 0; i < trans->count; i++) {
+ elem.priv = trans->priv[i];
+ if (!nft_setelem_is_catchall(trans->set, &elem))
+ atomic_dec(&trans->set->nelems);
+
+ nf_tables_set_elem_destroy(&ctx, trans->set, elem.priv);
+ }
+
+ nft_trans_gc_destroy(trans);
+}
+
+static bool nft_trans_gc_work_done(struct nft_trans_gc *trans)
+{
+ struct nftables_pernet *nft_net;
+ struct nft_ctx ctx = {};
+
+ nft_net = nft_pernet(trans->net);
+
+ mutex_lock(&nft_net->commit_mutex);
+
+ /* Check for race with transaction, otherwise this batch refers to
+ * stale objects that might not be there anymore. Skip transaction if
+ * set has been destroyed from control plane transaction in case gc
+ * worker loses race.
+ */
+ if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) {
+ mutex_unlock(&nft_net->commit_mutex);
+ return false;
+ }
+
+ ctx.net = trans->net;
+ ctx.table = trans->set->table;
+
+ nft_trans_gc_setelem_remove(&ctx, trans);
+ mutex_unlock(&nft_net->commit_mutex);
+
+ return true;
+}
+
+static void nft_trans_gc_work(struct work_struct *work)
+{
+ struct nft_trans_gc *trans, *next;
+ LIST_HEAD(trans_gc_list);
+
+ spin_lock(&nf_tables_destroy_list_lock);
+ list_splice_init(&nf_tables_gc_list, &trans_gc_list);
+ spin_unlock(&nf_tables_destroy_list_lock);
+
+ list_for_each_entry_safe(trans, next, &trans_gc_list, list) {
+ list_del(&trans->list);
+ if (!nft_trans_gc_work_done(trans)) {
+ nft_trans_gc_destroy(trans);
+ continue;
+ }
+ call_rcu(&trans->rcu, nft_trans_gc_trans_free);
+ }
+}
+
+struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
+ unsigned int gc_seq, gfp_t gfp)
+{
+ struct net *net = read_pnet(&set->net);
+ struct nft_trans_gc *trans;
+
+ trans = kzalloc(sizeof(*trans), gfp);
+ if (!trans)
+ return NULL;
+
+ refcount_inc(&set->refs);
+ trans->set = set;
+ trans->net = get_net(net);
+ trans->seq = gc_seq;
+
+ return trans;
+}
+
+void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv)
+{
+ trans->priv[trans->count++] = priv;
+}
+
+static void nft_trans_gc_queue_work(struct nft_trans_gc *trans)
+{
+ spin_lock(&nf_tables_gc_list_lock);
+ list_add_tail(&trans->list, &nf_tables_gc_list);
+ spin_unlock(&nf_tables_gc_list_lock);
+
+ schedule_work(&trans_gc_work);
+}
+
+static int nft_trans_gc_space(struct nft_trans_gc *trans)
+{
+ return NFT_TRANS_GC_BATCHCOUNT - trans->count;
+}
+
+struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
+ unsigned int gc_seq, gfp_t gfp)
+{
+ if (nft_trans_gc_space(gc))
+ return gc;
+
+ nft_trans_gc_queue_work(gc);
+
+ return nft_trans_gc_alloc(gc->set, gc_seq, gfp);
+}
+
+void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans)
+{
+ if (trans->count == 0) {
+ nft_trans_gc_destroy(trans);
+ return;
+ }
+
+ nft_trans_gc_queue_work(trans);
+}
+
+struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp)
+{
+ if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)))
+ return NULL;
+
+ if (nft_trans_gc_space(gc))
+ return gc;
+
+ call_rcu(&gc->rcu, nft_trans_gc_trans_free);
+
+ return nft_trans_gc_alloc(gc->set, 0, gfp);
+}
+
+void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans)
+{
+ WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net));
+
+ if (trans->count == 0) {
+ nft_trans_gc_destroy(trans);
+ return;
+ }
+
+ call_rcu(&trans->rcu, nft_trans_gc_trans_free);
+}
+
+struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc,
+ unsigned int gc_seq)
+{
+ struct nft_set_elem_catchall *catchall;
+ const struct nft_set *set = gc->set;
+ struct nft_set_ext *ext;
+
+ list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
+ ext = nft_set_elem_ext(set, catchall->elem);
+
+ if (!nft_set_elem_expired(ext))
+ continue;
+ if (nft_set_elem_is_dead(ext))
+ goto dead_elem;
+
+ nft_set_elem_dead(ext);
+dead_elem:
+ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+ if (!gc)
+ return NULL;
+
+ nft_trans_gc_elem_add(gc, catchall->elem);
+ }
+
+ return gc;
+}
+
static void nf_tables_module_autoload_cleanup(struct net *net)
{
struct nftables_pernet *nft_net = nft_pernet(net);
@@ -9580,11 +9791,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
{
struct nftables_pernet *nft_net = nft_pernet(net);
struct nft_trans *trans, *next;
+ unsigned int base_seq, gc_seq;
LIST_HEAD(set_update_list);
struct nft_trans_elem *te;
struct nft_chain *chain;
struct nft_table *table;
- unsigned int base_seq;
LIST_HEAD(adl);
int err;
@@ -9661,6 +9872,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
WRITE_ONCE(nft_net->base_seq, base_seq);
+ /* Bump gc counter, it becomes odd, this is the busy mark. */
+ gc_seq = READ_ONCE(nft_net->gc_seq);
+ WRITE_ONCE(nft_net->gc_seq, ++gc_seq);
+
/* step 3. Start new generation, rules_gen_X now in use. */
net->nft.gencursor = nft_gencursor_next(net);
@@ -9768,6 +9983,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
break;
case NFT_MSG_DELSET:
case NFT_MSG_DESTROYSET:
+ nft_trans_set(trans)->dead = 1;
list_del_rcu(&nft_trans_set(trans)->list);
nf_tables_set_notify(&trans->ctx, nft_trans_set(trans),
trans->msg_type, GFP_KERNEL);
@@ -9870,6 +10086,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nft_commit_notify(net, NETLINK_CB(skb).portid);
nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
nf_tables_commit_audit_log(&adl, nft_net->base_seq);
+
+ WRITE_ONCE(nft_net->gc_seq, ++gc_seq);
nf_tables_commit_release(net);
return 0;
@@ -10919,6 +11137,7 @@ static int __net_init nf_tables_init_net(struct net *net)
INIT_LIST_HEAD(&nft_net->notify_list);
mutex_init(&nft_net->commit_mutex);
nft_net->base_seq = 1;
+ nft_net->gc_seq = 0;
return 0;
}
@@ -10947,10 +11166,16 @@ static void __net_exit nf_tables_exit_net(struct net *net)
WARN_ON_ONCE(!list_empty(&nft_net->notify_list));
}
+static void nf_tables_exit_batch(struct list_head *net_exit_list)
+{
+ flush_work(&trans_gc_work);
+}
+
static struct pernet_operations nf_tables_net_ops = {
.init = nf_tables_init_net,
.pre_exit = nf_tables_pre_exit_net,
.exit = nf_tables_exit_net,
+ .exit_batch = nf_tables_exit_batch,
.id = &nf_tables_net_id,
.size = sizeof(struct nftables_pernet),
};
@@ -11022,6 +11247,7 @@ static void __exit nf_tables_module_exit(void)
nft_chain_filter_fini();
nft_chain_route_fini();
unregister_pernet_subsys(&nf_tables_net_ops);
+ cancel_work_sync(&trans_gc_work);
cancel_work_sync(&trans_destroy_work);
rcu_barrier();
rhltable_destroy(&nft_objname_ht);
--
2.30.2
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH nf 3/5] netfilter: nf_tables: adapt set backend to use GC transaction API
2023-08-09 13:15 [PATCH nf 1/5] netfilter: nf_tables: don't skip expired elements during walk Pablo Neira Ayuso
2023-08-09 13:15 ` [PATCH nf 2/5] netfilter: nf_tables: GC transaction API to avoid race with control plane Pablo Neira Ayuso
@ 2023-08-09 13:15 ` Pablo Neira Ayuso
2023-08-09 15:32 ` kernel test robot
2023-08-09 13:15 ` [PATCH nf 4/5] netfilter: nft_set_hash: mark set element as dead when deleting from packet path Pablo Neira Ayuso
2023-08-09 13:15 ` [PATCH nf 5/5] netfilter: nf_tables: remove busy mark and gc batch API Pablo Neira Ayuso
3 siblings, 1 reply; 6+ messages in thread
From: Pablo Neira Ayuso @ 2023-08-09 13:15 UTC (permalink / raw)
To: netfilter-devel
Use the GC transaction API to replace the old and buggy gc API and the
busy mark approach.
No set elements are removed from async garbage collection anymore,
instead the _DEAD bit is set on so the set element is not visible from
lookup path anymore. Async GC enqueues transaction work that might be
aborted and retried later.
rbtree and pipapo set backends does not set on the _DEAD bit from the
sync GC path since this runs in control plane path where mutex is held.
In this case, set elements are deactivated, removed and then released
via RCU callback, sync GC never fails.
Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges")
Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support")
Fixes: 9d0982927e79 ("netfilter: nft_hash: add support for timeouts")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_tables_api.c | 7 +-
net/netfilter/nft_set_hash.c | 77 +++++++++++-------
net/netfilter/nft_set_pipapo.c | 48 ++++++++---
net/netfilter/nft_set_rbtree.c | 142 ++++++++++++++++++++-------------
4 files changed, 173 insertions(+), 101 deletions(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index c28bacb9479b..fd4b5da7ac3c 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6380,7 +6380,6 @@ static void nft_setelem_activate(struct net *net, struct nft_set *set,
if (nft_setelem_is_catchall(set, elem)) {
nft_set_elem_change_active(net, set, ext);
- nft_set_elem_clear_busy(ext);
} else {
set->ops->activate(net, set, elem);
}
@@ -6395,8 +6394,7 @@ static int nft_setelem_catchall_deactivate(const struct net *net,
list_for_each_entry(catchall, &set->catchall_list, list) {
ext = nft_set_elem_ext(set, catchall->elem);
- if (!nft_is_active(net, ext) ||
- nft_set_elem_mark_busy(ext))
+ if (!nft_is_active(net, ext))
continue;
kfree(elem->priv);
@@ -7109,8 +7107,7 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx,
list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
ext = nft_set_elem_ext(set, catchall->elem);
- if (!nft_set_elem_active(ext, genmask) ||
- nft_set_elem_mark_busy(ext))
+ if (!nft_set_elem_active(ext, genmask))
continue;
elem.priv = catchall->elem;
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 24caa31fa231..2f067e4596b0 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -59,6 +59,8 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg,
if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen))
return 1;
+ if (nft_set_elem_is_dead(&he->ext))
+ return 1;
if (nft_set_elem_expired(&he->ext))
return 1;
if (!nft_set_elem_active(&he->ext, x->genmask))
@@ -188,7 +190,6 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set,
struct nft_rhash_elem *he = elem->priv;
nft_set_elem_change_active(net, set, &he->ext);
- nft_set_elem_clear_busy(&he->ext);
}
static bool nft_rhash_flush(const struct net *net,
@@ -196,12 +197,9 @@ static bool nft_rhash_flush(const struct net *net,
{
struct nft_rhash_elem *he = priv;
- if (!nft_set_elem_mark_busy(&he->ext) ||
- !nft_is_active(net, &he->ext)) {
- nft_set_elem_change_active(net, set, &he->ext);
- return true;
- }
- return false;
+ nft_set_elem_change_active(net, set, &he->ext);
+
+ return true;
}
static void *nft_rhash_deactivate(const struct net *net,
@@ -218,9 +216,8 @@ static void *nft_rhash_deactivate(const struct net *net,
rcu_read_lock();
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
- if (he != NULL &&
- !nft_rhash_flush(net, set, he))
- he = NULL;
+ if (he)
+ nft_set_elem_change_active(net, set, &he->ext);
rcu_read_unlock();
@@ -312,25 +309,48 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set,
static void nft_rhash_gc(struct work_struct *work)
{
+ struct nftables_pernet *nft_net;
struct nft_set *set;
struct nft_rhash_elem *he;
struct nft_rhash *priv;
- struct nft_set_gc_batch *gcb = NULL;
struct rhashtable_iter hti;
+ struct nft_trans_gc *gc;
+ struct net *net;
+ u32 gc_seq;
priv = container_of(work, struct nft_rhash, gc_work.work);
set = nft_set_container_of(priv);
+ net = read_pnet(&set->net);
+ nft_net = nft_pernet(net);
+ gc_seq = READ_ONCE(nft_net->gc_seq);
+
+ gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
+ if (!gc)
+ goto done;
rhashtable_walk_enter(&priv->ht, &hti);
rhashtable_walk_start(&hti);
while ((he = rhashtable_walk_next(&hti))) {
if (IS_ERR(he)) {
- if (PTR_ERR(he) != -EAGAIN)
- break;
+ if (PTR_ERR(he) != -EAGAIN) {
+ nft_trans_gc_destroy(gc);
+ gc = NULL;
+ goto try_later;
+ }
continue;
}
+ /* Ruleset has been updated, try later. */
+ if (READ_ONCE(nft_net->gc_seq) != gc_seq) {
+ nft_trans_gc_destroy(gc);
+ gc = NULL;
+ goto try_later;
+ }
+
+ if (nft_set_elem_is_dead(&he->ext))
+ goto dead_elem;
+
if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) &&
nft_rhash_expr_needs_gc_run(set, &he->ext))
goto needs_gc_run;
@@ -338,26 +358,26 @@ static void nft_rhash_gc(struct work_struct *work)
if (!nft_set_elem_expired(&he->ext))
continue;
needs_gc_run:
- if (nft_set_elem_mark_busy(&he->ext))
- continue;
+ nft_set_elem_dead(&he->ext);
+dead_elem:
+ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+ if (!gc)
+ goto try_later;
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (gcb == NULL)
- break;
- rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, he);
+ nft_trans_gc_elem_add(gc, he);
}
+
+ gc = nft_trans_gc_catchall(gc, gc_seq);
+
+try_later:
+ /* catchall list iteration requires rcu read side lock. */
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
- he = nft_set_catchall_gc(set);
- if (he) {
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (gcb)
- nft_set_gc_batch_add(gcb, he);
- }
- nft_set_gc_batch_complete(gcb);
+ if (gc)
+ nft_trans_gc_queue_async_done(gc);
+
+done:
queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
nft_set_gc_interval(set));
}
@@ -420,7 +440,6 @@ static void nft_rhash_destroy(const struct nft_ctx *ctx,
};
cancel_delayed_work_sync(&priv->gc_work);
- rcu_barrier();
rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy,
(void *)&rhash_ctx);
}
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index d54784ea465b..a5b8301afe4a 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -1536,16 +1536,34 @@ static void pipapo_drop(struct nft_pipapo_match *m,
}
}
+static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set,
+ struct nft_pipapo_elem *e)
+
+{
+ struct nft_set_elem elem = {
+ .priv = e,
+ };
+
+ nft_setelem_data_deactivate(net, set, &elem);
+}
+
/**
* pipapo_gc() - Drop expired entries from set, destroy start and end elements
* @set: nftables API set representation
* @m: Matching data
*/
-static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
+static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m)
{
+ struct nft_set *set = (struct nft_set *) _set;
struct nft_pipapo *priv = nft_set_priv(set);
+ struct net *net = read_pnet(&set->net);
int rules_f0, first_rule = 0;
struct nft_pipapo_elem *e;
+ struct nft_trans_gc *gc;
+
+ gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
+ if (!gc)
+ return;
while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
@@ -1569,13 +1587,20 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
f--;
i--;
e = f->mt[rulemap[i].to].e;
- if (nft_set_elem_expired(&e->ext) &&
- !nft_set_elem_mark_busy(&e->ext)) {
+
+ /* synchronous gc never fails, there is no need to set on
+ * NFT_SET_ELEM_DEAD_BIT.
+ */
+ if (nft_set_elem_expired(&e->ext)) {
priv->dirty = true;
- pipapo_drop(m, rulemap);
- rcu_barrier();
- nft_set_elem_destroy(set, e, true);
+ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
+ if (!gc)
+ break;
+
+ nft_pipapo_gc_deactivate(net, set, e);
+ pipapo_drop(m, rulemap);
+ nft_trans_gc_elem_add(gc, e);
/* And check again current first rule, which is now the
* first we haven't checked.
@@ -1585,11 +1610,11 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
}
}
- e = nft_set_catchall_gc(set);
- if (e)
- nft_set_elem_destroy(set, e, true);
-
- priv->last_gc = jiffies;
+ gc = nft_trans_gc_catchall(gc, 0);
+ if (gc) {
+ nft_trans_gc_queue_sync_done(gc);
+ priv->last_gc = jiffies;
+ }
}
/**
@@ -1714,7 +1739,6 @@ static void nft_pipapo_activate(const struct net *net,
return;
nft_set_elem_change_active(net, set, &e->ext);
- nft_set_elem_clear_busy(&e->ext);
}
/**
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 39956e5341c9..6424b69e3baf 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -46,6 +46,12 @@ static int nft_rbtree_cmp(const struct nft_set *set,
set->klen);
}
+static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe)
+{
+ return nft_set_elem_expired(&rbe->ext) ||
+ nft_set_elem_is_dead(&rbe->ext);
+}
+
static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
const u32 *key, const struct nft_set_ext **ext,
unsigned int seq)
@@ -80,7 +86,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
continue;
}
- if (nft_set_elem_expired(&rbe->ext))
+ if (nft_rbtree_elem_expired(rbe))
return false;
if (nft_rbtree_interval_end(rbe)) {
@@ -98,7 +104,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
nft_set_elem_active(&interval->ext, genmask) &&
- !nft_set_elem_expired(&interval->ext) &&
+ !nft_rbtree_elem_expired(interval) &&
nft_rbtree_interval_start(interval)) {
*ext = &interval->ext;
return true;
@@ -215,6 +221,18 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
return rbe;
}
+static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
+ struct nft_rbtree *priv,
+ struct nft_rbtree_elem *rbe)
+{
+ struct nft_set_elem elem = {
+ .priv = rbe,
+ };
+
+ nft_setelem_data_deactivate(net, set, &elem);
+ rb_erase(&rbe->node, &priv->root);
+}
+
static int nft_rbtree_gc_elem(const struct nft_set *__set,
struct nft_rbtree *priv,
struct nft_rbtree_elem *rbe,
@@ -222,11 +240,12 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
{
struct nft_set *set = (struct nft_set *)__set;
struct rb_node *prev = rb_prev(&rbe->node);
+ struct net *net = read_pnet(&set->net);
struct nft_rbtree_elem *rbe_prev;
- struct nft_set_gc_batch *gcb;
+ struct nft_trans_gc *gc;
- gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC);
- if (!gcb)
+ gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
+ if (!gc)
return -ENOMEM;
/* search for end interval coming before this element.
@@ -244,17 +263,28 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
if (prev) {
rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
+ nft_rbtree_gc_remove(net, set, priv, rbe_prev);
- rb_erase(&rbe_prev->node, &priv->root);
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, rbe_prev);
+ /* There is always room in this trans gc for this element,
+ * memory allocation never actually happens, hence, the warning
+ * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT,
+ * this is synchronous gc which never fails.
+ */
+ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
+ if (WARN_ON_ONCE(!gc))
+ return -ENOMEM;
+
+ nft_trans_gc_elem_add(gc, rbe_prev);
}
- rb_erase(&rbe->node, &priv->root);
- atomic_dec(&set->nelems);
+ nft_rbtree_gc_remove(net, set, priv, rbe);
+ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
+ if (WARN_ON_ONCE(!gc))
+ return -ENOMEM;
+
+ nft_trans_gc_elem_add(gc, rbe);
- nft_set_gc_batch_add(gcb, rbe);
- nft_set_gc_batch_complete(gcb);
+ nft_trans_gc_queue_sync_done(gc);
return 0;
}
@@ -482,7 +512,6 @@ static void nft_rbtree_activate(const struct net *net,
struct nft_rbtree_elem *rbe = elem->priv;
nft_set_elem_change_active(net, set, &rbe->ext);
- nft_set_elem_clear_busy(&rbe->ext);
}
static bool nft_rbtree_flush(const struct net *net,
@@ -490,12 +519,9 @@ static bool nft_rbtree_flush(const struct net *net,
{
struct nft_rbtree_elem *rbe = priv;
- if (!nft_set_elem_mark_busy(&rbe->ext) ||
- !nft_is_active(net, &rbe->ext)) {
- nft_set_elem_change_active(net, set, &rbe->ext);
- return true;
- }
- return false;
+ nft_set_elem_change_active(net, set, &rbe->ext);
+
+ return true;
}
static void *nft_rbtree_deactivate(const struct net *net,
@@ -570,11 +596,13 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
static void nft_rbtree_gc(struct work_struct *work)
{
- struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
- struct nft_set_gc_batch *gcb = NULL;
+ struct nft_rbtree_elem *rbe, *rbe_end = NULL;
+ struct nftables_pernet *nft_net;
struct nft_rbtree *priv;
+ struct nft_trans_gc *gc;
struct rb_node *node;
struct nft_set *set;
+ unsigned int gc_seq;
struct net *net;
u8 genmask;
@@ -582,14 +610,28 @@ static void nft_rbtree_gc(struct work_struct *work)
set = nft_set_container_of(priv);
net = read_pnet(&set->net);
genmask = nft_genmask_cur(net);
+ nft_net = nft_pernet(net);
+ gc_seq = READ_ONCE(nft_net->gc_seq);
+
+ gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
+ if (!gc)
+ goto done;
write_lock_bh(&priv->lock);
write_seqcount_begin(&priv->count);
for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
+
+ /* Ruleset has been updated, try later. */
+ if (READ_ONCE(nft_net->gc_seq) != gc_seq) {
+ nft_trans_gc_destroy(gc);
+ gc = NULL;
+ goto try_later;
+ }
+
rbe = rb_entry(node, struct nft_rbtree_elem, node);
- if (!nft_set_elem_active(&rbe->ext, genmask))
- continue;
+ if (nft_set_elem_is_dead(&rbe->ext))
+ goto dead_elem;
/* elements are reversed in the rbtree for historical reasons,
* from highest to lowest value, that is why end element is
@@ -602,46 +644,36 @@ static void nft_rbtree_gc(struct work_struct *work)
if (!nft_set_elem_expired(&rbe->ext))
continue;
- if (nft_set_elem_mark_busy(&rbe->ext)) {
- rbe_end = NULL;
+ nft_set_elem_dead(&rbe->ext);
+
+ if (!rbe_end)
continue;
- }
- if (rbe_prev) {
- rb_erase(&rbe_prev->node, &priv->root);
- rbe_prev = NULL;
- }
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (!gcb)
- break;
+ nft_set_elem_dead(&rbe_end->ext);
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, rbe);
- rbe_prev = rbe;
+ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+ if (!gc)
+ goto try_later;
- if (rbe_end) {
- atomic_dec(&set->nelems);
- nft_set_gc_batch_add(gcb, rbe_end);
- rb_erase(&rbe_end->node, &priv->root);
- rbe_end = NULL;
- }
- node = rb_next(node);
- if (!node)
- break;
+ nft_trans_gc_elem_add(gc, rbe_end);
+ rbe_end = NULL;
+dead_elem:
+ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
+ if (!gc)
+ goto try_later;
+
+ nft_trans_gc_elem_add(gc, rbe);
}
- if (rbe_prev)
- rb_erase(&rbe_prev->node, &priv->root);
+
+ gc = nft_trans_gc_catchall(gc, gc_seq);
+
+try_later:
write_seqcount_end(&priv->count);
write_unlock_bh(&priv->lock);
- rbe = nft_set_catchall_gc(set);
- if (rbe) {
- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
- if (gcb)
- nft_set_gc_batch_add(gcb, rbe);
- }
- nft_set_gc_batch_complete(gcb);
-
+ if (gc)
+ nft_trans_gc_queue_async_done(gc);
+done:
queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
nft_set_gc_interval(set));
}
--
2.30.2
^ permalink raw reply related [flat|nested] 6+ messages in thread* Re: [PATCH nf 3/5] netfilter: nf_tables: adapt set backend to use GC transaction API
2023-08-09 13:15 ` [PATCH nf 3/5] netfilter: nf_tables: adapt set backend to use GC transaction API Pablo Neira Ayuso
@ 2023-08-09 15:32 ` kernel test robot
0 siblings, 0 replies; 6+ messages in thread
From: kernel test robot @ 2023-08-09 15:32 UTC (permalink / raw)
To: Pablo Neira Ayuso, netfilter-devel; +Cc: oe-kbuild-all
Hi Pablo,
kernel test robot noticed the following build warnings:
[auto build test WARNING on linus/master]
[also build test WARNING on v6.5-rc5 next-20230809]
[cannot apply to nf/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Pablo-Neira-Ayuso/netfilter-nf_tables-GC-transaction-API-to-avoid-race-with-control-plane/20230809-211751
base: linus/master
patch link: https://lore.kernel.org/r/20230809131546.8598-3-pablo%40netfilter.org
patch subject: [PATCH nf 3/5] netfilter: nf_tables: adapt set backend to use GC transaction API
config: loongarch-allyesconfig (https://download.01.org/0day-ci/archive/20230809/202308092313.jSo4bmAR-lkp@intel.com/config)
compiler: loongarch64-linux-gcc (GCC) 12.3.0
reproduce: (https://download.01.org/0day-ci/archive/20230809/202308092313.jSo4bmAR-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202308092313.jSo4bmAR-lkp@intel.com/
All warnings (new ones prefixed by >>):
net/netfilter/nft_set_rbtree.c: In function 'nft_rbtree_gc':
>> net/netfilter/nft_set_rbtree.c:607:12: warning: variable 'genmask' set but not used [-Wunused-but-set-variable]
607 | u8 genmask;
| ^~~~~~~
vim +/genmask +607 net/netfilter/nft_set_rbtree.c
20a69341f2d00c net/netfilter/nft_rbtree.c Patrick McHardy 2013-10-11 596
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 597 static void nft_rbtree_gc(struct work_struct *work)
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 598 {
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 599 struct nft_rbtree_elem *rbe, *rbe_end = NULL;
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 600 struct nftables_pernet *nft_net;
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 601 struct nft_rbtree *priv;
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 602 struct nft_trans_gc *gc;
a13f814a67b12a net/netfilter/nft_set_rbtree.c Taehee Yoo 2018-08-30 603 struct rb_node *node;
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 604 struct nft_set *set;
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 605 unsigned int gc_seq;
5d235d6ce75c12 net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-01-14 606 struct net *net;
5d235d6ce75c12 net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-01-14 @607 u8 genmask;
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 608
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 609 priv = container_of(work, struct nft_rbtree, gc_work.work);
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 610 set = nft_set_container_of(priv);
5d235d6ce75c12 net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-01-14 611 net = read_pnet(&set->net);
5d235d6ce75c12 net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-01-14 612 genmask = nft_genmask_cur(net);
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 613 nft_net = nft_pernet(net);
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 614 gc_seq = READ_ONCE(nft_net->gc_seq);
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 615
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 616 gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 617 if (!gc)
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 618 goto done;
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 619
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 620 write_lock_bh(&priv->lock);
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 621 write_seqcount_begin(&priv->count);
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 622 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 623
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 624 /* Ruleset has been updated, try later. */
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 625 if (READ_ONCE(nft_net->gc_seq) != gc_seq) {
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 626 nft_trans_gc_destroy(gc);
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 627 gc = NULL;
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 628 goto try_later;
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 629 }
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 630
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 631 rbe = rb_entry(node, struct nft_rbtree_elem, node);
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 632
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 633 if (nft_set_elem_is_dead(&rbe->ext))
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 634 goto dead_elem;
5d235d6ce75c12 net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-01-14 635
5d235d6ce75c12 net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-01-14 636 /* elements are reversed in the rbtree for historical reasons,
5d235d6ce75c12 net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-01-14 637 * from highest to lowest value, that is why end element is
5d235d6ce75c12 net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-01-14 638 * always visited before the start element.
5d235d6ce75c12 net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-01-14 639 */
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 640 if (nft_rbtree_interval_end(rbe)) {
a13f814a67b12a net/netfilter/nft_set_rbtree.c Taehee Yoo 2018-08-30 641 rbe_end = rbe;
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 642 continue;
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 643 }
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 644 if (!nft_set_elem_expired(&rbe->ext))
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 645 continue;
5d235d6ce75c12 net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-01-14 646
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 647 nft_set_elem_dead(&rbe->ext);
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 648
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 649 if (!rbe_end)
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 650 continue;
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 651
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 652 nft_set_elem_dead(&rbe_end->ext);
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 653
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 654 gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 655 if (!gc)
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 656 goto try_later;
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 657
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 658 nft_trans_gc_elem_add(gc, rbe_end);
a13f814a67b12a net/netfilter/nft_set_rbtree.c Taehee Yoo 2018-08-30 659 rbe_end = NULL;
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 660 dead_elem:
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 661 gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 662 if (!gc)
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 663 goto try_later;
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 664
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 665 nft_trans_gc_elem_add(gc, rbe);
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 666 }
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 667
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 668 gc = nft_trans_gc_catchall(gc, gc_seq);
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 669
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 670 try_later:
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 671 write_seqcount_end(&priv->count);
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 672 write_unlock_bh(&priv->lock);
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 673
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 674 if (gc)
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 675 nft_trans_gc_queue_async_done(gc);
9cbd4eae38ba6c net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2023-08-09 676 done:
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 677 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 678 nft_set_gc_interval(set));
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 679 }
8d8540c4f5e03d net/netfilter/nft_set_rbtree.c Pablo Neira Ayuso 2018-05-16 680
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH nf 4/5] netfilter: nft_set_hash: mark set element as dead when deleting from packet path
2023-08-09 13:15 [PATCH nf 1/5] netfilter: nf_tables: don't skip expired elements during walk Pablo Neira Ayuso
2023-08-09 13:15 ` [PATCH nf 2/5] netfilter: nf_tables: GC transaction API to avoid race with control plane Pablo Neira Ayuso
2023-08-09 13:15 ` [PATCH nf 3/5] netfilter: nf_tables: adapt set backend to use GC transaction API Pablo Neira Ayuso
@ 2023-08-09 13:15 ` Pablo Neira Ayuso
2023-08-09 13:15 ` [PATCH nf 5/5] netfilter: nf_tables: remove busy mark and gc batch API Pablo Neira Ayuso
3 siblings, 0 replies; 6+ messages in thread
From: Pablo Neira Ayuso @ 2023-08-09 13:15 UTC (permalink / raw)
To: netfilter-devel
Set on the NFT_SET_ELEM_DEAD_BIT flag on this element, instead of
performing element removal which might race with an ongoing transaction.
Enable gc when dynamic flag is set on since dynset deletion requires
garbage collection after this patch.
Fixes: d0a8d877da97 ("netfilter: nft_dynset: support for element deletion")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nft_set_hash.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 2f067e4596b0..cef5df846000 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -249,7 +249,9 @@ static bool nft_rhash_delete(const struct nft_set *set,
if (he == NULL)
return false;
- return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0;
+ nft_set_elem_dead(&he->ext);
+
+ return true;
}
static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
@@ -412,7 +414,7 @@ static int nft_rhash_init(const struct nft_set *set,
return err;
INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc);
- if (set->flags & NFT_SET_TIMEOUT)
+ if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL))
nft_rhash_gc_init(set);
return 0;
--
2.30.2
^ permalink raw reply related [flat|nested] 6+ messages in thread* [PATCH nf 5/5] netfilter: nf_tables: remove busy mark and gc batch API
2023-08-09 13:15 [PATCH nf 1/5] netfilter: nf_tables: don't skip expired elements during walk Pablo Neira Ayuso
` (2 preceding siblings ...)
2023-08-09 13:15 ` [PATCH nf 4/5] netfilter: nft_set_hash: mark set element as dead when deleting from packet path Pablo Neira Ayuso
@ 2023-08-09 13:15 ` Pablo Neira Ayuso
3 siblings, 0 replies; 6+ messages in thread
From: Pablo Neira Ayuso @ 2023-08-09 13:15 UTC (permalink / raw)
To: netfilter-devel
Ditch it, it has been replace it by the GC transaction API and it has no
clients anymore.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/net/netfilter/nf_tables.h | 98 +------------------------------
net/netfilter/nf_tables_api.c | 48 +--------------
2 files changed, 4 insertions(+), 142 deletions(-)
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 7256e9c80477..35870858ddf2 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -599,7 +599,6 @@ struct nft_set *nft_set_lookup_global(const struct net *net,
struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
const struct nft_set *set);
-void *nft_set_catchall_gc(const struct nft_set *set);
static inline unsigned long nft_set_gc_interval(const struct nft_set *set)
{
@@ -816,62 +815,6 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
const struct nft_set *set, void *elem);
-/**
- * struct nft_set_gc_batch_head - nf_tables set garbage collection batch
- *
- * @rcu: rcu head
- * @set: set the elements belong to
- * @cnt: count of elements
- */
-struct nft_set_gc_batch_head {
- struct rcu_head rcu;
- const struct nft_set *set;
- unsigned int cnt;
-};
-
-#define NFT_SET_GC_BATCH_SIZE ((PAGE_SIZE - \
- sizeof(struct nft_set_gc_batch_head)) / \
- sizeof(void *))
-
-/**
- * struct nft_set_gc_batch - nf_tables set garbage collection batch
- *
- * @head: GC batch head
- * @elems: garbage collection elements
- */
-struct nft_set_gc_batch {
- struct nft_set_gc_batch_head head;
- void *elems[NFT_SET_GC_BATCH_SIZE];
-};
-
-struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
- gfp_t gfp);
-void nft_set_gc_batch_release(struct rcu_head *rcu);
-
-static inline void nft_set_gc_batch_complete(struct nft_set_gc_batch *gcb)
-{
- if (gcb != NULL)
- call_rcu(&gcb->head.rcu, nft_set_gc_batch_release);
-}
-
-static inline struct nft_set_gc_batch *
-nft_set_gc_batch_check(const struct nft_set *set, struct nft_set_gc_batch *gcb,
- gfp_t gfp)
-{
- if (gcb != NULL) {
- if (gcb->head.cnt + 1 < ARRAY_SIZE(gcb->elems))
- return gcb;
- nft_set_gc_batch_complete(gcb);
- }
- return nft_set_gc_batch_alloc(set, gfp);
-}
-
-static inline void nft_set_gc_batch_add(struct nft_set_gc_batch *gcb,
- void *elem)
-{
- gcb->elems[gcb->head.cnt++] = elem;
-}
-
struct nft_expr_ops;
/**
* struct nft_expr_type - nf_tables expression type
@@ -1560,47 +1503,12 @@ static inline void nft_set_elem_change_active(const struct net *net,
#endif /* IS_ENABLED(CONFIG_NF_TABLES) */
-/*
- * We use a free bit in the genmask field to indicate the element
- * is busy, meaning it is currently being processed either by
- * the netlink API or GC.
- *
- * Even though the genmask is only a single byte wide, this works
- * because the extension structure if fully constant once initialized,
- * so there are no non-atomic write accesses unless it is already
- * marked busy.
- */
-#define NFT_SET_ELEM_BUSY_MASK (1 << 2)
-
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-#define NFT_SET_ELEM_BUSY_BIT 2
-#elif defined(__BIG_ENDIAN_BITFIELD)
-#define NFT_SET_ELEM_BUSY_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2)
-#else
-#error
-#endif
-
-static inline int nft_set_elem_mark_busy(struct nft_set_ext *ext)
-{
- unsigned long *word = (unsigned long *)ext;
-
- BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
- return test_and_set_bit(NFT_SET_ELEM_BUSY_BIT, word);
-}
-
-static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext)
-{
- unsigned long *word = (unsigned long *)ext;
-
- clear_bit(NFT_SET_ELEM_BUSY_BIT, word);
-}
-
-#define NFT_SET_ELEM_DEAD_MASK (1 << 3)
+#define NFT_SET_ELEM_DEAD_MASK (1 << 2)
#if defined(__LITTLE_ENDIAN_BITFIELD)
-#define NFT_SET_ELEM_DEAD_BIT 3
+#define NFT_SET_ELEM_DEAD_BIT 2
#elif defined(__BIG_ENDIAN_BITFIELD)
-#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3)
+#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2)
#else
#error
#endif
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index fd4b5da7ac3c..c62227ae7746 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6296,29 +6296,6 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
}
EXPORT_SYMBOL_GPL(nft_set_catchall_lookup);
-void *nft_set_catchall_gc(const struct nft_set *set)
-{
- struct nft_set_elem_catchall *catchall, *next;
- struct nft_set_ext *ext;
- void *elem = NULL;
-
- list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
- ext = nft_set_elem_ext(set, catchall->elem);
-
- if (!nft_set_elem_expired(ext) ||
- nft_set_elem_mark_busy(ext))
- continue;
-
- elem = catchall->elem;
- list_del_rcu(&catchall->list);
- kfree_rcu(catchall, rcu);
- break;
- }
-
- return elem;
-}
-EXPORT_SYMBOL_GPL(nft_set_catchall_gc);
-
static int nft_setelem_catchall_insert(const struct net *net,
struct nft_set *set,
const struct nft_set_elem *elem,
@@ -6789,7 +6766,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
goto err_elem_free;
}
- ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK;
+ ext->genmask = nft_genmask_cur(ctx->net);
err = nft_setelem_insert(ctx->net, set, &elem, &ext2, flags);
if (err) {
@@ -7181,29 +7158,6 @@ static int nf_tables_delsetelem(struct sk_buff *skb,
return err;
}
-void nft_set_gc_batch_release(struct rcu_head *rcu)
-{
- struct nft_set_gc_batch *gcb;
- unsigned int i;
-
- gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu);
- for (i = 0; i < gcb->head.cnt; i++)
- nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true);
- kfree(gcb);
-}
-
-struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
- gfp_t gfp)
-{
- struct nft_set_gc_batch *gcb;
-
- gcb = kzalloc(sizeof(*gcb), gfp);
- if (gcb == NULL)
- return gcb;
- gcb->head.set = set;
- return gcb;
-}
-
/*
* Stateful objects
*/
--
2.30.2
^ permalink raw reply related [flat|nested] 6+ messages in thread