* [PATCH 0/1] ipset performance regression in swap fix
@ 2024-01-16 16:29 Jozsef Kadlecsik
2024-01-16 16:29 ` [PATCH 1/1] netfilter: ipset: fix performance regression in swap operation Jozsef Kadlecsik
0 siblings, 1 reply; 4+ messages in thread
From: Jozsef Kadlecsik @ 2024-01-16 16:29 UTC (permalink / raw)
To: netfilter-devel; +Cc: Pablo Neira Ayuso, Ale Crismani, David Wang
Hi Pablo,
Please consider to apply the next patch to your nf tree. It should be applied
to all stable branches to which the patch "netfilter: ipset: fix race condition
between swap/destroy and kernel side add/del/test", commit 28628fa9 was added.
* The synchronize_rcu() call added to the swap function to prevent the race
condition makes it too slow. The race can be prevented by using call_rcu()
in the destroy function instead.
Best regards,
Jozsef
The following changes since commit ac631873c9e7a50d2a8de457cfc4b9f86666403e:
net: ethernet: cortina: Drop TSO support (2024-01-07 16:05:00 +0000)
are available in the Git repository at:
git://blackhole.kfki.hu/nf 9833c7d18f7065c2
for you to fetch changes up to 9833c7d18f7065c2ef31aae67973bcb198d761bc:
netfilter: ipset: fix performance regression in swap operation (2024-01-16 17:19:19 +0100)
----------------------------------------------------------------
Jozsef Kadlecsik (1):
netfilter: ipset: fix performance regression in swap operation
include/linux/netfilter/ipset/ip_set.h | 2 ++
net/netfilter/ipset/ip_set_core.c | 31 +++++++++++++++++++++++--------
2 files changed, 25 insertions(+), 8 deletions(-)
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH 1/1] netfilter: ipset: fix performance regression in swap operation
2024-01-16 16:29 [PATCH 0/1] ipset performance regression in swap fix Jozsef Kadlecsik
@ 2024-01-16 16:29 ` Jozsef Kadlecsik
2024-01-17 12:44 ` Pablo Neira Ayuso
0 siblings, 1 reply; 4+ messages in thread
From: Jozsef Kadlecsik @ 2024-01-16 16:29 UTC (permalink / raw)
To: netfilter-devel; +Cc: Pablo Neira Ayuso, Ale Crismani, David Wang
The patch "netfilter: ipset: fix race condition between swap/destroy
and kernel side add/del/test", commit 28628fa9 fixes a race condition.
But the synchronize_rcu() added to the swap function unnecessarily slows
it down: it can safely be moved to destroy and use call_rcu() instead.
Thus we can get back the same performance and preventing the race condition
at the same time.
Link: https://lore.kernel.org/lkml/C0829B10-EAA6-4809-874E-E1E9C05A8D84@automattic.com/
Reported-by: Ale Crismani <ale.crismani@automattic.com>
Reported-by: David Wang <00107082@163.com
Tested-by: Ale Crismani <ale.crismani@automattic.com>
Tested-by: David Wang <00107082@163.com
Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
---
include/linux/netfilter/ipset/ip_set.h | 2 ++
net/netfilter/ipset/ip_set_core.c | 31 +++++++++++++++++++-------
2 files changed, 25 insertions(+), 8 deletions(-)
diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index e8c350a3ade1..912f750d0bea 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -242,6 +242,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type);
/* A generic IP set */
struct ip_set {
+ /* For call_cru in destroy */
+ struct rcu_head rcu;
/* The name of the set */
char name[IPSET_MAXNAMELEN];
/* Lock protecting the set data */
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 4c133e06be1d..3bf9bb345809 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1182,6 +1182,14 @@ ip_set_destroy_set(struct ip_set *set)
kfree(set);
}
+static void
+ip_set_destroy_set_rcu(struct rcu_head *head)
+{
+ struct ip_set *set = container_of(head, struct ip_set, rcu);
+
+ ip_set_destroy_set(set);
+}
+
static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
const struct nlattr * const attr[])
{
@@ -1193,8 +1201,6 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
if (unlikely(protocol_min_failed(attr)))
return -IPSET_ERR_PROTOCOL;
- /* Must wait for flush to be really finished in list:set */
- rcu_barrier();
/* Commands are serialized and references are
* protected by the ip_set_ref_lock.
@@ -1206,8 +1212,10 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
* counter, so if it's already zero, we can proceed
* without holding the lock.
*/
- read_lock_bh(&ip_set_ref_lock);
if (!attr[IPSET_ATTR_SETNAME]) {
+ /* Must wait for flush to be really finished in list:set */
+ rcu_barrier();
+ read_lock_bh(&ip_set_ref_lock);
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
if (s && (s->ref || s->ref_netlink)) {
@@ -1228,6 +1236,9 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
inst->is_destroyed = false;
} else {
u32 flags = flag_exist(info->nlh);
+ u16 features = 0;
+
+ read_lock_bh(&ip_set_ref_lock);
s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
&i);
if (!s) {
@@ -1238,10 +1249,14 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
ret = -IPSET_ERR_BUSY;
goto out;
}
+ features = s->type->features;
ip_set(inst, i) = NULL;
read_unlock_bh(&ip_set_ref_lock);
-
- ip_set_destroy_set(s);
+ if (features & IPSET_TYPE_NAME) {
+ /* Must wait for flush to be really finished */
+ rcu_barrier();
+ }
+ call_rcu(&s->rcu, ip_set_destroy_set_rcu);
}
return 0;
out:
@@ -1394,9 +1409,6 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info,
ip_set(inst, to_id) = from;
write_unlock_bh(&ip_set_ref_lock);
- /* Make sure all readers of the old set pointers are completed. */
- synchronize_rcu();
-
return 0;
}
@@ -2357,6 +2369,9 @@ ip_set_net_exit(struct net *net)
inst->is_deleted = true; /* flag for ip_set_nfnl_put */
+ /* Wait for call_rcu() in destroy */
+ rcu_barrier();
+
nfnl_lock(NFNL_SUBSYS_IPSET);
for (i = 0; i < inst->ip_set_max; i++) {
set = ip_set(inst, i);
--
2.39.2
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH 1/1] netfilter: ipset: fix performance regression in swap operation
2024-01-16 16:29 ` [PATCH 1/1] netfilter: ipset: fix performance regression in swap operation Jozsef Kadlecsik
@ 2024-01-17 12:44 ` Pablo Neira Ayuso
0 siblings, 0 replies; 4+ messages in thread
From: Pablo Neira Ayuso @ 2024-01-17 12:44 UTC (permalink / raw)
To: Jozsef Kadlecsik; +Cc: netfilter-devel, Ale Crismani, David Wang
On Tue, Jan 16, 2024 at 05:29:56PM +0100, Jozsef Kadlecsik wrote:
> The patch "netfilter: ipset: fix race condition between swap/destroy
> and kernel side add/del/test", commit 28628fa9 fixes a race condition.
> But the synchronize_rcu() added to the swap function unnecessarily slows
> it down: it can safely be moved to destroy and use call_rcu() instead.
> Thus we can get back the same performance and preventing the race condition
> at the same time.
Manually applied this to nf.git, thanks!
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH 1/1] netfilter: ipset: fix performance regression in swap operation
2024-01-29 9:57 [PATCH 0/1] ipset performance regression in swap fix Jozsef Kadlecsik
@ 2024-01-29 9:57 ` Jozsef Kadlecsik
0 siblings, 0 replies; 4+ messages in thread
From: Jozsef Kadlecsik @ 2024-01-29 9:57 UTC (permalink / raw)
To: netfilter-devel; +Cc: Pablo Neira Ayuso, Ale Crismani, David Wang, Eric Dumazet
The patch "netfilter: ipset: fix race condition between swap/destroy
and kernel side add/del/test", commit 28628fa9 fixes a race condition.
But the synchronize_rcu() added to the swap function unnecessarily slows
it down: it can safely be moved to destroy and use call_rcu() instead.
Eric Dumazet pointed out that simply calling the destroy functions as
rcu callback does not work: sets with timeout use garbage collectors
which need cancelling at destroy which can wait. Therefore the destroy
functions are split into two: cancelling garbage collectors safely at
executing the command received by netlink and moving the remaining
part only into the rcu callback.
Link: https://lore.kernel.org/lkml/C0829B10-EAA6-4809-874E-E1E9C05A8D84@automattic.com/
Fixes: 28628fa952fe ("netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test")
Reported-by: Ale Crismani <ale.crismani@automattic.com>
Reported-by: David Wang <00107082@163.com>
Tested-by: David Wang <00107082@163.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
---
include/linux/netfilter/ipset/ip_set.h | 4 +++
net/netfilter/ipset/ip_set_bitmap_gen.h | 14 ++++++++--
net/netfilter/ipset/ip_set_core.c | 37 +++++++++++++++++++------
net/netfilter/ipset/ip_set_hash_gen.h | 15 ++++++++--
net/netfilter/ipset/ip_set_list_set.c | 13 +++++++--
5 files changed, 65 insertions(+), 18 deletions(-)
diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index e8c350a3ade1..e9f4f845d760 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -186,6 +186,8 @@ struct ip_set_type_variant {
/* Return true if "b" set is the same as "a"
* according to the create set parameters */
bool (*same_set)(const struct ip_set *a, const struct ip_set *b);
+ /* Cancel ongoing garbage collectors before destroying the set*/
+ void (*cancel_gc)(struct ip_set *set);
/* Region-locking is used */
bool region_lock;
};
@@ -242,6 +244,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type);
/* A generic IP set */
struct ip_set {
+ /* For call_cru in destroy */
+ struct rcu_head rcu;
/* The name of the set */
char name[IPSET_MAXNAMELEN];
/* Lock protecting the set data */
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 26ab0e9612d8..9523104a90da 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -28,6 +28,7 @@
#define mtype_del IPSET_TOKEN(MTYPE, _del)
#define mtype_list IPSET_TOKEN(MTYPE, _list)
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
+#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc)
#define mtype MTYPE
#define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id)))
@@ -57,9 +58,6 @@ mtype_destroy(struct ip_set *set)
{
struct mtype *map = set->data;
- if (SET_WITH_TIMEOUT(set))
- del_timer_sync(&map->gc);
-
if (set->dsize && set->extensions & IPSET_EXT_DESTROY)
mtype_ext_cleanup(set);
ip_set_free(map->members);
@@ -288,6 +286,15 @@ mtype_gc(struct timer_list *t)
add_timer(&map->gc);
}
+static void
+mtype_cancel_gc(struct ip_set *set)
+{
+ struct mtype *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ del_timer_sync(&map->gc);
+}
+
static const struct ip_set_type_variant mtype = {
.kadt = mtype_kadt,
.uadt = mtype_uadt,
@@ -301,6 +308,7 @@ static const struct ip_set_type_variant mtype = {
.head = mtype_head,
.list = mtype_list,
.same_set = mtype_same_set,
+ .cancel_gc = mtype_cancel_gc,
};
#endif /* __IP_SET_BITMAP_IP_GEN_H */
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 4c133e06be1d..bcaad9c009fe 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1182,6 +1182,14 @@ ip_set_destroy_set(struct ip_set *set)
kfree(set);
}
+static void
+ip_set_destroy_set_rcu(struct rcu_head *head)
+{
+ struct ip_set *set = container_of(head, struct ip_set, rcu);
+
+ ip_set_destroy_set(set);
+}
+
static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
const struct nlattr * const attr[])
{
@@ -1193,8 +1201,6 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
if (unlikely(protocol_min_failed(attr)))
return -IPSET_ERR_PROTOCOL;
- /* Must wait for flush to be really finished in list:set */
- rcu_barrier();
/* Commands are serialized and references are
* protected by the ip_set_ref_lock.
@@ -1206,8 +1212,10 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
* counter, so if it's already zero, we can proceed
* without holding the lock.
*/
- read_lock_bh(&ip_set_ref_lock);
if (!attr[IPSET_ATTR_SETNAME]) {
+ /* Must wait for flush to be really finished in list:set */
+ rcu_barrier();
+ read_lock_bh(&ip_set_ref_lock);
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
if (s && (s->ref || s->ref_netlink)) {
@@ -1221,6 +1229,8 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
s = ip_set(inst, i);
if (s) {
ip_set(inst, i) = NULL;
+ /* Must cancel garbage collectors */
+ s->variant->cancel_gc(s);
ip_set_destroy_set(s);
}
}
@@ -1228,6 +1238,9 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
inst->is_destroyed = false;
} else {
u32 flags = flag_exist(info->nlh);
+ u16 features = 0;
+
+ read_lock_bh(&ip_set_ref_lock);
s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
&i);
if (!s) {
@@ -1238,10 +1251,16 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
ret = -IPSET_ERR_BUSY;
goto out;
}
+ features = s->type->features;
ip_set(inst, i) = NULL;
read_unlock_bh(&ip_set_ref_lock);
-
- ip_set_destroy_set(s);
+ if (features & IPSET_TYPE_NAME) {
+ /* Must wait for flush to be really finished */
+ rcu_barrier();
+ }
+ /* Must cancel garbage collectors */
+ s->variant->cancel_gc(s);
+ call_rcu(&s->rcu, ip_set_destroy_set_rcu);
}
return 0;
out:
@@ -1394,9 +1413,6 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info,
ip_set(inst, to_id) = from;
write_unlock_bh(&ip_set_ref_lock);
- /* Make sure all readers of the old set pointers are completed. */
- synchronize_rcu();
-
return 0;
}
@@ -2409,8 +2425,11 @@ ip_set_fini(void)
{
nf_unregister_sockopt(&so_set);
nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
-
unregister_pernet_subsys(&ip_set_net_ops);
+
+ /* Wait for call_rcu() in destroy */
+ rcu_barrier();
+
pr_debug("these are the famous last words\n");
}
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 7c2399541771..c62998b46f00 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -221,6 +221,7 @@ static const union nf_inet_addr zeromask = {};
#undef mtype_gc_do
#undef mtype_gc
#undef mtype_gc_init
+#undef mtype_cancel_gc
#undef mtype_variant
#undef mtype_data_match
@@ -265,6 +266,7 @@ static const union nf_inet_addr zeromask = {};
#define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do)
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
+#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc)
#define mtype_variant IPSET_TOKEN(MTYPE, _variant)
#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
@@ -449,9 +451,6 @@ mtype_destroy(struct ip_set *set)
struct htype *h = set->data;
struct list_head *l, *lt;
- if (SET_WITH_TIMEOUT(set))
- cancel_delayed_work_sync(&h->gc.dwork);
-
mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
list_for_each_safe(l, lt, &h->ad) {
list_del(l);
@@ -598,6 +597,15 @@ mtype_gc_init(struct htable_gc *gc)
queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
}
+static void
+mtype_cancel_gc(struct ip_set *set)
+{
+ struct htype *h = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ cancel_delayed_work_sync(&h->gc.dwork);
+}
+
static int
mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
struct ip_set_ext *mext, u32 flags);
@@ -1440,6 +1448,7 @@ static const struct ip_set_type_variant mtype_variant = {
.uref = mtype_uref,
.resize = mtype_resize,
.same_set = mtype_same_set,
+ .cancel_gc = mtype_cancel_gc,
.region_lock = true,
};
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index e162636525cf..6c3f28bc59b3 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -426,9 +426,6 @@ list_set_destroy(struct ip_set *set)
struct list_set *map = set->data;
struct set_elem *e, *n;
- if (SET_WITH_TIMEOUT(set))
- timer_shutdown_sync(&map->gc);
-
list_for_each_entry_safe(e, n, &map->members, list) {
list_del(&e->list);
ip_set_put_byindex(map->net, e->id);
@@ -545,6 +542,15 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b)
a->extensions == b->extensions;
}
+static void
+list_set_cancel_gc(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ timer_shutdown_sync(&map->gc);
+}
+
static const struct ip_set_type_variant set_variant = {
.kadt = list_set_kadt,
.uadt = list_set_uadt,
@@ -558,6 +564,7 @@ static const struct ip_set_type_variant set_variant = {
.head = list_set_head,
.list = list_set_list,
.same_set = list_set_same_set,
+ .cancel_gc = list_set_cancel_gc,
};
static void
--
2.39.2
^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2024-01-29 9:57 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-01-16 16:29 [PATCH 0/1] ipset performance regression in swap fix Jozsef Kadlecsik
2024-01-16 16:29 ` [PATCH 1/1] netfilter: ipset: fix performance regression in swap operation Jozsef Kadlecsik
2024-01-17 12:44 ` Pablo Neira Ayuso
-- strict thread matches above, loose matches on Subject: below --
2024-01-29 9:57 [PATCH 0/1] ipset performance regression in swap fix Jozsef Kadlecsik
2024-01-29 9:57 ` [PATCH 1/1] netfilter: ipset: fix performance regression in swap operation Jozsef Kadlecsik
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).