* [PATCH net 01/14] netfilter: nf_tables: reject invalid set policy
2024-01-17 16:00 [PATCH net 00/14] Netfilter fixes for net Pablo Neira Ayuso
@ 2024-01-17 16:00 ` Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 02/14] netfilter: nf_tables: validate .maxattr at expression registration Pablo Neira Ayuso
` (12 subsequent siblings)
13 siblings, 0 replies; 32+ messages in thread
From: Pablo Neira Ayuso @ 2024-01-17 16:00 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw
Report -EINVAL in case userspace provides a unsupported set backend
policy.
Fixes: c50b960ccc59 ("netfilter: nf_tables: implement proper set selection")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_tables_api.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8438a8922e4a..a90a364f5be5 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5048,8 +5048,16 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
}
desc.policy = NFT_SET_POL_PERFORMANCE;
- if (nla[NFTA_SET_POLICY] != NULL)
+ if (nla[NFTA_SET_POLICY] != NULL) {
desc.policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY]));
+ switch (desc.policy) {
+ case NFT_SET_POL_PERFORMANCE:
+ case NFT_SET_POL_MEMORY:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ }
if (nla[NFTA_SET_DESC] != NULL) {
err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]);
--
2.30.2
^ permalink raw reply related [flat|nested] 32+ messages in thread
* [PATCH net 02/14] netfilter: nf_tables: validate .maxattr at expression registration
2024-01-17 16:00 [PATCH net 00/14] Netfilter fixes for net Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 01/14] netfilter: nf_tables: reject invalid set policy Pablo Neira Ayuso
@ 2024-01-17 16:00 ` Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 03/14] netfilter: nf_tables: bail out if stateful expression provides no .clone Pablo Neira Ayuso
` (11 subsequent siblings)
13 siblings, 0 replies; 32+ messages in thread
From: Pablo Neira Ayuso @ 2024-01-17 16:00 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw
struct nft_expr_info allows to store up to NFT_EXPR_MAXATTR (16)
attributes when parsing netlink attributes.
Rise a warning in case there is ever a nft expression whose .maxattr
goes beyond this number of expressions, in such case, struct nft_expr_info
needs to be updated.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_tables_api.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index a90a364f5be5..2548d7d56408 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2977,6 +2977,9 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info,
*/
int nft_register_expr(struct nft_expr_type *type)
{
+ if (WARN_ON_ONCE(type->maxattr > NFT_EXPR_MAXATTR))
+ return -ENOMEM;
+
nfnl_lock(NFNL_SUBSYS_NFTABLES);
if (type->family == NFPROTO_UNSPEC)
list_add_tail_rcu(&type->list, &nf_tables_expressions);
--
2.30.2
^ permalink raw reply related [flat|nested] 32+ messages in thread
* [PATCH net 03/14] netfilter: nf_tables: bail out if stateful expression provides no .clone
2024-01-17 16:00 [PATCH net 00/14] Netfilter fixes for net Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 01/14] netfilter: nf_tables: reject invalid set policy Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 02/14] netfilter: nf_tables: validate .maxattr at expression registration Pablo Neira Ayuso
@ 2024-01-17 16:00 ` Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 04/14] netfilter: nft_limit: do not ignore unsupported flags Pablo Neira Ayuso
` (10 subsequent siblings)
13 siblings, 0 replies; 32+ messages in thread
From: Pablo Neira Ayuso @ 2024-01-17 16:00 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw
All existing NFT_EXPR_STATEFUL provide a .clone interface, remove
fallback to copy content of stateful expression since this is never
exercised and bail out if .clone interface is not defined.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_tables_api.c | 15 +++++++--------
1 file changed, 7 insertions(+), 8 deletions(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 2548d7d56408..b3db97440db1 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3274,14 +3274,13 @@ int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src)
{
int err;
- if (src->ops->clone) {
- dst->ops = src->ops;
- err = src->ops->clone(dst, src);
- if (err < 0)
- return err;
- } else {
- memcpy(dst, src, src->ops->size);
- }
+ if (WARN_ON_ONCE(!src->ops->clone))
+ return -EINVAL;
+
+ dst->ops = src->ops;
+ err = src->ops->clone(dst, src);
+ if (err < 0)
+ return err;
__module_get(src->ops->type->owner);
--
2.30.2
^ permalink raw reply related [flat|nested] 32+ messages in thread
* [PATCH net 04/14] netfilter: nft_limit: do not ignore unsupported flags
2024-01-17 16:00 [PATCH net 00/14] Netfilter fixes for net Pablo Neira Ayuso
` (2 preceding siblings ...)
2024-01-17 16:00 ` [PATCH net 03/14] netfilter: nf_tables: bail out if stateful expression provides no .clone Pablo Neira Ayuso
@ 2024-01-17 16:00 ` Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 05/14] netfilter: nfnetlink_log: use proper helper for fetching physinif Pablo Neira Ayuso
` (9 subsequent siblings)
13 siblings, 0 replies; 32+ messages in thread
From: Pablo Neira Ayuso @ 2024-01-17 16:00 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw
Bail out if userspace provides unsupported flags, otherwise future
extensions to the limit expression will be silently ignored by the
kernel.
Fixes: c7862a5f0de5 ("netfilter: nft_limit: allow to invert matching criteria")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nft_limit.c | 19 ++++++++++++-------
1 file changed, 12 insertions(+), 7 deletions(-)
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index 145dc62c6247..79039afde34e 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -58,6 +58,7 @@ static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost)
static int nft_limit_init(struct nft_limit_priv *priv,
const struct nlattr * const tb[], bool pkts)
{
+ bool invert = false;
u64 unit, tokens;
if (tb[NFTA_LIMIT_RATE] == NULL ||
@@ -90,19 +91,23 @@ static int nft_limit_init(struct nft_limit_priv *priv,
priv->rate);
}
+ if (tb[NFTA_LIMIT_FLAGS]) {
+ u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS]));
+
+ if (flags & ~NFT_LIMIT_F_INV)
+ return -EOPNOTSUPP;
+
+ if (flags & NFT_LIMIT_F_INV)
+ invert = true;
+ }
+
priv->limit = kmalloc(sizeof(*priv->limit), GFP_KERNEL_ACCOUNT);
if (!priv->limit)
return -ENOMEM;
priv->limit->tokens = tokens;
priv->tokens_max = priv->limit->tokens;
-
- if (tb[NFTA_LIMIT_FLAGS]) {
- u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS]));
-
- if (flags & NFT_LIMIT_F_INV)
- priv->invert = true;
- }
+ priv->invert = invert;
priv->limit->last = ktime_get_ns();
spin_lock_init(&priv->limit->lock);
--
2.30.2
^ permalink raw reply related [flat|nested] 32+ messages in thread
* [PATCH net 05/14] netfilter: nfnetlink_log: use proper helper for fetching physinif
2024-01-17 16:00 [PATCH net 00/14] Netfilter fixes for net Pablo Neira Ayuso
` (3 preceding siblings ...)
2024-01-17 16:00 ` [PATCH net 04/14] netfilter: nft_limit: do not ignore unsupported flags Pablo Neira Ayuso
@ 2024-01-17 16:00 ` Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 06/14] netfilter: nf_queue: remove excess nf_bridge variable Pablo Neira Ayuso
` (8 subsequent siblings)
13 siblings, 0 replies; 32+ messages in thread
From: Pablo Neira Ayuso @ 2024-01-17 16:00 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw
From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
We don't use physindev in __build_packet_message except for getting
physinif from it. So let's switch to nf_bridge_get_physinif to get what
we want directly.
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nfnetlink_log.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index f03f4d4d7d88..134e05d31061 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -508,7 +508,7 @@ __build_packet_message(struct nfnl_log_net *log,
htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
goto nla_put_failure;
} else {
- struct net_device *physindev;
+ int physinif;
/* Case 2: indev is bridge group, we need to look for
* physical device (when called from ipv4) */
@@ -516,10 +516,10 @@ __build_packet_message(struct nfnl_log_net *log,
htonl(indev->ifindex)))
goto nla_put_failure;
- physindev = nf_bridge_get_physindev(skb);
- if (physindev &&
+ physinif = nf_bridge_get_physinif(skb);
+ if (physinif &&
nla_put_be32(inst->skb, NFULA_IFINDEX_PHYSINDEV,
- htonl(physindev->ifindex)))
+ htonl(physinif)))
goto nla_put_failure;
}
#endif
--
2.30.2
^ permalink raw reply related [flat|nested] 32+ messages in thread
* [PATCH net 06/14] netfilter: nf_queue: remove excess nf_bridge variable
2024-01-17 16:00 [PATCH net 00/14] Netfilter fixes for net Pablo Neira Ayuso
` (4 preceding siblings ...)
2024-01-17 16:00 ` [PATCH net 05/14] netfilter: nfnetlink_log: use proper helper for fetching physinif Pablo Neira Ayuso
@ 2024-01-17 16:00 ` Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 07/14] netfilter: propagate net to nf_bridge_get_physindev Pablo Neira Ayuso
` (7 subsequent siblings)
13 siblings, 0 replies; 32+ messages in thread
From: Pablo Neira Ayuso @ 2024-01-17 16:00 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw
From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
We don't really need nf_bridge variable here. And nf_bridge_info_exists
is better replacement for nf_bridge_info_get in case we are only
checking for existence.
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_queue.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 63d1516816b1..3dfcb3ac5cb4 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -82,10 +82,8 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry)
{
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
const struct sk_buff *skb = entry->skb;
- struct nf_bridge_info *nf_bridge;
- nf_bridge = nf_bridge_info_get(skb);
- if (nf_bridge) {
+ if (nf_bridge_info_exists(skb)) {
entry->physin = nf_bridge_get_physindev(skb);
entry->physout = nf_bridge_get_physoutdev(skb);
} else {
--
2.30.2
^ permalink raw reply related [flat|nested] 32+ messages in thread
* [PATCH net 07/14] netfilter: propagate net to nf_bridge_get_physindev
2024-01-17 16:00 [PATCH net 00/14] Netfilter fixes for net Pablo Neira Ayuso
` (5 preceding siblings ...)
2024-01-17 16:00 ` [PATCH net 06/14] netfilter: nf_queue: remove excess nf_bridge variable Pablo Neira Ayuso
@ 2024-01-17 16:00 ` Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 08/14] netfilter: bridge: replace physindev with physinif in nf_bridge_info Pablo Neira Ayuso
` (6 subsequent siblings)
13 siblings, 0 replies; 32+ messages in thread
From: Pablo Neira Ayuso @ 2024-01-17 16:00 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw
From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
This is a preparation patch for replacing physindev with physinif on
nf_bridge_info structure. We will use dev_get_by_index_rcu to resolve
device, when needed, and it requires net to be available.
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/linux/netfilter_bridge.h | 2 +-
net/ipv4/netfilter/nf_reject_ipv4.c | 2 +-
net/ipv6/netfilter/nf_reject_ipv6.c | 2 +-
net/netfilter/ipset/ip_set_hash_netiface.c | 8 ++++----
net/netfilter/nf_log_syslog.c | 13 +++++++------
net/netfilter/nf_queue.c | 2 +-
net/netfilter/xt_physdev.c | 2 +-
7 files changed, 16 insertions(+), 15 deletions(-)
diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index f980edfdd278..e927b9a15a55 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -56,7 +56,7 @@ static inline int nf_bridge_get_physoutif(const struct sk_buff *skb)
}
static inline struct net_device *
-nf_bridge_get_physindev(const struct sk_buff *skb)
+nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net)
{
const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index f01b038fc1cd..86e7d390671a 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -289,7 +289,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
* build the eth header using the original destination's MAC as the
* source, and send the RST packet directly.
*/
- br_indev = nf_bridge_get_physindev(oldskb);
+ br_indev = nf_bridge_get_physindev(oldskb, net);
if (br_indev) {
struct ethhdr *oeth = eth_hdr(oldskb);
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index d45bc54b7ea5..27b2164f4c43 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -354,7 +354,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
* build the eth header using the original destination's MAC as the
* source, and send the RST packet directly.
*/
- br_indev = nf_bridge_get_physindev(oldskb);
+ br_indev = nf_bridge_get_physindev(oldskb, net);
if (br_indev) {
struct ethhdr *oeth = eth_hdr(oldskb);
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 95aeb31c60e0..30a655e5c4fd 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -138,9 +138,9 @@ hash_netiface4_data_next(struct hash_netiface4_elem *next,
#include "ip_set_hash_gen.h"
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-static const char *get_physindev_name(const struct sk_buff *skb)
+static const char *get_physindev_name(const struct sk_buff *skb, struct net *net)
{
- struct net_device *dev = nf_bridge_get_physindev(skb);
+ struct net_device *dev = nf_bridge_get_physindev(skb, net);
return dev ? dev->name : NULL;
}
@@ -177,7 +177,7 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- const char *eiface = SRCDIR ? get_physindev_name(skb) :
+ const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) :
get_physoutdev_name(skb);
if (!eiface)
@@ -395,7 +395,7 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- const char *eiface = SRCDIR ? get_physindev_name(skb) :
+ const char *eiface = SRCDIR ? get_physindev_name(skb, xt_net(par)) :
get_physoutdev_name(skb);
if (!eiface)
diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c
index c66689ad2b49..58402226045e 100644
--- a/net/netfilter/nf_log_syslog.c
+++ b/net/netfilter/nf_log_syslog.c
@@ -111,7 +111,8 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf,
unsigned int hooknum, const struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
- const struct nf_loginfo *loginfo, const char *prefix)
+ const struct nf_loginfo *loginfo, const char *prefix,
+ struct net *net)
{
const struct net_device *physoutdev __maybe_unused;
const struct net_device *physindev __maybe_unused;
@@ -121,7 +122,7 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u8 pf,
in ? in->name : "",
out ? out->name : "");
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- physindev = nf_bridge_get_physindev(skb);
+ physindev = nf_bridge_get_physindev(skb, net);
if (physindev && in != physindev)
nf_log_buf_add(m, "PHYSIN=%s ", physindev->name);
physoutdev = nf_bridge_get_physoutdev(skb);
@@ -148,7 +149,7 @@ static void nf_log_arp_packet(struct net *net, u_int8_t pf,
loginfo = &default_loginfo;
nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
- prefix);
+ prefix, net);
dump_arp_packet(m, loginfo, skb, skb_network_offset(skb));
nf_log_buf_close(m);
@@ -845,7 +846,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
loginfo = &default_loginfo;
nf_log_dump_packet_common(m, pf, hooknum, skb, in,
- out, loginfo, prefix);
+ out, loginfo, prefix, net);
if (in)
dump_mac_header(m, loginfo, skb);
@@ -880,7 +881,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
loginfo = &default_loginfo;
nf_log_dump_packet_common(m, pf, hooknum, skb, in, out,
- loginfo, prefix);
+ loginfo, prefix, net);
if (in)
dump_mac_header(m, loginfo, skb);
@@ -916,7 +917,7 @@ static void nf_log_unknown_packet(struct net *net, u_int8_t pf,
loginfo = &default_loginfo;
nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
- prefix);
+ prefix, net);
dump_mac_header(m, loginfo, skb);
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 3dfcb3ac5cb4..e2f334f70281 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -84,7 +84,7 @@ static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry)
const struct sk_buff *skb = entry->skb;
if (nf_bridge_info_exists(skb)) {
- entry->physin = nf_bridge_get_physindev(skb);
+ entry->physin = nf_bridge_get_physindev(skb, entry->state.net);
entry->physout = nf_bridge_get_physoutdev(skb);
} else {
entry->physin = NULL;
diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c
index ec6ed6fda96c..343e65f377d4 100644
--- a/net/netfilter/xt_physdev.c
+++ b/net/netfilter/xt_physdev.c
@@ -59,7 +59,7 @@ physdev_mt(const struct sk_buff *skb, struct xt_action_param *par)
(!!outdev ^ !(info->invert & XT_PHYSDEV_OP_BRIDGED)))
return false;
- physdev = nf_bridge_get_physindev(skb);
+ physdev = nf_bridge_get_physindev(skb, xt_net(par));
indev = physdev ? physdev->name : NULL;
if ((info->bitmask & XT_PHYSDEV_OP_ISIN &&
--
2.30.2
^ permalink raw reply related [flat|nested] 32+ messages in thread
* [PATCH net 08/14] netfilter: bridge: replace physindev with physinif in nf_bridge_info
2024-01-17 16:00 [PATCH net 00/14] Netfilter fixes for net Pablo Neira Ayuso
` (6 preceding siblings ...)
2024-01-17 16:00 ` [PATCH net 07/14] netfilter: propagate net to nf_bridge_get_physindev Pablo Neira Ayuso
@ 2024-01-17 16:00 ` Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 09/14] netfilter: nf_tables: check if catch-all set element is active in next generation Pablo Neira Ayuso
` (5 subsequent siblings)
13 siblings, 0 replies; 32+ messages in thread
From: Pablo Neira Ayuso @ 2024-01-17 16:00 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw
From: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
An skb can be added to a neigh->arp_queue while waiting for an arp
reply. Where original skb's skb->dev can be different to neigh's
neigh->dev. For instance in case of bridging dnated skb from one veth to
another, the skb would be added to a neigh->arp_queue of the bridge.
As skb->dev can be reset back to nf_bridge->physindev and used, and as
there is no explicit mechanism that prevents this physindev from been
freed under us (for instance neigh_flush_dev doesn't cleanup skbs from
different device's neigh queue) we can crash on e.g. this stack:
arp_process
neigh_update
skb = __skb_dequeue(&neigh->arp_queue)
neigh_resolve_output(..., skb)
...
br_nf_dev_xmit
br_nf_pre_routing_finish_bridge_slow
skb->dev = nf_bridge->physindev
br_handle_frame_finish
Let's use plain ifindex instead of net_device link. To peek into the
original net_device we will use dev_get_by_index_rcu(). Thus either we
get device and are safe to use it or we don't get it and drop skb.
Fixes: c4e70a87d975 ("netfilter: bridge: rename br_netfilter.c to br_netfilter_hooks.c")
Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/linux/netfilter_bridge.h | 4 +--
include/linux/skbuff.h | 2 +-
net/bridge/br_netfilter_hooks.c | 42 +++++++++++++++++++++++------
net/bridge/br_netfilter_ipv6.c | 14 +++++++---
net/ipv4/netfilter/nf_reject_ipv4.c | 9 ++++---
net/ipv6/netfilter/nf_reject_ipv6.c | 11 +++++---
6 files changed, 61 insertions(+), 21 deletions(-)
diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h
index e927b9a15a55..743475ca7e9d 100644
--- a/include/linux/netfilter_bridge.h
+++ b/include/linux/netfilter_bridge.h
@@ -42,7 +42,7 @@ static inline int nf_bridge_get_physinif(const struct sk_buff *skb)
if (!nf_bridge)
return 0;
- return nf_bridge->physindev ? nf_bridge->physindev->ifindex : 0;
+ return nf_bridge->physinif;
}
static inline int nf_bridge_get_physoutif(const struct sk_buff *skb)
@@ -60,7 +60,7 @@ nf_bridge_get_physindev(const struct sk_buff *skb, struct net *net)
{
const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
- return nf_bridge ? nf_bridge->physindev : NULL;
+ return nf_bridge ? dev_get_by_index_rcu(net, nf_bridge->physinif) : NULL;
}
static inline struct net_device *
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a5ae952454c8..2dde34c29203 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -295,7 +295,7 @@ struct nf_bridge_info {
u8 bridged_dnat:1;
u8 sabotage_in_done:1;
__u16 frag_max_size;
- struct net_device *physindev;
+ int physinif;
/* always valid & non-NULL from FORWARD on, for physdev match */
struct net_device *physoutdev;
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 6adcb45bca75..ed1720890757 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -279,8 +279,17 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_
if ((READ_ONCE(neigh->nud_state) & NUD_CONNECTED) &&
READ_ONCE(neigh->hh.hh_len)) {
+ struct net_device *br_indev;
+
+ br_indev = nf_bridge_get_physindev(skb, net);
+ if (!br_indev) {
+ neigh_release(neigh);
+ goto free_skb;
+ }
+
neigh_hh_bridge(&neigh->hh, skb);
- skb->dev = nf_bridge->physindev;
+ skb->dev = br_indev;
+
ret = br_handle_frame_finish(net, sk, skb);
} else {
/* the neighbour function below overwrites the complete
@@ -352,12 +361,18 @@ br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb,
*/
static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net_device *dev = skb->dev;
+ struct net_device *dev = skb->dev, *br_indev;
struct iphdr *iph = ip_hdr(skb);
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
struct rtable *rt;
int err;
+ br_indev = nf_bridge_get_physindev(skb, net);
+ if (!br_indev) {
+ kfree_skb(skb);
+ return 0;
+ }
+
nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
if (nf_bridge->pkt_otherhost) {
@@ -397,7 +412,7 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_
} else {
if (skb_dst(skb)->dev == dev) {
bridged_dnat:
- skb->dev = nf_bridge->physindev;
+ skb->dev = br_indev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
br_nf_hook_thresh(NF_BR_PRE_ROUTING,
@@ -410,7 +425,7 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_
skb->pkt_type = PACKET_HOST;
}
} else {
- rt = bridge_parent_rtable(nf_bridge->physindev);
+ rt = bridge_parent_rtable(br_indev);
if (!rt) {
kfree_skb(skb);
return 0;
@@ -419,7 +434,7 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_
skb_dst_set_noref(skb, &rt->dst);
}
- skb->dev = nf_bridge->physindev;
+ skb->dev = br_indev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL,
@@ -456,7 +471,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net)
}
nf_bridge->in_prerouting = 1;
- nf_bridge->physindev = skb->dev;
+ nf_bridge->physinif = skb->dev->ifindex;
skb->dev = brnf_get_logical_dev(skb, skb->dev, net);
if (skb->protocol == htons(ETH_P_8021Q))
@@ -553,7 +568,11 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff
if (skb->protocol == htons(ETH_P_IPV6))
nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;
- in = nf_bridge->physindev;
+ in = nf_bridge_get_physindev(skb, net);
+ if (!in) {
+ kfree_skb(skb);
+ return 0;
+ }
if (nf_bridge->pkt_otherhost) {
skb->pkt_type = PACKET_OTHERHOST;
nf_bridge->pkt_otherhost = false;
@@ -899,6 +918,13 @@ static unsigned int ip_sabotage_in(void *priv,
static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
{
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ struct net_device *br_indev;
+
+ br_indev = nf_bridge_get_physindev(skb, dev_net(skb->dev));
+ if (!br_indev) {
+ kfree_skb(skb);
+ return;
+ }
skb_pull(skb, ETH_HLEN);
nf_bridge->bridged_dnat = 0;
@@ -908,7 +934,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN),
nf_bridge->neigh_header,
ETH_HLEN - ETH_ALEN);
- skb->dev = nf_bridge->physindev;
+ skb->dev = br_indev;
nf_bridge->physoutdev = NULL;
br_handle_frame_finish(dev_net(skb->dev), NULL, skb);
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 2e24a743f917..e0421eaa3abc 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -102,9 +102,15 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
{
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
struct rtable *rt;
- struct net_device *dev = skb->dev;
+ struct net_device *dev = skb->dev, *br_indev;
const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
+ br_indev = nf_bridge_get_physindev(skb, net);
+ if (!br_indev) {
+ kfree_skb(skb);
+ return 0;
+ }
+
nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;
if (nf_bridge->pkt_otherhost) {
@@ -122,7 +128,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
}
if (skb_dst(skb)->dev == dev) {
- skb->dev = nf_bridge->physindev;
+ skb->dev = br_indev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
br_nf_hook_thresh(NF_BR_PRE_ROUTING,
@@ -133,7 +139,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
skb->pkt_type = PACKET_HOST;
} else {
- rt = bridge_parent_rtable(nf_bridge->physindev);
+ rt = bridge_parent_rtable(br_indev);
if (!rt) {
kfree_skb(skb);
return 0;
@@ -142,7 +148,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
skb_dst_set_noref(skb, &rt->dst);
}
- skb->dev = nf_bridge->physindev;
+ skb->dev = br_indev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb,
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 86e7d390671a..04504b2b51df 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -239,7 +239,6 @@ static int nf_reject_fill_skb_dst(struct sk_buff *skb_in)
void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
int hook)
{
- struct net_device *br_indev __maybe_unused;
struct sk_buff *nskb;
struct iphdr *niph;
const struct tcphdr *oth;
@@ -289,9 +288,13 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
* build the eth header using the original destination's MAC as the
* source, and send the RST packet directly.
*/
- br_indev = nf_bridge_get_physindev(oldskb, net);
- if (br_indev) {
+ if (nf_bridge_info_exists(oldskb)) {
struct ethhdr *oeth = eth_hdr(oldskb);
+ struct net_device *br_indev;
+
+ br_indev = nf_bridge_get_physindev(oldskb, net);
+ if (!br_indev)
+ goto free_nskb;
nskb->dev = br_indev;
niph->tot_len = htons(nskb->len);
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index 27b2164f4c43..196dd4ecb5e2 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -278,7 +278,6 @@ static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in)
void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
int hook)
{
- struct net_device *br_indev __maybe_unused;
struct sk_buff *nskb;
struct tcphdr _otcph;
const struct tcphdr *otcph;
@@ -354,9 +353,15 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
* build the eth header using the original destination's MAC as the
* source, and send the RST packet directly.
*/
- br_indev = nf_bridge_get_physindev(oldskb, net);
- if (br_indev) {
+ if (nf_bridge_info_exists(oldskb)) {
struct ethhdr *oeth = eth_hdr(oldskb);
+ struct net_device *br_indev;
+
+ br_indev = nf_bridge_get_physindev(oldskb, net);
+ if (!br_indev) {
+ kfree_skb(nskb);
+ return;
+ }
nskb->dev = br_indev;
nskb->protocol = htons(ETH_P_IPV6);
--
2.30.2
^ permalink raw reply related [flat|nested] 32+ messages in thread
* [PATCH net 09/14] netfilter: nf_tables: check if catch-all set element is active in next generation
2024-01-17 16:00 [PATCH net 00/14] Netfilter fixes for net Pablo Neira Ayuso
` (7 preceding siblings ...)
2024-01-17 16:00 ` [PATCH net 08/14] netfilter: bridge: replace physindev with physinif in nf_bridge_info Pablo Neira Ayuso
@ 2024-01-17 16:00 ` Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 10/14] netfilter: nf_tables: do not allow mismatch field size and set key length Pablo Neira Ayuso
` (4 subsequent siblings)
13 siblings, 0 replies; 32+ messages in thread
From: Pablo Neira Ayuso @ 2024-01-17 16:00 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw
When deactivating the catch-all set element, check the state in the next
generation that represents this transaction.
This bug uncovered after the recent removal of the element busy mark
a2dd0233cbc4 ("netfilter: nf_tables: remove busy mark and gc batch API").
Fixes: aaa31047a6d2 ("netfilter: nftables: add catch-all set element support")
Cc: stable@vger.kernel.org
Reported-by: lonial con <kongln9170@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_tables_api.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index b3db97440db1..50b595ef6389 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6578,7 +6578,7 @@ static int nft_setelem_catchall_deactivate(const struct net *net,
list_for_each_entry(catchall, &set->catchall_list, list) {
ext = nft_set_elem_ext(set, catchall->elem);
- if (!nft_is_active(net, ext))
+ if (!nft_is_active_next(net, ext))
continue;
kfree(elem->priv);
--
2.30.2
^ permalink raw reply related [flat|nested] 32+ messages in thread
* [PATCH net 10/14] netfilter: nf_tables: do not allow mismatch field size and set key length
2024-01-17 16:00 [PATCH net 00/14] Netfilter fixes for net Pablo Neira Ayuso
` (8 preceding siblings ...)
2024-01-17 16:00 ` [PATCH net 09/14] netfilter: nf_tables: check if catch-all set element is active in next generation Pablo Neira Ayuso
@ 2024-01-17 16:00 ` Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 11/14] netfilter: nf_tables: skip dead set elements in netlink dump Pablo Neira Ayuso
` (3 subsequent siblings)
13 siblings, 0 replies; 32+ messages in thread
From: Pablo Neira Ayuso @ 2024-01-17 16:00 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw
The set description provides the size of each field in the set whose sum
should not mismatch the set key length, bail out otherwise.
I did not manage to crash nft_set_pipapo with mismatch fields and set key
length so far, but this is UB which must be disallowed.
Fixes: f3a2181e16f1 ("netfilter: nf_tables: Support for sets with multiple ranged fields")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_tables_api.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 50b595ef6389..e9fa4a32c093 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4813,8 +4813,8 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr,
static int nft_set_desc_concat(struct nft_set_desc *desc,
const struct nlattr *nla)
{
+ u32 num_regs = 0, key_num_regs = 0;
struct nlattr *attr;
- u32 num_regs = 0;
int rem, err, i;
nla_for_each_nested(attr, nla, rem) {
@@ -4829,6 +4829,10 @@ static int nft_set_desc_concat(struct nft_set_desc *desc,
for (i = 0; i < desc->field_count; i++)
num_regs += DIV_ROUND_UP(desc->field_len[i], sizeof(u32));
+ key_num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32));
+ if (key_num_regs != num_regs)
+ return -EINVAL;
+
if (num_regs > NFT_REG32_COUNT)
return -E2BIG;
--
2.30.2
^ permalink raw reply related [flat|nested] 32+ messages in thread
* [PATCH net 11/14] netfilter: nf_tables: skip dead set elements in netlink dump
2024-01-17 16:00 [PATCH net 00/14] Netfilter fixes for net Pablo Neira Ayuso
` (9 preceding siblings ...)
2024-01-17 16:00 ` [PATCH net 10/14] netfilter: nf_tables: do not allow mismatch field size and set key length Pablo Neira Ayuso
@ 2024-01-17 16:00 ` Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 12/14] netfilter: nf_tables: reject NFT_SET_CONCAT with not field length description Pablo Neira Ayuso
` (2 subsequent siblings)
13 siblings, 0 replies; 32+ messages in thread
From: Pablo Neira Ayuso @ 2024-01-17 16:00 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw
Delete from packet path relies on the garbage collector to purge
elements with NFT_SET_ELEM_DEAD_BIT on.
Skip these dead elements from nf_tables_dump_setelem() path, I very
rarely see tests/shell/testcases/maps/typeof_maps_add_delete reports
[DUMP FAILED] showing a mismatch in the expected output with an element
that should not be there.
If the netlink dump happens before GC worker run, it might show dead
elements in the ruleset listing.
nft_rhash_get() already skips dead elements in nft_rhash_cmp(),
therefore, it already does not show the element when getting a single
element via netlink control plane.
Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_tables_api.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index e9fa4a32c093..88a6a858383b 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5718,7 +5718,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
struct nft_set_dump_args *args;
- if (nft_set_elem_expired(ext))
+ if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext))
return 0;
args = container_of(iter, struct nft_set_dump_args, iter);
--
2.30.2
^ permalink raw reply related [flat|nested] 32+ messages in thread
* [PATCH net 12/14] netfilter: nf_tables: reject NFT_SET_CONCAT with not field length description
2024-01-17 16:00 [PATCH net 00/14] Netfilter fixes for net Pablo Neira Ayuso
` (10 preceding siblings ...)
2024-01-17 16:00 ` [PATCH net 11/14] netfilter: nf_tables: skip dead set elements in netlink dump Pablo Neira Ayuso
@ 2024-01-17 16:00 ` Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 13/14] ipvs: avoid stat macros calls from preemptible context Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 14/14] netfilter: ipset: fix performance regression in swap operation Pablo Neira Ayuso
13 siblings, 0 replies; 32+ messages in thread
From: Pablo Neira Ayuso @ 2024-01-17 16:00 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw
It is still possible to set on the NFT_SET_CONCAT flag by specifying a
set size and no field description, report EINVAL in such case.
Fixes: 1b6345d4160e ("netfilter: nf_tables: check NFT_SET_CONCAT flag if field_count is specified")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_tables_api.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 88a6a858383b..4b55533ce5ca 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5070,8 +5070,12 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
if (err < 0)
return err;
- if (desc.field_count > 1 && !(flags & NFT_SET_CONCAT))
+ if (desc.field_count > 1) {
+ if (!(flags & NFT_SET_CONCAT))
+ return -EINVAL;
+ } else if (flags & NFT_SET_CONCAT) {
return -EINVAL;
+ }
} else if (flags & NFT_SET_CONCAT) {
return -EINVAL;
}
--
2.30.2
^ permalink raw reply related [flat|nested] 32+ messages in thread
* [PATCH net 13/14] ipvs: avoid stat macros calls from preemptible context
2024-01-17 16:00 [PATCH net 00/14] Netfilter fixes for net Pablo Neira Ayuso
` (11 preceding siblings ...)
2024-01-17 16:00 ` [PATCH net 12/14] netfilter: nf_tables: reject NFT_SET_CONCAT with not field length description Pablo Neira Ayuso
@ 2024-01-17 16:00 ` Pablo Neira Ayuso
2024-01-17 16:00 ` [PATCH net 14/14] netfilter: ipset: fix performance regression in swap operation Pablo Neira Ayuso
13 siblings, 0 replies; 32+ messages in thread
From: Pablo Neira Ayuso @ 2024-01-17 16:00 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw
From: Fedor Pchelkin <pchelkin@ispras.ru>
Inside decrement_ttl() upon discovering that the packet ttl has exceeded,
__IP_INC_STATS and __IP6_INC_STATS macros can be called from preemptible
context having the following backtrace:
check_preemption_disabled: 48 callbacks suppressed
BUG: using __this_cpu_add() in preemptible [00000000] code: curl/1177
caller is decrement_ttl+0x217/0x830
CPU: 5 PID: 1177 Comm: curl Not tainted 6.7.0+ #34
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0xbd/0xe0
check_preemption_disabled+0xd1/0xe0
decrement_ttl+0x217/0x830
__ip_vs_get_out_rt+0x4e0/0x1ef0
ip_vs_nat_xmit+0x205/0xcd0
ip_vs_in_hook+0x9b1/0x26a0
nf_hook_slow+0xc2/0x210
nf_hook+0x1fb/0x770
__ip_local_out+0x33b/0x640
ip_local_out+0x2a/0x490
__ip_queue_xmit+0x990/0x1d10
__tcp_transmit_skb+0x288b/0x3d10
tcp_connect+0x3466/0x5180
tcp_v4_connect+0x1535/0x1bb0
__inet_stream_connect+0x40d/0x1040
inet_stream_connect+0x57/0xa0
__sys_connect_file+0x162/0x1a0
__sys_connect+0x137/0x160
__x64_sys_connect+0x72/0xb0
do_syscall_64+0x6f/0x140
entry_SYSCALL_64_after_hwframe+0x6e/0x76
RIP: 0033:0x7fe6dbbc34e0
Use the corresponding preemption-aware variants: IP_INC_STATS and
IP6_INC_STATS.
Found by Linux Verification Center (linuxtesting.org).
Fixes: 8d8e20e2d7bb ("ipvs: Decrement ttl")
Signed-off-by: Fedor Pchelkin <pchelkin@ispras.ru>
Acked-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Simon Horman <horms@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/ipvs/ip_vs_xmit.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 9193e109e6b3..65e0259178da 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -271,7 +271,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_TIME_EXCEED,
ICMPV6_EXC_HOPLIMIT, 0);
- __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
return false;
}
@@ -286,7 +286,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
{
if (ip_hdr(skb)->ttl <= 1) {
/* Tell the sender its packet died... */
- __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
+ IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
return false;
}
--
2.30.2
^ permalink raw reply related [flat|nested] 32+ messages in thread
* [PATCH net 14/14] netfilter: ipset: fix performance regression in swap operation
2024-01-17 16:00 [PATCH net 00/14] Netfilter fixes for net Pablo Neira Ayuso
` (12 preceding siblings ...)
2024-01-17 16:00 ` [PATCH net 13/14] ipvs: avoid stat macros calls from preemptible context Pablo Neira Ayuso
@ 2024-01-17 16:00 ` Pablo Neira Ayuso
2024-01-17 16:08 ` Eric Dumazet
2024-01-18 10:08 ` Eric Dumazet
13 siblings, 2 replies; 32+ messages in thread
From: Pablo Neira Ayuso @ 2024-01-17 16:00 UTC (permalink / raw)
To: netfilter-devel; +Cc: davem, netdev, kuba, pabeni, edumazet, fw
From: Jozsef Kadlecsik <kadlec@netfilter.org>
The patch "netfilter: ipset: fix race condition between swap/destroy
and kernel side add/del/test", commit 28628fa9 fixes a race condition.
But the synchronize_rcu() added to the swap function unnecessarily slows
it down: it can safely be moved to destroy and use call_rcu() instead.
Thus we can get back the same performance and preventing the race condition
at the same time.
Fixes: 28628fa952fe ("netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test")
Link: https://lore.kernel.org/lkml/C0829B10-EAA6-4809-874E-E1E9C05A8D84@automattic.com/
Reported-by: Ale Crismani <ale.crismani@automattic.com>
Reported-by: David Wang <00107082@163.com
Tested-by: Ale Crismani <ale.crismani@automattic.com>
Tested-by: David Wang <00107082@163.com
Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/linux/netfilter/ipset/ip_set.h | 2 ++
net/netfilter/ipset/ip_set_core.c | 31 +++++++++++++++++++-------
2 files changed, 25 insertions(+), 8 deletions(-)
diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index e8c350a3ade1..912f750d0bea 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -242,6 +242,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type);
/* A generic IP set */
struct ip_set {
+ /* For call_cru in destroy */
+ struct rcu_head rcu;
/* The name of the set */
char name[IPSET_MAXNAMELEN];
/* Lock protecting the set data */
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 4c133e06be1d..3bf9bb345809 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1182,6 +1182,14 @@ ip_set_destroy_set(struct ip_set *set)
kfree(set);
}
+static void
+ip_set_destroy_set_rcu(struct rcu_head *head)
+{
+ struct ip_set *set = container_of(head, struct ip_set, rcu);
+
+ ip_set_destroy_set(set);
+}
+
static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
const struct nlattr * const attr[])
{
@@ -1193,8 +1201,6 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
if (unlikely(protocol_min_failed(attr)))
return -IPSET_ERR_PROTOCOL;
- /* Must wait for flush to be really finished in list:set */
- rcu_barrier();
/* Commands are serialized and references are
* protected by the ip_set_ref_lock.
@@ -1206,8 +1212,10 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
* counter, so if it's already zero, we can proceed
* without holding the lock.
*/
- read_lock_bh(&ip_set_ref_lock);
if (!attr[IPSET_ATTR_SETNAME]) {
+ /* Must wait for flush to be really finished in list:set */
+ rcu_barrier();
+ read_lock_bh(&ip_set_ref_lock);
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
if (s && (s->ref || s->ref_netlink)) {
@@ -1228,6 +1236,9 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
inst->is_destroyed = false;
} else {
u32 flags = flag_exist(info->nlh);
+ u16 features = 0;
+
+ read_lock_bh(&ip_set_ref_lock);
s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
&i);
if (!s) {
@@ -1238,10 +1249,14 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
ret = -IPSET_ERR_BUSY;
goto out;
}
+ features = s->type->features;
ip_set(inst, i) = NULL;
read_unlock_bh(&ip_set_ref_lock);
-
- ip_set_destroy_set(s);
+ if (features & IPSET_TYPE_NAME) {
+ /* Must wait for flush to be really finished */
+ rcu_barrier();
+ }
+ call_rcu(&s->rcu, ip_set_destroy_set_rcu);
}
return 0;
out:
@@ -1394,9 +1409,6 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info,
ip_set(inst, to_id) = from;
write_unlock_bh(&ip_set_ref_lock);
- /* Make sure all readers of the old set pointers are completed. */
- synchronize_rcu();
-
return 0;
}
@@ -2357,6 +2369,9 @@ ip_set_net_exit(struct net *net)
inst->is_deleted = true; /* flag for ip_set_nfnl_put */
+ /* Wait for call_rcu() in destroy */
+ rcu_barrier();
+
nfnl_lock(NFNL_SUBSYS_IPSET);
for (i = 0; i < inst->ip_set_max; i++) {
set = ip_set(inst, i);
--
2.30.2
^ permalink raw reply related [flat|nested] 32+ messages in thread
* Re: [PATCH net 14/14] netfilter: ipset: fix performance regression in swap operation
2024-01-17 16:00 ` [PATCH net 14/14] netfilter: ipset: fix performance regression in swap operation Pablo Neira Ayuso
@ 2024-01-17 16:08 ` Eric Dumazet
2024-01-17 16:23 ` Jozsef Kadlecsik
2024-01-18 10:08 ` Eric Dumazet
1 sibling, 1 reply; 32+ messages in thread
From: Eric Dumazet @ 2024-01-17 16:08 UTC (permalink / raw)
To: Pablo Neira Ayuso; +Cc: netfilter-devel, davem, netdev, kuba, pabeni, fw
On Wed, Jan 17, 2024 at 5:00 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>
> From: Jozsef Kadlecsik <kadlec@netfilter.org>
>
> The patch "netfilter: ipset: fix race condition between swap/destroy
> and kernel side add/del/test", commit 28628fa9 fixes a race condition.
> But the synchronize_rcu() added to the swap function unnecessarily slows
> it down: it can safely be moved to destroy and use call_rcu() instead.
> Thus we can get back the same performance and preventing the race condition
> at the same time.
...
>
> @@ -2357,6 +2369,9 @@ ip_set_net_exit(struct net *net)
>
> inst->is_deleted = true; /* flag for ip_set_nfnl_put */
>
> + /* Wait for call_rcu() in destroy */
> + rcu_barrier();
> +
> nfnl_lock(NFNL_SUBSYS_IPSET);
> for (i = 0; i < inst->ip_set_max; i++) {
> set = ip_set(inst, i);
> --
> 2.30.2
>
If I am reading this right, time for netns dismantles will increase,
even for netns not using ipset
If there is no other option, please convert "struct pernet_operations
ip_set_net_ops".exit to an exit_batch() handler,
to at least have a factorized rcu_barrier();
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH net 14/14] netfilter: ipset: fix performance regression in swap operation
2024-01-17 16:08 ` Eric Dumazet
@ 2024-01-17 16:23 ` Jozsef Kadlecsik
2024-01-17 16:28 ` Eric Dumazet
0 siblings, 1 reply; 32+ messages in thread
From: Jozsef Kadlecsik @ 2024-01-17 16:23 UTC (permalink / raw)
To: Eric Dumazet
Cc: Pablo Neira Ayuso, netfilter-devel, davem, netdev, kuba, pabeni,
fw
[-- Attachment #1: Type: text/plain, Size: 1705 bytes --]
Hi,
On Wed, 17 Jan 2024, Eric Dumazet wrote:
> On Wed, Jan 17, 2024 at 5:00 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> >
> > From: Jozsef Kadlecsik <kadlec@netfilter.org>
> >
> > The patch "netfilter: ipset: fix race condition between swap/destroy
> > and kernel side add/del/test", commit 28628fa9 fixes a race condition.
> > But the synchronize_rcu() added to the swap function unnecessarily slows
> > it down: it can safely be moved to destroy and use call_rcu() instead.
> > Thus we can get back the same performance and preventing the race condition
> > at the same time.
>
> ...
>
> >
> > @@ -2357,6 +2369,9 @@ ip_set_net_exit(struct net *net)
> >
> > inst->is_deleted = true; /* flag for ip_set_nfnl_put */
> >
> > + /* Wait for call_rcu() in destroy */
> > + rcu_barrier();
> > +
> > nfnl_lock(NFNL_SUBSYS_IPSET);
> > for (i = 0; i < inst->ip_set_max; i++) {
> > set = ip_set(inst, i);
> > --
> > 2.30.2
> >
>
> If I am reading this right, time for netns dismantles will increase,
> even for netns not using ipset
>
> If there is no other option, please convert "struct pernet_operations
> ip_set_net_ops".exit to an exit_batch() handler,
> to at least have a factorized rcu_barrier();
You are right, the call to rcu_barrier() can safely be moved to
ip_set_fini(). I'm going to prepare a new version of the patch.
Thanks for catching it.
Best regards,
Jozsef
--
E-mail : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.hu
PGP key : https://wigner.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics
H-1525 Budapest 114, POB. 49, Hungary
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH net 14/14] netfilter: ipset: fix performance regression in swap operation
2024-01-17 16:23 ` Jozsef Kadlecsik
@ 2024-01-17 16:28 ` Eric Dumazet
2024-01-18 11:08 ` Paolo Abeni
0 siblings, 1 reply; 32+ messages in thread
From: Eric Dumazet @ 2024-01-17 16:28 UTC (permalink / raw)
To: Jozsef Kadlecsik
Cc: Pablo Neira Ayuso, netfilter-devel, davem, netdev, kuba, pabeni,
fw
On Wed, Jan 17, 2024 at 5:23 PM Jozsef Kadlecsik <kadlec@netfilter.org> wrote:
>
> Hi,
>
> On Wed, 17 Jan 2024, Eric Dumazet wrote:
>
> > On Wed, Jan 17, 2024 at 5:00 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > >
> > > From: Jozsef Kadlecsik <kadlec@netfilter.org>
> > >
> > > The patch "netfilter: ipset: fix race condition between swap/destroy
> > > and kernel side add/del/test", commit 28628fa9 fixes a race condition.
> > > But the synchronize_rcu() added to the swap function unnecessarily slows
> > > it down: it can safely be moved to destroy and use call_rcu() instead.
> > > Thus we can get back the same performance and preventing the race condition
> > > at the same time.
> >
> > ...
> >
> > >
> > > @@ -2357,6 +2369,9 @@ ip_set_net_exit(struct net *net)
> > >
> > > inst->is_deleted = true; /* flag for ip_set_nfnl_put */
> > >
> > > + /* Wait for call_rcu() in destroy */
> > > + rcu_barrier();
> > > +
> > > nfnl_lock(NFNL_SUBSYS_IPSET);
> > > for (i = 0; i < inst->ip_set_max; i++) {
> > > set = ip_set(inst, i);
> > > --
> > > 2.30.2
> > >
> >
> > If I am reading this right, time for netns dismantles will increase,
> > even for netns not using ipset
> >
> > If there is no other option, please convert "struct pernet_operations
> > ip_set_net_ops".exit to an exit_batch() handler,
> > to at least have a factorized rcu_barrier();
>
> You are right, the call to rcu_barrier() can safely be moved to
> ip_set_fini(). I'm going to prepare a new version of the patch.
>
> Thanks for catching it.
I do not want to hold the series, your fix can be built as another
patch on top of this one.
Thanks.
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH net 14/14] netfilter: ipset: fix performance regression in swap operation
2024-01-17 16:28 ` Eric Dumazet
@ 2024-01-18 11:08 ` Paolo Abeni
2024-01-18 12:14 ` Eric Dumazet
0 siblings, 1 reply; 32+ messages in thread
From: Paolo Abeni @ 2024-01-18 11:08 UTC (permalink / raw)
To: Eric Dumazet, Jozsef Kadlecsik
Cc: Pablo Neira Ayuso, netfilter-devel, davem, netdev, kuba, fw
On Wed, 2024-01-17 at 17:28 +0100, Eric Dumazet wrote:
> On Wed, Jan 17, 2024 at 5:23 PM Jozsef Kadlecsik <kadlec@netfilter.org> wrote:
> >
> > Hi,
> >
> > On Wed, 17 Jan 2024, Eric Dumazet wrote:
> >
> > > On Wed, Jan 17, 2024 at 5:00 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > >
> > > > From: Jozsef Kadlecsik <kadlec@netfilter.org>
> > > >
> > > > The patch "netfilter: ipset: fix race condition between swap/destroy
> > > > and kernel side add/del/test", commit 28628fa9 fixes a race condition.
> > > > But the synchronize_rcu() added to the swap function unnecessarily slows
> > > > it down: it can safely be moved to destroy and use call_rcu() instead.
> > > > Thus we can get back the same performance and preventing the race condition
> > > > at the same time.
> > >
> > > ...
> > >
> > > >
> > > > @@ -2357,6 +2369,9 @@ ip_set_net_exit(struct net *net)
> > > >
> > > > inst->is_deleted = true; /* flag for ip_set_nfnl_put */
> > > >
> > > > + /* Wait for call_rcu() in destroy */
> > > > + rcu_barrier();
> > > > +
> > > > nfnl_lock(NFNL_SUBSYS_IPSET);
> > > > for (i = 0; i < inst->ip_set_max; i++) {
> > > > set = ip_set(inst, i);
> > > > --
> > > > 2.30.2
> > > >
> > >
> > > If I am reading this right, time for netns dismantles will increase,
> > > even for netns not using ipset
> > >
> > > If there is no other option, please convert "struct pernet_operations
> > > ip_set_net_ops".exit to an exit_batch() handler,
> > > to at least have a factorized rcu_barrier();
> >
> > You are right, the call to rcu_barrier() can safely be moved to
> > ip_set_fini(). I'm going to prepare a new version of the patch.
> >
> > Thanks for catching it.
>
> I do not want to hold the series, your fix can be built as another
> patch on top of this one.
Given the timing, if we merge this series as is, it could go very soon
into Linus' tree. I think it would be better to avoid introducing known
regressions there.
Any strong opinions vs holding this series until the problems are
fixed? Likely a new PR will be required.
Thanks,
Paolo
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH net 14/14] netfilter: ipset: fix performance regression in swap operation
2024-01-18 11:08 ` Paolo Abeni
@ 2024-01-18 12:14 ` Eric Dumazet
0 siblings, 0 replies; 32+ messages in thread
From: Eric Dumazet @ 2024-01-18 12:14 UTC (permalink / raw)
To: Paolo Abeni
Cc: Jozsef Kadlecsik, Pablo Neira Ayuso, netfilter-devel, davem,
netdev, kuba, fw
On Thu, Jan 18, 2024 at 12:08 PM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Wed, 2024-01-17 at 17:28 +0100, Eric Dumazet wrote:
> > On Wed, Jan 17, 2024 at 5:23 PM Jozsef Kadlecsik <kadlec@netfilter.org> wrote:
> > >
> > > Hi,
> > >
> > > On Wed, 17 Jan 2024, Eric Dumazet wrote:
> > >
> > > > On Wed, Jan 17, 2024 at 5:00 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > > >
> > > > > From: Jozsef Kadlecsik <kadlec@netfilter.org>
> > > > >
> > > > > The patch "netfilter: ipset: fix race condition between swap/destroy
> > > > > and kernel side add/del/test", commit 28628fa9 fixes a race condition.
> > > > > But the synchronize_rcu() added to the swap function unnecessarily slows
> > > > > it down: it can safely be moved to destroy and use call_rcu() instead.
> > > > > Thus we can get back the same performance and preventing the race condition
> > > > > at the same time.
> > > >
> > > > ...
> > > >
> > > > >
> > > > > @@ -2357,6 +2369,9 @@ ip_set_net_exit(struct net *net)
> > > > >
> > > > > inst->is_deleted = true; /* flag for ip_set_nfnl_put */
> > > > >
> > > > > + /* Wait for call_rcu() in destroy */
> > > > > + rcu_barrier();
> > > > > +
> > > > > nfnl_lock(NFNL_SUBSYS_IPSET);
> > > > > for (i = 0; i < inst->ip_set_max; i++) {
> > > > > set = ip_set(inst, i);
> > > > > --
> > > > > 2.30.2
> > > > >
> > > >
> > > > If I am reading this right, time for netns dismantles will increase,
> > > > even for netns not using ipset
> > > >
> > > > If there is no other option, please convert "struct pernet_operations
> > > > ip_set_net_ops".exit to an exit_batch() handler,
> > > > to at least have a factorized rcu_barrier();
> > >
> > > You are right, the call to rcu_barrier() can safely be moved to
> > > ip_set_fini(). I'm going to prepare a new version of the patch.
> > >
> > > Thanks for catching it.
> >
> > I do not want to hold the series, your fix can be built as another
> > patch on top of this one.
>
> Given the timing, if we merge this series as is, it could go very soon
> into Linus' tree. I think it would be better to avoid introducing known
> regressions there.
>
> Any strong opinions vs holding this series until the problems are
> fixed? Likely a new PR will be required.
>
If this helps, here is one splat (using linux-next next-20240118)
BUG: sleeping function called from invalid context at kernel/workqueue.c:3348
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 22194, name:
syz-executor.0
preempt_count: 101, expected: 0
RCU nest depth: 0, expected: 0
3 locks held by syz-executor.0/22194:
#0: ffff8880162a2420 (sb_writers#5){.+.+}-{0:0}, at:
ksys_write+0x12f/0x260 fs/read_write.c:643
#1: ffff888042b48960 (&sb->s_type->i_mutex_key#12){+.+.}-{3:3}, at:
inode_lock include/linux/fs.h:802 [inline]
#1: ffff888042b48960 (&sb->s_type->i_mutex_key#12){+.+.}-{3:3}, at:
shmem_file_write_iter+0x8c/0x140 mm/shmem.c:2883
#2: ffffffff8d5aeb00 (rcu_callback){....}-{0:0}, at: rcu_lock_acquire
include/linux/rcupdate.h:298 [inline]
#2: ffffffff8d5aeb00 (rcu_callback){....}-{0:0}, at: rcu_do_batch
kernel/rcu/tree.c:2152 [inline]
#2: ffffffff8d5aeb00 (rcu_callback){....}-{0:0}, at:
rcu_core+0x7cc/0x16b0 kernel/rcu/tree.c:2433
Preemption disabled at:
[<ffffffff813c061e>] unwind_next_frame+0xce/0x2390
arch/x86/kernel/unwind_orc.c:479
CPU: 0 PID: 22194 Comm: syz-executor.0 Not tainted
6.7.0-next-20240118-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine,
BIOS Google 11/17/2023
Call Trace:
<IRQ>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x125/0x1b0 lib/dump_stack.c:106
__might_resched+0x3c0/0x5e0 kernel/sched/core.c:10176
start_flush_work kernel/workqueue.c:3348 [inline]
__flush_work+0x11f/0xa20 kernel/workqueue.c:3410
__cancel_work_timer+0x3f3/0x590 kernel/workqueue.c:3498
hash_ipmac6_destroy+0x337/0x420 net/netfilter/ipset/ip_set_hash_gen.h:454
ip_set_destroy_set+0x65/0x100 net/netfilter/ipset/ip_set_core.c:1180
rcu_do_batch kernel/rcu/tree.c:2158 [inline]
rcu_core+0x828/0x16b0 kernel/rcu/tree.c:2433
__do_softirq+0x218/0x8de kernel/softirq.c:553
invoke_softirq kernel/softirq.c:427 [inline]
__irq_exit_rcu kernel/softirq.c:632 [inline]
irq_exit_rcu+0xb9/0x120 kernel/softirq.c:644
sysvec_apic_timer_interrupt+0x95/0xb0 arch/x86/kernel/apic/apic.c:1076
</IRQ>
<TASK>
asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:649
RIP: 0010:on_stack arch/x86/include/asm/stacktrace.h:59 [inline]
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH net 14/14] netfilter: ipset: fix performance regression in swap operation
2024-01-17 16:00 ` [PATCH net 14/14] netfilter: ipset: fix performance regression in swap operation Pablo Neira Ayuso
2024-01-17 16:08 ` Eric Dumazet
@ 2024-01-18 10:08 ` Eric Dumazet
2024-01-18 14:24 ` Jozsef Kadlecsik
1 sibling, 1 reply; 32+ messages in thread
From: Eric Dumazet @ 2024-01-18 10:08 UTC (permalink / raw)
To: Pablo Neira Ayuso; +Cc: netfilter-devel, davem, netdev, kuba, pabeni, fw
On Wed, Jan 17, 2024 at 5:00 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>
> From: Jozsef Kadlecsik <kadlec@netfilter.org>
>
> The patch "netfilter: ipset: fix race condition between swap/destroy
> and kernel side add/del/test", commit 28628fa9 fixes a race condition.
> But the synchronize_rcu() added to the swap function unnecessarily slows
> it down: it can safely be moved to destroy and use call_rcu() instead.
> Thus we can get back the same performance and preventing the race condition
> at the same time.
>
> Fixes: 28628fa952fe ("netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test")
> Link: https://lore.kernel.org/lkml/C0829B10-EAA6-4809-874E-E1E9C05A8D84@automattic.com/
> Reported-by: Ale Crismani <ale.crismani@automattic.com>
> Reported-by: David Wang <00107082@163.com
> Tested-by: Ale Crismani <ale.crismani@automattic.com>
> Tested-by: David Wang <00107082@163.com
> Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> ---
> include/linux/netfilter/ipset/ip_set.h | 2 ++
> net/netfilter/ipset/ip_set_core.c | 31 +++++++++++++++++++-------
> 2 files changed, 25 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
> index e8c350a3ade1..912f750d0bea 100644
> --- a/include/linux/netfilter/ipset/ip_set.h
> +++ b/include/linux/netfilter/ipset/ip_set.h
> @@ -242,6 +242,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type);
>
> /* A generic IP set */
> struct ip_set {
> + /* For call_cru in destroy */
> + struct rcu_head rcu;
> /* The name of the set */
> char name[IPSET_MAXNAMELEN];
> /* Lock protecting the set data */
> diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
> index 4c133e06be1d..3bf9bb345809 100644
> --- a/net/netfilter/ipset/ip_set_core.c
> +++ b/net/netfilter/ipset/ip_set_core.c
> @@ -1182,6 +1182,14 @@ ip_set_destroy_set(struct ip_set *set)
> kfree(set);
> }
>
> +static void
> +ip_set_destroy_set_rcu(struct rcu_head *head)
> +{
> + struct ip_set *set = container_of(head, struct ip_set, rcu);
> +
> + ip_set_destroy_set(set);
Calling ip_set_destroy_set() from BH (rcu callbacks) is not working.
I think you should test your patch with LOCKDEP enabled, and/or
CONFIG_DEBUG_ATOMIC_SLEEP=y
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH net 14/14] netfilter: ipset: fix performance regression in swap operation
2024-01-18 10:08 ` Eric Dumazet
@ 2024-01-18 14:24 ` Jozsef Kadlecsik
2024-01-23 9:03 ` Jozsef Kadlecsik
0 siblings, 1 reply; 32+ messages in thread
From: Jozsef Kadlecsik @ 2024-01-18 14:24 UTC (permalink / raw)
To: Eric Dumazet
Cc: Pablo Neira Ayuso, netfilter-devel, David Miller, netdev, kuba,
pabeni, Florian Westphal
[-- Attachment #1: Type: text/plain, Size: 3077 bytes --]
Hi,
Please drop the patch from the batch, it needs more work (see below).
On Thu, 18 Jan 2024, Eric Dumazet wrote:
> On Wed, Jan 17, 2024 at 5:00 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> >
> > The patch "netfilter: ipset: fix race condition between swap/destroy
> > and kernel side add/del/test", commit 28628fa9 fixes a race condition.
> > But the synchronize_rcu() added to the swap function unnecessarily slows
> > it down: it can safely be moved to destroy and use call_rcu() instead.
> > Thus we can get back the same performance and preventing the race condition
> > at the same time.
> >
> > Fixes: 28628fa952fe ("netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test")
> > Link: https://lore.kernel.org/lkml/C0829B10-EAA6-4809-874E-E1E9C05A8D84@automattic.com/
> > Reported-by: Ale Crismani <ale.crismani@automattic.com>
> > Reported-by: David Wang <00107082@163.com
> > Tested-by: Ale Crismani <ale.crismani@automattic.com>
> > Tested-by: David Wang <00107082@163.com
> > Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
> > Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> > ---
> > include/linux/netfilter/ipset/ip_set.h | 2 ++
> > net/netfilter/ipset/ip_set_core.c | 31 +++++++++++++++++++-------
> > 2 files changed, 25 insertions(+), 8 deletions(-)
> >
> > diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
> > index e8c350a3ade1..912f750d0bea 100644
> > --- a/include/linux/netfilter/ipset/ip_set.h
> > +++ b/include/linux/netfilter/ipset/ip_set.h
> > @@ -242,6 +242,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type);
> >
> > /* A generic IP set */
> > struct ip_set {
> > + /* For call_cru in destroy */
> > + struct rcu_head rcu;
> > /* The name of the set */
> > char name[IPSET_MAXNAMELEN];
> > /* Lock protecting the set data */
> > diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
> > index 4c133e06be1d..3bf9bb345809 100644
> > --- a/net/netfilter/ipset/ip_set_core.c
> > +++ b/net/netfilter/ipset/ip_set_core.c
> > @@ -1182,6 +1182,14 @@ ip_set_destroy_set(struct ip_set *set)
> > kfree(set);
> > }
> >
> > +static void
> > +ip_set_destroy_set_rcu(struct rcu_head *head)
> > +{
> > + struct ip_set *set = container_of(head, struct ip_set, rcu);
> > +
> > + ip_set_destroy_set(set);
>
> Calling ip_set_destroy_set() from BH (rcu callbacks) is not working.
Yeah, it calls cancel_delayed_work_sync() to handle the garbage collector
and that can wait. The call can be moved into the main destroy function
and let the rcu callback do just the minimal job, however it needs a
restructuring. So please skip this patch now.
Best regards,
Jozsef
--
E-mail : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.hu
PGP key : https://wigner.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics
H-1525 Budapest 114, POB. 49, Hungary
^ permalink raw reply [flat|nested] 32+ messages in thread
* Re: [PATCH net 14/14] netfilter: ipset: fix performance regression in swap operation
2024-01-18 14:24 ` Jozsef Kadlecsik
@ 2024-01-23 9:03 ` Jozsef Kadlecsik
2024-01-23 17:20 ` David Wang
0 siblings, 1 reply; 32+ messages in thread
From: Jozsef Kadlecsik @ 2024-01-23 9:03 UTC (permalink / raw)
To: Eric Dumazet
Cc: Pablo Neira Ayuso, netfilter-devel, David Miller, netdev, kuba,
pabeni, Florian Westphal, Ale Crismani, David Wang
[-- Attachment #1: Type: text/plain, Size: 11793 bytes --]
Hi,
On Thu, 18 Jan 2024, Jozsef Kadlecsik wrote:
> On Thu, 18 Jan 2024, Eric Dumazet wrote:
>
> > On Wed, Jan 17, 2024 at 5:00 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > >
> > > The patch "netfilter: ipset: fix race condition between swap/destroy
> > > and kernel side add/del/test", commit 28628fa9 fixes a race condition.
> > > But the synchronize_rcu() added to the swap function unnecessarily slows
> > > it down: it can safely be moved to destroy and use call_rcu() instead.
> > > Thus we can get back the same performance and preventing the race condition
> > > at the same time.
[...]
> > > +static void
> > > +ip_set_destroy_set_rcu(struct rcu_head *head)
> > > +{
> > > + struct ip_set *set = container_of(head, struct ip_set, rcu);
> > > +
> > > + ip_set_destroy_set(set);
> >
> > Calling ip_set_destroy_set() from BH (rcu callbacks) is not working.
>
> Yeah, it calls cancel_delayed_work_sync() to handle the garbage collector
> and that can wait. The call can be moved into the main destroy function
> and let the rcu callback do just the minimal job, however it needs a
> restructuring. So please skip this patch now.
I reworked the patch to work safely with using call_rcu() and still
preventing the race condition. According to my tests the patch works as
intended, but it'd be good to receive feedback that it indeed fixes the
issue properly.
From 23e85f453da573dd4265e7d1fffd2aeab3369e0d Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@netfilter.org>
Date: Tue, 16 Jan 2024 17:10:45 +0100
Subject: [PATCH 1/1] netfilter: ipset: fix performance regression in swap
operation
The patch "netfilter: ipset: fix race condition between swap/destroy
and kernel side add/del/test", commit 28628fa9 fixes a race condition.
But the synchronize_rcu() added to the swap function unnecessarily slows
it down: it can safely be moved to destroy and use call_rcu() instead.
Eric Dumazet pointed out that simply calling the destroy functions as
rcu callback does not work: sets with timeout use garbage collectors
which need cancelling at destroy which can wait. Therefore the destroy
functions are split into two: cancelling garbage collectors safely at
executing the command received by netlink and moving the remaining
part only into the rcu callback.
Link: https://lore.kernel.org/lkml/C0829B10-EAA6-4809-874E-E1E9C05A8D84@automattic.com/
Fixes: 28628fa952fe ("netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test")
Reported-by: Ale Crismani <ale.crismani@automattic.com>
Reported-by: David Wang <00107082@163.com
Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
---
include/linux/netfilter/ipset/ip_set.h | 4 +++
net/netfilter/ipset/ip_set_bitmap_gen.h | 14 ++++++++--
net/netfilter/ipset/ip_set_core.c | 37 +++++++++++++++++++------
net/netfilter/ipset/ip_set_hash_gen.h | 15 ++++++++--
net/netfilter/ipset/ip_set_list_set.c | 13 +++++++--
5 files changed, 65 insertions(+), 18 deletions(-)
diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index e8c350a3ade1..e9f4f845d760 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -186,6 +186,8 @@ struct ip_set_type_variant {
/* Return true if "b" set is the same as "a"
* according to the create set parameters */
bool (*same_set)(const struct ip_set *a, const struct ip_set *b);
+ /* Cancel ongoing garbage collectors before destroying the set*/
+ void (*cancel_gc)(struct ip_set *set);
/* Region-locking is used */
bool region_lock;
};
@@ -242,6 +244,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type);
/* A generic IP set */
struct ip_set {
+ /* For call_cru in destroy */
+ struct rcu_head rcu;
/* The name of the set */
char name[IPSET_MAXNAMELEN];
/* Lock protecting the set data */
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index 26ab0e9612d8..9523104a90da 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -28,6 +28,7 @@
#define mtype_del IPSET_TOKEN(MTYPE, _del)
#define mtype_list IPSET_TOKEN(MTYPE, _list)
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
+#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc)
#define mtype MTYPE
#define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id)))
@@ -57,9 +58,6 @@ mtype_destroy(struct ip_set *set)
{
struct mtype *map = set->data;
- if (SET_WITH_TIMEOUT(set))
- del_timer_sync(&map->gc);
-
if (set->dsize && set->extensions & IPSET_EXT_DESTROY)
mtype_ext_cleanup(set);
ip_set_free(map->members);
@@ -288,6 +286,15 @@ mtype_gc(struct timer_list *t)
add_timer(&map->gc);
}
+static void
+mtype_cancel_gc(struct ip_set *set)
+{
+ struct mtype *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ del_timer_sync(&map->gc);
+}
+
static const struct ip_set_type_variant mtype = {
.kadt = mtype_kadt,
.uadt = mtype_uadt,
@@ -301,6 +308,7 @@ static const struct ip_set_type_variant mtype = {
.head = mtype_head,
.list = mtype_list,
.same_set = mtype_same_set,
+ .cancel_gc = mtype_cancel_gc,
};
#endif /* __IP_SET_BITMAP_IP_GEN_H */
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 4c133e06be1d..bcaad9c009fe 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1182,6 +1182,14 @@ ip_set_destroy_set(struct ip_set *set)
kfree(set);
}
+static void
+ip_set_destroy_set_rcu(struct rcu_head *head)
+{
+ struct ip_set *set = container_of(head, struct ip_set, rcu);
+
+ ip_set_destroy_set(set);
+}
+
static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
const struct nlattr * const attr[])
{
@@ -1193,8 +1201,6 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
if (unlikely(protocol_min_failed(attr)))
return -IPSET_ERR_PROTOCOL;
- /* Must wait for flush to be really finished in list:set */
- rcu_barrier();
/* Commands are serialized and references are
* protected by the ip_set_ref_lock.
@@ -1206,8 +1212,10 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
* counter, so if it's already zero, we can proceed
* without holding the lock.
*/
- read_lock_bh(&ip_set_ref_lock);
if (!attr[IPSET_ATTR_SETNAME]) {
+ /* Must wait for flush to be really finished in list:set */
+ rcu_barrier();
+ read_lock_bh(&ip_set_ref_lock);
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
if (s && (s->ref || s->ref_netlink)) {
@@ -1221,6 +1229,8 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
s = ip_set(inst, i);
if (s) {
ip_set(inst, i) = NULL;
+ /* Must cancel garbage collectors */
+ s->variant->cancel_gc(s);
ip_set_destroy_set(s);
}
}
@@ -1228,6 +1238,9 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
inst->is_destroyed = false;
} else {
u32 flags = flag_exist(info->nlh);
+ u16 features = 0;
+
+ read_lock_bh(&ip_set_ref_lock);
s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
&i);
if (!s) {
@@ -1238,10 +1251,16 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
ret = -IPSET_ERR_BUSY;
goto out;
}
+ features = s->type->features;
ip_set(inst, i) = NULL;
read_unlock_bh(&ip_set_ref_lock);
-
- ip_set_destroy_set(s);
+ if (features & IPSET_TYPE_NAME) {
+ /* Must wait for flush to be really finished */
+ rcu_barrier();
+ }
+ /* Must cancel garbage collectors */
+ s->variant->cancel_gc(s);
+ call_rcu(&s->rcu, ip_set_destroy_set_rcu);
}
return 0;
out:
@@ -1394,9 +1413,6 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info,
ip_set(inst, to_id) = from;
write_unlock_bh(&ip_set_ref_lock);
- /* Make sure all readers of the old set pointers are completed. */
- synchronize_rcu();
-
return 0;
}
@@ -2409,8 +2425,11 @@ ip_set_fini(void)
{
nf_unregister_sockopt(&so_set);
nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
-
unregister_pernet_subsys(&ip_set_net_ops);
+
+ /* Wait for call_rcu() in destroy */
+ rcu_barrier();
+
pr_debug("these are the famous last words\n");
}
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 7c2399541771..c62998b46f00 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -221,6 +221,7 @@ static const union nf_inet_addr zeromask = {};
#undef mtype_gc_do
#undef mtype_gc
#undef mtype_gc_init
+#undef mtype_cancel_gc
#undef mtype_variant
#undef mtype_data_match
@@ -265,6 +266,7 @@ static const union nf_inet_addr zeromask = {};
#define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do)
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
+#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc)
#define mtype_variant IPSET_TOKEN(MTYPE, _variant)
#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
@@ -449,9 +451,6 @@ mtype_destroy(struct ip_set *set)
struct htype *h = set->data;
struct list_head *l, *lt;
- if (SET_WITH_TIMEOUT(set))
- cancel_delayed_work_sync(&h->gc.dwork);
-
mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
list_for_each_safe(l, lt, &h->ad) {
list_del(l);
@@ -598,6 +597,15 @@ mtype_gc_init(struct htable_gc *gc)
queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
}
+static void
+mtype_cancel_gc(struct ip_set *set)
+{
+ struct htype *h = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ cancel_delayed_work_sync(&h->gc.dwork);
+}
+
static int
mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
struct ip_set_ext *mext, u32 flags);
@@ -1440,6 +1448,7 @@ static const struct ip_set_type_variant mtype_variant = {
.uref = mtype_uref,
.resize = mtype_resize,
.same_set = mtype_same_set,
+ .cancel_gc = mtype_cancel_gc,
.region_lock = true,
};
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index e162636525cf..6c3f28bc59b3 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -426,9 +426,6 @@ list_set_destroy(struct ip_set *set)
struct list_set *map = set->data;
struct set_elem *e, *n;
- if (SET_WITH_TIMEOUT(set))
- timer_shutdown_sync(&map->gc);
-
list_for_each_entry_safe(e, n, &map->members, list) {
list_del(&e->list);
ip_set_put_byindex(map->net, e->id);
@@ -545,6 +542,15 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b)
a->extensions == b->extensions;
}
+static void
+list_set_cancel_gc(struct ip_set *set)
+{
+ struct list_set *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ timer_shutdown_sync(&map->gc);
+}
+
static const struct ip_set_type_variant set_variant = {
.kadt = list_set_kadt,
.uadt = list_set_uadt,
@@ -558,6 +564,7 @@ static const struct ip_set_type_variant set_variant = {
.head = list_set_head,
.list = list_set_list,
.same_set = list_set_same_set,
+ .cancel_gc = list_set_cancel_gc,
};
static void
--
2.39.2
Best regards,
Jozsef
--
E-mail : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.hu
PGP key : https://wigner.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics
H-1525 Budapest 114, POB. 49, Hungary
^ permalink raw reply related [flat|nested] 32+ messages in thread
* Re: [PATCH net 14/14] netfilter: ipset: fix performance regression in swap operation
2024-01-23 9:03 ` Jozsef Kadlecsik
@ 2024-01-23 17:20 ` David Wang
0 siblings, 0 replies; 32+ messages in thread
From: David Wang @ 2024-01-23 17:20 UTC (permalink / raw)
To: Jozsef Kadlecsik
Cc: Eric Dumazet, Pablo Neira Ayuso, netfilter-devel, David Miller,
netdev, kuba, pabeni, Florian Westphal, Ale Crismani
At 2024-01-23 17:03:42, "Jozsef Kadlecsik" <kadlec@blackhole.kfki.hu> wrote:
>Hi,
>
>On Thu, 18 Jan 2024, Jozsef Kadlecsik wrote:
>
>> On Thu, 18 Jan 2024, Eric Dumazet wrote:
>>
>> > On Wed, Jan 17, 2024 at 5:00 PM Pablo Neira Ayuso <pablo@netfilter.org> wrote:
>> > >
>> > > The patch "netfilter: ipset: fix race condition between swap/destroy
>> > > and kernel side add/del/test", commit 28628fa9 fixes a race condition.
>> > > But the synchronize_rcu() added to the swap function unnecessarily slows
>> > > it down: it can safely be moved to destroy and use call_rcu() instead.
>> > > Thus we can get back the same performance and preventing the race condition
>> > > at the same time.
>[...]
>> > > +static void
>> > > +ip_set_destroy_set_rcu(struct rcu_head *head)
>> > > +{
>> > > + struct ip_set *set = container_of(head, struct ip_set, rcu);
>> > > +
>> > > + ip_set_destroy_set(set);
>> >
>> > Calling ip_set_destroy_set() from BH (rcu callbacks) is not working.
>>
>> Yeah, it calls cancel_delayed_work_sync() to handle the garbage collector
>> and that can wait. The call can be moved into the main destroy function
>> and let the rcu callback do just the minimal job, however it needs a
>> restructuring. So please skip this patch now.
>
>I reworked the patch to work safely with using call_rcu() and still
>preventing the race condition. According to my tests the patch works as
>intended, but it'd be good to receive feedback that it indeed fixes the
>issue properly.
I apply the patch on 6.8.0-rc1, stress create-add-swap-destroy sequence for 1000000 rounds, and no performance regression observed.
$ time sudo ./a.out
real 0m19.717s
user 0m0.849s
sys 0m17.060s
I also test with two processes create-add-swap-destroy disjoint set of ipset parallelly on a 2-Core VM, no abnormality noticed other than some slower, I guess this is expected, right?
$ time sudo ./a.out
real 0m23.625s
user 0m0.827s
sys 0m20.222s
$ time sudo ./a2.out
real 0m23.838s
user 0m0.835s
sys 0m20.401s
No obverse slab memory increment can be confirmed after several round of stress.
cat /proc/meminfo | grep Slab:
39248 kB - 39368 kB -- 39340 kB - 39388 kB ----- 39308 kB
FYI
David
>
>From 23e85f453da573dd4265e7d1fffd2aeab3369e0d Mon Sep 17 00:00:00 2001
>From: Jozsef Kadlecsik <kadlec@netfilter.org>
>Date: Tue, 16 Jan 2024 17:10:45 +0100
>Subject: [PATCH 1/1] netfilter: ipset: fix performance regression in swap
> operation
>
>The patch "netfilter: ipset: fix race condition between swap/destroy
>and kernel side add/del/test", commit 28628fa9 fixes a race condition.
>But the synchronize_rcu() added to the swap function unnecessarily slows
>it down: it can safely be moved to destroy and use call_rcu() instead.
>
>Eric Dumazet pointed out that simply calling the destroy functions as
>rcu callback does not work: sets with timeout use garbage collectors
>which need cancelling at destroy which can wait. Therefore the destroy
>functions are split into two: cancelling garbage collectors safely at
>executing the command received by netlink and moving the remaining
>part only into the rcu callback.
>
>Link: https://lore.kernel.org/lkml/C0829B10-EAA6-4809-874E-E1E9C05A8D84@automattic.com/
>Fixes: 28628fa952fe ("netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test")
>Reported-by: Ale Crismani <ale.crismani@automattic.com>
>Reported-by: David Wang <00107082@163.com
>Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
>---
> include/linux/netfilter/ipset/ip_set.h | 4 +++
> net/netfilter/ipset/ip_set_bitmap_gen.h | 14 ++++++++--
> net/netfilter/ipset/ip_set_core.c | 37 +++++++++++++++++++------
> net/netfilter/ipset/ip_set_hash_gen.h | 15 ++++++++--
> net/netfilter/ipset/ip_set_list_set.c | 13 +++++++--
> 5 files changed, 65 insertions(+), 18 deletions(-)
>
>diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
>index e8c350a3ade1..e9f4f845d760 100644
>--- a/include/linux/netfilter/ipset/ip_set.h
>+++ b/include/linux/netfilter/ipset/ip_set.h
>@@ -186,6 +186,8 @@ struct ip_set_type_variant {
> /* Return true if "b" set is the same as "a"
> * according to the create set parameters */
> bool (*same_set)(const struct ip_set *a, const struct ip_set *b);
>+ /* Cancel ongoing garbage collectors before destroying the set*/
>+ void (*cancel_gc)(struct ip_set *set);
> /* Region-locking is used */
> bool region_lock;
> };
>@@ -242,6 +244,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type);
>
> /* A generic IP set */
> struct ip_set {
>+ /* For call_cru in destroy */
>+ struct rcu_head rcu;
> /* The name of the set */
> char name[IPSET_MAXNAMELEN];
> /* Lock protecting the set data */
>diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
>index 26ab0e9612d8..9523104a90da 100644
>--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
>+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
>@@ -28,6 +28,7 @@
> #define mtype_del IPSET_TOKEN(MTYPE, _del)
> #define mtype_list IPSET_TOKEN(MTYPE, _list)
> #define mtype_gc IPSET_TOKEN(MTYPE, _gc)
>+#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc)
> #define mtype MTYPE
>
> #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id)))
>@@ -57,9 +58,6 @@ mtype_destroy(struct ip_set *set)
> {
> struct mtype *map = set->data;
>
>- if (SET_WITH_TIMEOUT(set))
>- del_timer_sync(&map->gc);
>-
> if (set->dsize && set->extensions & IPSET_EXT_DESTROY)
> mtype_ext_cleanup(set);
> ip_set_free(map->members);
>@@ -288,6 +286,15 @@ mtype_gc(struct timer_list *t)
> add_timer(&map->gc);
> }
>
>+static void
>+mtype_cancel_gc(struct ip_set *set)
>+{
>+ struct mtype *map = set->data;
>+
>+ if (SET_WITH_TIMEOUT(set))
>+ del_timer_sync(&map->gc);
>+}
>+
> static const struct ip_set_type_variant mtype = {
> .kadt = mtype_kadt,
> .uadt = mtype_uadt,
>@@ -301,6 +308,7 @@ static const struct ip_set_type_variant mtype = {
> .head = mtype_head,
> .list = mtype_list,
> .same_set = mtype_same_set,
>+ .cancel_gc = mtype_cancel_gc,
> };
>
> #endif /* __IP_SET_BITMAP_IP_GEN_H */
>diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
>index 4c133e06be1d..bcaad9c009fe 100644
>--- a/net/netfilter/ipset/ip_set_core.c
>+++ b/net/netfilter/ipset/ip_set_core.c
>@@ -1182,6 +1182,14 @@ ip_set_destroy_set(struct ip_set *set)
> kfree(set);
> }
>
>+static void
>+ip_set_destroy_set_rcu(struct rcu_head *head)
>+{
>+ struct ip_set *set = container_of(head, struct ip_set, rcu);
>+
>+ ip_set_destroy_set(set);
>+}
>+
> static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
> const struct nlattr * const attr[])
> {
>@@ -1193,8 +1201,6 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
> if (unlikely(protocol_min_failed(attr)))
> return -IPSET_ERR_PROTOCOL;
>
>- /* Must wait for flush to be really finished in list:set */
>- rcu_barrier();
>
> /* Commands are serialized and references are
> * protected by the ip_set_ref_lock.
>@@ -1206,8 +1212,10 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
> * counter, so if it's already zero, we can proceed
> * without holding the lock.
> */
>- read_lock_bh(&ip_set_ref_lock);
> if (!attr[IPSET_ATTR_SETNAME]) {
>+ /* Must wait for flush to be really finished in list:set */
>+ rcu_barrier();
>+ read_lock_bh(&ip_set_ref_lock);
> for (i = 0; i < inst->ip_set_max; i++) {
> s = ip_set(inst, i);
> if (s && (s->ref || s->ref_netlink)) {
>@@ -1221,6 +1229,8 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
> s = ip_set(inst, i);
> if (s) {
> ip_set(inst, i) = NULL;
>+ /* Must cancel garbage collectors */
>+ s->variant->cancel_gc(s);
> ip_set_destroy_set(s);
> }
> }
>@@ -1228,6 +1238,9 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
> inst->is_destroyed = false;
> } else {
> u32 flags = flag_exist(info->nlh);
>+ u16 features = 0;
>+
>+ read_lock_bh(&ip_set_ref_lock);
> s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
> &i);
> if (!s) {
>@@ -1238,10 +1251,16 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
> ret = -IPSET_ERR_BUSY;
> goto out;
> }
>+ features = s->type->features;
> ip_set(inst, i) = NULL;
> read_unlock_bh(&ip_set_ref_lock);
>-
>- ip_set_destroy_set(s);
>+ if (features & IPSET_TYPE_NAME) {
>+ /* Must wait for flush to be really finished */
>+ rcu_barrier();
>+ }
>+ /* Must cancel garbage collectors */
>+ s->variant->cancel_gc(s);
>+ call_rcu(&s->rcu, ip_set_destroy_set_rcu);
> }
> return 0;
> out:
>@@ -1394,9 +1413,6 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info,
> ip_set(inst, to_id) = from;
> write_unlock_bh(&ip_set_ref_lock);
>
>- /* Make sure all readers of the old set pointers are completed. */
>- synchronize_rcu();
>-
> return 0;
> }
>
>@@ -2409,8 +2425,11 @@ ip_set_fini(void)
> {
> nf_unregister_sockopt(&so_set);
> nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
>-
> unregister_pernet_subsys(&ip_set_net_ops);
>+
>+ /* Wait for call_rcu() in destroy */
>+ rcu_barrier();
>+
> pr_debug("these are the famous last words\n");
> }
>
>diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
>index 7c2399541771..c62998b46f00 100644
>--- a/net/netfilter/ipset/ip_set_hash_gen.h
>+++ b/net/netfilter/ipset/ip_set_hash_gen.h
>@@ -221,6 +221,7 @@ static const union nf_inet_addr zeromask = {};
> #undef mtype_gc_do
> #undef mtype_gc
> #undef mtype_gc_init
>+#undef mtype_cancel_gc
> #undef mtype_variant
> #undef mtype_data_match
>
>@@ -265,6 +266,7 @@ static const union nf_inet_addr zeromask = {};
> #define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do)
> #define mtype_gc IPSET_TOKEN(MTYPE, _gc)
> #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
>+#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc)
> #define mtype_variant IPSET_TOKEN(MTYPE, _variant)
> #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
>
>@@ -449,9 +451,6 @@ mtype_destroy(struct ip_set *set)
> struct htype *h = set->data;
> struct list_head *l, *lt;
>
>- if (SET_WITH_TIMEOUT(set))
>- cancel_delayed_work_sync(&h->gc.dwork);
>-
> mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
> list_for_each_safe(l, lt, &h->ad) {
> list_del(l);
>@@ -598,6 +597,15 @@ mtype_gc_init(struct htable_gc *gc)
> queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
> }
>
>+static void
>+mtype_cancel_gc(struct ip_set *set)
>+{
>+ struct htype *h = set->data;
>+
>+ if (SET_WITH_TIMEOUT(set))
>+ cancel_delayed_work_sync(&h->gc.dwork);
>+}
>+
> static int
> mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
> struct ip_set_ext *mext, u32 flags);
>@@ -1440,6 +1448,7 @@ static const struct ip_set_type_variant mtype_variant = {
> .uref = mtype_uref,
> .resize = mtype_resize,
> .same_set = mtype_same_set,
>+ .cancel_gc = mtype_cancel_gc,
> .region_lock = true,
> };
>
>diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
>index e162636525cf..6c3f28bc59b3 100644
>--- a/net/netfilter/ipset/ip_set_list_set.c
>+++ b/net/netfilter/ipset/ip_set_list_set.c
>@@ -426,9 +426,6 @@ list_set_destroy(struct ip_set *set)
> struct list_set *map = set->data;
> struct set_elem *e, *n;
>
>- if (SET_WITH_TIMEOUT(set))
>- timer_shutdown_sync(&map->gc);
>-
> list_for_each_entry_safe(e, n, &map->members, list) {
> list_del(&e->list);
> ip_set_put_byindex(map->net, e->id);
>@@ -545,6 +542,15 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b)
> a->extensions == b->extensions;
> }
>
>+static void
>+list_set_cancel_gc(struct ip_set *set)
>+{
>+ struct list_set *map = set->data;
>+
>+ if (SET_WITH_TIMEOUT(set))
>+ timer_shutdown_sync(&map->gc);
>+}
>+
> static const struct ip_set_type_variant set_variant = {
> .kadt = list_set_kadt,
> .uadt = list_set_uadt,
>@@ -558,6 +564,7 @@ static const struct ip_set_type_variant set_variant = {
> .head = list_set_head,
> .list = list_set_list,
> .same_set = list_set_same_set,
>+ .cancel_gc = list_set_cancel_gc,
> };
>
> static void
>--
>2.39.2
>
>Best regards,
>Jozsef
>--
>E-mail : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.hu
>PGP key : https://wigner.hu/~kadlec/pgp_public_key.txt
>Address : Wigner Research Centre for Physics
> H-1525 Budapest 114, POB. 49, Hungary
^ permalink raw reply [flat|nested] 32+ messages in thread