netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [nf PATCH v2 0/8] Introduce locking for reset requests
@ 2023-09-28 16:52 Phil Sutter
  2023-09-28 16:52 ` [nf PATCH v2 1/8] netfilter: nf_tables: Don't allocate nft_rule_dump_ctx Phil Sutter
                   ` (7 more replies)
  0 siblings, 8 replies; 34+ messages in thread
From: Phil Sutter @ 2023-09-28 16:52 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, netfilter-devel

Next try, this time with:
- commit_mutex instead of dedicated spinlock
- Subroutine creation split into separate patches
- Separate patch adding reset bit to nft_set_dump_ctx
- Improved commit descriptions
- Fixed leak in error path added by patch

Phil Sutter (8):
  netfilter: nf_tables: Don't allocate nft_rule_dump_ctx
  netfilter: nf_tables: Introduce nf_tables_getrule_single()
  netfilter: nf_tables: Add locking for NFT_MSG_GETRULE_RESET requests
  netfilter: nf_tables: Introduce struct nft_obj_dump_ctx
  netfilter: nf_tables: Introduce nf_tables_getobj_single
  netfilter: nf_tables: Add locking for NFT_MSG_GETOBJ_RESET requests
  netfilter: nf_tables: Pass reset bit in nft_set_dump_ctx
  netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET
    requests

 net/netfilter/nf_tables_api.c | 546 +++++++++++++++++++++++-----------
 1 file changed, 371 insertions(+), 175 deletions(-)

-- 
2.41.0


^ permalink raw reply	[flat|nested] 34+ messages in thread

* [nf PATCH v2 1/8] netfilter: nf_tables: Don't allocate nft_rule_dump_ctx
  2023-09-28 16:52 [nf PATCH v2 0/8] Introduce locking for reset requests Phil Sutter
@ 2023-09-28 16:52 ` Phil Sutter
  2023-09-28 18:49   ` Pablo Neira Ayuso
  2023-09-28 19:00   ` Florian Westphal
  2023-09-28 16:52 ` [nf PATCH v2 2/8] netfilter: nf_tables: Introduce nf_tables_getrule_single() Phil Sutter
                   ` (6 subsequent siblings)
  7 siblings, 2 replies; 34+ messages in thread
From: Phil Sutter @ 2023-09-28 16:52 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, netfilter-devel

Eliminate the direct use of netlink_callback::args when dumping rules by
casting nft_rule_dump_ctx over netlink_callback::ctx as suggested in
the struct's comment.

The value for 's_idx' has to be stored inside nft_rule_dump_ctx now and
make it hold the 'reset' boolean as well.

Note how this patch removes the zeroing of netlink_callback::args[1-5] -
none of the rule dump callbacks seem to make use of them.

Signed-off-by: Phil Sutter <phil@nwl.cc>
---
Changes since v1:
- Fix description: 'idx' is not moved into the struct
---
 net/netfilter/nf_tables_api.c | 81 ++++++++++++++---------------------
 1 file changed, 32 insertions(+), 49 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 4356189360fb8..511508407867d 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3441,20 +3441,21 @@ static void audit_log_rule_reset(const struct nft_table *table,
 }
 
 struct nft_rule_dump_ctx {
+	unsigned int s_idx;
 	char *table;
 	char *chain;
+	bool reset;
 };
 
 static int __nf_tables_dump_rules(struct sk_buff *skb,
 				  unsigned int *idx,
 				  struct netlink_callback *cb,
 				  const struct nft_table *table,
-				  const struct nft_chain *chain,
-				  bool reset)
+				  const struct nft_chain *chain)
 {
+	struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
 	struct net *net = sock_net(skb->sk);
 	const struct nft_rule *rule, *prule;
-	unsigned int s_idx = cb->args[0];
 	unsigned int entries = 0;
 	int ret = 0;
 	u64 handle;
@@ -3463,12 +3464,9 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
 	list_for_each_entry_rcu(rule, &chain->rules, list) {
 		if (!nft_is_active(net, rule))
 			goto cont_skip;
-		if (*idx < s_idx)
+		if (*idx < ctx->s_idx)
 			goto cont;
-		if (*idx > s_idx) {
-			memset(&cb->args[1], 0,
-					sizeof(cb->args) - sizeof(cb->args[0]));
-		}
+
 		if (prule)
 			handle = prule->handle;
 		else
@@ -3479,7 +3477,7 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
 					NFT_MSG_NEWRULE,
 					NLM_F_MULTI | NLM_F_APPEND,
 					table->family,
-					table, chain, rule, handle, reset) < 0) {
+					table, chain, rule, handle, ctx->reset) < 0) {
 			ret = 1;
 			break;
 		}
@@ -3491,7 +3489,7 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
 		(*idx)++;
 	}
 
-	if (reset && entries)
+	if (ctx->reset && entries)
 		audit_log_rule_reset(table, cb->seq, entries);
 
 	return ret;
@@ -3501,17 +3499,13 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 				struct netlink_callback *cb)
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
-	const struct nft_rule_dump_ctx *ctx = cb->data;
+	struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
 	struct nft_table *table;
 	const struct nft_chain *chain;
 	unsigned int idx = 0;
 	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 	struct nftables_pernet *nft_net;
-	bool reset = false;
-
-	if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET)
-		reset = true;
 
 	rcu_read_lock();
 	nft_net = nft_pernet(net);
@@ -3521,10 +3515,10 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 		if (family != NFPROTO_UNSPEC && family != table->family)
 			continue;
 
-		if (ctx && ctx->table && strcmp(ctx->table, table->name) != 0)
+		if (ctx->table && strcmp(ctx->table, table->name) != 0)
 			continue;
 
-		if (ctx && ctx->table && ctx->chain) {
+		if (ctx->table && ctx->chain) {
 			struct rhlist_head *list, *tmp;
 
 			list = rhltable_lookup(&table->chains_ht, ctx->chain,
@@ -3536,7 +3530,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 				if (!nft_is_active(net, chain))
 					continue;
 				__nf_tables_dump_rules(skb, &idx,
-						       cb, table, chain, reset);
+						       cb, table, chain);
 				break;
 			}
 			goto done;
@@ -3544,62 +3538,51 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 
 		list_for_each_entry_rcu(chain, &table->chains, list) {
 			if (__nf_tables_dump_rules(skb, &idx,
-						   cb, table, chain, reset))
+						   cb, table, chain))
 				goto done;
 		}
 
-		if (ctx && ctx->table)
+		if (ctx->table)
 			break;
 	}
 done:
 	rcu_read_unlock();
 
-	cb->args[0] = idx;
+	ctx->s_idx = idx;
 	return skb->len;
 }
 
 static int nf_tables_dump_rules_start(struct netlink_callback *cb)
 {
+	struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
 	const struct nlattr * const *nla = cb->data;
-	struct nft_rule_dump_ctx *ctx = NULL;
 
-	if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
-		ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
-		if (!ctx)
-			return -ENOMEM;
+	BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
 
-		if (nla[NFTA_RULE_TABLE]) {
-			ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
-							GFP_ATOMIC);
-			if (!ctx->table) {
-				kfree(ctx);
-				return -ENOMEM;
-			}
-		}
-		if (nla[NFTA_RULE_CHAIN]) {
-			ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
-						GFP_ATOMIC);
-			if (!ctx->chain) {
-				kfree(ctx->table);
-				kfree(ctx);
-				return -ENOMEM;
-			}
+	if (nla[NFTA_RULE_TABLE]) {
+		ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], GFP_ATOMIC);
+		if (!ctx->table)
+			return -ENOMEM;
+	}
+	if (nla[NFTA_RULE_CHAIN]) {
+		ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], GFP_ATOMIC);
+		if (!ctx->chain) {
+			kfree(ctx->table);
+			return -ENOMEM;
 		}
 	}
+	if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET)
+		ctx->reset = true;
 
-	cb->data = ctx;
 	return 0;
 }
 
 static int nf_tables_dump_rules_done(struct netlink_callback *cb)
 {
-	struct nft_rule_dump_ctx *ctx = cb->data;
+	struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
 
-	if (ctx) {
-		kfree(ctx->table);
-		kfree(ctx->chain);
-		kfree(ctx);
-	}
+	kfree(ctx->table);
+	kfree(ctx->chain);
 	return 0;
 }
 
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [nf PATCH v2 2/8] netfilter: nf_tables: Introduce nf_tables_getrule_single()
  2023-09-28 16:52 [nf PATCH v2 0/8] Introduce locking for reset requests Phil Sutter
  2023-09-28 16:52 ` [nf PATCH v2 1/8] netfilter: nf_tables: Don't allocate nft_rule_dump_ctx Phil Sutter
@ 2023-09-28 16:52 ` Phil Sutter
  2023-09-28 16:52 ` [nf PATCH v2 3/8] netfilter: nf_tables: Add locking for NFT_MSG_GETRULE_RESET requests Phil Sutter
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 34+ messages in thread
From: Phil Sutter @ 2023-09-28 16:52 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, netfilter-devel

Outsource the reply skb preparation for non-dump getrule requests into a
distinct function. Prep work for rule reset locking.

Signed-off-by: Phil Sutter <phil@nwl.cc>
---
Changes since v1:
- New patch
---
 net/netfilter/nf_tables_api.c | 88 +++++++++++++++++++++++------------
 1 file changed, 59 insertions(+), 29 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 511508407867d..641ae9bde46fa 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3587,8 +3587,9 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb)
 }
 
 /* called with rcu_read_lock held */
-static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
-			     const struct nlattr * const nla[])
+static struct sk_buff *
+nf_tables_getrule_single(u32 portid, const struct nfnl_info *info,
+			 const struct nlattr * const nla[], bool reset)
 {
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_cur(info->net);
@@ -3598,60 +3599,89 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
 	struct net *net = info->net;
 	struct nft_table *table;
 	struct sk_buff *skb2;
-	bool reset = false;
 	int err;
 
-	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
-		struct netlink_dump_control c = {
-			.start= nf_tables_dump_rules_start,
-			.dump = nf_tables_dump_rules,
-			.done = nf_tables_dump_rules_done,
-			.module = THIS_MODULE,
-			.data = (void *)nla,
-		};
-
-		return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
-	}
-
 	table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0);
 	if (IS_ERR(table)) {
 		NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
-		return PTR_ERR(table);
+		return ERR_CAST(table);
 	}
 
 	chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
 	if (IS_ERR(chain)) {
 		NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
-		return PTR_ERR(chain);
+		return ERR_CAST(chain);
 	}
 
 	rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]);
 	if (IS_ERR(rule)) {
 		NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
-		return PTR_ERR(rule);
+		return ERR_CAST(rule);
 	}
 
 	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
 	if (!skb2)
+		return ERR_PTR(-ENOMEM);
+
+	err = nf_tables_fill_rule_info(skb2, net, portid,
+				       info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
+				       family, table, chain, rule, 0, reset);
+	if (err < 0) {
+		kfree_skb(skb2);
+		return ERR_PTR(err);
+	}
+
+	return skb2;
+}
+
+static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
+			     const struct nlattr * const nla[])
+{
+	u8 family = info->nfmsg->nfgen_family;
+	u32 portid = NETLINK_CB(skb).portid;
+	struct net *net = info->net;
+	struct sk_buff *skb2;
+	bool reset = false;
+	char *tablename;
+
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.start= nf_tables_dump_rules_start,
+			.dump = nf_tables_dump_rules,
+			.done = nf_tables_dump_rules_done,
+			.module = THIS_MODULE,
+			.data = (void *)nla,
+		};
+
+		return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+	}
+
+	if (!nla[NFTA_RULE_TABLE])
+		return -EINVAL;
+
+	tablename = nla_strdup(nla[NFTA_RULE_TABLE], GFP_ATOMIC);
+	if (!tablename)
 		return -ENOMEM;
 
 	if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET)
 		reset = true;
 
-	err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid,
-				       info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
-				       family, table, chain, rule, 0, reset);
-	if (err < 0)
-		goto err_fill_rule_info;
+	skb2 = nf_tables_getrule_single(portid, info, nla, reset);
+	if (IS_ERR(skb2)) {
+		kfree(tablename);
+		return PTR_ERR(skb2);
+	}
 
-	if (reset)
-		audit_log_rule_reset(table, nft_pernet(net)->base_seq, 1);
+	if (reset) {
+		char *buf = kasprintf(GFP_ATOMIC, "%s:%u",
+				      tablename, nft_pernet(net)->base_seq);
 
-	return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
+		audit_log_nfcfg(buf, family, 1,
+				AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
+		kfree(buf);
+	}
 
-err_fill_rule_info:
-	kfree_skb(skb2);
-	return err;
+	return nfnetlink_unicast(skb2, net, portid);
 }
 
 void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule)
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [nf PATCH v2 3/8] netfilter: nf_tables: Add locking for NFT_MSG_GETRULE_RESET requests
  2023-09-28 16:52 [nf PATCH v2 0/8] Introduce locking for reset requests Phil Sutter
  2023-09-28 16:52 ` [nf PATCH v2 1/8] netfilter: nf_tables: Don't allocate nft_rule_dump_ctx Phil Sutter
  2023-09-28 16:52 ` [nf PATCH v2 2/8] netfilter: nf_tables: Introduce nf_tables_getrule_single() Phil Sutter
@ 2023-09-28 16:52 ` Phil Sutter
  2023-09-28 16:52 ` [nf PATCH v2 4/8] netfilter: nf_tables: Introduce struct nft_obj_dump_ctx Phil Sutter
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 34+ messages in thread
From: Phil Sutter @ 2023-09-28 16:52 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, netfilter-devel

Rule reset is not concurrency-safe per-se, so multiple CPUs may reset
the same rule at the same time. At least counter and quota expressions
will suffer from value underruns in this case.

Prevent this by introducing dedicated locking callbacks for nfnetlink
and the asynchronous dump handling to serialize access.

Signed-off-by: Phil Sutter <phil@nwl.cc>
---
Changes since v1:
- commit_mutex instead of dedicated spinlock
---
 net/netfilter/nf_tables_api.c | 85 ++++++++++++++++++++++++++---------
 1 file changed, 65 insertions(+), 20 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 641ae9bde46fa..2e0402a4078cf 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3552,6 +3552,19 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 	return skb->len;
 }
 
+static int nf_tables_dumpreset_rules(struct sk_buff *skb,
+				     struct netlink_callback *cb)
+{
+	struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
+	int ret;
+
+	mutex_lock(&nft_net->commit_mutex);
+	ret = nf_tables_dump_rules(skb, cb);
+	mutex_unlock(&nft_net->commit_mutex);
+
+	return ret;
+}
+
 static int nf_tables_dump_rules_start(struct netlink_callback *cb)
 {
 	struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
@@ -3571,12 +3584,18 @@ static int nf_tables_dump_rules_start(struct netlink_callback *cb)
 			return -ENOMEM;
 		}
 	}
-	if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET)
-		ctx->reset = true;
-
 	return 0;
 }
 
+static int nf_tables_dumpreset_rules_start(struct netlink_callback *cb)
+{
+	struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
+
+	ctx->reset = true;
+
+	return nf_tables_dump_rules_start(cb);
+}
+
 static int nf_tables_dump_rules_done(struct netlink_callback *cb)
 {
 	struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
@@ -3637,12 +3656,8 @@ nf_tables_getrule_single(u32 portid, const struct nfnl_info *info,
 static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
 			     const struct nlattr * const nla[])
 {
-	u8 family = info->nfmsg->nfgen_family;
 	u32 portid = NETLINK_CB(skb).portid;
-	struct net *net = info->net;
 	struct sk_buff *skb2;
-	bool reset = false;
-	char *tablename;
 
 	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
@@ -3656,6 +3671,35 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
 		return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
 	}
 
+	skb2 = nf_tables_getrule_single(portid, info, nla, false);
+	if (IS_ERR(skb2))
+		return PTR_ERR(skb2);
+
+	return nfnetlink_unicast(skb2, info->net, portid);
+}
+
+static int nf_tables_getrule_reset(struct sk_buff *skb,
+				   const struct nfnl_info *info,
+				   const struct nlattr * const nla[])
+{
+	struct nftables_pernet *nft_net = nft_pernet(info->net);
+	u8 family = info->nfmsg->nfgen_family;
+	u32 portid = NETLINK_CB(skb).portid;
+	char *tablename, *buf;
+	struct sk_buff *skb2;
+
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.start= nf_tables_dumpreset_rules_start,
+			.dump = nf_tables_dumpreset_rules,
+			.done = nf_tables_dump_rules_done,
+			.module = THIS_MODULE,
+			.data = (void *)nla,
+		};
+
+		return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+	}
+
 	if (!nla[NFTA_RULE_TABLE])
 		return -EINVAL;
 
@@ -3663,25 +3707,26 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
 	if (!tablename)
 		return -ENOMEM;
 
-	if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET)
-		reset = true;
+	if (!try_module_get(THIS_MODULE))
+		return -EINVAL;
+	rcu_read_unlock();
+	mutex_lock(&nft_net->commit_mutex);
+	skb2 = nf_tables_getrule_single(portid, info, nla, true);
+	mutex_unlock(&nft_net->commit_mutex);
+	rcu_read_lock();
+	module_put(THIS_MODULE);
 
-	skb2 = nf_tables_getrule_single(portid, info, nla, reset);
 	if (IS_ERR(skb2)) {
 		kfree(tablename);
 		return PTR_ERR(skb2);
 	}
 
-	if (reset) {
-		char *buf = kasprintf(GFP_ATOMIC, "%s:%u",
-				      tablename, nft_pernet(net)->base_seq);
-
-		audit_log_nfcfg(buf, family, 1,
-				AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
-		kfree(buf);
-	}
+	buf = kasprintf(GFP_ATOMIC, "%s:%u", tablename, nft_net->base_seq);
+	audit_log_nfcfg(buf, family, 1, AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
+	kfree(buf);
+	kfree(tablename);
 
-	return nfnetlink_unicast(skb2, net, portid);
+	return nfnetlink_unicast(skb2, info->net, portid);
 }
 
 void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule)
@@ -8980,7 +9025,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.policy		= nft_rule_policy,
 	},
 	[NFT_MSG_GETRULE_RESET] = {
-		.call		= nf_tables_getrule,
+		.call		= nf_tables_getrule_reset,
 		.type		= NFNL_CB_RCU,
 		.attr_count	= NFTA_RULE_MAX,
 		.policy		= nft_rule_policy,
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [nf PATCH v2 4/8] netfilter: nf_tables: Introduce struct nft_obj_dump_ctx
  2023-09-28 16:52 [nf PATCH v2 0/8] Introduce locking for reset requests Phil Sutter
                   ` (2 preceding siblings ...)
  2023-09-28 16:52 ` [nf PATCH v2 3/8] netfilter: nf_tables: Add locking for NFT_MSG_GETRULE_RESET requests Phil Sutter
@ 2023-09-28 16:52 ` Phil Sutter
  2023-09-28 16:52 ` [nf PATCH v2 5/8] netfilter: nf_tables: Introduce nf_tables_getobj_single Phil Sutter
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 34+ messages in thread
From: Phil Sutter @ 2023-09-28 16:52 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, netfilter-devel

Convert struct nft_obj_filter into a dump context data structure
equivalent to nft_rule_dump_ctx. Make it reside inside struct
netlink_callback::ctx scratch area so there's no need to allocate and
free it. Also carry the 'reset' boolean instead of determining this upon
each call to nf_tables_dump_obj() by inspecting the nlmsg_type.

Signed-off-by: Phil Sutter <phil@nwl.cc>
---
Changes since v1:
- None
---
 net/netfilter/nf_tables_api.c | 66 ++++++++++++++---------------------
 1 file changed, 26 insertions(+), 40 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 2e0402a4078cf..67ee0d09cb844 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -7673,25 +7673,23 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
 	return -1;
 }
 
-struct nft_obj_filter {
+struct nft_obj_dump_ctx {
+	unsigned int	s_idx;
 	char		*table;
 	u32		type;
+	bool		reset;
 };
 
 static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
-	const struct nft_table *table;
-	unsigned int idx = 0, s_idx = cb->args[0];
-	struct nft_obj_filter *filter = cb->data;
+	struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
 	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 	struct nftables_pernet *nft_net;
+	const struct nft_table *table;
 	struct nft_object *obj;
-	bool reset = false;
-
-	if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
-		reset = true;
+	unsigned int idx = 0;
 
 	rcu_read_lock();
 	nft_net = nft_pernet(net);
@@ -7704,19 +7702,14 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
 		list_for_each_entry_rcu(obj, &table->objects, list) {
 			if (!nft_is_active(net, obj))
 				goto cont;
-			if (idx < s_idx)
+			if (idx < ctx->s_idx)
 				goto cont;
-			if (idx > s_idx)
-				memset(&cb->args[1], 0,
-				       sizeof(cb->args) - sizeof(cb->args[0]));
-			if (filter && filter->table &&
-			    strcmp(filter->table, table->name))
+			if (ctx->table && strcmp(ctx->table, table->name))
 				goto cont;
-			if (filter &&
-			    filter->type != NFT_OBJECT_UNSPEC &&
-			    obj->ops->type->type != filter->type)
+			if (ctx->type != NFT_OBJECT_UNSPEC &&
+			    obj->ops->type->type != ctx->type)
 				goto cont;
-			if (reset) {
+			if (ctx->reset) {
 				char *buf = kasprintf(GFP_ATOMIC,
 						      "%s:%u",
 						      table->name,
@@ -7735,7 +7728,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
 						    NFT_MSG_NEWOBJ,
 						    NLM_F_MULTI | NLM_F_APPEND,
 						    table->family, table,
-						    obj, reset) < 0)
+						    obj, ctx->reset) < 0)
 				goto done;
 
 			nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -7746,44 +7739,37 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
 done:
 	rcu_read_unlock();
 
-	cb->args[0] = idx;
+	ctx->s_idx = idx;
 	return skb->len;
 }
 
 static int nf_tables_dump_obj_start(struct netlink_callback *cb)
 {
+	struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
 	const struct nlattr * const *nla = cb->data;
-	struct nft_obj_filter *filter = NULL;
 
-	if (nla[NFTA_OBJ_TABLE] || nla[NFTA_OBJ_TYPE]) {
-		filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
-		if (!filter)
+	BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+
+	if (nla[NFTA_OBJ_TABLE]) {
+		ctx->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
+		if (!ctx->table)
 			return -ENOMEM;
+	}
 
-		if (nla[NFTA_OBJ_TABLE]) {
-			filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
-			if (!filter->table) {
-				kfree(filter);
-				return -ENOMEM;
-			}
-		}
+	if (nla[NFTA_OBJ_TYPE])
+		ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
 
-		if (nla[NFTA_OBJ_TYPE])
-			filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
-	}
+	if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
+		ctx->reset = true;
 
-	cb->data = filter;
 	return 0;
 }
 
 static int nf_tables_dump_obj_done(struct netlink_callback *cb)
 {
-	struct nft_obj_filter *filter = cb->data;
+	struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
 
-	if (filter) {
-		kfree(filter->table);
-		kfree(filter);
-	}
+	kfree(ctx->table);
 
 	return 0;
 }
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [nf PATCH v2 5/8] netfilter: nf_tables: Introduce nf_tables_getobj_single
  2023-09-28 16:52 [nf PATCH v2 0/8] Introduce locking for reset requests Phil Sutter
                   ` (3 preceding siblings ...)
  2023-09-28 16:52 ` [nf PATCH v2 4/8] netfilter: nf_tables: Introduce struct nft_obj_dump_ctx Phil Sutter
@ 2023-09-28 16:52 ` Phil Sutter
  2023-09-28 16:52 ` [nf PATCH v2 6/8] netfilter: nf_tables: Add locking for NFT_MSG_GETOBJ_RESET requests Phil Sutter
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 34+ messages in thread
From: Phil Sutter @ 2023-09-28 16:52 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, netfilter-devel

Outsource the reply skb preparation for non-dump getrule requests into a
distinct function. Prep work for object reset locking.

Signed-off-by: Phil Sutter <phil@nwl.cc>
---
Changes since v1:
- New patch
---
 net/netfilter/nf_tables_api.c | 66 +++++++++++++++++++++++++++--------
 1 file changed, 51 insertions(+), 15 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 67ee0d09cb844..eee149ea98b41 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -7775,19 +7775,65 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb)
 }
 
 /* called with rcu_read_lock held */
+static struct sk_buff *
+nf_tables_getobj_single(u32 portid, const struct nfnl_info *info,
+			const struct nlattr * const nla[], bool reset)
+{
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_cur(info->net);
+	u8 family = info->nfmsg->nfgen_family;
+	const struct nft_table *table;
+	struct net *net = info->net;
+	struct nft_object *obj;
+	struct sk_buff *skb2;
+	u32 objtype;
+	int err;
+
+	if (!nla[NFTA_OBJ_NAME] ||
+	    !nla[NFTA_OBJ_TYPE])
+		return ERR_PTR(-EINVAL);
+
+	table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0);
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
+		return ERR_CAST(table);
+	}
+
+	objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+	obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask);
+	if (IS_ERR(obj)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
+		return ERR_CAST(obj);
+	}
+
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
+	if (!skb2)
+		return ERR_PTR(-ENOMEM);
+
+	err = nf_tables_fill_obj_info(skb2, net, portid,
+				      info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
+				      family, table, obj, reset);
+	if (err < 0) {
+		kfree_skb(skb2);
+		return ERR_PTR(err);
+	}
+
+	return skb2;
+}
+
 static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
 			    const struct nlattr * const nla[])
 {
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_cur(info->net);
 	u8 family = info->nfmsg->nfgen_family;
+	u32 portid = NETLINK_CB(skb).portid;
 	const struct nft_table *table;
 	struct net *net = info->net;
 	struct nft_object *obj;
 	struct sk_buff *skb2;
 	bool reset = false;
 	u32 objtype;
-	int err;
 
 	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
@@ -7818,10 +7864,6 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
 		return PTR_ERR(obj);
 	}
 
-	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
-	if (!skb2)
-		return -ENOMEM;
-
 	if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
 		reset = true;
 
@@ -7840,17 +7882,11 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
 		kfree(buf);
 	}
 
-	err = nf_tables_fill_obj_info(skb2, net, NETLINK_CB(skb).portid,
-				      info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
-				      family, table, obj, reset);
-	if (err < 0)
-		goto err_fill_obj_info;
-
-	return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
+	skb2 = nf_tables_getobj_single(portid, info, nla, reset);
+	if (IS_ERR(skb2))
+		return PTR_ERR(skb2);
 
-err_fill_obj_info:
-	kfree_skb(skb2);
-	return err;
+	return nfnetlink_unicast(skb2, net, portid);
 }
 
 static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [nf PATCH v2 6/8] netfilter: nf_tables: Add locking for NFT_MSG_GETOBJ_RESET requests
  2023-09-28 16:52 [nf PATCH v2 0/8] Introduce locking for reset requests Phil Sutter
                   ` (4 preceding siblings ...)
  2023-09-28 16:52 ` [nf PATCH v2 5/8] netfilter: nf_tables: Introduce nf_tables_getobj_single Phil Sutter
@ 2023-09-28 16:52 ` Phil Sutter
  2023-09-28 16:52 ` [nf PATCH v2 7/8] netfilter: nf_tables: Pass reset bit in nft_set_dump_ctx Phil Sutter
  2023-09-28 16:52 ` [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests Phil Sutter
  7 siblings, 0 replies; 34+ messages in thread
From: Phil Sutter @ 2023-09-28 16:52 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, netfilter-devel

Objects' dump callbacks are not concurrency-safe per-se with reset bit
set. If two CPUs perform a reset at the same time, at least counter and
quota objects suffer from value underrun.

Prevent this by introducing dedicated locking callbacks for nfnetlink
and the asynchronous dump handling to serialize access.

Signed-off-by: Phil Sutter <phil@nwl.cc>
---
Changes since v1:
- Proper commit description
- commit_mutex instead of dedicated spinlock
---
 net/netfilter/nf_tables_api.c | 91 +++++++++++++++++++++++++----------
 1 file changed, 66 insertions(+), 25 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index eee149ea98b41..f154fcc341421 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -7743,6 +7743,19 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
+static int nf_tables_dumpreset_obj(struct sk_buff *skb,
+				   struct netlink_callback *cb)
+{
+	struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
+	int ret;
+
+	mutex_lock(&nft_net->commit_mutex);
+	ret = nf_tables_dump_obj(skb, cb);
+	mutex_unlock(&nft_net->commit_mutex);
+
+	return ret;
+}
+
 static int nf_tables_dump_obj_start(struct netlink_callback *cb)
 {
 	struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
@@ -7759,12 +7772,18 @@ static int nf_tables_dump_obj_start(struct netlink_callback *cb)
 	if (nla[NFTA_OBJ_TYPE])
 		ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
 
-	if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
-		ctx->reset = true;
-
 	return 0;
 }
 
+static int nf_tables_dumpreset_obj_start(struct netlink_callback *cb)
+{
+	struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
+
+	ctx->reset = true;
+
+	return nf_tables_dump_obj_start(cb);
+}
+
 static int nf_tables_dump_obj_done(struct netlink_callback *cb)
 {
 	struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
@@ -7824,6 +7843,33 @@ nf_tables_getobj_single(u32 portid, const struct nfnl_info *info,
 static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
 			    const struct nlattr * const nla[])
 {
+	u32 portid = NETLINK_CB(skb).portid;
+	struct sk_buff *skb2;
+
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.start = nf_tables_dump_obj_start,
+			.dump = nf_tables_dump_obj,
+			.done = nf_tables_dump_obj_done,
+			.module = THIS_MODULE,
+			.data = (void *)nla,
+		};
+
+		return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+	}
+
+	skb2 = nf_tables_getobj_single(portid, info, nla, false);
+	if (IS_ERR(skb2))
+		return PTR_ERR(skb2);
+
+	return nfnetlink_unicast(skb2, info->net, portid);
+}
+
+static int nf_tables_getobj_reset(struct sk_buff *skb,
+				  const struct nfnl_info *info,
+				  const struct nlattr * const nla[])
+{
+	struct nftables_pernet *nft_net = nft_pernet(info->net);
 	struct netlink_ext_ack *extack = info->extack;
 	u8 genmask = nft_genmask_cur(info->net);
 	u8 family = info->nfmsg->nfgen_family;
@@ -7832,13 +7878,13 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
 	struct net *net = info->net;
 	struct nft_object *obj;
 	struct sk_buff *skb2;
-	bool reset = false;
 	u32 objtype;
+	char *buf;
 
 	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
-			.start = nf_tables_dump_obj_start,
-			.dump = nf_tables_dump_obj,
+			.start = nf_tables_dumpreset_obj_start,
+			.dump = nf_tables_dumpreset_obj,
 			.done = nf_tables_dump_obj_done,
 			.module = THIS_MODULE,
 			.data = (void *)nla,
@@ -7864,28 +7910,23 @@ static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
 		return PTR_ERR(obj);
 	}
 
-	if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETOBJ_RESET)
-		reset = true;
-
-	if (reset) {
-		const struct nftables_pernet *nft_net;
-		char *buf;
-
-		nft_net = nft_pernet(net);
-		buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, nft_net->base_seq);
-
-		audit_log_nfcfg(buf,
-				family,
-				obj->handle,
-				AUDIT_NFT_OP_OBJ_RESET,
-				GFP_ATOMIC);
-		kfree(buf);
-	}
+	if (!try_module_get(THIS_MODULE))
+		return -EINVAL;
+	rcu_read_unlock();
+	mutex_lock(&nft_net->commit_mutex);
+	skb2 = nf_tables_getobj_single(portid, info, nla, true);
+	mutex_unlock(&nft_net->commit_mutex);
+	rcu_read_lock();
+	module_put(THIS_MODULE);
 
-	skb2 = nf_tables_getobj_single(portid, info, nla, reset);
 	if (IS_ERR(skb2))
 		return PTR_ERR(skb2);
 
+	buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, nft_net->base_seq);
+	audit_log_nfcfg(buf, family, obj->handle,
+			AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC);
+	kfree(buf);
+
 	return nfnetlink_unicast(skb2, net, portid);
 }
 
@@ -9147,7 +9188,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.policy		= nft_obj_policy,
 	},
 	[NFT_MSG_GETOBJ_RESET] = {
-		.call		= nf_tables_getobj,
+		.call		= nf_tables_getobj_reset,
 		.type		= NFNL_CB_RCU,
 		.attr_count	= NFTA_OBJ_MAX,
 		.policy		= nft_obj_policy,
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [nf PATCH v2 7/8] netfilter: nf_tables: Pass reset bit in nft_set_dump_ctx
  2023-09-28 16:52 [nf PATCH v2 0/8] Introduce locking for reset requests Phil Sutter
                   ` (5 preceding siblings ...)
  2023-09-28 16:52 ` [nf PATCH v2 6/8] netfilter: nf_tables: Add locking for NFT_MSG_GETOBJ_RESET requests Phil Sutter
@ 2023-09-28 16:52 ` Phil Sutter
  2023-09-28 18:53   ` Pablo Neira Ayuso
  2023-09-28 16:52 ` [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests Phil Sutter
  7 siblings, 1 reply; 34+ messages in thread
From: Phil Sutter @ 2023-09-28 16:52 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, netfilter-devel

Relieve the dump callback from having to check nlmsg_type upon each
call. Prep work for set element reset locking.

Signed-off-by: Phil Sutter <phil@nwl.cc>
---
Changes since v1:
- New patch
---
 net/netfilter/nf_tables_api.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index f154fcc341421..1491d4c65fed9 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5731,6 +5731,7 @@ static void audit_log_nft_set_reset(const struct nft_table *table,
 struct nft_set_dump_ctx {
 	const struct nft_set	*set;
 	struct nft_ctx		ctx;
+	bool			reset;
 };
 
 static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb,
@@ -5770,7 +5771,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	bool set_found = false;
 	struct nlmsghdr *nlh;
 	struct nlattr *nest;
-	bool reset = false;
 	u32 portid, seq;
 	int event;
 
@@ -5818,12 +5818,9 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	if (nest == NULL)
 		goto nla_put_failure;
 
-	if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET)
-		reset = true;
-
 	args.cb			= cb;
 	args.skb		= skb;
-	args.reset		= reset;
+	args.reset		= dump_ctx->reset;
 	args.iter.genmask	= nft_genmask_cur(net);
 	args.iter.skip		= cb->args[0];
 	args.iter.count		= 0;
@@ -5833,11 +5830,11 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 
 	if (!args.iter.err && args.iter.count == cb->args[0])
 		args.iter.err = nft_set_catchall_dump(net, skb, set,
-						      reset, cb->seq);
+						      dump_ctx->reset, cb->seq);
 	nla_nest_end(skb, nest);
 	nlmsg_end(skb, nlh);
 
-	if (reset && args.iter.count > args.iter.skip)
+	if (dump_ctx->reset && args.iter.count > args.iter.skip)
 		audit_log_nft_set_reset(table, cb->seq,
 					args.iter.count - args.iter.skip);
 
@@ -6088,6 +6085,9 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
 
 	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
 
+	if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET)
+		reset = true;
+
 	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.start = nf_tables_dump_set_start,
@@ -6098,6 +6098,7 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
 		struct nft_set_dump_ctx dump_ctx = {
 			.set = set,
 			.ctx = ctx,
+			.reset = reset,
 		};
 
 		c.data = &dump_ctx;
@@ -6107,9 +6108,6 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
 	if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
 		return -EINVAL;
 
-	if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET)
-		reset = true;
-
 	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
 		err = nft_get_set_elem(&ctx, set, attr, reset);
 		if (err < 0) {
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests
  2023-09-28 16:52 [nf PATCH v2 0/8] Introduce locking for reset requests Phil Sutter
                   ` (6 preceding siblings ...)
  2023-09-28 16:52 ` [nf PATCH v2 7/8] netfilter: nf_tables: Pass reset bit in nft_set_dump_ctx Phil Sutter
@ 2023-09-28 16:52 ` Phil Sutter
  2023-09-28 17:46   ` Florian Westphal
  2023-09-28 18:51   ` Pablo Neira Ayuso
  7 siblings, 2 replies; 34+ messages in thread
From: Phil Sutter @ 2023-09-28 16:52 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, netfilter-devel

Set expressions' dump callbacks are not concurrency-safe per-se with
reset bit set. If two CPUs reset the same element at the same time,
values may underrun at least with element-attached counters and quotas.

Prevent this by introducing dedicated callbacks for nfnetlink and the
asynchronous dump handling to serialize access.

Signed-off-by: Phil Sutter <phil@nwl.cc>
---
Changes since v1:
- Improved commit description
- Unrelated chunk moved into a previous patch
---
 net/netfilter/nf_tables_api.c | 105 +++++++++++++++++++++++++++++-----
 1 file changed, 91 insertions(+), 14 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 1491d4c65fed9..a855b43fe72db 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5834,10 +5834,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	nla_nest_end(skb, nest);
 	nlmsg_end(skb, nlh);
 
-	if (dump_ctx->reset && args.iter.count > args.iter.skip)
-		audit_log_nft_set_reset(table, cb->seq,
-					args.iter.count - args.iter.skip);
-
 	rcu_read_unlock();
 
 	if (args.iter.err && args.iter.err != -EMSGSIZE)
@@ -5853,6 +5849,24 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 	return -ENOSPC;
 }
 
+static int nf_tables_dumpreset_set(struct sk_buff *skb,
+				   struct netlink_callback *cb)
+{
+	struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
+	struct nft_set_dump_ctx *dump_ctx = cb->data;
+	int ret, skip = cb->args[0];
+
+	mutex_lock(&nft_net->commit_mutex);
+	ret = nf_tables_dump_set(skb, cb);
+	mutex_unlock(&nft_net->commit_mutex);
+
+	if (cb->args[0] > skip)
+		audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq,
+					cb->args[0] - skip);
+
+	return ret;
+}
+
 static int nf_tables_dump_set_start(struct netlink_callback *cb)
 {
 	struct nft_set_dump_ctx *dump_ctx = cb->data;
@@ -6070,7 +6084,6 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
 	struct nft_set *set;
 	struct nlattr *attr;
 	struct nft_ctx ctx;
-	bool reset = false;
 
 	table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
 				 genmask, 0);
@@ -6085,9 +6098,6 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
 
 	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
 
-	if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET)
-		reset = true;
-
 	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.start = nf_tables_dump_set_start,
@@ -6098,7 +6108,67 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
 		struct nft_set_dump_ctx dump_ctx = {
 			.set = set,
 			.ctx = ctx,
-			.reset = reset,
+			.reset = false,
+		};
+
+		c.data = &dump_ctx;
+		return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
+	}
+
+	if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
+		return -EINVAL;
+
+	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
+		err = nft_get_set_elem(&ctx, set, attr, false);
+		if (err < 0) {
+			NL_SET_BAD_ATTR(extack, attr);
+			break;
+		}
+		nelems++;
+	}
+
+	return err;
+}
+
+static int nf_tables_getsetelem_reset(struct sk_buff *skb,
+				      const struct nfnl_info *info,
+				      const struct nlattr * const nla[])
+{
+	struct nftables_pernet *nft_net = nft_pernet(info->net);
+	struct netlink_ext_ack *extack = info->extack;
+	u8 genmask = nft_genmask_cur(info->net);
+	u8 family = info->nfmsg->nfgen_family;
+	int rem, err = 0, nelems = 0;
+	struct net *net = info->net;
+	struct nft_table *table;
+	struct nft_set *set;
+	struct nlattr *attr;
+	struct nft_ctx ctx;
+
+	table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
+				 genmask, 0);
+	if (IS_ERR(table)) {
+		NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
+		return PTR_ERR(table);
+	}
+
+	set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
+	if (IS_ERR(set))
+		return PTR_ERR(set);
+
+	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
+
+	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
+		struct netlink_dump_control c = {
+			.start = nf_tables_dump_set_start,
+			.dump = nf_tables_dumpreset_set,
+			.done = nf_tables_dump_set_done,
+			.module = THIS_MODULE,
+		};
+		struct nft_set_dump_ctx dump_ctx = {
+			.set = set,
+			.ctx = ctx,
+			.reset = true,
 		};
 
 		c.data = &dump_ctx;
@@ -6108,18 +6178,25 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
 	if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
 		return -EINVAL;
 
+	if (!try_module_get(THIS_MODULE))
+		return -EINVAL;
+	rcu_read_unlock();
+	mutex_lock(&nft_net->commit_mutex);
+	rcu_read_lock();
 	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
-		err = nft_get_set_elem(&ctx, set, attr, reset);
+		err = nft_get_set_elem(&ctx, set, attr, true);
 		if (err < 0) {
 			NL_SET_BAD_ATTR(extack, attr);
 			break;
 		}
 		nelems++;
 	}
+	rcu_read_unlock();
+	mutex_unlock(&nft_net->commit_mutex);
+	rcu_read_lock();
+	module_put(THIS_MODULE);
 
-	if (reset)
-		audit_log_nft_set_reset(table, nft_pernet(net)->base_seq,
-					nelems);
+	audit_log_nft_set_reset(table, nft_net->base_seq, nelems);
 
 	return err;
 }
@@ -9140,7 +9217,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.policy		= nft_set_elem_list_policy,
 	},
 	[NFT_MSG_GETSETELEM_RESET] = {
-		.call		= nf_tables_getsetelem,
+		.call		= nf_tables_getsetelem_reset,
 		.type		= NFNL_CB_RCU,
 		.attr_count	= NFTA_SET_ELEM_LIST_MAX,
 		.policy		= nft_set_elem_list_policy,
-- 
2.41.0


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests
  2023-09-28 16:52 ` [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests Phil Sutter
@ 2023-09-28 17:46   ` Florian Westphal
  2023-09-28 18:47     ` Pablo Neira Ayuso
  2023-09-29 11:03     ` Phil Sutter
  2023-09-28 18:51   ` Pablo Neira Ayuso
  1 sibling, 2 replies; 34+ messages in thread
From: Florian Westphal @ 2023-09-28 17:46 UTC (permalink / raw)
  To: Phil Sutter; +Cc: Pablo Neira Ayuso, Florian Westphal, netfilter-devel

Phil Sutter <phil@nwl.cc> wrote:
> +static int nf_tables_dumpreset_set(struct sk_buff *skb,
> +				   struct netlink_callback *cb)
> +{
> +	struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
> +	struct nft_set_dump_ctx *dump_ctx = cb->data;
> +	int ret, skip = cb->args[0];
> +
> +	mutex_lock(&nft_net->commit_mutex);
> +	ret = nf_tables_dump_set(skb, cb);
> +	mutex_unlock(&nft_net->commit_mutex);
> +
> +	if (cb->args[0] > skip)
> +		audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq,
> +					cb->args[0] - skip);
> +

Once commit_mutex is dropped, parallel user can
delete table, and ctx.table references garbage.

So I think this needs to be done under mutex.

>  		c.data = &dump_ctx;
> @@ -6108,18 +6178,25 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
>  	if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
>  		return -EINVAL;
>  
> +	if (!try_module_get(THIS_MODULE))
> +		return -EINVAL;
> +	rcu_read_unlock();
> +	mutex_lock(&nft_net->commit_mutex);
> +	rcu_read_lock();

Why do we need to regain the rcu read lock here?
Are we tripping over a now bogus rcu_derefence check or is there
another reason?

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests
  2023-09-28 17:46   ` Florian Westphal
@ 2023-09-28 18:47     ` Pablo Neira Ayuso
  2023-09-28 18:57       ` Florian Westphal
  2023-09-29 11:03     ` Phil Sutter
  1 sibling, 1 reply; 34+ messages in thread
From: Pablo Neira Ayuso @ 2023-09-28 18:47 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Phil Sutter, netfilter-devel

On Thu, Sep 28, 2023 at 07:46:30PM +0200, Florian Westphal wrote:
> Phil Sutter <phil@nwl.cc> wrote:
> > +static int nf_tables_dumpreset_set(struct sk_buff *skb,
> > +				   struct netlink_callback *cb)
> > +{
> > +	struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
> > +	struct nft_set_dump_ctx *dump_ctx = cb->data;
> > +	int ret, skip = cb->args[0];
> > +
> > +	mutex_lock(&nft_net->commit_mutex);
> > +	ret = nf_tables_dump_set(skb, cb);
> > +	mutex_unlock(&nft_net->commit_mutex);
> > +
> > +	if (cb->args[0] > skip)
> > +		audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq,
> > +					cb->args[0] - skip);
> > +
> 
> Once commit_mutex is dropped, parallel user can
> delete table, and ctx.table references garbage.

This path should hold rcu read lock.

> So I think this needs to be done under mutex.

I think spinlock would be better, we would just spin for very little
time here for another thread to complete the reset, and the race is
fixed.

The use of commit_mutex here is confusing is really misleading to the
reader, this is also not the commit path.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 1/8] netfilter: nf_tables: Don't allocate nft_rule_dump_ctx
  2023-09-28 16:52 ` [nf PATCH v2 1/8] netfilter: nf_tables: Don't allocate nft_rule_dump_ctx Phil Sutter
@ 2023-09-28 18:49   ` Pablo Neira Ayuso
  2023-09-29 10:15     ` Phil Sutter
  2023-09-28 19:00   ` Florian Westphal
  1 sibling, 1 reply; 34+ messages in thread
From: Pablo Neira Ayuso @ 2023-09-28 18:49 UTC (permalink / raw)
  To: Phil Sutter; +Cc: Florian Westphal, netfilter-devel

On Thu, Sep 28, 2023 at 06:52:37PM +0200, Phil Sutter wrote:
[...]

This whole chunk below looks like a cleanup to remove one indentation
level? Please add an initial patch for this.

>  static int nf_tables_dump_rules_start(struct netlink_callback *cb)
>  {
> +	struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
>  	const struct nlattr * const *nla = cb->data;
> -	struct nft_rule_dump_ctx *ctx = NULL;
>  
> -	if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
> -		ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
> -		if (!ctx)
> -			return -ENOMEM;
> +	BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
>  
> -		if (nla[NFTA_RULE_TABLE]) {
> -			ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
> -							GFP_ATOMIC);
> -			if (!ctx->table) {
> -				kfree(ctx);
> -				return -ENOMEM;
> -			}
> -		}
> -		if (nla[NFTA_RULE_CHAIN]) {
> -			ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
> -						GFP_ATOMIC);
> -			if (!ctx->chain) {
> -				kfree(ctx->table);
> -				kfree(ctx);
> -				return -ENOMEM;
> -			}
> +	if (nla[NFTA_RULE_TABLE]) {
> +		ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], GFP_ATOMIC);
> +		if (!ctx->table)
> +			return -ENOMEM;
> +	}
> +	if (nla[NFTA_RULE_CHAIN]) {
> +		ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], GFP_ATOMIC);
> +		if (!ctx->chain) {
> +			kfree(ctx->table);
> +			return -ENOMEM;
>  		}
>  	}
> +	if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET)
> +		ctx->reset = true;
>  
> -	cb->data = ctx;
>  	return 0;
>  }
>  
>  static int nf_tables_dump_rules_done(struct netlink_callback *cb)
>  {
> -	struct nft_rule_dump_ctx *ctx = cb->data;
> +	struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
>  
> -	if (ctx) {
> -		kfree(ctx->table);
> -		kfree(ctx->chain);
> -		kfree(ctx);
> -	}
> +	kfree(ctx->table);
> +	kfree(ctx->chain);
>  	return 0;
>  }
>  
> -- 
> 2.41.0
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests
  2023-09-28 16:52 ` [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests Phil Sutter
  2023-09-28 17:46   ` Florian Westphal
@ 2023-09-28 18:51   ` Pablo Neira Ayuso
  2023-09-29 10:28     ` Phil Sutter
  1 sibling, 1 reply; 34+ messages in thread
From: Pablo Neira Ayuso @ 2023-09-28 18:51 UTC (permalink / raw)
  To: Phil Sutter; +Cc: Florian Westphal, netfilter-devel

On Thu, Sep 28, 2023 at 06:52:44PM +0200, Phil Sutter wrote:
> Set expressions' dump callbacks are not concurrency-safe per-se with
> reset bit set. If two CPUs reset the same element at the same time,
> values may underrun at least with element-attached counters and quotas.
> 
> Prevent this by introducing dedicated callbacks for nfnetlink and the
> asynchronous dump handling to serialize access.
> 
> Signed-off-by: Phil Sutter <phil@nwl.cc>
> ---
> Changes since v1:
> - Improved commit description
> - Unrelated chunk moved into a previous patch
> ---
>  net/netfilter/nf_tables_api.c | 105 +++++++++++++++++++++++++++++-----
>  1 file changed, 91 insertions(+), 14 deletions(-)
> 
> diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
> index 1491d4c65fed9..a855b43fe72db 100644
> --- a/net/netfilter/nf_tables_api.c
> +++ b/net/netfilter/nf_tables_api.c
> @@ -5834,10 +5834,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
>  	nla_nest_end(skb, nest);
>  	nlmsg_end(skb, nlh);
>  
> -	if (dump_ctx->reset && args.iter.count > args.iter.skip)
> -		audit_log_nft_set_reset(table, cb->seq,
> -					args.iter.count - args.iter.skip);
> -
>  	rcu_read_unlock();
>  
>  	if (args.iter.err && args.iter.err != -EMSGSIZE)
> @@ -5853,6 +5849,24 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
>  	return -ENOSPC;
>  }
>  
> +static int nf_tables_dumpreset_set(struct sk_buff *skb,
> +				   struct netlink_callback *cb)
> +{
> +	struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
> +	struct nft_set_dump_ctx *dump_ctx = cb->data;
> +	int ret, skip = cb->args[0];
> +
> +	mutex_lock(&nft_net->commit_mutex);
> +	ret = nf_tables_dump_set(skb, cb);
> +	mutex_unlock(&nft_net->commit_mutex);
> +
> +	if (cb->args[0] > skip)
> +		audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq,
> +					cb->args[0] - skip);
> +
> +	return ret;
> +}
> +
>  static int nf_tables_dump_set_start(struct netlink_callback *cb)
>  {
>  	struct nft_set_dump_ctx *dump_ctx = cb->data;
> @@ -6070,7 +6084,6 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
>  	struct nft_set *set;
>  	struct nlattr *attr;
>  	struct nft_ctx ctx;
> -	bool reset = false;
>  
>  	table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
>  				 genmask, 0);
> @@ -6085,9 +6098,6 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
>  
>  	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
>  
> -	if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET)
> -		reset = true;
> -
>  	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
>  		struct netlink_dump_control c = {
>  			.start = nf_tables_dump_set_start,
> @@ -6098,7 +6108,67 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
>  		struct nft_set_dump_ctx dump_ctx = {
>  			.set = set,
>  			.ctx = ctx,
> -			.reset = reset,
> +			.reset = false,
> +		};
> +
> +		c.data = &dump_ctx;
> +		return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
> +	}
> +
> +	if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
> +		return -EINVAL;
> +
> +	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
> +		err = nft_get_set_elem(&ctx, set, attr, false);
> +		if (err < 0) {
> +			NL_SET_BAD_ATTR(extack, attr);
> +			break;
> +		}
> +		nelems++;
> +	}
> +
> +	return err;
> +}
> +
> +static int nf_tables_getsetelem_reset(struct sk_buff *skb,
> +				      const struct nfnl_info *info,
> +				      const struct nlattr * const nla[])
> +{
> +	struct nftables_pernet *nft_net = nft_pernet(info->net);
> +	struct netlink_ext_ack *extack = info->extack;
> +	u8 genmask = nft_genmask_cur(info->net);
> +	u8 family = info->nfmsg->nfgen_family;
> +	int rem, err = 0, nelems = 0;
> +	struct net *net = info->net;
> +	struct nft_table *table;
> +	struct nft_set *set;
> +	struct nlattr *attr;
> +	struct nft_ctx ctx;
> +
> +	table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
> +				 genmask, 0);
> +	if (IS_ERR(table)) {
> +		NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
> +		return PTR_ERR(table);
> +	}
> +
> +	set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
> +	if (IS_ERR(set))
> +		return PTR_ERR(set);
> +
> +	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
> +
> +	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
> +		struct netlink_dump_control c = {
> +			.start = nf_tables_dump_set_start,
> +			.dump = nf_tables_dumpreset_set,
> +			.done = nf_tables_dump_set_done,
> +			.module = THIS_MODULE,
> +		};
> +		struct nft_set_dump_ctx dump_ctx = {
> +			.set = set,
> +			.ctx = ctx,
> +			.reset = true,
>  		};
>  
>  		c.data = &dump_ctx;
> @@ -6108,18 +6178,25 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
>  	if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
>  		return -EINVAL;
>  
> +	if (!try_module_get(THIS_MODULE))
> +		return -EINVAL;
> +	rcu_read_unlock();
> +	mutex_lock(&nft_net->commit_mutex);
> +	rcu_read_lock();
>  	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
> -		err = nft_get_set_elem(&ctx, set, attr, reset);
> +		err = nft_get_set_elem(&ctx, set, attr, true);
>  		if (err < 0) {
>  			NL_SET_BAD_ATTR(extack, attr);
>  			break;
>  		}
>  		nelems++;
>  	}
> +	rcu_read_unlock();
> +	mutex_unlock(&nft_net->commit_mutex);
> +	rcu_read_lock();
> +	module_put(THIS_MODULE);
>  
> -	if (reset)
> -		audit_log_nft_set_reset(table, nft_pernet(net)->base_seq,
> -					nelems);
> +	audit_log_nft_set_reset(table, nft_net->base_seq, nelems);
>  
>  	return err;
>  }
> @@ -9140,7 +9217,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
>  		.policy		= nft_set_elem_list_policy,
>  	},
>  	[NFT_MSG_GETSETELEM_RESET] = {
> -		.call		= nf_tables_getsetelem,
> +		.call		= nf_tables_getsetelem_reset,

This diff is weird. This is a complete new function, but it show like
new code in nf_tables_getsetelem(), this is difficult to track at
review.

What did it change for this diff to happen?

>  		.type		= NFNL_CB_RCU,
>  		.attr_count	= NFTA_SET_ELEM_LIST_MAX,
>  		.policy		= nft_set_elem_list_policy,
> -- 
> 2.41.0
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 7/8] netfilter: nf_tables: Pass reset bit in nft_set_dump_ctx
  2023-09-28 16:52 ` [nf PATCH v2 7/8] netfilter: nf_tables: Pass reset bit in nft_set_dump_ctx Phil Sutter
@ 2023-09-28 18:53   ` Pablo Neira Ayuso
  2023-09-29 10:08     ` Phil Sutter
  0 siblings, 1 reply; 34+ messages in thread
From: Pablo Neira Ayuso @ 2023-09-28 18:53 UTC (permalink / raw)
  To: Phil Sutter; +Cc: Florian Westphal, netfilter-devel

On Thu, Sep 28, 2023 at 06:52:43PM +0200, Phil Sutter wrote:
> Relieve the dump callback from having to check nlmsg_type upon each
> call. Prep work for set element reset locking.

Maybe add this as a preparation patch first place in this series,
rather making this cleanup at this late stage of the batch.

> Signed-off-by: Phil Sutter <phil@nwl.cc>
> ---
> Changes since v1:
> - New patch
> ---
>  net/netfilter/nf_tables_api.c | 18 ++++++++----------
>  1 file changed, 8 insertions(+), 10 deletions(-)
> 
> diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
> index f154fcc341421..1491d4c65fed9 100644
> --- a/net/netfilter/nf_tables_api.c
> +++ b/net/netfilter/nf_tables_api.c
> @@ -5731,6 +5731,7 @@ static void audit_log_nft_set_reset(const struct nft_table *table,
>  struct nft_set_dump_ctx {
>  	const struct nft_set	*set;
>  	struct nft_ctx		ctx;
> +	bool			reset;
>  };
>  
>  static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb,
> @@ -5770,7 +5771,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
>  	bool set_found = false;
>  	struct nlmsghdr *nlh;
>  	struct nlattr *nest;
> -	bool reset = false;
>  	u32 portid, seq;
>  	int event;
>  
> @@ -5818,12 +5818,9 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
>  	if (nest == NULL)
>  		goto nla_put_failure;
>  
> -	if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET)
> -		reset = true;
> -
>  	args.cb			= cb;
>  	args.skb		= skb;
> -	args.reset		= reset;
> +	args.reset		= dump_ctx->reset;
>  	args.iter.genmask	= nft_genmask_cur(net);
>  	args.iter.skip		= cb->args[0];
>  	args.iter.count		= 0;
> @@ -5833,11 +5830,11 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
>  
>  	if (!args.iter.err && args.iter.count == cb->args[0])
>  		args.iter.err = nft_set_catchall_dump(net, skb, set,
> -						      reset, cb->seq);
> +						      dump_ctx->reset, cb->seq);
>  	nla_nest_end(skb, nest);
>  	nlmsg_end(skb, nlh);
>  
> -	if (reset && args.iter.count > args.iter.skip)
> +	if (dump_ctx->reset && args.iter.count > args.iter.skip)
>  		audit_log_nft_set_reset(table, cb->seq,
>  					args.iter.count - args.iter.skip);
>  
> @@ -6088,6 +6085,9 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
>  
>  	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
>  
> +	if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET)
> +		reset = true;
> +
>  	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
>  		struct netlink_dump_control c = {
>  			.start = nf_tables_dump_set_start,
> @@ -6098,6 +6098,7 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
>  		struct nft_set_dump_ctx dump_ctx = {
>  			.set = set,
>  			.ctx = ctx,
> +			.reset = reset,
>  		};
>  
>  		c.data = &dump_ctx;
> @@ -6107,9 +6108,6 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
>  	if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
>  		return -EINVAL;
>  
> -	if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET)
> -		reset = true;
> -
>  	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
>  		err = nft_get_set_elem(&ctx, set, attr, reset);
>  		if (err < 0) {
> -- 
> 2.41.0
> 

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests
  2023-09-28 18:47     ` Pablo Neira Ayuso
@ 2023-09-28 18:57       ` Florian Westphal
  2023-09-28 19:04         ` Pablo Neira Ayuso
  0 siblings, 1 reply; 34+ messages in thread
From: Florian Westphal @ 2023-09-28 18:57 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, Phil Sutter, netfilter-devel

Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> On Thu, Sep 28, 2023 at 07:46:30PM +0200, Florian Westphal wrote:
> > Phil Sutter <phil@nwl.cc> wrote:
> > > +static int nf_tables_dumpreset_set(struct sk_buff *skb,
> > > +				   struct netlink_callback *cb)
> > > +{
> > > +	struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
> > > +	struct nft_set_dump_ctx *dump_ctx = cb->data;
> > > +	int ret, skip = cb->args[0];
> > > +
> > > +	mutex_lock(&nft_net->commit_mutex);
> > > +	ret = nf_tables_dump_set(skb, cb);
> > > +	mutex_unlock(&nft_net->commit_mutex);
> > > +
> > > +	if (cb->args[0] > skip)
> > > +		audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq,
> > > +					cb->args[0] - skip);
> > > +
> > 
> > Once commit_mutex is dropped, parallel user can
> > delete table, and ctx.table references garbage.
> 
> This path should hold rcu read lock.

Then it would splat on first mutex_lock().

> I think spinlock would be better, we would just spin for very little
> time here for another thread to complete the reset, and the race is
> fixed.
> 
> The use of commit_mutex here is confusing is really misleading to the
> reader, this is also not the commit path.

I'd say its semantically the same thing, we alter state.

We even emit audit records to tell userspace that there is state
change.

Also, are you sure spinlock is appropriate here?

For full-ruleset resets we might be doing quite some
traverals.

But, no problem, I'm fine with spinlock as well.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 1/8] netfilter: nf_tables: Don't allocate nft_rule_dump_ctx
  2023-09-28 16:52 ` [nf PATCH v2 1/8] netfilter: nf_tables: Don't allocate nft_rule_dump_ctx Phil Sutter
  2023-09-28 18:49   ` Pablo Neira Ayuso
@ 2023-09-28 19:00   ` Florian Westphal
  2023-09-29 10:13     ` Phil Sutter
  1 sibling, 1 reply; 34+ messages in thread
From: Florian Westphal @ 2023-09-28 19:00 UTC (permalink / raw)
  To: Phil Sutter; +Cc: Pablo Neira Ayuso, Florian Westphal, netfilter-devel

Phil Sutter <phil@nwl.cc> wrote:
> Eliminate the direct use of netlink_callback::args when dumping rules by
> casting nft_rule_dump_ctx over netlink_callback::ctx as suggested in
> the struct's comment.
> 
> The value for 's_idx' has to be stored inside nft_rule_dump_ctx now and
> make it hold the 'reset' boolean as well.
> 
> Note how this patch removes the zeroing of netlink_callback::args[1-5] -
> none of the rule dump callbacks seem to make use of them.

Do you think we can fix the reset race in -next instead of -nf?

If yes, you could detach preparation patches like this one and
split the series in several batches.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests
  2023-09-28 18:57       ` Florian Westphal
@ 2023-09-28 19:04         ` Pablo Neira Ayuso
  2023-09-28 19:21           ` Florian Westphal
  2023-09-28 19:39           ` Jozsef Kadlecsik
  0 siblings, 2 replies; 34+ messages in thread
From: Pablo Neira Ayuso @ 2023-09-28 19:04 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Phil Sutter, netfilter-devel

On Thu, Sep 28, 2023 at 08:57:45PM +0200, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > On Thu, Sep 28, 2023 at 07:46:30PM +0200, Florian Westphal wrote:
> > > Phil Sutter <phil@nwl.cc> wrote:
> > > > +static int nf_tables_dumpreset_set(struct sk_buff *skb,
> > > > +				   struct netlink_callback *cb)
> > > > +{
> > > > +	struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
> > > > +	struct nft_set_dump_ctx *dump_ctx = cb->data;
> > > > +	int ret, skip = cb->args[0];
> > > > +
> > > > +	mutex_lock(&nft_net->commit_mutex);
> > > > +	ret = nf_tables_dump_set(skb, cb);
> > > > +	mutex_unlock(&nft_net->commit_mutex);
> > > > +
> > > > +	if (cb->args[0] > skip)
> > > > +		audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq,
> > > > +					cb->args[0] - skip);
> > > > +
> > > 
> > > Once commit_mutex is dropped, parallel user can
> > > delete table, and ctx.table references garbage.
> > 
> > This path should hold rcu read lock.
> 
> Then it would splat on first mutex_lock().
> 
> > I think spinlock would be better, we would just spin for very little
> > time here for another thread to complete the reset, and the race is
> > fixed.
> > 
> > The use of commit_mutex here is confusing is really misleading to the
> > reader, this is also not the commit path.
> 
> I'd say its semantically the same thing, we alter state.
> 
> We even emit audit records to tell userspace that there is state
> change.

This is a netlink event, how does the mutex help in that regard?

> Also, are you sure spinlock is appropriate here?
> 
> For full-ruleset resets we might be doing quite some
> traverals.

I said several times, we grab and release this for each
netlink_recvmsg(), commit_mutex helps us in no way to fix the "two
concurrent dump-and-reset problem".

One concern might be deadlock due to reordering, but I don't see how
that can happen.

> But, no problem, I'm fine with spinlock as well.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests
  2023-09-28 19:04         ` Pablo Neira Ayuso
@ 2023-09-28 19:21           ` Florian Westphal
  2023-09-28 20:07             ` Florian Westphal
  2023-09-28 19:39           ` Jozsef Kadlecsik
  1 sibling, 1 reply; 34+ messages in thread
From: Florian Westphal @ 2023-09-28 19:21 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, Phil Sutter, netfilter-devel

Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > I'd say its semantically the same thing, we alter state.
> > 
> > We even emit audit records to tell userspace that there is state
> > change.
> 
> This is a netlink event, how does the mutex help in that regard?

It will prevent two concurrent 'reset dumps' from messing
up internal state of counters, quota, etc.

> > Also, are you sure spinlock is appropriate here?
> > 
> > For full-ruleset resets we might be doing quite some
> > traverals.
> 
> I said several times, we grab and release this for each
> netlink_recvmsg(), commit_mutex helps us in no way to fix the "two
> concurrent dump-and-reset problem".

It does, any lock prevents the 'concurrent reset problem'.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests
  2023-09-28 19:04         ` Pablo Neira Ayuso
  2023-09-28 19:21           ` Florian Westphal
@ 2023-09-28 19:39           ` Jozsef Kadlecsik
  2023-09-28 20:09             ` Florian Westphal
  1 sibling, 1 reply; 34+ messages in thread
From: Jozsef Kadlecsik @ 2023-09-28 19:39 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, Phil Sutter, netfilter-devel

On Thu, 28 Sep 2023, Pablo Neira Ayuso wrote:

> On Thu, Sep 28, 2023 at 08:57:45PM +0200, Florian Westphal wrote:
> > Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > On Thu, Sep 28, 2023 at 07:46:30PM +0200, Florian Westphal wrote:
> > > > Phil Sutter <phil@nwl.cc> wrote:
> > > > > +static int nf_tables_dumpreset_set(struct sk_buff *skb,
> > > > > +				   struct netlink_callback *cb)
> > > > > +{
> > > > > +	struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
> > > > > +	struct nft_set_dump_ctx *dump_ctx = cb->data;
> > > > > +	int ret, skip = cb->args[0];
> > > > > +
> > > > > +	mutex_lock(&nft_net->commit_mutex);
> > > > > +	ret = nf_tables_dump_set(skb, cb);
> > > > > +	mutex_unlock(&nft_net->commit_mutex);
> > > > > +
> > > > > +	if (cb->args[0] > skip)
> > > > > +		audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq,
> > > > > +					cb->args[0] - skip);
> > > > > +
> > > > 
> > > > Once commit_mutex is dropped, parallel user can
> > > > delete table, and ctx.table references garbage.
> > > 
> > > This path should hold rcu read lock.
> > 
> > Then it would splat on first mutex_lock().
> > 
> > > I think spinlock would be better, we would just spin for very little
> > > time here for another thread to complete the reset, and the race is
> > > fixed.
> > > 
> > > The use of commit_mutex here is confusing is really misleading to the
> > > reader, this is also not the commit path.
> > 
> > I'd say its semantically the same thing, we alter state.
> > 
> > We even emit audit records to tell userspace that there is state
> > change.
> 
> This is a netlink event, how does the mutex help in that regard?
> 
> > Also, are you sure spinlock is appropriate here?
> > 
> > For full-ruleset resets we might be doing quite some
> > traverals.
> 
> I said several times, we grab and release this for each
> netlink_recvmsg(), commit_mutex helps us in no way to fix the "two
> concurrent dump-and-reset problem".
> 
> One concern might be deadlock due to reordering, but I don't see how
> that can happen.

The same problem exists ipset: when a set is listed/saved (dumped), 
concurrent destroy/rename/swap for the same set must be excluded. As 
neither spinlock nor mutex helps, a reference counter is used: the start 
of the dump increases it and by checking it all concurrent events can 
safely be rejected by returning EBUSY.

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.hu
PGP key : https://wigner.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests
  2023-09-28 19:21           ` Florian Westphal
@ 2023-09-28 20:07             ` Florian Westphal
  2023-09-29 11:25               ` Phil Sutter
  0 siblings, 1 reply; 34+ messages in thread
From: Florian Westphal @ 2023-09-28 20:07 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Pablo Neira Ayuso, Phil Sutter, netfilter-devel

Florian Westphal <fw@strlen.de> wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > I'd say its semantically the same thing, we alter state.
> > > 
> > > We even emit audit records to tell userspace that there is state
> > > change.
> > 
> > This is a netlink event, how does the mutex help in that regard?
> 
> It will prevent two concurrent 'reset dumps' from messing
> up internal state of counters, quota, etc.
> > > Also, are you sure spinlock is appropriate here?
> > > 
> > > For full-ruleset resets we might be doing quite some
> > > traverals.
> > 
> > I said several times, we grab and release this for each
> > netlink_recvmsg(), commit_mutex helps us in no way to fix the "two
> > concurrent dump-and-reset problem".
> 
> It does, any lock prevents the 'concurrent reset problem'.

Reading Jozsefs email:

The locking prevents problems with concurrent resets,
but not concurrent modifcation with one (or more) resets.

Phil, what is the intention with the reset?
If the idea is to atomically reset AND list everything
(old values are shown) then yes, this approach doesn't work
either and something ipset-alike has to be done, i.e.
completely preventing any new transaction while a reset
"dump" is in progress.

Pablo, do you think we should combine this with the
ealier-discussed "perpetual dump restart" problem?

Userspace could request 'stable' dump that locks
out writers for the entire duration, and kernel could
force it automatically for resets.

I don't really like it though because misbehaving userspace
can lock out writers.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests
  2023-09-28 19:39           ` Jozsef Kadlecsik
@ 2023-09-28 20:09             ` Florian Westphal
  2023-09-28 20:25               ` Jozsef Kadlecsik
  0 siblings, 1 reply; 34+ messages in thread
From: Florian Westphal @ 2023-09-28 20:09 UTC (permalink / raw)
  To: Jozsef Kadlecsik
  Cc: Pablo Neira Ayuso, Florian Westphal, Phil Sutter, netfilter-devel

Jozsef Kadlecsik <kadlec@netfilter.org> wrote:
> On Thu, 28 Sep 2023, Pablo Neira Ayuso wrote:
> > One concern might be deadlock due to reordering, but I don't see how
> > that can happen.
> 
> The same problem exists ipset: when a set is listed/saved (dumped), 
> concurrent destroy/rename/swap for the same set must be excluded. As 
> neither spinlock nor mutex helps, a reference counter is used: the start 
> of the dump increases it and by checking it all concurrent events can 
> safely be rejected by returning EBUSY.

Thanks for sharing!

I assume that means that a dumper that starts a dump, and then
goes to sleep before closing the socket/finishing the dump can
block further ipset updates, is that correct?

(I assume so, I don't see a solution that doesn't trade one problem
 for another).

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests
  2023-09-28 20:09             ` Florian Westphal
@ 2023-09-28 20:25               ` Jozsef Kadlecsik
  0 siblings, 0 replies; 34+ messages in thread
From: Jozsef Kadlecsik @ 2023-09-28 20:25 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Pablo Neira Ayuso, Phil Sutter, netfilter-devel

On Thu, 28 Sep 2023, Florian Westphal wrote:

> Jozsef Kadlecsik <kadlec@netfilter.org> wrote:
> > On Thu, 28 Sep 2023, Pablo Neira Ayuso wrote:
> > > One concern might be deadlock due to reordering, but I don't see how
> > > that can happen.
> > 
> > The same problem exists ipset: when a set is listed/saved (dumped), 
> > concurrent destroy/rename/swap for the same set must be excluded. As 
> > neither spinlock nor mutex helps, a reference counter is used: the start 
> > of the dump increases it and by checking it all concurrent events can 
> > safely be rejected by returning EBUSY.
> 
> Thanks for sharing!
> 
> I assume that means that a dumper that starts a dump, and then
> goes to sleep before closing the socket/finishing the dump can
> block further ipset updates, is that correct?

Concurrent updates are supported, both from user and kernel space.

The operations which would lead to corruption are excluded, like 
destroying the set being dumped or swapping the dumped set with another 
one. A relatively new application of the method is when there's a huge 
add-del operation which must be temporarily halted and the scheduler 
called (so spinlock/mutex cannot be held): for that time destroy/swap must 
also be excluded.
 
> (I assume so, I don't see a solution that doesn't trade one problem
>  for another).

Yes, usually that's the case. However, this code segment triggered me:

> > > > +       mutex_lock(&nft_net->commit_mutex);
> > > > +       ret = nf_tables_dump_set(skb, cb);
> > > > +       mutex_unlock(&nft_net->commit_mutex);
> > > > +
> > > > +       if (cb->args[0] > skip)
> > > > +               audit_log_nft_set_reset(dump_ctx->ctx.table, 
cb->seq,
> > > > +                                       cb->args[0] - skip);
> > > > +

That can quite nicely be handled by reference counting the object and 
protecting that way.

Best regards, 
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlecsik.jozsef@wigner.hu
PGP key : https://wigner.hu/~kadlec/pgp_public_key.txt
Address : Wigner Research Centre for Physics
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 7/8] netfilter: nf_tables: Pass reset bit in nft_set_dump_ctx
  2023-09-28 18:53   ` Pablo Neira Ayuso
@ 2023-09-29 10:08     ` Phil Sutter
  2023-09-29 10:15       ` Pablo Neira Ayuso
  0 siblings, 1 reply; 34+ messages in thread
From: Phil Sutter @ 2023-09-29 10:08 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, netfilter-devel

On Thu, Sep 28, 2023 at 08:53:11PM +0200, Pablo Neira Ayuso wrote:
> On Thu, Sep 28, 2023 at 06:52:43PM +0200, Phil Sutter wrote:
> > Relieve the dump callback from having to check nlmsg_type upon each
> > call. Prep work for set element reset locking.
> 
> Maybe add this as a preparation patch first place in this series,
> rather making this cleanup at this late stage of the batch.

Sure, no problem. I extracted it from v1 of patch 8 and so they are
closely related.

Maybe I should split the series up in per-callback ones? I'd start with
the getsetelem_reset one as that is most cumbersome it seems.

Cheers, Phil

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 1/8] netfilter: nf_tables: Don't allocate nft_rule_dump_ctx
  2023-09-28 19:00   ` Florian Westphal
@ 2023-09-29 10:13     ` Phil Sutter
  0 siblings, 0 replies; 34+ messages in thread
From: Phil Sutter @ 2023-09-29 10:13 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Pablo Neira Ayuso, netfilter-devel

On Thu, Sep 28, 2023 at 09:00:44PM +0200, Florian Westphal wrote:
> Phil Sutter <phil@nwl.cc> wrote:
> > Eliminate the direct use of netlink_callback::args when dumping rules by
> > casting nft_rule_dump_ctx over netlink_callback::ctx as suggested in
> > the struct's comment.
> > 
> > The value for 's_idx' has to be stored inside nft_rule_dump_ctx now and
> > make it hold the 'reset' boolean as well.
> > 
> > Note how this patch removes the zeroing of netlink_callback::args[1-5] -
> > none of the rule dump callbacks seem to make use of them.
> 
> Do you think we can fix the reset race in -next instead of -nf?
> 
> If yes, you could detach preparation patches like this one and
> split the series in several batches.

Yes, I noticed this series is no longer the "add some spinlock to
prevent races" it was in the beginning.

TBH, I chose nf mostly because nf-next lacked a commit I needed. But
it's there now, so v3 will address nf-next.

Thanks, Phil

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 7/8] netfilter: nf_tables: Pass reset bit in nft_set_dump_ctx
  2023-09-29 10:08     ` Phil Sutter
@ 2023-09-29 10:15       ` Pablo Neira Ayuso
  2023-09-29 10:18         ` Phil Sutter
  0 siblings, 1 reply; 34+ messages in thread
From: Pablo Neira Ayuso @ 2023-09-29 10:15 UTC (permalink / raw)
  To: Phil Sutter, Florian Westphal, netfilter-devel

On Fri, Sep 29, 2023 at 12:08:18PM +0200, Phil Sutter wrote:
> On Thu, Sep 28, 2023 at 08:53:11PM +0200, Pablo Neira Ayuso wrote:
> > On Thu, Sep 28, 2023 at 06:52:43PM +0200, Phil Sutter wrote:
> > > Relieve the dump callback from having to check nlmsg_type upon each
> > > call. Prep work for set element reset locking.
> > 
> > Maybe add this as a preparation patch first place in this series,
> > rather making this cleanup at this late stage of the batch.
> 
> Sure, no problem. I extracted it from v1 of patch 8 and so they are
> closely related.
> 
> Maybe I should split the series up in per-callback ones? I'd start with
> the getsetelem_reset one as that is most cumbersome it seems.

Thanks.

Side note: I also read a comment from Florian regarding the use of
ctx.table. You have to be very careful with what you cache in the dump
context area, since such pointer might just go away.

So far this code caches was "careful" to cache only to check if the
table was still there, but iterating over the table list again
(another safer approach could be to use the table handle which is
unique).

All this is also related to the chunked nature of netlink dumps
(in other words, userspace retrieves part of it in every
netlink_recvmsg() call).

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 1/8] netfilter: nf_tables: Don't allocate nft_rule_dump_ctx
  2023-09-28 18:49   ` Pablo Neira Ayuso
@ 2023-09-29 10:15     ` Phil Sutter
  0 siblings, 0 replies; 34+ messages in thread
From: Phil Sutter @ 2023-09-29 10:15 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, netfilter-devel

On Thu, Sep 28, 2023 at 08:49:33PM +0200, Pablo Neira Ayuso wrote:
> On Thu, Sep 28, 2023 at 06:52:37PM +0200, Phil Sutter wrote:
> [...]
> 
> This whole chunk below looks like a cleanup to remove one indentation
> level? Please add an initial patch for this.

Actually, it changes nft_rule_dump_ctx from being allocated to being
cast over the general purpose part in netlink_callback. But I like your
suggestion, it will reduce this patch to the relevant bits.

Thanks, Phil

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 7/8] netfilter: nf_tables: Pass reset bit in nft_set_dump_ctx
  2023-09-29 10:15       ` Pablo Neira Ayuso
@ 2023-09-29 10:18         ` Phil Sutter
  2023-09-29 10:56           ` Pablo Neira Ayuso
  0 siblings, 1 reply; 34+ messages in thread
From: Phil Sutter @ 2023-09-29 10:18 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, netfilter-devel

On Fri, Sep 29, 2023 at 12:15:16PM +0200, Pablo Neira Ayuso wrote:
> On Fri, Sep 29, 2023 at 12:08:18PM +0200, Phil Sutter wrote:
> > On Thu, Sep 28, 2023 at 08:53:11PM +0200, Pablo Neira Ayuso wrote:
> > > On Thu, Sep 28, 2023 at 06:52:43PM +0200, Phil Sutter wrote:
> > > > Relieve the dump callback from having to check nlmsg_type upon each
> > > > call. Prep work for set element reset locking.
> > > 
> > > Maybe add this as a preparation patch first place in this series,
> > > rather making this cleanup at this late stage of the batch.
> > 
> > Sure, no problem. I extracted it from v1 of patch 8 and so they are
> > closely related.
> > 
> > Maybe I should split the series up in per-callback ones? I'd start with
> > the getsetelem_reset one as that is most cumbersome it seems.
> 
> Thanks.
> 
> Side note: I also read a comment from Florian regarding the use of
> ctx.table. You have to be very careful with what you cache in the dump
> context area, since such pointer might just go away.
> 
> So far this code caches was "careful" to cache only to check if the
> table was still there, but iterating over the table list again
> (another safer approach could be to use the table handle which is
> unique).
> 
> All this is also related to the chunked nature of netlink dumps
> (in other words, userspace retrieves part of it in every
> netlink_recvmsg() call).

Good point. I think we may reduce all this to 'strdup(table->name)' and
not care what happens in other CPUs. The only requirement is to cache
table->family for audit logging also (IIRC). I'll give this a try.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests
  2023-09-28 18:51   ` Pablo Neira Ayuso
@ 2023-09-29 10:28     ` Phil Sutter
  0 siblings, 0 replies; 34+ messages in thread
From: Phil Sutter @ 2023-09-29 10:28 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, netfilter-devel

On Thu, Sep 28, 2023 at 08:51:53PM +0200, Pablo Neira Ayuso wrote:
> On Thu, Sep 28, 2023 at 06:52:44PM +0200, Phil Sutter wrote:
> > Set expressions' dump callbacks are not concurrency-safe per-se with
> > reset bit set. If two CPUs reset the same element at the same time,
> > values may underrun at least with element-attached counters and quotas.
> > 
> > Prevent this by introducing dedicated callbacks for nfnetlink and the
> > asynchronous dump handling to serialize access.
> > 
> > Signed-off-by: Phil Sutter <phil@nwl.cc>
> > ---
> > Changes since v1:
> > - Improved commit description
> > - Unrelated chunk moved into a previous patch
> > ---
> >  net/netfilter/nf_tables_api.c | 105 +++++++++++++++++++++++++++++-----
> >  1 file changed, 91 insertions(+), 14 deletions(-)
> > 
> > diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
> > index 1491d4c65fed9..a855b43fe72db 100644
> > --- a/net/netfilter/nf_tables_api.c
> > +++ b/net/netfilter/nf_tables_api.c
> > @@ -5834,10 +5834,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
> >  	nla_nest_end(skb, nest);
> >  	nlmsg_end(skb, nlh);
> >  
> > -	if (dump_ctx->reset && args.iter.count > args.iter.skip)
> > -		audit_log_nft_set_reset(table, cb->seq,
> > -					args.iter.count - args.iter.skip);
> > -
> >  	rcu_read_unlock();
> >  
> >  	if (args.iter.err && args.iter.err != -EMSGSIZE)
> > @@ -5853,6 +5849,24 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
> >  	return -ENOSPC;
> >  }
> >  
> > +static int nf_tables_dumpreset_set(struct sk_buff *skb,
> > +				   struct netlink_callback *cb)
> > +{
> > +	struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
> > +	struct nft_set_dump_ctx *dump_ctx = cb->data;
> > +	int ret, skip = cb->args[0];
> > +
> > +	mutex_lock(&nft_net->commit_mutex);
> > +	ret = nf_tables_dump_set(skb, cb);
> > +	mutex_unlock(&nft_net->commit_mutex);
> > +
> > +	if (cb->args[0] > skip)
> > +		audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq,
> > +					cb->args[0] - skip);
> > +
> > +	return ret;
> > +}
> > +
> >  static int nf_tables_dump_set_start(struct netlink_callback *cb)
> >  {
> >  	struct nft_set_dump_ctx *dump_ctx = cb->data;
> > @@ -6070,7 +6084,6 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
> >  	struct nft_set *set;
> >  	struct nlattr *attr;
> >  	struct nft_ctx ctx;
> > -	bool reset = false;
> >  
> >  	table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
> >  				 genmask, 0);
> > @@ -6085,9 +6098,6 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
> >  
> >  	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
> >  
> > -	if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETSETELEM_RESET)
> > -		reset = true;
> > -
> >  	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
> >  		struct netlink_dump_control c = {
> >  			.start = nf_tables_dump_set_start,
> > @@ -6098,7 +6108,67 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
> >  		struct nft_set_dump_ctx dump_ctx = {
> >  			.set = set,
> >  			.ctx = ctx,
> > -			.reset = reset,
> > +			.reset = false,
> > +		};
> > +
> > +		c.data = &dump_ctx;
> > +		return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
> > +	}
> > +
> > +	if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
> > +		return -EINVAL;
> > +
> > +	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
> > +		err = nft_get_set_elem(&ctx, set, attr, false);
> > +		if (err < 0) {
> > +			NL_SET_BAD_ATTR(extack, attr);
> > +			break;
> > +		}
> > +		nelems++;
> > +	}
> > +
> > +	return err;
> > +}
> > +
> > +static int nf_tables_getsetelem_reset(struct sk_buff *skb,
> > +				      const struct nfnl_info *info,
> > +				      const struct nlattr * const nla[])
> > +{
> > +	struct nftables_pernet *nft_net = nft_pernet(info->net);
> > +	struct netlink_ext_ack *extack = info->extack;
> > +	u8 genmask = nft_genmask_cur(info->net);
> > +	u8 family = info->nfmsg->nfgen_family;
> > +	int rem, err = 0, nelems = 0;
> > +	struct net *net = info->net;
> > +	struct nft_table *table;
> > +	struct nft_set *set;
> > +	struct nlattr *attr;
> > +	struct nft_ctx ctx;
> > +
> > +	table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
> > +				 genmask, 0);
> > +	if (IS_ERR(table)) {
> > +		NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
> > +		return PTR_ERR(table);
> > +	}
> > +
> > +	set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
> > +	if (IS_ERR(set))
> > +		return PTR_ERR(set);
> > +
> > +	nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
> > +
> > +	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
> > +		struct netlink_dump_control c = {
> > +			.start = nf_tables_dump_set_start,
> > +			.dump = nf_tables_dumpreset_set,
> > +			.done = nf_tables_dump_set_done,
> > +			.module = THIS_MODULE,
> > +		};
> > +		struct nft_set_dump_ctx dump_ctx = {
> > +			.set = set,
> > +			.ctx = ctx,
> > +			.reset = true,
> >  		};
> >  
> >  		c.data = &dump_ctx;
> > @@ -6108,18 +6178,25 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
> >  	if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
> >  		return -EINVAL;
> >  
> > +	if (!try_module_get(THIS_MODULE))
> > +		return -EINVAL;
> > +	rcu_read_unlock();
> > +	mutex_lock(&nft_net->commit_mutex);
> > +	rcu_read_lock();
> >  	nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
> > -		err = nft_get_set_elem(&ctx, set, attr, reset);
> > +		err = nft_get_set_elem(&ctx, set, attr, true);
> >  		if (err < 0) {
> >  			NL_SET_BAD_ATTR(extack, attr);
> >  			break;
> >  		}
> >  		nelems++;
> >  	}
> > +	rcu_read_unlock();
> > +	mutex_unlock(&nft_net->commit_mutex);
> > +	rcu_read_lock();
> > +	module_put(THIS_MODULE);
> >  
> > -	if (reset)
> > -		audit_log_nft_set_reset(table, nft_pernet(net)->base_seq,
> > -					nelems);
> > +	audit_log_nft_set_reset(table, nft_net->base_seq, nelems);
> >  
> >  	return err;
> >  }
> > @@ -9140,7 +9217,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
> >  		.policy		= nft_set_elem_list_policy,
> >  	},
> >  	[NFT_MSG_GETSETELEM_RESET] = {
> > -		.call		= nf_tables_getsetelem,
> > +		.call		= nf_tables_getsetelem_reset,
> 
> This diff is weird. This is a complete new function, but it show like
> new code in nf_tables_getsetelem(), this is difficult to track at
> review.
> 
> What did it change for this diff to happen?

Well, the new function nf_tables_getsetelem_reset is very similar to
nf_tables_getsetelem, so diff is a bit too creative reducing the diff
size. Also, I see that labels refer to the wrong function name. It's a
funny result for something like this:

| old_function() {
| 	old_header;
| 	common_body;
| 	old_body;
| }

Adding 'new_function' may be done like so:

| old_function() {
| 	old_header;
|+      common_body;
|+	old_body;
|+}
|+new_function() {
|+      new_header;
| 	common_body;
|@@...@@ old_function
|-      old_body;
|+      new_body;
| }

I'll keep an eye on this for v3. If it still happens, maybe some diff
option helps or reordering where I add the new function.

Cheers, Phil

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 7/8] netfilter: nf_tables: Pass reset bit in nft_set_dump_ctx
  2023-09-29 10:18         ` Phil Sutter
@ 2023-09-29 10:56           ` Pablo Neira Ayuso
  2023-09-29 11:12             ` Phil Sutter
  0 siblings, 1 reply; 34+ messages in thread
From: Pablo Neira Ayuso @ 2023-09-29 10:56 UTC (permalink / raw)
  To: Phil Sutter, Florian Westphal, netfilter-devel

On Fri, Sep 29, 2023 at 12:18:28PM +0200, Phil Sutter wrote:
> On Fri, Sep 29, 2023 at 12:15:16PM +0200, Pablo Neira Ayuso wrote:
> > On Fri, Sep 29, 2023 at 12:08:18PM +0200, Phil Sutter wrote:
> > > On Thu, Sep 28, 2023 at 08:53:11PM +0200, Pablo Neira Ayuso wrote:
> > > > On Thu, Sep 28, 2023 at 06:52:43PM +0200, Phil Sutter wrote:
> > > > > Relieve the dump callback from having to check nlmsg_type upon each
> > > > > call. Prep work for set element reset locking.
> > > > 
> > > > Maybe add this as a preparation patch first place in this series,
> > > > rather making this cleanup at this late stage of the batch.
> > > 
> > > Sure, no problem. I extracted it from v1 of patch 8 and so they are
> > > closely related.
> > > 
> > > Maybe I should split the series up in per-callback ones? I'd start with
> > > the getsetelem_reset one as that is most cumbersome it seems.
> > 
> > Thanks.
> > 
> > Side note: I also read a comment from Florian regarding the use of
> > ctx.table. You have to be very careful with what you cache in the dump
> > context area, since such pointer might just go away.
> > 
> > So far this code caches was "careful" to cache only to check if the
> > table was still there, but iterating over the table list again
> > (another safer approach could be to use the table handle which is
> > unique).
> > 
> > All this is also related to the chunked nature of netlink dumps
> > (in other words, userspace retrieves part of it in every
> > netlink_recvmsg() call).
> 
> Good point. I think we may reduce all this to 'strdup(table->name)' and
> not care what happens in other CPUs. The only requirement is to cache
> table->family for audit logging also (IIRC). I'll give this a try.

table handle is u64 and you don't need to clone, right? Handle
allocation is monotonic.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests
  2023-09-28 17:46   ` Florian Westphal
  2023-09-28 18:47     ` Pablo Neira Ayuso
@ 2023-09-29 11:03     ` Phil Sutter
  1 sibling, 0 replies; 34+ messages in thread
From: Phil Sutter @ 2023-09-29 11:03 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Pablo Neira Ayuso, netfilter-devel

On Thu, Sep 28, 2023 at 07:46:30PM +0200, Florian Westphal wrote:
> Phil Sutter <phil@nwl.cc> wrote:
> > +static int nf_tables_dumpreset_set(struct sk_buff *skb,
> > +				   struct netlink_callback *cb)
> > +{
> > +	struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
> > +	struct nft_set_dump_ctx *dump_ctx = cb->data;
> > +	int ret, skip = cb->args[0];
> > +
> > +	mutex_lock(&nft_net->commit_mutex);
> > +	ret = nf_tables_dump_set(skb, cb);
> > +	mutex_unlock(&nft_net->commit_mutex);
> > +
> > +	if (cb->args[0] > skip)
> > +		audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq,
> > +					cb->args[0] - skip);
> > +
> 
> Once commit_mutex is dropped, parallel user can
> delete table, and ctx.table references garbage.
> 
> So I think this needs to be done under mutex.

OK, will do.

> >  		c.data = &dump_ctx;
> > @@ -6108,18 +6178,25 @@ static int nf_tables_getsetelem(struct sk_buff *skb,
> >  	if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
> >  		return -EINVAL;
> >  
> > +	if (!try_module_get(THIS_MODULE))
> > +		return -EINVAL;
> > +	rcu_read_unlock();
> > +	mutex_lock(&nft_net->commit_mutex);
> > +	rcu_read_lock();
> 
> Why do we need to regain the rcu read lock here?
> Are we tripping over a now bogus rcu_derefence check or is there
> another reason?

Yes, I got a lockdep warning because rhashtable_lookup() called from
nft_rhash_get() calls rht_dereference_rcu() which wants either ht->mutex
or RCU read lock held.

Cheers, Phil

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 7/8] netfilter: nf_tables: Pass reset bit in nft_set_dump_ctx
  2023-09-29 10:56           ` Pablo Neira Ayuso
@ 2023-09-29 11:12             ` Phil Sutter
  0 siblings, 0 replies; 34+ messages in thread
From: Phil Sutter @ 2023-09-29 11:12 UTC (permalink / raw)
  To: Pablo Neira Ayuso; +Cc: Florian Westphal, netfilter-devel

On Fri, Sep 29, 2023 at 12:56:15PM +0200, Pablo Neira Ayuso wrote:
> On Fri, Sep 29, 2023 at 12:18:28PM +0200, Phil Sutter wrote:
> > On Fri, Sep 29, 2023 at 12:15:16PM +0200, Pablo Neira Ayuso wrote:
> > > On Fri, Sep 29, 2023 at 12:08:18PM +0200, Phil Sutter wrote:
> > > > On Thu, Sep 28, 2023 at 08:53:11PM +0200, Pablo Neira Ayuso wrote:
> > > > > On Thu, Sep 28, 2023 at 06:52:43PM +0200, Phil Sutter wrote:
> > > > > > Relieve the dump callback from having to check nlmsg_type upon each
> > > > > > call. Prep work for set element reset locking.
> > > > > 
> > > > > Maybe add this as a preparation patch first place in this series,
> > > > > rather making this cleanup at this late stage of the batch.
> > > > 
> > > > Sure, no problem. I extracted it from v1 of patch 8 and so they are
> > > > closely related.
> > > > 
> > > > Maybe I should split the series up in per-callback ones? I'd start with
> > > > the getsetelem_reset one as that is most cumbersome it seems.
> > > 
> > > Thanks.
> > > 
> > > Side note: I also read a comment from Florian regarding the use of
> > > ctx.table. You have to be very careful with what you cache in the dump
> > > context area, since such pointer might just go away.
> > > 
> > > So far this code caches was "careful" to cache only to check if the
> > > table was still there, but iterating over the table list again
> > > (another safer approach could be to use the table handle which is
> > > unique).
> > > 
> > > All this is also related to the chunked nature of netlink dumps
> > > (in other words, userspace retrieves part of it in every
> > > netlink_recvmsg() call).
> > 
> > Good point. I think we may reduce all this to 'strdup(table->name)' and
> > not care what happens in other CPUs. The only requirement is to cache
> > table->family for audit logging also (IIRC). I'll give this a try.
> 
> table handle is u64 and you don't need to clone, right? Handle
> allocation is monotonic.

The intention was to use it for audit logging as well which requires the
name (and family). Pulling anything that accesses the cached data into
the critical section is probably more straightforward.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests
  2023-09-28 20:07             ` Florian Westphal
@ 2023-09-29 11:25               ` Phil Sutter
  2023-09-29 11:30                 ` Florian Westphal
  0 siblings, 1 reply; 34+ messages in thread
From: Phil Sutter @ 2023-09-29 11:25 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Pablo Neira Ayuso, netfilter-devel

On Thu, Sep 28, 2023 at 10:07:51PM +0200, Florian Westphal wrote:
> Florian Westphal <fw@strlen.de> wrote:
> > Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > > I'd say its semantically the same thing, we alter state.
> > > > 
> > > > We even emit audit records to tell userspace that there is state
> > > > change.
> > > 
> > > This is a netlink event, how does the mutex help in that regard?
> > 
> > It will prevent two concurrent 'reset dumps' from messing
> > up internal state of counters, quota, etc.
> > > > Also, are you sure spinlock is appropriate here?
> > > > 
> > > > For full-ruleset resets we might be doing quite some
> > > > traverals.
> > > 
> > > I said several times, we grab and release this for each
> > > netlink_recvmsg(), commit_mutex helps us in no way to fix the "two
> > > concurrent dump-and-reset problem".
> > 
> > It does, any lock prevents the 'concurrent reset problem'.
> 
> Reading Jozsefs email:
> 
> The locking prevents problems with concurrent resets,
> but not concurrent modifcation with one (or more) resets.
> 
> Phil, what is the intention with the reset?
> If the idea is to atomically reset AND list everything
> (old values are shown) then yes, this approach doesn't work
> either and something ipset-alike has to be done, i.e.
> completely preventing any new transaction while a reset
> "dump" is in progress.

Sure, the functionality is to fetch old values while resetting so they
are not lost (and may be used for accounting, etc.). The question is
what we want to guarantee. There's still the non-dump path, so if
someone wants transactional safety they could still reset rules
individually.

The whole "what if my other hand pulls the trigger while I'm still
drawing the gun" scenario is a bit too academic for my taste. ;)

> Pablo, do you think we should combine this with the
> ealier-discussed "perpetual dump restart" problem?
> 
> Userspace could request 'stable' dump that locks
> out writers for the entire duration, and kernel could
> force it automatically for resets.
> 
> I don't really like it though because misbehaving userspace
> can lock out writers.

Make them inactive and free only after the dump is done? IIUC,
nft_active_genmask() will return true again though after the second
update, right?

Cheers, Phil

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests
  2023-09-29 11:25               ` Phil Sutter
@ 2023-09-29 11:30                 ` Florian Westphal
  2023-09-29 11:45                   ` Phil Sutter
  0 siblings, 1 reply; 34+ messages in thread
From: Florian Westphal @ 2023-09-29 11:30 UTC (permalink / raw)
  To: Phil Sutter, Florian Westphal, Pablo Neira Ayuso, netfilter-devel

Phil Sutter <phil@nwl.cc> wrote:
> On Thu, Sep 28, 2023 at 10:07:51PM +0200, Florian Westphal wrote:
> > I don't really like it though because misbehaving userspace
> > can lock out writers.
> 
> Make them inactive and free only after the dump is done? IIUC,
> nft_active_genmask() will return true again though after the second
> update, right?

Yes, however, in case of update and 'reset dump', we'll set the
NLM_F_DUMP_INTR flag, so userspace would restart the dump.

AFAIU, this means the original values of 'already-reset' counters
are lost given nft will restart the 'reset dump'.

Alternative is make nft not restart if reset-dump was requested,
but in that case the dump can be incomplete.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests
  2023-09-29 11:30                 ` Florian Westphal
@ 2023-09-29 11:45                   ` Phil Sutter
  0 siblings, 0 replies; 34+ messages in thread
From: Phil Sutter @ 2023-09-29 11:45 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Pablo Neira Ayuso, netfilter-devel

On Fri, Sep 29, 2023 at 01:30:43PM +0200, Florian Westphal wrote:
> Phil Sutter <phil@nwl.cc> wrote:
> > On Thu, Sep 28, 2023 at 10:07:51PM +0200, Florian Westphal wrote:
> > > I don't really like it though because misbehaving userspace
> > > can lock out writers.
> > 
> > Make them inactive and free only after the dump is done? IIUC,
> > nft_active_genmask() will return true again though after the second
> > update, right?
> 
> Yes, however, in case of update and 'reset dump', we'll set the
> NLM_F_DUMP_INTR flag, so userspace would restart the dump.
> 
> AFAIU, this means the original values of 'already-reset' counters
> are lost given nft will restart the 'reset dump'.
> 
> Alternative is make nft not restart if reset-dump was requested,
> but in that case the dump can be incomplete.

Modification of the data being dump-reset is unsolvable anyway, unless
we can undo the reset. Not having to return EINTR for unrelated
modifications would help already, though may just be yet another
half-ass solution.

I'd honestly just document the unreliability of 'reset rules' and point
at 'reset rule' for a safe variant. (Assuming the non-dump path is
actually safe?!)

Cheers, Phil

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2023-09-29 11:46 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-09-28 16:52 [nf PATCH v2 0/8] Introduce locking for reset requests Phil Sutter
2023-09-28 16:52 ` [nf PATCH v2 1/8] netfilter: nf_tables: Don't allocate nft_rule_dump_ctx Phil Sutter
2023-09-28 18:49   ` Pablo Neira Ayuso
2023-09-29 10:15     ` Phil Sutter
2023-09-28 19:00   ` Florian Westphal
2023-09-29 10:13     ` Phil Sutter
2023-09-28 16:52 ` [nf PATCH v2 2/8] netfilter: nf_tables: Introduce nf_tables_getrule_single() Phil Sutter
2023-09-28 16:52 ` [nf PATCH v2 3/8] netfilter: nf_tables: Add locking for NFT_MSG_GETRULE_RESET requests Phil Sutter
2023-09-28 16:52 ` [nf PATCH v2 4/8] netfilter: nf_tables: Introduce struct nft_obj_dump_ctx Phil Sutter
2023-09-28 16:52 ` [nf PATCH v2 5/8] netfilter: nf_tables: Introduce nf_tables_getobj_single Phil Sutter
2023-09-28 16:52 ` [nf PATCH v2 6/8] netfilter: nf_tables: Add locking for NFT_MSG_GETOBJ_RESET requests Phil Sutter
2023-09-28 16:52 ` [nf PATCH v2 7/8] netfilter: nf_tables: Pass reset bit in nft_set_dump_ctx Phil Sutter
2023-09-28 18:53   ` Pablo Neira Ayuso
2023-09-29 10:08     ` Phil Sutter
2023-09-29 10:15       ` Pablo Neira Ayuso
2023-09-29 10:18         ` Phil Sutter
2023-09-29 10:56           ` Pablo Neira Ayuso
2023-09-29 11:12             ` Phil Sutter
2023-09-28 16:52 ` [nf PATCH v2 8/8] netfilter: nf_tables: Add locking for NFT_MSG_GETSETELEM_RESET requests Phil Sutter
2023-09-28 17:46   ` Florian Westphal
2023-09-28 18:47     ` Pablo Neira Ayuso
2023-09-28 18:57       ` Florian Westphal
2023-09-28 19:04         ` Pablo Neira Ayuso
2023-09-28 19:21           ` Florian Westphal
2023-09-28 20:07             ` Florian Westphal
2023-09-29 11:25               ` Phil Sutter
2023-09-29 11:30                 ` Florian Westphal
2023-09-29 11:45                   ` Phil Sutter
2023-09-28 19:39           ` Jozsef Kadlecsik
2023-09-28 20:09             ` Florian Westphal
2023-09-28 20:25               ` Jozsef Kadlecsik
2023-09-29 11:03     ` Phil Sutter
2023-09-28 18:51   ` Pablo Neira Ayuso
2023-09-29 10:28     ` Phil Sutter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).