netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH nf-next] netfilter: nf_tables: avoid chain re-validation if possible
@ 2025-11-26 11:47 Florian Westphal
  2025-11-29  1:22 ` Hamza Mahfooz
  0 siblings, 1 reply; 5+ messages in thread
From: Florian Westphal @ 2025-11-26 11:47 UTC (permalink / raw)
  To: netfilter-devel; +Cc: hamzamahfooz, Florian Westphal

Consider:
  input -> j2 -> j3
  input -> j2 -> j3
  input -> j1 -> j2 -> j3

Then the second rule does not need to revalidate j2, and, by extension j3.
We need to validate it only for rule 3.

This is needed because chain loop detection also ensures we do not
exceed the jump stack: Just because we know that j2 is cycle free, its
last jump might now exceed the allowed stack.  We also need to update
the new largest call depth for all the reachable nodes.

Care has to be taken to revalidate even if the chain depth won't be an
issue, as the chain validation also ensures that expressions are not
called from invalid context (e.g., masquerade from a filter chain
or NAT prerouting hook).

Therefore we also need to stash the base chain context (type, hooknum)
and revalidate if the chain became reachable from a different hook or type.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 Not urgent, can be deferred to next cycle.

 include/net/netfilter/nf_tables.h | 30 +++++++++----
 net/netfilter/nf_tables_api.c     | 70 +++++++++++++++++++++++++++++--
 2 files changed, 88 insertions(+), 12 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index fab7dc73f738..3f91a4f0dcaa 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1091,6 +1091,26 @@ struct nft_rule_blob {
 		__attribute__((aligned(__alignof__(struct nft_rule_dp))));
 };
 
+enum nft_chain_types {
+	NFT_CHAIN_T_DEFAULT = 0,
+	NFT_CHAIN_T_ROUTE,
+	NFT_CHAIN_T_NAT,
+	NFT_CHAIN_T_MAX
+};
+
+/**
+ *	struct nft_chain_validate_state - cache validation state
+ *
+ *	@depth: chain was validated for call level <= depth
+ *	@basetype_mask: chain receives packets from basetypes
+ *	@hook_mask: chain receives packets from hook numbers
+ */
+struct nft_chain_validate_state {
+	u8			basetype_mask;
+	u8			hook_mask[NFT_CHAIN_T_MAX];
+	u8			depth;
+};
+
 /**
  *	struct nft_chain - nf_tables chain
  *
@@ -1128,9 +1148,10 @@ struct nft_chain {
 
 	/* Only used during control plane commit phase: */
 	struct nft_rule_blob		*blob_next;
+	struct nft_chain_validate_state vstate;
 };
 
-int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain);
+int nft_chain_validate(const struct nft_ctx *ctx, struct nft_chain *chain);
 int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
 			 const struct nft_set_iter *iter,
 			 struct nft_elem_priv *elem_priv);
@@ -1138,13 +1159,6 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set);
 int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);
 void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);
 
-enum nft_chain_types {
-	NFT_CHAIN_T_DEFAULT = 0,
-	NFT_CHAIN_T_ROUTE,
-	NFT_CHAIN_T_NAT,
-	NFT_CHAIN_T_MAX
-};
-
 /**
  * 	struct nft_chain_type - nf_tables chain type info
  *
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index f3de2f9bbebf..763690850a39 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -123,6 +123,30 @@ static void nft_validate_state_update(struct nft_table *table, u8 new_validate_s
 
 	table->validate_state = new_validate_state;
 }
+
+static bool nft_chain_vstate_valid(const struct nft_ctx *ctx,
+				   const struct nft_chain *chain)
+{
+	const struct nft_base_chain *base_chain;
+	enum nft_chain_types type;
+	u8 hooknum;
+
+	if (!nft_is_base_chain(ctx->chain))
+		return false;
+
+	base_chain = nft_base_chain(ctx->chain);
+	hooknum = base_chain->ops.hooknum;
+	type = base_chain->type->type;
+
+	/* chain is already validated for this call depth */
+	if (chain->vstate.depth >= ctx->level &&
+	    chain->vstate.basetype_mask & BIT(type) &&
+	    chain->vstate.hook_mask[type] & BIT(hooknum))
+		return true;
+
+	return false;
+}
+
 static void nf_tables_trans_destroy_work(struct work_struct *w);
 
 static void nft_trans_gc_work(struct work_struct *work);
@@ -4079,6 +4103,29 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *r
 	nf_tables_rule_destroy(ctx, rule);
 }
 
+static void nft_chain_vstate_update(const struct nft_ctx *ctx, struct nft_chain *chain)
+{
+	const struct nft_base_chain *base_chain;
+	enum nft_chain_types type;
+	u8 hooknum;
+
+	if (!nft_is_base_chain(ctx->chain)) {
+		memset(&chain->vstate, 0, sizeof(chain->vstate));
+		return;
+	}
+
+	base_chain = nft_base_chain(ctx->chain);
+	hooknum = base_chain->ops.hooknum;
+	type = base_chain->type->type;
+
+	BUILD_BUG_ON(BIT(NF_INET_NUMHOOKS) > U8_MAX);
+
+	chain->vstate.basetype_mask |= BIT(type);
+	chain->vstate.hook_mask[type] |= BIT(hooknum);
+	if (chain->vstate.depth < ctx->level)
+		chain->vstate.depth = ctx->level;
+}
+
 /** nft_chain_validate - loop detection and hook validation
  *
  * @ctx: context containing call depth and base chain
@@ -4088,15 +4135,25 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *r
  * and set lookups until either the jump limit is hit or all reachable
  * chains have been validated.
  */
-int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
+int nft_chain_validate(const struct nft_ctx *ctx, struct nft_chain *chain)
 {
 	struct nft_expr *expr, *last;
 	struct nft_rule *rule;
 	int err;
 
+	BUILD_BUG_ON(NFT_JUMP_STACK_SIZE > 255);
 	if (ctx->level == NFT_JUMP_STACK_SIZE)
 		return -EMLINK;
 
+	if (ctx->level > 0) {
+		/* jumps to base chains are not allowed. */
+		if (nft_is_base_chain(chain))
+			return -ELOOP;
+
+		if (nft_chain_vstate_valid(ctx, chain))
+			return 0;
+	}
+
 	list_for_each_entry(rule, &chain->rules, list) {
 		if (fatal_signal_pending(current))
 			return -EINTR;
@@ -4117,6 +4174,7 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
 		}
 	}
 
+	nft_chain_vstate_update(ctx, chain);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nft_chain_validate);
@@ -4128,7 +4186,7 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
 		.net	= net,
 		.family	= table->family,
 	};
-	int err;
+	int err = 0;
 
 	list_for_each_entry(chain, &table->chains, list) {
 		if (!nft_is_base_chain(chain))
@@ -4137,12 +4195,16 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
 		ctx.chain = chain;
 		err = nft_chain_validate(&ctx, chain);
 		if (err < 0)
-			return err;
+			goto err;
 
 		cond_resched();
 	}
 
-	return 0;
+err:
+	list_for_each_entry(chain, &table->chains, list)
+		memset(&chain->vstate, 0, sizeof(chain->vstate));
+
+	return err;
 }
 
 int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
-- 
2.51.2


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH nf-next] netfilter: nf_tables: avoid chain re-validation if possible
  2025-11-26 11:47 [PATCH nf-next] netfilter: nf_tables: avoid chain re-validation if possible Florian Westphal
@ 2025-11-29  1:22 ` Hamza Mahfooz
  2025-11-29  1:32   ` Florian Westphal
  0 siblings, 1 reply; 5+ messages in thread
From: Hamza Mahfooz @ 2025-11-29  1:22 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

On Wed, Nov 26, 2025 at 12:47:00PM +0100, Florian Westphal wrote:
> Consider:
>   input -> j2 -> j3
>   input -> j2 -> j3
>   input -> j1 -> j2 -> j3
> 
> Then the second rule does not need to revalidate j2, and, by extension j3.
> We need to validate it only for rule 3.
> 
> This is needed because chain loop detection also ensures we do not
> exceed the jump stack: Just because we know that j2 is cycle free, its
> last jump might now exceed the allowed stack.  We also need to update
> the new largest call depth for all the reachable nodes.
> 
> Care has to be taken to revalidate even if the chain depth won't be an
> issue, as the chain validation also ensures that expressions are not
> called from invalid context (e.g., masquerade from a filter chain
> or NAT prerouting hook).
> 
> Therefore we also need to stash the base chain context (type, hooknum)
> and revalidate if the chain became reachable from a different hook or type.

The issue is reproducible with this version of the patch applied, unless
I make the following change:

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 1cf9f0aa1f49..a7b415c53df6 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4145,14 +4145,8 @@ int nft_chain_validate(const struct nft_ctx *ctx, struct nft_chain *chain)
 	if (ctx->level == NFT_JUMP_STACK_SIZE)
 		return -EMLINK;
 
-	if (ctx->level > 0) {
-		/* jumps to base chains are not allowed. */
-		if (nft_is_base_chain(chain))
-			return -ELOOP;
-
-		if (nft_chain_vstate_valid(ctx, chain))
-			return 0;
-	}
+	if (ctx->level && nft_chain_vstate_valid(ctx, chain))
+		return 0;
 
 	list_for_each_entry(rule, &chain->rules, list) {
 		if (fatal_signal_pending(current))

It is also worth noting that I'm still seeing the cpu usage spike up to
100% for a couple of seconds (attributed to an iptables process) with
this version of the patch (even with the above change), while the
previous rendition seemd to have resolved that.

Hamza

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH nf-next] netfilter: nf_tables: avoid chain re-validation if possible
  2025-11-29  1:22 ` Hamza Mahfooz
@ 2025-11-29  1:32   ` Florian Westphal
  2025-12-01 19:48     ` Hamza Mahfooz
  0 siblings, 1 reply; 5+ messages in thread
From: Florian Westphal @ 2025-11-29  1:32 UTC (permalink / raw)
  To: Hamza Mahfooz; +Cc: netfilter-devel

Hamza Mahfooz <hamzamahfooz@linux.microsoft.com> wrote:
> The issue is reproducible with this version of the patch applied, unless
> I make the following change:
> 
> diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
> index 1cf9f0aa1f49..a7b415c53df6 100644
> --- a/net/netfilter/nf_tables_api.c
> +++ b/net/netfilter/nf_tables_api.c
> @@ -4145,14 +4145,8 @@ int nft_chain_validate(const struct nft_ctx *ctx, struct nft_chain *chain)
>  	if (ctx->level == NFT_JUMP_STACK_SIZE)
>  		return -EMLINK;
>  
> -	if (ctx->level > 0) {
> -		/* jumps to base chains are not allowed. */
> -		if (nft_is_base_chain(chain))
> -			return -ELOOP;
> -
> -		if (nft_chain_vstate_valid(ctx, chain))
> -			return 0;
> -	}
> +	if (ctx->level && nft_chain_vstate_valid(ctx, chain))
> +		return 0;

Looks like a placebo change to me?
Also, the nft_is_base_chain(chain) check is required.

> It is also worth noting that I'm still seeing the cpu usage spike up to
> 100% for a couple of seconds (attributed to an iptables process) with
> this version of the patch (even with the above change), while the
> previous rendition seemd to have resolved that.

The previous version makes illegal shortcuts (as in, not validating
when it has to), it cannot be applied.

That said, I have flagged this patch as deferred anyway, there are too
many conflicting changes flying around.

I'll resubmit in a few weeks when -next opens up again.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH nf-next] netfilter: nf_tables: avoid chain re-validation if possible
  2025-11-29  1:32   ` Florian Westphal
@ 2025-12-01 19:48     ` Hamza Mahfooz
  2025-12-01 21:57       ` Florian Westphal
  0 siblings, 1 reply; 5+ messages in thread
From: Hamza Mahfooz @ 2025-12-01 19:48 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

On Sat, Nov 29, 2025 at 02:32:15AM +0100, Florian Westphal wrote:
> Looks like a placebo change to me?
> Also, the nft_is_base_chain(chain) check is required.
> 

Ya, seems like the delta was due to variances in test runs.
Sorry about the noise.

> 
> The previous version makes illegal shortcuts (as in, not validating
> when it has to), it cannot be applied.
> 
> That said, I have flagged this patch as deferred anyway, there are too
> many conflicting changes flying around.
> 
> I'll resubmit in a few weeks when -next opens up again.

Makes sense, in any case this patch significantly reduces the soft
lockup rate, so feel free to add:

Tested-by: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>

When you get around to resubmitting it after the merge window closes.

Hamza

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH nf-next] netfilter: nf_tables: avoid chain re-validation if possible
  2025-12-01 19:48     ` Hamza Mahfooz
@ 2025-12-01 21:57       ` Florian Westphal
  0 siblings, 0 replies; 5+ messages in thread
From: Florian Westphal @ 2025-12-01 21:57 UTC (permalink / raw)
  To: Hamza Mahfooz; +Cc: netfilter-devel

Hamza Mahfooz <hamzamahfooz@linux.microsoft.com> wrote:
> On Sat, Nov 29, 2025 at 02:32:15AM +0100, Florian Westphal wrote:
> > I'll resubmit in a few weeks when -next opens up again.
> 
> Makes sense, in any case this patch significantly reduces the soft
> lockup rate, so feel free to add:
> 
> Tested-by: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>

Thanks for testing!

Now that you mention this: you could try to revert
314c82841602 ("netfilter: nf_tables: can't schedule in nft_chain_validate")
to get rid of all softlockup warnings:

As of a60a5abe19d6 ("netfilter: nf_tables: allow iter callbacks to sleep")
the iterator-cant-schedule no longer applies.

If i'm right you could submit a bug fix revert for nf tree.

Thanks!

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-12-01 21:58 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-26 11:47 [PATCH nf-next] netfilter: nf_tables: avoid chain re-validation if possible Florian Westphal
2025-11-29  1:22 ` Hamza Mahfooz
2025-11-29  1:32   ` Florian Westphal
2025-12-01 19:48     ` Hamza Mahfooz
2025-12-01 21:57       ` Florian Westphal

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).