[PATCH net 04/14] netfilter: flowtable: fix and simplify IP6IP6 tunnel handling

Netdev List
 help / color / mirror / Atom feed

From: Pablo Neira Ayuso <pablo@netfilter.org>
To: netfilter-devel@vger.kernel.org
Cc: davem@davemloft.net, netdev@vger.kernel.org, kuba@kernel.org,
	pabeni@redhat.com, edumazet@google.com, fw@strlen.de,
	horms@kernel.org
Subject: [PATCH net 04/14] netfilter: flowtable: fix and simplify IP6IP6 tunnel handling
Date: Sun, 21 Jun 2026 00:27:28 +0200	[thread overview]
Message-ID: <20260620222738.112506-5-pablo@netfilter.org> (raw)
In-Reply-To: <20260620222738.112506-1-pablo@netfilter.org>

From: Lorenzo Bianconi <lorenzo@kernel.org>

Fix nf_flow_ip6_tunnel_proto() to use pskb_may_pull() instead of
skb_header_pointer() to ensure the outer IPv6 header is in the skb
headroom, which is required for subsequent packet processing. Move
ctx->offset update inside the IPPROTO_IPV6 conditional block since it
should only be adjusted when an IP6IP6 tunnel is actually detected.
Simplify the rx path by removing ipv6_skip_exthdr() and checking
ip6h->nexthdr directly, as the flowtable fast path only handles simple
IP6IP6 encapsulation without extension headers.
Drop the tunnel encapsulation limit destination option support from the
tx path to match, since the rx path no longer handles extension headers.
Remove the encap_limit parameter from nf_flow_offload_ipv6_forward(),
nf_flow_tunnel_ip6ip6_push() and nf_flow_tunnel_v6_push(), along with
the ipv6_tel_txoption struct and related headroom/MTU adjustments.

Fixes: d98103575dcdd ("netfilter: flowtable: Add IP6IP6 rx sw acceleration")
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/ipv6/ip6_tunnel.c                         |  7 ++
 net/netfilter/nf_flow_table_ip.c              | 80 +++++--------------
 .../selftests/net/netfilter/nft_flowtable.sh  |  8 +-
 3 files changed, 30 insertions(+), 65 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index d7c90a8533ec..bf8e40af60b0 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1851,6 +1851,13 @@ static int ip6_tnl_fill_forward_path(struct net_device_path_ctx *ctx,
 	struct dst_entry *dst;
 	int err;
 
+	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
+		/* encaplimit option is currently not supported is
+		 * sw-acceleration path.
+		 */
+		return -EOPNOTSUPP;
+	}
+
 	dst = ip6_route_output(dev_net(ctx->dev), NULL, &fl6);
 	if (!dst->error) {
 		path->type = DEV_PATH_TUN;
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 9c05a50d6013..e7a3fb2b2d94 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -347,29 +347,23 @@ static bool nf_flow_ip6_tunnel_proto(struct nf_flowtable_ctx *ctx,
 				     struct sk_buff *skb)
 {
 #if IS_ENABLED(CONFIG_IPV6)
-	struct ipv6hdr *ip6h, _ip6h;
-	__be16 frag_off;
-	u8 nexthdr;
-	int hdrlen;
+	struct ipv6hdr *ip6h;
 
-	ip6h = skb_header_pointer(skb, ctx->offset, sizeof(*ip6h), &_ip6h);
-	if (!ip6h)
+	if (!pskb_may_pull(skb, sizeof(*ip6h) + ctx->offset))
 		return false;
 
+	ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset);
 	if (ip6h->hop_limit <= 1)
 		return false;
 
-	nexthdr = ip6h->nexthdr;
-	hdrlen = ipv6_skip_exthdr(skb, sizeof(*ip6h) + ctx->offset, &nexthdr,
-				  &frag_off);
-	if (hdrlen < 0)
+	if (ipv6_ext_hdr(ip6h->nexthdr))
 		return false;
 
-	if (nexthdr == IPPROTO_IPV6) {
-		ctx->tun.hdr_size = hdrlen;
-		ctx->tun.proto = IPPROTO_IPV6;
+	if (ip6h->nexthdr == IPPROTO_IPV6) {
+		ctx->tun.proto = ip6h->nexthdr;
+		ctx->tun.hdr_size = sizeof(*ip6h);
+		ctx->offset += ctx->tun.hdr_size;
 	}
-	ctx->offset += ctx->tun.hdr_size;
 
 	return true;
 #else
@@ -648,25 +642,19 @@ static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb,
 	return 0;
 }
 
-struct ipv6_tel_txoption {
-	struct ipv6_txoptions ops;
-	__u8 dst_opt[8];
-};
-
 static int nf_flow_tunnel_ip6ip6_push(struct net *net, struct sk_buff *skb,
 				      struct flow_offload_tuple *tuple,
-				      struct in6_addr **ip6_daddr,
-				      int encap_limit)
+				      struct in6_addr **ip6_daddr)
 {
 	struct ipv6hdr *ip6h = (struct ipv6hdr *)skb_network_header(skb);
-	u8 hop_limit = ip6h->hop_limit, proto = IPPROTO_IPV6;
 	struct rtable *rt = dst_rtable(tuple->dst_cache);
 	__u8 dsfield = ipv6_get_dsfield(ip6h);
 	struct flowi6 fl6 = {
 		.daddr = tuple->tun.src_v6,
 		.saddr = tuple->tun.dst_v6,
-		.flowi6_proto = proto,
+		.flowi6_proto = IPPROTO_IPV6,
 	};
+	u8 hop_limit = ip6h->hop_limit;
 	int err, mtu;
 	u32 headroom;
 
@@ -674,41 +662,18 @@ static int nf_flow_tunnel_ip6ip6_push(struct net *net, struct sk_buff *skb,
 	if (err)
 		return err;
 
-	skb_set_inner_ipproto(skb, proto);
+	skb_set_inner_ipproto(skb, IPPROTO_IPV6);
 	headroom = sizeof(*ip6h) + LL_RESERVED_SPACE(rt->dst.dev) +
 		   rt->dst.header_len;
-	if (encap_limit)
-		headroom += 8;
 	err = skb_cow_head(skb, headroom);
 	if (err)
 		return err;
 
 	skb_scrub_packet(skb, true);
 	mtu = dst_mtu(&rt->dst) - sizeof(*ip6h);
-	if (encap_limit)
-		mtu -= 8;
 	mtu = max(mtu, IPV6_MIN_MTU);
 	skb_dst_update_pmtu_no_confirm(skb, mtu);
 
-	if (encap_limit > 0) {
-		struct ipv6_tel_txoption opt = {
-			.dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT,
-			.dst_opt[3] = 1,
-			.dst_opt[4] = encap_limit,
-			.dst_opt[5] = IPV6_TLV_PADN,
-			.dst_opt[6] = 1,
-		};
-		struct ipv6_opt_hdr *hopt;
-
-		opt.ops.dst1opt = (struct ipv6_opt_hdr *)opt.dst_opt;
-		opt.ops.opt_nflen = 8;
-
-		hopt = skb_push(skb, ipv6_optlen(opt.ops.dst1opt));
-		memcpy(hopt, opt.ops.dst1opt, ipv6_optlen(opt.ops.dst1opt));
-		hopt->nexthdr = IPPROTO_IPV6;
-		proto = NEXTHDR_DEST;
-	}
-
 	skb_push(skb, sizeof(*ip6h));
 	skb_reset_network_header(skb);
 
@@ -716,7 +681,7 @@ static int nf_flow_tunnel_ip6ip6_push(struct net *net, struct sk_buff *skb,
 	ip6_flow_hdr(ip6h, dsfield,
 		     ip6_make_flowlabel(net, skb, fl6.flowlabel, true, &fl6));
 	ip6h->hop_limit = hop_limit;
-	ip6h->nexthdr = proto;
+	ip6h->nexthdr = IPPROTO_IPV6;
 	ip6h->daddr = tuple->tun.src_v6;
 	ip6h->saddr = tuple->tun.dst_v6;
 	ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(*ip6h));
@@ -729,12 +694,10 @@ static int nf_flow_tunnel_ip6ip6_push(struct net *net, struct sk_buff *skb,
 
 static int nf_flow_tunnel_v6_push(struct net *net, struct sk_buff *skb,
 				  struct flow_offload_tuple *tuple,
-				  struct in6_addr **ip6_daddr,
-				  int encap_limit)
+				  struct in6_addr **ip6_daddr)
 {
 	if (tuple->tun_num)
-		return nf_flow_tunnel_ip6ip6_push(net, skb, tuple, ip6_daddr,
-						  encap_limit);
+		return nf_flow_tunnel_ip6ip6_push(net, skb, tuple, ip6_daddr);
 
 	return 0;
 }
@@ -1089,7 +1052,7 @@ static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb,
 static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx,
 					struct nf_flowtable *flow_table,
 					struct flow_offload_tuple_rhash *tuplehash,
-					struct sk_buff *skb, int encap_limit)
+					struct sk_buff *skb)
 {
 	enum flow_offload_tuple_dir dir;
 	struct flow_offload *flow;
@@ -1100,11 +1063,8 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx,
 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
 
 	mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset;
-	if (flow->tuplehash[!dir].tuple.tun_num) {
+	if (flow->tuplehash[!dir].tuple.tun_num)
 		mtu -= sizeof(*ip6h);
-		if (encap_limit > 0)
-			mtu -= 8; /* encap limit option */
-	}
 
 	if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
 		return 0;
@@ -1158,7 +1118,6 @@ unsigned int
 nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 			  const struct nf_hook_state *state)
 {
-	int encap_limit = IPV6_DEFAULT_TNL_ENCAP_LIMIT;
 	struct flow_offload_tuple_rhash *tuplehash;
 	struct nf_flowtable *flow_table = priv;
 	struct flow_offload_tuple *other_tuple;
@@ -1177,8 +1136,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 	if (tuplehash == NULL)
 		return NF_ACCEPT;
 
-	ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb,
-					   encap_limit);
+	ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb);
 	if (ret < 0)
 		return NF_DROP;
 	else if (ret == 0)
@@ -1198,7 +1156,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 	ip6_daddr = &other_tuple->src_v6;
 
 	if (nf_flow_tunnel_v6_push(state->net, skb, other_tuple,
-				   &ip6_daddr, encap_limit) < 0)
+				   &ip6_daddr) < 0)
 		return NF_DROP;
 
 	switch (tuplehash->tuple.xmit_type) {
diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh
index 7a34ef468975..08ad07500e8a 100755
--- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh
+++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh
@@ -592,7 +592,7 @@ ip -net "$nsr1" link set tun0 up
 ip -net "$nsr1" addr add 192.168.100.1/24 dev tun0
 ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null
 
-ip -net "$nsr1" link add name tun6 type ip6tnl local fee1:2::1 remote fee1:2::2
+ip -net "$nsr1" link add name tun6 type ip6tnl local fee1:2::1 remote fee1:2::2 encaplimit none
 ip -net "$nsr1" link set tun6 up
 ip -net "$nsr1" addr add fee1:3::1/64 dev tun6 nodad
 
@@ -601,7 +601,7 @@ ip -net "$nsr2" link set tun0 up
 ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0
 ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null
 
-ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote fee1:2::1 || ret=1
+ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote fee1:2::1 encaplimit none || ret=1
 ip -net "$nsr2" link set tun6 up
 ip -net "$nsr2" addr add fee1:3::2/64 dev tun6 nodad
 
@@ -651,7 +651,7 @@ ip -net "$nsr1" route change default via 192.168.200.2
 ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null
 ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0.10 accept'
 
-ip -net "$nsr1" link add name tun6.10 type ip6tnl local fee1:4::1 remote fee1:4::2
+ip -net "$nsr1" link add name tun6.10 type ip6tnl local fee1:4::1 remote fee1:4::2 encaplimit none
 ip -net "$nsr1" link set tun6.10 up
 ip -net "$nsr1" addr add fee1:5::1/64 dev tun6.10 nodad
 ip -6 -net "$nsr1" route delete default
@@ -670,7 +670,7 @@ ip -net "$nsr2" addr add 192.168.200.2/24 dev tun0.10
 ip -net "$nsr2" route change default via 192.168.200.1
 ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null
 
-ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote fee1:4::1 || ret=1
+ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote fee1:4::1 encaplimit none || ret=1
 ip -net "$nsr2" link set tun6.10 up
 ip -net "$nsr2" addr add fee1:5::2/64 dev tun6.10 nodad
 ip -6 -net "$nsr2" route delete default
-- 
2.47.3

next prev parent reply	other threads:[~2026-06-20 22:27 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-20 22:27 [PATCH net,v2 00/14] Netfilter fixes for net Pablo Neira Ayuso
2026-06-20 22:27 ` [PATCH net 01/14] netfilter: flowtable: fix offloaded ct timeout never being extended Pablo Neira Ayuso
2026-06-20 22:27 ` [PATCH net 02/14] netfilter: nf_queue: pin bridge device while NFQUEUE holds fake dst Pablo Neira Ayuso
2026-06-20 22:27 ` [PATCH net 03/14] netfilter: xt_cluster: reject template conntracks in hash match Pablo Neira Ayuso
2026-06-20 22:27 ` Pablo Neira Ayuso [this message]
2026-06-20 22:27 ` [PATCH net 05/14] netfilter: ipset: Don't use test_bit() in lockless RCU readers in hash types Pablo Neira Ayuso
2026-06-20 22:27 ` [PATCH net 06/14] netfilter: ipset: Don't use test_bit() in lockless RCU readers in bitmap types Pablo Neira Ayuso
2026-06-20 22:27 ` [PATCH net 07/14] netfilter: ipset: fix order of kfree_rcu() and rcu_assign_pointer() Pablo Neira Ayuso
2026-06-20 22:27 ` [PATCH net 08/14] netfilter: ipset: make sure gc is properly stopped Pablo Neira Ayuso
2026-06-20 22:27 ` [PATCH net 09/14] netfilter: nft_payload: reject offsets exceeding 65535 bytes Pablo Neira Ayuso
2026-06-20 22:27 ` [PATCH net 10/14] netfilter: nft_meta_bridge: add validate callback for get operations Pablo Neira Ayuso
2026-06-20 22:27 ` [PATCH net 11/14] netfilter: nft_flow_offload: zero device address for non-ether case Pablo Neira Ayuso
2026-06-20 22:27 ` [PATCH net 12/14] netfilter: nf_reject: skip iphdr options when looking for icmp header Pablo Neira Ayuso
2026-06-20 22:27 ` [PATCH net 13/14] netfilter: nf_conntrack_expect: use conntrack GC to reap expectations Pablo Neira Ayuso
2026-06-20 22:27 ` [PATCH net 14/14] netfilter: nft_meta_bridge: fix NFT_META_BRI_IIFPVID stack leak Pablo Neira Ayuso

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:d7c90a8533e dfblob:bf8e40af60b dfblob:9c05a50d601
dfblob:e7a3fb2b2d9 dfblob:7a34ef46897 dfblob:08ad07500e8 )
 OR (
bs:"[PATCH net 04/14] netfilter: flowtable: fix and simplify IP6IP6 tunnel handling" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260620222738.112506-5-pablo@netfilter.org \
    --to=pablo@netfilter.org \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=fw@strlen.de \
    --cc=horms@kernel.org \
    --cc=kuba@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=netfilter-devel@vger.kernel.org \
    --cc=pabeni@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox