> Hi Lorenzo, > > On Tue, Jun 09, 2026 at 02:31:03PM +0200, Lorenzo Bianconi wrote: > > > On Tue, Jun 09, 2026 at 12:28:08PM +0200, Lorenzo Bianconi wrote: > > > > On Jun 09, Pablo Neira Ayuso wrote: > > > > > Hi Lorenzo, > > > > > > > > > > On Mon, Jun 08, 2026 at 07:06:52PM +0200, Lorenzo Bianconi wrote: > > > > > > Switch nf_flow_ip6_tunnel_proto() from skb_header_pointer() to > > > > > > pskb_may_pull() for header validation, aligning it with the approach > > > > > > used in nf_flow_ip4_tunnel_proto(). > > > > > > Move ctx->offset update inside the IPPROTO_IPV6 conditional block since > > > > > > it should only be adjusted when a tunnel is actually detected. > > > > > > While at it, use nexthdr instead of the hardcoded IPPROTO_IPV6 constant > > > > > > when setting ctx->tun.proto. > > > > > > > > > > > > Fixes: d98103575dcdd ("netfilter: flowtable: Add IP6IP6 rx sw acceleration") > > > > > > Signed-off-by: Lorenzo Bianconi > > > > > > --- > > > > > > net/netfilter/nf_flow_table_ip.c | 10 +++++----- > > > > > > 1 file changed, 5 insertions(+), 5 deletions(-) > > > > > > > > > > > > diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c > > > > > > index 9c05a50d6013..2946399ab715 100644 > > > > > > --- a/net/netfilter/nf_flow_table_ip.c > > > > > > +++ b/net/netfilter/nf_flow_table_ip.c > > > > > > @@ -347,15 +347,15 @@ static bool nf_flow_ip6_tunnel_proto(struct nf_flowtable_ctx *ctx, > > > > > > struct sk_buff *skb) > > > > > > { > > > > > > #if IS_ENABLED(CONFIG_IPV6) > > > > > > - struct ipv6hdr *ip6h, _ip6h; > > > > > > + struct ipv6hdr *ip6h; > > > > > > __be16 frag_off; > > > > > > u8 nexthdr; > > > > > > int hdrlen; > > > > > > > > > > > > - ip6h = skb_header_pointer(skb, ctx->offset, sizeof(*ip6h), &_ip6h); > > > > > > - if (!ip6h) > > > > > > + if (!pskb_may_pull(skb, sizeof(*ip6h) + ctx->offset)) > > > > > > return false; > > > > > > > > > > > > + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); > > > > > > if (ip6h->hop_limit <= 1) > > > > > > return false; > > > > > > > > > > Not shown in the patch, but is there still a corner case here that > > > > > needs to be covered? > > > > > > > > > > ipv6_skip_exthdr() uses skb_header_pointer() internal, then another > > > > > pskb_may_pull() is needed to make sure no other IPv6 extension header > > > > > sits between the outer and the inner IPPROTO_IPV6 header, allowing to > > > > > be in a non-linear area of the skb? > > > > > > > > > > > @@ -367,9 +367,9 @@ static bool nf_flow_ip6_tunnel_proto(struct nf_flowtable_ctx *ctx, > > > > > > > > > > > > > > > > I mean: > > > > > > > > > > if (!pskb_may_pull(skb, hdrlen)) > > > > > return false; > > > > > > > > > > where hdrlen is what ipv6_skip_exthdr() returns. > > > > > > > > > > Then, I think it should be safe to call skb_pull() on > > > > > ctx->tun.hdr_size. > > > > > > > > > > Let me know, thanks. > > > > > > > > I think you are right, here we need to run: > > > > > > > > if (!pskb_may_pull(skb, hdrlen)) > > > > return false; > > > > > > > > in order to be sure we can pull ctx->tun.hdr_size in nf_flow_ip_tunnel_pop(). > > > > Doing so, we can roll-back to the original skb_header_pointer() to access the > > > > outer ip6 header here. What do you think? > > > > > > Yes, initial skb_header_pointer() then pskb_may_pull(skb, hdrlen) to > > > ensure the entire should be fine. > > > > > > I think this need one more fix: This needs to resort to classic path > > > if there are intermediate extension headers sitting in between the > > > outer and inner headers in IP6IP6, ie. ipv6_ext_hdr() == true. Those > > > extensions need to be handled by the IPv6 stack. > > > > In my setup we have just a single Destination Option extension header (60) > > between the outer and the inner IPV6 headers. In order to check if we have > > other extensions headers other than Destination Option (and if so, send the > > packet the networking stack) I guess we need to implement something similar > > to ipv6_skip_exthdr(), agree? > > Maybe simpler check? If nexthdr is immediately IP6IP6 (ie. no > intermediate headers are in place), then handle this from the > flowtable datapath, otherwise fallback to classic. Thus, no new > special parser function is needed. Hi Pablo, ack, I agree. I guess we can limit the flowtable acceleration to the case of a IP6IP6 tunnel created with encaplimit set to none: $ip link add name tun0 type ip6tnl local remote encaplimit none > > > > nf_flow_ip6_tunnel_proto() needs to be fixed to deal with this. > > > > Do you want to do it with a dedicated patch or do you prefer to do it in this > > one? > > I think a single patch to fix the issues in nf_flow_ip6_tunnel_proto() > should be fine. I cooked this patch, it works fine. What do you think? Regards, Lorenzo diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 9c05a50d6013..5cb50414a491 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -347,29 +347,20 @@ static bool nf_flow_ip6_tunnel_proto(struct nf_flowtable_ctx *ctx, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) - struct ipv6hdr *ip6h, _ip6h; - __be16 frag_off; - u8 nexthdr; - int hdrlen; + struct ipv6hdr *ip6h; - ip6h = skb_header_pointer(skb, ctx->offset, sizeof(*ip6h), &_ip6h); - if (!ip6h) + if (!pskb_may_pull(skb, sizeof(*ip6h) + ctx->offset)) return false; + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); if (ip6h->hop_limit <= 1) return false; - nexthdr = ip6h->nexthdr; - hdrlen = ipv6_skip_exthdr(skb, sizeof(*ip6h) + ctx->offset, &nexthdr, - &frag_off); - if (hdrlen < 0) - return false; - - if (nexthdr == IPPROTO_IPV6) { - ctx->tun.hdr_size = hdrlen; - ctx->tun.proto = IPPROTO_IPV6; + if (ip6h->nexthdr == IPPROTO_IPV6) { + ctx->tun.proto = ip6h->nexthdr; + ctx->tun.hdr_size = sizeof(*ip6h); + ctx->offset += ctx->tun.hdr_size; } - ctx->offset += ctx->tun.hdr_size; return true; #else @@ -648,25 +639,19 @@ static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb, return 0; } -struct ipv6_tel_txoption { - struct ipv6_txoptions ops; - __u8 dst_opt[8]; -}; - static int nf_flow_tunnel_ip6ip6_push(struct net *net, struct sk_buff *skb, struct flow_offload_tuple *tuple, - struct in6_addr **ip6_daddr, - int encap_limit) + struct in6_addr **ip6_daddr) { struct ipv6hdr *ip6h = (struct ipv6hdr *)skb_network_header(skb); - u8 hop_limit = ip6h->hop_limit, proto = IPPROTO_IPV6; struct rtable *rt = dst_rtable(tuple->dst_cache); __u8 dsfield = ipv6_get_dsfield(ip6h); struct flowi6 fl6 = { .daddr = tuple->tun.src_v6, .saddr = tuple->tun.dst_v6, - .flowi6_proto = proto, + .flowi6_proto = IPPROTO_IPV6, }; + u8 hop_limit = ip6h->hop_limit; int err, mtu; u32 headroom; @@ -674,41 +659,18 @@ static int nf_flow_tunnel_ip6ip6_push(struct net *net, struct sk_buff *skb, if (err) return err; - skb_set_inner_ipproto(skb, proto); + skb_set_inner_ipproto(skb, IPPROTO_IPV6); headroom = sizeof(*ip6h) + LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; - if (encap_limit) - headroom += 8; err = skb_cow_head(skb, headroom); if (err) return err; skb_scrub_packet(skb, true); mtu = dst_mtu(&rt->dst) - sizeof(*ip6h); - if (encap_limit) - mtu -= 8; mtu = max(mtu, IPV6_MIN_MTU); skb_dst_update_pmtu_no_confirm(skb, mtu); - if (encap_limit > 0) { - struct ipv6_tel_txoption opt = { - .dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT, - .dst_opt[3] = 1, - .dst_opt[4] = encap_limit, - .dst_opt[5] = IPV6_TLV_PADN, - .dst_opt[6] = 1, - }; - struct ipv6_opt_hdr *hopt; - - opt.ops.dst1opt = (struct ipv6_opt_hdr *)opt.dst_opt; - opt.ops.opt_nflen = 8; - - hopt = skb_push(skb, ipv6_optlen(opt.ops.dst1opt)); - memcpy(hopt, opt.ops.dst1opt, ipv6_optlen(opt.ops.dst1opt)); - hopt->nexthdr = IPPROTO_IPV6; - proto = NEXTHDR_DEST; - } - skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); @@ -716,7 +678,7 @@ static int nf_flow_tunnel_ip6ip6_push(struct net *net, struct sk_buff *skb, ip6_flow_hdr(ip6h, dsfield, ip6_make_flowlabel(net, skb, fl6.flowlabel, true, &fl6)); ip6h->hop_limit = hop_limit; - ip6h->nexthdr = proto; + ip6h->nexthdr = IPPROTO_IPV6; ip6h->daddr = tuple->tun.src_v6; ip6h->saddr = tuple->tun.dst_v6; ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(*ip6h)); @@ -729,12 +691,10 @@ static int nf_flow_tunnel_ip6ip6_push(struct net *net, struct sk_buff *skb, static int nf_flow_tunnel_v6_push(struct net *net, struct sk_buff *skb, struct flow_offload_tuple *tuple, - struct in6_addr **ip6_daddr, - int encap_limit) + struct in6_addr **ip6_daddr) { if (tuple->tun_num) - return nf_flow_tunnel_ip6ip6_push(net, skb, tuple, ip6_daddr, - encap_limit); + return nf_flow_tunnel_ip6ip6_push(net, skb, tuple, ip6_daddr); return 0; } @@ -1089,7 +1049,7 @@ static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, struct nf_flowtable *flow_table, struct flow_offload_tuple_rhash *tuplehash, - struct sk_buff *skb, int encap_limit) + struct sk_buff *skb) { enum flow_offload_tuple_dir dir; struct flow_offload *flow; @@ -1100,11 +1060,8 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; - if (flow->tuplehash[!dir].tuple.tun_num) { + if (flow->tuplehash[!dir].tuple.tun_num) mtu -= sizeof(*ip6h); - if (encap_limit > 0) - mtu -= 8; /* encap limit option */ - } if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return 0; @@ -1158,7 +1115,6 @@ unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - int encap_limit = IPV6_DEFAULT_TNL_ENCAP_LIMIT; struct flow_offload_tuple_rhash *tuplehash; struct nf_flowtable *flow_table = priv; struct flow_offload_tuple *other_tuple; @@ -1177,8 +1133,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (tuplehash == NULL) return NF_ACCEPT; - ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb, - encap_limit); + ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb); if (ret < 0) return NF_DROP; else if (ret == 0) @@ -1198,7 +1153,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, ip6_daddr = &other_tuple->src_v6; if (nf_flow_tunnel_v6_push(state->net, skb, other_tuple, - &ip6_daddr, encap_limit) < 0) + &ip6_daddr) < 0) return NF_DROP; switch (tuplehash->tuple.xmit_type) { > > My understanding is that IP6IP6 tunnels are special, because it is > handled in the received path as local traffic, then decapsulation > happens and packet follows local output path. This is different to > what we already have where packets follow forwarding path, not local > input. The flowtable should only pull the outer IP6 header. > > > > And I suspect nf_flow_ip4_tunnel_proto() with IP options have the same > > > problem, the flowtable need to resort to the classic stack path. > > > > In the IPv4 case, if the packet has options nf_flow_ip4_tunnel_proto() will > > return false and so the packet will be sent to the networking stack. > > OK, then IPv4 is fine, thanks for explaining.