* [PATCH nf-next v2 0/4] Add IP6IP6 flowtable SW acceleration
@ 2025-12-09 7:35 Lorenzo Bianconi
2025-12-09 7:35 ` [PATCH nf-next v2 1/4] netfilter: Introduce tunnel metadata info in nf_flowtable_ctx struct Lorenzo Bianconi
` (3 more replies)
0 siblings, 4 replies; 9+ messages in thread
From: Lorenzo Bianconi @ 2025-12-09 7:35 UTC (permalink / raw)
To: Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal,
Phil Sutter, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Simon Horman, David Ahern, Shuah Khan
Cc: netfilter-devel, coreteam, netdev, linux-kselftest,
Lorenzo Bianconi
Introduce SW acceleration for IP6IP6 tunnels in the netfilter flowtable
infrastructure.
---
Changes in v2:
- Fix compilation when CONFIG_IPV6 is disabled
- Rely on ipv6_skip_exthdr() in nf_flow_ip6_tunnel_proto() to avoid
use-after-free issues
- Drop patch 2/5 from v1
- Link to v1: https://lore.kernel.org/r/20251207-b4-flowtable-offload-ip6ip6-v1-0-18e3ab7f748c@kernel.org
---
Lorenzo Bianconi (4):
netfilter: Introduce tunnel metadata info in nf_flowtable_ctx struct
netfilter: flowtable: Add IP6IP6 rx sw acceleration
netfilter: flowtable: Add IP6IP6 tx sw acceleration
selftests: netfilter: nft_flowtable.sh: Add IP6IP6 flowtable selftest
net/ipv6/ip6_tunnel.c | 27 +++
net/netfilter/nf_flow_table_ip.c | 229 ++++++++++++++++++---
.../selftests/net/netfilter/nft_flowtable.sh | 62 +++++-
3 files changed, 275 insertions(+), 43 deletions(-)
---
base-commit: f8156ef0fd8232055396ebf1e044fa06fb8bc388
change-id: 20251207-b4-flowtable-offload-ip6ip6-8e9a2c6f3a77
Best regards,
--
Lorenzo Bianconi <lorenzo@kernel.org>
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH nf-next v2 1/4] netfilter: Introduce tunnel metadata info in nf_flowtable_ctx struct
2025-12-09 7:35 [PATCH nf-next v2 0/4] Add IP6IP6 flowtable SW acceleration Lorenzo Bianconi
@ 2025-12-09 7:35 ` Lorenzo Bianconi
2026-01-13 15:11 ` Florian Westphal
2025-12-09 7:35 ` [PATCH nf-next v2 2/4] netfilter: flowtable: Add IP6IP6 rx sw acceleration Lorenzo Bianconi
` (2 subsequent siblings)
3 siblings, 1 reply; 9+ messages in thread
From: Lorenzo Bianconi @ 2025-12-09 7:35 UTC (permalink / raw)
To: Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal,
Phil Sutter, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Simon Horman, David Ahern, Shuah Khan
Cc: netfilter-devel, coreteam, netdev, linux-kselftest,
Lorenzo Bianconi
This is a preliminary patch to introduce IP6IP6 flowtable acceleration.
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
net/netfilter/nf_flow_table_ip.c | 80 ++++++++++++++++++++++------------------
1 file changed, 44 insertions(+), 36 deletions(-)
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index e128b0fe9a7bf50b458df9940d629ea08c521871..14c01b59f76569170057d2465ee5953efb557bcc 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -142,7 +142,18 @@ static bool ip_has_options(unsigned int thoff)
return thoff != sizeof(struct iphdr);
}
-static void nf_flow_tuple_encap(struct sk_buff *skb,
+struct nf_flowtable_ctx {
+ const struct net_device *in;
+ u32 offset;
+ u32 hdrsize;
+ struct {
+ u32 offset;
+ u8 proto;
+ } tun;
+};
+
+static void nf_flow_tuple_encap(struct nf_flowtable_ctx *ctx,
+ struct sk_buff *skb,
struct flow_offload_tuple *tuple)
{
__be16 inner_proto = skb->protocol;
@@ -174,22 +185,15 @@ static void nf_flow_tuple_encap(struct sk_buff *skb,
break;
}
- if (inner_proto == htons(ETH_P_IP)) {
+ if (inner_proto == htons(ETH_P_IP) &&
+ ctx->tun.proto == IPPROTO_IPIP) {
iph = (struct iphdr *)(skb_network_header(skb) + offset);
- if (iph->protocol == IPPROTO_IPIP) {
- tuple->tun.dst_v4.s_addr = iph->daddr;
- tuple->tun.src_v4.s_addr = iph->saddr;
- tuple->tun.l3_proto = IPPROTO_IPIP;
- }
+ tuple->tun.dst_v4.s_addr = iph->daddr;
+ tuple->tun.src_v4.s_addr = iph->saddr;
+ tuple->tun.l3_proto = IPPROTO_IPIP;
}
}
-struct nf_flowtable_ctx {
- const struct net_device *in;
- u32 offset;
- u32 hdrsize;
-};
-
static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb,
struct flow_offload_tuple *tuple)
{
@@ -257,7 +261,7 @@ static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb,
tuple->l3proto = AF_INET;
tuple->l4proto = ipproto;
tuple->iifidx = ctx->in->ifindex;
- nf_flow_tuple_encap(skb, tuple);
+ nf_flow_tuple_encap(ctx, skb, tuple);
return 0;
}
@@ -293,15 +297,16 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
return NF_STOLEN;
}
-static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize)
+static bool nf_flow_ip4_tunnel_proto(struct nf_flowtable_ctx *ctx,
+ struct sk_buff *skb)
{
struct iphdr *iph;
u16 size;
- if (!pskb_may_pull(skb, sizeof(*iph) + *psize))
+ if (!pskb_may_pull(skb, sizeof(*iph) + ctx->offset))
return false;
- iph = (struct iphdr *)(skb_network_header(skb) + *psize);
+ iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset);
size = iph->ihl << 2;
if (ip_is_fragment(iph) || unlikely(ip_has_options(size)))
@@ -310,25 +315,27 @@ static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize)
if (iph->ttl <= 1)
return false;
- if (iph->protocol == IPPROTO_IPIP)
- *psize += size;
+ if (iph->protocol == IPPROTO_IPIP) {
+ ctx->tun.proto = IPPROTO_IPIP;
+ ctx->tun.offset = size;
+ ctx->offset += size;
+ }
return true;
}
-static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb)
+static void nf_flow_ip4_tunnel_pop(struct nf_flowtable_ctx *ctx,
+ struct sk_buff *skb)
{
- struct iphdr *iph = (struct iphdr *)skb_network_header(skb);
-
- if (iph->protocol != IPPROTO_IPIP)
+ if (ctx->tun.proto != IPPROTO_IPIP)
return;
- skb_pull(skb, iph->ihl << 2);
+ skb_pull(skb, ctx->tun.offset);
skb_reset_network_header(skb);
}
-static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto,
- u32 *offset)
+static bool nf_flow_skb_encap_protocol(struct nf_flowtable_ctx *ctx,
+ struct sk_buff *skb, __be16 proto)
{
__be16 inner_proto = skb->protocol;
struct vlan_ethhdr *veth;
@@ -341,7 +348,7 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto,
veth = (struct vlan_ethhdr *)skb_mac_header(skb);
if (veth->h_vlan_encapsulated_proto == proto) {
- *offset += VLAN_HLEN;
+ ctx->offset += VLAN_HLEN;
inner_proto = proto;
ret = true;
}
@@ -349,19 +356,20 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto,
case htons(ETH_P_PPP_SES):
if (nf_flow_pppoe_proto(skb, &inner_proto) &&
inner_proto == proto) {
- *offset += PPPOE_SES_HLEN;
+ ctx->offset += PPPOE_SES_HLEN;
ret = true;
}
break;
}
if (inner_proto == htons(ETH_P_IP))
- ret = nf_flow_ip4_tunnel_proto(skb, offset);
+ ret = nf_flow_ip4_tunnel_proto(ctx, skb);
return ret;
}
-static void nf_flow_encap_pop(struct sk_buff *skb,
+static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx,
+ struct sk_buff *skb,
struct flow_offload_tuple_rhash *tuplehash)
{
struct vlan_hdr *vlan_hdr;
@@ -388,7 +396,7 @@ static void nf_flow_encap_pop(struct sk_buff *skb,
}
if (skb->protocol == htons(ETH_P_IP))
- nf_flow_ip4_tunnel_pop(skb);
+ nf_flow_ip4_tunnel_pop(ctx, skb);
}
struct nf_flow_xmit {
@@ -414,7 +422,7 @@ nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx,
{
struct flow_offload_tuple tuple = {};
- if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset))
+ if (!nf_flow_skb_encap_protocol(ctx, skb, htons(ETH_P_IP)))
return NULL;
if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0)
@@ -458,7 +466,7 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx,
flow_offload_refresh(flow_table, flow, false);
- nf_flow_encap_pop(skb, tuplehash);
+ nf_flow_encap_pop(ctx, skb, tuplehash);
thoff -= ctx->offset;
iph = ip_hdr(skb);
@@ -836,7 +844,7 @@ static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb,
tuple->l3proto = AF_INET6;
tuple->l4proto = nexthdr;
tuple->iifidx = ctx->in->ifindex;
- nf_flow_tuple_encap(skb, tuple);
+ nf_flow_tuple_encap(ctx, skb, tuple);
return 0;
}
@@ -873,7 +881,7 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx,
flow_offload_refresh(flow_table, flow, false);
- nf_flow_encap_pop(skb, tuplehash);
+ nf_flow_encap_pop(ctx, skb, tuplehash);
ip6h = ipv6_hdr(skb);
nf_flow_nat_ipv6(flow, skb, dir, ip6h);
@@ -895,7 +903,7 @@ nf_flow_offload_ipv6_lookup(struct nf_flowtable_ctx *ctx,
struct flow_offload_tuple tuple = {};
if (skb->protocol != htons(ETH_P_IPV6) &&
- !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &ctx->offset))
+ !nf_flow_skb_encap_protocol(ctx, skb, htons(ETH_P_IPV6)))
return NULL;
if (nf_flow_tuple_ipv6(ctx, skb, &tuple) < 0)
--
2.52.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH nf-next v2 2/4] netfilter: flowtable: Add IP6IP6 rx sw acceleration
2025-12-09 7:35 [PATCH nf-next v2 0/4] Add IP6IP6 flowtable SW acceleration Lorenzo Bianconi
2025-12-09 7:35 ` [PATCH nf-next v2 1/4] netfilter: Introduce tunnel metadata info in nf_flowtable_ctx struct Lorenzo Bianconi
@ 2025-12-09 7:35 ` Lorenzo Bianconi
2026-01-13 15:23 ` Florian Westphal
2025-12-09 7:35 ` [PATCH nf-next v2 3/4] netfilter: flowtable: Add IP6IP6 tx " Lorenzo Bianconi
2025-12-09 7:35 ` [PATCH nf-next v2 4/4] selftests: netfilter: nft_flowtable.sh: Add IP6IP6 flowtable selftest Lorenzo Bianconi
3 siblings, 1 reply; 9+ messages in thread
From: Lorenzo Bianconi @ 2025-12-09 7:35 UTC (permalink / raw)
To: Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal,
Phil Sutter, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Simon Horman, David Ahern, Shuah Khan
Cc: netfilter-devel, coreteam, netdev, linux-kselftest,
Lorenzo Bianconi
Introduce sw acceleration for rx path of IP6IP6 tunnels relying on the
netfilter flowtable infrastructure. Subsequent patches will add sw
acceleration for IP6IP6 tunnels tx path.
IP6IP6 rx sw acceleration can be tested running the following scenario
where the traffic is forwarded between two NICs (eth0 and eth1) and an
IP6IP6 tunnel is used to access a remote site (using eth1 as the underlay
device):
ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (2001:db8:3::2)
$ip addr show
6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff
inet6 2001:db8:1::2/64 scope global nodad
valid_lft forever preferred_lft forever
7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff
inet6 2001:db8:2::1/64 scope global nodad
valid_lft forever preferred_lft forever
8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000
link/tunnel6 2001:db8:2::1 peer 2001:db8:2::2 permaddr ce9c:2940:7dcc::
inet6 2002:db8:1::1/64 scope global nodad
valid_lft forever preferred_lft forever
$ip -6 route show
2001:db8:1::/64 dev eth0 proto kernel metric 256 pref medium
2001:db8:2::/64 dev eth1 proto kernel metric 256 pref medium
2002:db8:1::/64 dev tun0 proto kernel metric 256 pref medium
default via 2002:db8:1::2 dev tun0 metric 1024 pref medium
$nft list ruleset
table inet filter {
flowtable ft {
hook ingress priority filter
devices = { eth0, eth1 }
}
chain forward {
type filter hook forward priority filter; policy accept;
meta l4proto { tcp, udp } flow add @ft
}
}
Reproducing the scenario described above using veths I got the following
results:
- TCP stream received from the IPIP tunnel:
- net-next: (baseline) ~ 81Gbps
- net-next + IP6IP6 flowtbale support: ~112Gbps
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
net/ipv6/ip6_tunnel.c | 27 +++++++++++++
net/netfilter/nf_flow_table_ip.c | 83 +++++++++++++++++++++++++++++++++-------
2 files changed, 97 insertions(+), 13 deletions(-)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 6405072050e0ef7521ca1fdddc4a0252e2159d2a..10341bfc16bd16a43290015952bd9a57658e6ae1 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1828,6 +1828,32 @@ int ip6_tnl_encap_setup(struct ip6_tnl *t,
}
EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup);
+static int ip6_tnl_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct ip6_tnl *t = netdev_priv(ctx->dev);
+ struct flowi6 fl6 = {
+ .daddr = t->parms.raddr,
+ };
+ struct dst_entry *dst;
+ int err;
+
+ dst = ip6_route_output(dev_net(ctx->dev), NULL, &fl6);
+ if (!dst->error) {
+ path->type = DEV_PATH_TUN;
+ path->tun.src_v6 = t->parms.laddr;
+ path->tun.dst_v6 = t->parms.raddr;
+ path->tun.l3_proto = IPPROTO_IPV6;
+ path->dev = ctx->dev;
+ ctx->dev = dst->dev;
+ }
+
+ err = dst->error;
+ dst_release(dst);
+
+ return err;
+}
+
static const struct net_device_ops ip6_tnl_netdev_ops = {
.ndo_init = ip6_tnl_dev_init,
.ndo_uninit = ip6_tnl_dev_uninit,
@@ -1836,6 +1862,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
.ndo_change_mtu = ip6_tnl_change_mtu,
.ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
+ .ndo_fill_forward_path = ip6_tnl_fill_forward_path,
};
#define IPXIPX_FEATURES (NETIF_F_SG | \
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 14c01b59f76569170057d2465ee5953efb557bcc..8323f44a1ef172f16300a5c2c628464a99b2c47a 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -159,6 +159,7 @@ static void nf_flow_tuple_encap(struct nf_flowtable_ctx *ctx,
__be16 inner_proto = skb->protocol;
struct vlan_ethhdr *veth;
struct pppoe_hdr *phdr;
+ struct ipv6hdr *ip6h;
struct iphdr *iph;
u16 offset = 0;
int i = 0;
@@ -185,12 +186,25 @@ static void nf_flow_tuple_encap(struct nf_flowtable_ctx *ctx,
break;
}
- if (inner_proto == htons(ETH_P_IP) &&
- ctx->tun.proto == IPPROTO_IPIP) {
+ switch (inner_proto) {
+ case htons(ETH_P_IP):
iph = (struct iphdr *)(skb_network_header(skb) + offset);
- tuple->tun.dst_v4.s_addr = iph->daddr;
- tuple->tun.src_v4.s_addr = iph->saddr;
- tuple->tun.l3_proto = IPPROTO_IPIP;
+ if (ctx->tun.proto == IPPROTO_IPIP) {
+ tuple->tun.dst_v4.s_addr = iph->daddr;
+ tuple->tun.src_v4.s_addr = iph->saddr;
+ tuple->tun.l3_proto = IPPROTO_IPIP;
+ }
+ break;
+ case htons(ETH_P_IPV6):
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
+ if (ctx->tun.proto == IPPROTO_IPV6) {
+ tuple->tun.dst_v6 = ip6h->daddr;
+ tuple->tun.src_v6 = ip6h->saddr;
+ tuple->tun.l3_proto = IPPROTO_IPV6;
+ }
+ break;
+ default:
+ break;
}
}
@@ -324,10 +338,45 @@ static bool nf_flow_ip4_tunnel_proto(struct nf_flowtable_ctx *ctx,
return true;
}
-static void nf_flow_ip4_tunnel_pop(struct nf_flowtable_ctx *ctx,
- struct sk_buff *skb)
+static bool nf_flow_ip6_tunnel_proto(struct nf_flowtable_ctx *ctx,
+ struct sk_buff *skb)
{
- if (ctx->tun.proto != IPPROTO_IPIP)
+#if IS_ENABLED(CONFIG_IPV6)
+ struct ipv6hdr *ip6h;
+ __be16 frag_off;
+ u8 nexthdr;
+ int hdrlen;
+
+ if (!pskb_may_pull(skb, sizeof(*ip6h) + ctx->offset))
+ return false;
+
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset);
+ if (ip6h->hop_limit <= 1)
+ return false;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ hdrlen = ipv6_skip_exthdr(skb, sizeof(*ip6h) + ctx->offset, &nexthdr,
+ &frag_off);
+ if (hdrlen < 0)
+ return false;
+
+ if (nexthdr == IPPROTO_IPV6) {
+ ctx->tun.offset = hdrlen;
+ ctx->tun.proto = IPPROTO_IPV6;
+ }
+ ctx->offset += ctx->tun.offset;
+
+ return true;
+#else
+ return false;
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+}
+
+static void nf_flow_ip_tunnel_pop(struct nf_flowtable_ctx *ctx,
+ struct sk_buff *skb)
+{
+ if (ctx->tun.proto != IPPROTO_IPIP &&
+ ctx->tun.proto != IPPROTO_IPV6)
return;
skb_pull(skb, ctx->tun.offset);
@@ -362,8 +411,16 @@ static bool nf_flow_skb_encap_protocol(struct nf_flowtable_ctx *ctx,
break;
}
- if (inner_proto == htons(ETH_P_IP))
+ switch (inner_proto) {
+ case htons(ETH_P_IP):
ret = nf_flow_ip4_tunnel_proto(ctx, skb);
+ break;
+ case htons(ETH_P_IPV6):
+ ret = nf_flow_ip6_tunnel_proto(ctx, skb);
+ break;
+ default:
+ break;
+ }
return ret;
}
@@ -395,8 +452,9 @@ static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx,
}
}
- if (skb->protocol == htons(ETH_P_IP))
- nf_flow_ip4_tunnel_pop(ctx, skb);
+ if (skb->protocol == htons(ETH_P_IP) ||
+ skb->protocol == htons(ETH_P_IPV6))
+ nf_flow_ip_tunnel_pop(ctx, skb);
}
struct nf_flow_xmit {
@@ -902,8 +960,7 @@ nf_flow_offload_ipv6_lookup(struct nf_flowtable_ctx *ctx,
{
struct flow_offload_tuple tuple = {};
- if (skb->protocol != htons(ETH_P_IPV6) &&
- !nf_flow_skb_encap_protocol(ctx, skb, htons(ETH_P_IPV6)))
+ if (!nf_flow_skb_encap_protocol(ctx, skb, htons(ETH_P_IPV6)))
return NULL;
if (nf_flow_tuple_ipv6(ctx, skb, &tuple) < 0)
--
2.52.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH nf-next v2 3/4] netfilter: flowtable: Add IP6IP6 tx sw acceleration
2025-12-09 7:35 [PATCH nf-next v2 0/4] Add IP6IP6 flowtable SW acceleration Lorenzo Bianconi
2025-12-09 7:35 ` [PATCH nf-next v2 1/4] netfilter: Introduce tunnel metadata info in nf_flowtable_ctx struct Lorenzo Bianconi
2025-12-09 7:35 ` [PATCH nf-next v2 2/4] netfilter: flowtable: Add IP6IP6 rx sw acceleration Lorenzo Bianconi
@ 2025-12-09 7:35 ` Lorenzo Bianconi
2025-12-09 7:35 ` [PATCH nf-next v2 4/4] selftests: netfilter: nft_flowtable.sh: Add IP6IP6 flowtable selftest Lorenzo Bianconi
3 siblings, 0 replies; 9+ messages in thread
From: Lorenzo Bianconi @ 2025-12-09 7:35 UTC (permalink / raw)
To: Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal,
Phil Sutter, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Simon Horman, David Ahern, Shuah Khan
Cc: netfilter-devel, coreteam, netdev, linux-kselftest,
Lorenzo Bianconi
Introduce sw acceleration for tx path of IP6IP6 tunnels relying on the
netfilter flowtable infrastructure.
IP6IP6 tx sw acceleration can be tested running the following scenario
where the traffic is forwarded between two NICs (eth0 and eth1) and an
IP6IP6 tunnel is used to access a remote site (using eth1 as the underlay
device):
ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (2001:db8:3::2)
$ip addr show
6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff
inet6 2001:db8:1::2/64 scope global nodad
valid_lft forever preferred_lft forever
7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff
inet6 2001:db8:2::1/64 scope global nodad
valid_lft forever preferred_lft forever
8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000
link/tunnel6 2001:db8:2::1 peer 2001:db8:2::2 permaddr ce9c:2940:7dcc::
inet6 2002:db8:1::1/64 scope global nodad
valid_lft forever preferred_lft forever
$ip -6 route show
2001:db8:1::/64 dev eth0 proto kernel metric 256 pref medium
2001:db8:2::/64 dev eth1 proto kernel metric 256 pref medium
2002:db8:1::/64 dev tun0 proto kernel metric 256 pref medium
default via 2002:db8:1::2 dev tun0 metric 1024 pref medium
$nft list ruleset
table inet filter {
flowtable ft {
hook ingress priority filter
devices = { eth0, eth1 }
}
chain forward {
type filter hook forward priority filter; policy accept;
meta l4proto { tcp, udp } flow add @ft
}
}
Reproducing the scenario described above using veths I got the following
results:
- TCP stream received from the IPIP tunnel:
- net-next: (baseline) ~93Gbps
- net-next + IP6IP6 flowtbale support: ~98Gbps
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
net/netfilter/nf_flow_table_ip.c | 96 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 96 insertions(+)
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 8323f44a1ef172f16300a5c2c628464a99b2c47a..937fd8cd085f459f22d6923592255cad2843746b 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -12,6 +12,7 @@
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
+#include <net/ip6_tunnel.h>
#include <net/neighbour.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_conntrack_acct.h>
@@ -633,6 +634,94 @@ static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb,
return 0;
}
+struct ipv6_tel_txoption {
+ struct ipv6_txoptions ops;
+ __u8 dst_opt[8];
+};
+
+static int nf_flow_tunnel_ip6ip6_push(struct net *net, struct sk_buff *skb,
+ struct flow_offload_tuple *tuple,
+ struct in6_addr **ip6_daddr)
+{
+ struct ipv6hdr *ip6h = (struct ipv6hdr *)skb_network_header(skb);
+ int err, mtu, encap_limit = IPV6_DEFAULT_TNL_ENCAP_LIMIT;
+ u8 hop_limit = ip6h->hop_limit, proto = IPPROTO_IPV6;
+ struct rtable *rt = dst_rtable(tuple->dst_cache);
+ __u8 dsfield = ipv6_get_dsfield(ip6h);
+ struct flowi6 fl6 = {
+ .daddr = tuple->tun.src_v6,
+ .saddr = tuple->tun.dst_v6,
+ .flowi6_proto = proto,
+ };
+ u32 headroom;
+
+ err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6);
+ if (err)
+ return err;
+
+ skb_set_inner_ipproto(skb, proto);
+ headroom = sizeof(*ip6h) + LL_RESERVED_SPACE(rt->dst.dev) +
+ rt->dst.header_len;
+ if (encap_limit)
+ headroom += 8;
+ err = skb_cow_head(skb, headroom);
+ if (err)
+ return err;
+
+ skb_scrub_packet(skb, true);
+ mtu = dst_mtu(&rt->dst) - sizeof(*ip6h);
+ if (encap_limit)
+ mtu -= 8;
+ mtu = max(mtu, IPV6_MIN_MTU);
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
+
+ if (encap_limit > 0) {
+ struct ipv6_tel_txoption opt = {
+ .dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT,
+ .dst_opt[3] = 1,
+ .dst_opt[4] = encap_limit,
+ .dst_opt[5] = IPV6_TLV_PADN,
+ .dst_opt[6] = 1,
+ };
+ struct ipv6_opt_hdr *hopt;
+
+ opt.ops.dst1opt = (struct ipv6_opt_hdr *)opt.dst_opt;
+ opt.ops.opt_nflen = 8;
+
+ hopt = skb_push(skb, ipv6_optlen(opt.ops.dst1opt));
+ memcpy(hopt, opt.ops.dst1opt, ipv6_optlen(opt.ops.dst1opt));
+ hopt->nexthdr = IPPROTO_IPV6;
+ proto = NEXTHDR_DEST;
+ }
+
+ skb_push(skb, sizeof(*ip6h));
+ skb_reset_network_header(skb);
+
+ ip6h = ipv6_hdr(skb);
+ ip6_flow_hdr(ip6h, dsfield,
+ ip6_make_flowlabel(net, skb, fl6.flowlabel, true, &fl6));
+ ip6h->hop_limit = hop_limit;
+ ip6h->nexthdr = proto;
+ ip6h->daddr = tuple->tun.src_v6;
+ ip6h->saddr = tuple->tun.dst_v6;
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(*ip6h));
+ IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
+
+ *ip6_daddr = &tuple->tun.src_v6;
+
+ return 0;
+}
+
+static int nf_flow_tunnel_v6_push(struct net *net, struct sk_buff *skb,
+ struct flow_offload_tuple *tuple,
+ struct in6_addr **ip6_daddr)
+{
+ if (tuple->tun_num)
+ return nf_flow_tunnel_ip6ip6_push(net, skb, tuple, ip6_daddr);
+
+ return 0;
+}
+
static int nf_flow_encap_push(struct sk_buff *skb,
struct flow_offload_tuple *tuple)
{
@@ -921,6 +1010,9 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx,
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset;
+ if (flow->tuplehash[!dir].tuple.tun_num)
+ mtu -= sizeof(*ip6h);
+
if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
return 0;
@@ -1010,6 +1102,10 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
other_tuple = &flow->tuplehash[!dir].tuple;
ip6_daddr = &other_tuple->src_v6;
+ if (nf_flow_tunnel_v6_push(state->net, skb, other_tuple,
+ &ip6_daddr) < 0)
+ return NF_DROP;
+
if (nf_flow_encap_push(skb, other_tuple) < 0)
return NF_DROP;
--
2.52.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH nf-next v2 4/4] selftests: netfilter: nft_flowtable.sh: Add IP6IP6 flowtable selftest
2025-12-09 7:35 [PATCH nf-next v2 0/4] Add IP6IP6 flowtable SW acceleration Lorenzo Bianconi
` (2 preceding siblings ...)
2025-12-09 7:35 ` [PATCH nf-next v2 3/4] netfilter: flowtable: Add IP6IP6 tx " Lorenzo Bianconi
@ 2025-12-09 7:35 ` Lorenzo Bianconi
3 siblings, 0 replies; 9+ messages in thread
From: Lorenzo Bianconi @ 2025-12-09 7:35 UTC (permalink / raw)
To: Pablo Neira Ayuso, Jozsef Kadlecsik, Florian Westphal,
Phil Sutter, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Simon Horman, David Ahern, Shuah Khan
Cc: netfilter-devel, coreteam, netdev, linux-kselftest,
Lorenzo Bianconi
Similar to IPIP, introduce specific selftest for IP6IP6 flowtable SW
acceleration in nft_flowtable.sh
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
.../selftests/net/netfilter/nft_flowtable.sh | 62 ++++++++++++++++++----
1 file changed, 53 insertions(+), 9 deletions(-)
diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh
index 24b4e60b91451e7ea7f6a041b0335233047c6242..bc98baba56c638cad35478109a3776d6d93c34a8 100755
--- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh
+++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh
@@ -590,16 +590,28 @@ ip -net "$nsr1" link set tun0 up
ip -net "$nsr1" addr add 192.168.100.1/24 dev tun0
ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null
+ip -net "$nsr1" link add name tun6 type ip6tnl local fee1:2::1 remote fee1:2::2
+ip -net "$nsr1" link set tun6 up
+ip -net "$nsr1" addr add fee1:3::1/64 dev tun6 nodad
+
ip -net "$nsr2" link add name tun0 type ipip local 192.168.10.2 remote 192.168.10.1
ip -net "$nsr2" link set tun0 up
ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0
ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null
+ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote fee1:2::1
+ip -net "$nsr2" link set tun6 up
+ip -net "$nsr2" addr add fee1:3::2/64 dev tun6 nodad
+
ip -net "$nsr1" route change default via 192.168.100.2
ip -net "$nsr2" route change default via 192.168.100.1
+ip -6 -net "$nsr1" route change default via fee1:3::2
+ip -6 -net "$nsr2" route change default via fee1:3::1
ip -net "$ns2" route add default via 10.0.2.1
+ip -6 -net "$ns2" route add default via dead:2::1
ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0 accept'
+ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun6 accept'
ip netns exec "$nsr1" nft -a insert rule inet filter forward \
'meta oif "veth0" tcp sport 12345 ct mark set 1 flow add @f1 counter name routed_repl accept'
@@ -609,28 +621,51 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel"; then
ret=1
fi
+if test_tcp_forwarding "$ns1" "$ns2" 1 6 "[dead:2::99]" 12345; then
+ echo "PASS: flow offload for ns1/ns2 IP6IP6 tunnel"
+else
+ echo "FAIL: flow offload for ns1/ns2 with IP6IP6 tunnel" 1>&2
+ ip netns exec "$nsr1" nft list ruleset
+ ret=1
+fi
+
# Create vlan tagged devices for IPIP traffic.
ip -net "$nsr1" link add link veth1 name veth1.10 type vlan id 10
ip -net "$nsr1" link set veth1.10 up
ip -net "$nsr1" addr add 192.168.20.1/24 dev veth1.10
+ip -net "$nsr1" addr add fee1:4::1/64 dev veth1.10 nodad
ip netns exec "$nsr1" sysctl net.ipv4.conf.veth1/10.forwarding=1 > /dev/null
ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif veth1.10 accept'
-ip -net "$nsr1" link add name tun1 type ipip local 192.168.20.1 remote 192.168.20.2
-ip -net "$nsr1" link set tun1 up
-ip -net "$nsr1" addr add 192.168.200.1/24 dev tun1
+
+ip -net "$nsr1" link add name tun0.10 type ipip local 192.168.20.1 remote 192.168.20.2
+ip -net "$nsr1" link set tun0.10 up
+ip -net "$nsr1" addr add 192.168.200.1/24 dev tun0.10
ip -net "$nsr1" route change default via 192.168.200.2
-ip netns exec "$nsr1" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null
-ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun1 accept'
+ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null
+ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0.10 accept'
+
+ip -net "$nsr1" link add name tun6.10 type ip6tnl local fee1:4::1 remote fee1:4::2
+ip -net "$nsr1" link set tun6.10 up
+ip -net "$nsr1" addr add fee1:5::1/64 dev tun6.10 nodad
+ip -6 -net "$nsr1" route change default via fee1:5::2
+ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun6.10 accept'
ip -net "$nsr2" link add link veth0 name veth0.10 type vlan id 10
ip -net "$nsr2" link set veth0.10 up
ip -net "$nsr2" addr add 192.168.20.2/24 dev veth0.10
+ip -net "$nsr2" addr add fee1:4::2/64 dev veth0.10 nodad
ip netns exec "$nsr2" sysctl net.ipv4.conf.veth0/10.forwarding=1 > /dev/null
-ip -net "$nsr2" link add name tun1 type ipip local 192.168.20.2 remote 192.168.20.1
-ip -net "$nsr2" link set tun1 up
-ip -net "$nsr2" addr add 192.168.200.2/24 dev tun1
+
+ip -net "$nsr2" link add name tun0.10 type ipip local 192.168.20.2 remote 192.168.20.1
+ip -net "$nsr2" link set tun0.10 up
+ip -net "$nsr2" addr add 192.168.200.2/24 dev tun0.10
ip -net "$nsr2" route change default via 192.168.200.1
-ip netns exec "$nsr2" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null
+ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null
+
+ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote fee1:4::1
+ip -net "$nsr2" link set tun6.10 up
+ip -net "$nsr2" addr add fee1:5::2/64 dev tun6.10 nodad
+ip -6 -net "$nsr2" route change default via fee1:5::1
if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then
echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel over vlan" 1>&2
@@ -638,10 +673,19 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then
ret=1
fi
+if test_tcp_forwarding "$ns1" "$ns2" 1 6 "[dead:2::99]" 12345; then
+ echo "PASS: flow offload for ns1/ns2 IP6IP6 tunnel over vlan"
+else
+ echo "FAIL: flow offload for ns1/ns2 with IP6IP6 tunnel over vlan" 1>&2
+ ip netns exec "$nsr1" nft list ruleset
+ ret=1
+fi
+
# Restore the previous configuration
ip -net "$nsr1" route change default via 192.168.10.2
ip -net "$nsr2" route change default via 192.168.10.1
ip -net "$ns2" route del default via 10.0.2.1
+ip -6 -net "$ns2" route del default via dead:2::1
}
# Another test:
--
2.52.0
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH nf-next v2 1/4] netfilter: Introduce tunnel metadata info in nf_flowtable_ctx struct
2025-12-09 7:35 ` [PATCH nf-next v2 1/4] netfilter: Introduce tunnel metadata info in nf_flowtable_ctx struct Lorenzo Bianconi
@ 2026-01-13 15:11 ` Florian Westphal
2026-01-15 23:00 ` Lorenzo Bianconi
0 siblings, 1 reply; 9+ messages in thread
From: Florian Westphal @ 2026-01-13 15:11 UTC (permalink / raw)
To: Lorenzo Bianconi
Cc: Pablo Neira Ayuso, Jozsef Kadlecsik, Phil Sutter, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
David Ahern, Shuah Khan, netfilter-devel, coreteam, netdev,
linux-kselftest
Lorenzo Bianconi <lorenzo@kernel.org> wrote:
> This is a preliminary patch to introduce IP6IP6 flowtable acceleration.
Would you mind extending this a little bit?
AFAICS this prepares for IP6IP6 by removing the 'its ipv4'
assumptions resp. adding needed 'its ipv4' checks:
no ipv6 support is added here.
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
> net/netfilter/nf_flow_table_ip.c | 80 ++++++++++++++++++++++------------------
> 1 file changed, 44 insertions(+), 36 deletions(-)
>
> diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
> index e128b0fe9a7bf50b458df9940d629ea08c521871..14c01b59f76569170057d2465ee5953efb557bcc 100644
> --- a/net/netfilter/nf_flow_table_ip.c
> +++ b/net/netfilter/nf_flow_table_ip.c
> @@ -142,7 +142,18 @@ static bool ip_has_options(unsigned int thoff)
> return thoff != sizeof(struct iphdr);
> }
>
> -static void nf_flow_tuple_encap(struct sk_buff *skb,
> +struct nf_flowtable_ctx {
> + const struct net_device *in;
> + u32 offset;
> + u32 hdrsize;
> + struct {
> + u32 offset;
> + u8 proto;
> + } tun;
> +};
Could you add comments for the members here?
In particular, we now have @offset and @tun.offset.
I can guess that the offset is the start of the inner
ip header and tun.offset is the start of the header
following the inner ip header.
This patch would perhaps be easier to review if the
pure move of the ctx structure and passing the extra
'ctx' arg would be in a separate patch.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH nf-next v2 2/4] netfilter: flowtable: Add IP6IP6 rx sw acceleration
2025-12-09 7:35 ` [PATCH nf-next v2 2/4] netfilter: flowtable: Add IP6IP6 rx sw acceleration Lorenzo Bianconi
@ 2026-01-13 15:23 ` Florian Westphal
2026-01-16 8:10 ` Lorenzo Bianconi
0 siblings, 1 reply; 9+ messages in thread
From: Florian Westphal @ 2026-01-13 15:23 UTC (permalink / raw)
To: Lorenzo Bianconi
Cc: Pablo Neira Ayuso, Jozsef Kadlecsik, Phil Sutter, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
David Ahern, Shuah Khan, netfilter-devel, coreteam, netdev,
linux-kselftest
Lorenzo Bianconi <lorenzo@kernel.org> wrote:
> Introduce sw acceleration for rx path of IP6IP6 tunnels relying on the
> netfilter flowtable infrastructure. Subsequent patches will add sw
> acceleration for IP6IP6 tunnels tx path.
> IP6IP6 rx sw acceleration can be tested running the following scenario
> where the traffic is forwarded between two NICs (eth0 and eth1) and an
> IP6IP6 tunnel is used to access a remote site (using eth1 as the underlay
> device):
>
> ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (2001:db8:3::2)
>
> $ip addr show
> 6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
> link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff
> inet6 2001:db8:1::2/64 scope global nodad
> valid_lft forever preferred_lft forever
> 7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
> link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff
> inet6 2001:db8:2::1/64 scope global nodad
> valid_lft forever preferred_lft forever
> 8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000
> link/tunnel6 2001:db8:2::1 peer 2001:db8:2::2 permaddr ce9c:2940:7dcc::
> inet6 2002:db8:1::1/64 scope global nodad
> valid_lft forever preferred_lft forever
>
> $ip -6 route show
> 2001:db8:1::/64 dev eth0 proto kernel metric 256 pref medium
> 2001:db8:2::/64 dev eth1 proto kernel metric 256 pref medium
> 2002:db8:1::/64 dev tun0 proto kernel metric 256 pref medium
> default via 2002:db8:1::2 dev tun0 metric 1024 pref medium
>
> $nft list ruleset
> table inet filter {
> flowtable ft {
> hook ingress priority filter
> devices = { eth0, eth1 }
> }
>
> chain forward {
> type filter hook forward priority filter; policy accept;
> meta l4proto { tcp, udp } flow add @ft
> }
> }
>
> Reproducing the scenario described above using veths I got the following
> results:
> - TCP stream received from the IPIP tunnel:
> - net-next: (baseline) ~ 81Gbps
> - net-next + IP6IP6 flowtbale support: ~112Gbps
>
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
> net/ipv6/ip6_tunnel.c | 27 +++++++++++++
> net/netfilter/nf_flow_table_ip.c | 83 +++++++++++++++++++++++++++++++++-------
> 2 files changed, 97 insertions(+), 13 deletions(-)
>
> diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
> index 6405072050e0ef7521ca1fdddc4a0252e2159d2a..10341bfc16bd16a43290015952bd9a57658e6ae1 100644
> --- a/net/ipv6/ip6_tunnel.c
> +++ b/net/ipv6/ip6_tunnel.c
> @@ -1828,6 +1828,32 @@ int ip6_tnl_encap_setup(struct ip6_tnl *t,
> }
> EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup);
>
> +static int ip6_tnl_fill_forward_path(struct net_device_path_ctx *ctx,
> + struct net_device_path *path)
> +{
> + struct ip6_tnl *t = netdev_priv(ctx->dev);
> + struct flowi6 fl6 = {
> + .daddr = t->parms.raddr,
> + };
> + struct dst_entry *dst;
> + int err;
> +
> + dst = ip6_route_output(dev_net(ctx->dev), NULL, &fl6);
> + if (!dst->error) {
> + path->type = DEV_PATH_TUN;
> + path->tun.src_v6 = t->parms.laddr;
> + path->tun.dst_v6 = t->parms.raddr;
> + path->tun.l3_proto = IPPROTO_IPV6;
> + path->dev = ctx->dev;
> + ctx->dev = dst->dev;
> + }
> +
> + err = dst->error;
> + dst_release(dst);
> +
> + return err;
> +}
> +
> static const struct net_device_ops ip6_tnl_netdev_ops = {
> .ndo_init = ip6_tnl_dev_init,
> .ndo_uninit = ip6_tnl_dev_uninit,
> @@ -1836,6 +1862,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
> .ndo_change_mtu = ip6_tnl_change_mtu,
> .ndo_get_stats64 = dev_get_tstats64,
> .ndo_get_iflink = ip6_tnl_get_iflink,
> + .ndo_fill_forward_path = ip6_tnl_fill_forward_path,
> };
>
> #define IPXIPX_FEATURES (NETIF_F_SG | \
> diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
> index 14c01b59f76569170057d2465ee5953efb557bcc..8323f44a1ef172f16300a5c2c628464a99b2c47a 100644
> --- a/net/netfilter/nf_flow_table_ip.c
> +++ b/net/netfilter/nf_flow_table_ip.c
> @@ -159,6 +159,7 @@ static void nf_flow_tuple_encap(struct nf_flowtable_ctx *ctx,
> __be16 inner_proto = skb->protocol;
> struct vlan_ethhdr *veth;
> struct pppoe_hdr *phdr;
> + struct ipv6hdr *ip6h;
> struct iphdr *iph;
> u16 offset = 0;
> int i = 0;
> @@ -185,12 +186,25 @@ static void nf_flow_tuple_encap(struct nf_flowtable_ctx *ctx,
> break;
> }
>
> - if (inner_proto == htons(ETH_P_IP) &&
> - ctx->tun.proto == IPPROTO_IPIP) {
This change is done in the preceeding patch, then removed again?
Looks like the previous patch should leave the
code as-is?
> + switch (inner_proto) {
> + case htons(ETH_P_IP):
> iph = (struct iphdr *)(skb_network_header(skb) + offset);
> - tuple->tun.dst_v4.s_addr = iph->daddr;
> - tuple->tun.src_v4.s_addr = iph->saddr;
> - tuple->tun.l3_proto = IPPROTO_IPIP;
> + if (ctx->tun.proto == IPPROTO_IPIP) {
> + tuple->tun.dst_v4.s_addr = iph->daddr;
> + tuple->tun.src_v4.s_addr = iph->saddr;
> + tuple->tun.l3_proto = IPPROTO_IPIP;
> + }
> + break;
> + case htons(ETH_P_IPV6):
> + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
> + if (ctx->tun.proto == IPPROTO_IPV6) {
> + tuple->tun.dst_v6 = ip6h->daddr;
> + tuple->tun.src_v6 = ip6h->saddr;
> + tuple->tun.l3_proto = IPPROTO_IPV6;
> + }
> + break;
> + default:
> + break;
> }
> }
>
> @@ -324,10 +338,45 @@ static bool nf_flow_ip4_tunnel_proto(struct nf_flowtable_ctx *ctx,
> return true;
> }
>
> -static void nf_flow_ip4_tunnel_pop(struct nf_flowtable_ctx *ctx,
> - struct sk_buff *skb)
> +static bool nf_flow_ip6_tunnel_proto(struct nf_flowtable_ctx *ctx,
> + struct sk_buff *skb)
> {
> - if (ctx->tun.proto != IPPROTO_IPIP)
> +#if IS_ENABLED(CONFIG_IPV6)
> + struct ipv6hdr *ip6h;
> + __be16 frag_off;
> + u8 nexthdr;
> + int hdrlen;
> +
> + if (!pskb_may_pull(skb, sizeof(*ip6h) + ctx->offset))
> + return false;
> +
> + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset);
> + if (ip6h->hop_limit <= 1)
> + return false;
There are multiple places where we do a pull on the skb, is this
needed? Could this be replaced by skb_header_pointer() ?
doing skb->head realloc might be expensive and its more
error prone.
Or is there a requirement that the ctx->offsets can be
accessed via skb->head/data?
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH nf-next v2 1/4] netfilter: Introduce tunnel metadata info in nf_flowtable_ctx struct
2026-01-13 15:11 ` Florian Westphal
@ 2026-01-15 23:00 ` Lorenzo Bianconi
0 siblings, 0 replies; 9+ messages in thread
From: Lorenzo Bianconi @ 2026-01-15 23:00 UTC (permalink / raw)
To: Florian Westphal
Cc: Pablo Neira Ayuso, Jozsef Kadlecsik, Phil Sutter, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
David Ahern, Shuah Khan, netfilter-devel, coreteam, netdev,
linux-kselftest
[-- Attachment #1: Type: text/plain, Size: 1741 bytes --]
> Lorenzo Bianconi <lorenzo@kernel.org> wrote:
> > This is a preliminary patch to introduce IP6IP6 flowtable acceleration.
>
> Would you mind extending this a little bit?
> AFAICS this prepares for IP6IP6 by removing the 'its ipv4'
> assumptions resp. adding needed 'its ipv4' checks:
> no ipv6 support is added here.
ack, I will expand the commit log.
>
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > ---
> > net/netfilter/nf_flow_table_ip.c | 80 ++++++++++++++++++++++------------------
> > 1 file changed, 44 insertions(+), 36 deletions(-)
> >
> > diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
> > index e128b0fe9a7bf50b458df9940d629ea08c521871..14c01b59f76569170057d2465ee5953efb557bcc 100644
> > --- a/net/netfilter/nf_flow_table_ip.c
> > +++ b/net/netfilter/nf_flow_table_ip.c
> > @@ -142,7 +142,18 @@ static bool ip_has_options(unsigned int thoff)
> > return thoff != sizeof(struct iphdr);
> > }
> >
> > -static void nf_flow_tuple_encap(struct sk_buff *skb,
> > +struct nf_flowtable_ctx {
> > + const struct net_device *in;
> > + u32 offset;
> > + u32 hdrsize;
> > + struct {
> > + u32 offset;
> > + u8 proto;
> > + } tun;
> > +};
>
> Could you add comments for the members here?
ack, I will do.
>
> In particular, we now have @offset and @tun.offset.
>
> I can guess that the offset is the start of the inner
> ip header and tun.offset is the start of the header
> following the inner ip header.
ack, right.
>
> This patch would perhaps be easier to review if the
> pure move of the ctx structure and passing the extra
> 'ctx' arg would be in a separate patch.
Ack I will do in v3.
Regards,
Lorenzo
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH nf-next v2 2/4] netfilter: flowtable: Add IP6IP6 rx sw acceleration
2026-01-13 15:23 ` Florian Westphal
@ 2026-01-16 8:10 ` Lorenzo Bianconi
0 siblings, 0 replies; 9+ messages in thread
From: Lorenzo Bianconi @ 2026-01-16 8:10 UTC (permalink / raw)
To: Florian Westphal
Cc: Pablo Neira Ayuso, Jozsef Kadlecsik, Phil Sutter, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
David Ahern, Shuah Khan, netfilter-devel, coreteam, netdev,
linux-kselftest
[-- Attachment #1: Type: text/plain, Size: 7081 bytes --]
> Lorenzo Bianconi <lorenzo@kernel.org> wrote:
> > Introduce sw acceleration for rx path of IP6IP6 tunnels relying on the
> > netfilter flowtable infrastructure. Subsequent patches will add sw
> > acceleration for IP6IP6 tunnels tx path.
> > IP6IP6 rx sw acceleration can be tested running the following scenario
> > where the traffic is forwarded between two NICs (eth0 and eth1) and an
> > IP6IP6 tunnel is used to access a remote site (using eth1 as the underlay
> > device):
> >
> > ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (2001:db8:3::2)
> >
> > $ip addr show
> > 6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
> > link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff
> > inet6 2001:db8:1::2/64 scope global nodad
> > valid_lft forever preferred_lft forever
> > 7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
> > link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff
> > inet6 2001:db8:2::1/64 scope global nodad
> > valid_lft forever preferred_lft forever
> > 8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000
> > link/tunnel6 2001:db8:2::1 peer 2001:db8:2::2 permaddr ce9c:2940:7dcc::
> > inet6 2002:db8:1::1/64 scope global nodad
> > valid_lft forever preferred_lft forever
> >
> > $ip -6 route show
> > 2001:db8:1::/64 dev eth0 proto kernel metric 256 pref medium
> > 2001:db8:2::/64 dev eth1 proto kernel metric 256 pref medium
> > 2002:db8:1::/64 dev tun0 proto kernel metric 256 pref medium
> > default via 2002:db8:1::2 dev tun0 metric 1024 pref medium
> >
> > $nft list ruleset
> > table inet filter {
> > flowtable ft {
> > hook ingress priority filter
> > devices = { eth0, eth1 }
> > }
> >
> > chain forward {
> > type filter hook forward priority filter; policy accept;
> > meta l4proto { tcp, udp } flow add @ft
> > }
> > }
> >
> > Reproducing the scenario described above using veths I got the following
> > results:
> > - TCP stream received from the IPIP tunnel:
> > - net-next: (baseline) ~ 81Gbps
> > - net-next + IP6IP6 flowtbale support: ~112Gbps
> >
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > ---
> > net/ipv6/ip6_tunnel.c | 27 +++++++++++++
> > net/netfilter/nf_flow_table_ip.c | 83 +++++++++++++++++++++++++++++++++-------
> > 2 files changed, 97 insertions(+), 13 deletions(-)
> >
> > diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
> > index 6405072050e0ef7521ca1fdddc4a0252e2159d2a..10341bfc16bd16a43290015952bd9a57658e6ae1 100644
> > --- a/net/ipv6/ip6_tunnel.c
> > +++ b/net/ipv6/ip6_tunnel.c
> > @@ -1828,6 +1828,32 @@ int ip6_tnl_encap_setup(struct ip6_tnl *t,
> > }
> > EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup);
> >
> > +static int ip6_tnl_fill_forward_path(struct net_device_path_ctx *ctx,
> > + struct net_device_path *path)
> > +{
> > + struct ip6_tnl *t = netdev_priv(ctx->dev);
> > + struct flowi6 fl6 = {
> > + .daddr = t->parms.raddr,
> > + };
> > + struct dst_entry *dst;
> > + int err;
> > +
> > + dst = ip6_route_output(dev_net(ctx->dev), NULL, &fl6);
> > + if (!dst->error) {
> > + path->type = DEV_PATH_TUN;
> > + path->tun.src_v6 = t->parms.laddr;
> > + path->tun.dst_v6 = t->parms.raddr;
> > + path->tun.l3_proto = IPPROTO_IPV6;
> > + path->dev = ctx->dev;
> > + ctx->dev = dst->dev;
> > + }
> > +
> > + err = dst->error;
> > + dst_release(dst);
> > +
> > + return err;
> > +}
> > +
> > static const struct net_device_ops ip6_tnl_netdev_ops = {
> > .ndo_init = ip6_tnl_dev_init,
> > .ndo_uninit = ip6_tnl_dev_uninit,
> > @@ -1836,6 +1862,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
> > .ndo_change_mtu = ip6_tnl_change_mtu,
> > .ndo_get_stats64 = dev_get_tstats64,
> > .ndo_get_iflink = ip6_tnl_get_iflink,
> > + .ndo_fill_forward_path = ip6_tnl_fill_forward_path,
> > };
> >
> > #define IPXIPX_FEATURES (NETIF_F_SG | \
> > diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
> > index 14c01b59f76569170057d2465ee5953efb557bcc..8323f44a1ef172f16300a5c2c628464a99b2c47a 100644
> > --- a/net/netfilter/nf_flow_table_ip.c
> > +++ b/net/netfilter/nf_flow_table_ip.c
> > @@ -159,6 +159,7 @@ static void nf_flow_tuple_encap(struct nf_flowtable_ctx *ctx,
> > __be16 inner_proto = skb->protocol;
> > struct vlan_ethhdr *veth;
> > struct pppoe_hdr *phdr;
> > + struct ipv6hdr *ip6h;
> > struct iphdr *iph;
> > u16 offset = 0;
> > int i = 0;
> > @@ -185,12 +186,25 @@ static void nf_flow_tuple_encap(struct nf_flowtable_ctx *ctx,
> > break;
> > }
> >
> > - if (inner_proto == htons(ETH_P_IP) &&
> > - ctx->tun.proto == IPPROTO_IPIP) {
>
> This change is done in the preceeding patch, then removed again?
> Looks like the previous patch should leave the
> code as-is?
ack, right. I will fix it.
>
> > + switch (inner_proto) {
> > + case htons(ETH_P_IP):
> > iph = (struct iphdr *)(skb_network_header(skb) + offset);
> > - tuple->tun.dst_v4.s_addr = iph->daddr;
> > - tuple->tun.src_v4.s_addr = iph->saddr;
> > - tuple->tun.l3_proto = IPPROTO_IPIP;
> > + if (ctx->tun.proto == IPPROTO_IPIP) {
> > + tuple->tun.dst_v4.s_addr = iph->daddr;
> > + tuple->tun.src_v4.s_addr = iph->saddr;
> > + tuple->tun.l3_proto = IPPROTO_IPIP;
> > + }
> > + break;
> > + case htons(ETH_P_IPV6):
> > + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
> > + if (ctx->tun.proto == IPPROTO_IPV6) {
> > + tuple->tun.dst_v6 = ip6h->daddr;
> > + tuple->tun.src_v6 = ip6h->saddr;
> > + tuple->tun.l3_proto = IPPROTO_IPV6;
> > + }
> > + break;
> > + default:
> > + break;
> > }
> > }
> >
> > @@ -324,10 +338,45 @@ static bool nf_flow_ip4_tunnel_proto(struct nf_flowtable_ctx *ctx,
> > return true;
> > }
> >
> > -static void nf_flow_ip4_tunnel_pop(struct nf_flowtable_ctx *ctx,
> > - struct sk_buff *skb)
> > +static bool nf_flow_ip6_tunnel_proto(struct nf_flowtable_ctx *ctx,
> > + struct sk_buff *skb)
> > {
> > - if (ctx->tun.proto != IPPROTO_IPIP)
> > +#if IS_ENABLED(CONFIG_IPV6)
> > + struct ipv6hdr *ip6h;
> > + __be16 frag_off;
> > + u8 nexthdr;
> > + int hdrlen;
> > +
> > + if (!pskb_may_pull(skb, sizeof(*ip6h) + ctx->offset))
> > + return false;
> > +
> > + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset);
> > + if (ip6h->hop_limit <= 1)
> > + return false;
>
> There are multiple places where we do a pull on the skb, is this
> needed? Could this be replaced by skb_header_pointer() ?
ack, I will fix it in v3.
Regards,
Lorenzo
>
> doing skb->head realloc might be expensive and its more
> error prone.
>
> Or is there a requirement that the ctx->offsets can be
> accessed via skb->head/data?
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2026-01-16 8:10 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-12-09 7:35 [PATCH nf-next v2 0/4] Add IP6IP6 flowtable SW acceleration Lorenzo Bianconi
2025-12-09 7:35 ` [PATCH nf-next v2 1/4] netfilter: Introduce tunnel metadata info in nf_flowtable_ctx struct Lorenzo Bianconi
2026-01-13 15:11 ` Florian Westphal
2026-01-15 23:00 ` Lorenzo Bianconi
2025-12-09 7:35 ` [PATCH nf-next v2 2/4] netfilter: flowtable: Add IP6IP6 rx sw acceleration Lorenzo Bianconi
2026-01-13 15:23 ` Florian Westphal
2026-01-16 8:10 ` Lorenzo Bianconi
2025-12-09 7:35 ` [PATCH nf-next v2 3/4] netfilter: flowtable: Add IP6IP6 tx " Lorenzo Bianconi
2025-12-09 7:35 ` [PATCH nf-next v2 4/4] selftests: netfilter: nft_flowtable.sh: Add IP6IP6 flowtable selftest Lorenzo Bianconi
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox