From: Ralf Lici <ralf@mandelbit.com>
To: netdev@vger.kernel.org
Cc: "Daniel Gröber" <dxld@darkboxed.org>,
"Ralf Lici" <ralf@mandelbit.com>,
"Antonio Quartulli" <antonio@mandelbit.com>,
"Andrew Lunn" <andrew+netdev@lunn.ch>,
"David S. Miller" <davem@davemloft.net>,
"Eric Dumazet" <edumazet@google.com>,
"Jakub Kicinski" <kuba@kernel.org>,
"Paolo Abeni" <pabeni@redhat.com>,
linux-kernel@vger.kernel.org
Subject: [RFC net-next 10/15] ipxlat: add 4to6 pre-fragmentation path
Date: Thu, 19 Mar 2026 16:12:19 +0100 [thread overview]
Message-ID: <20260319151230.655687-11-ralf@mandelbit.com> (raw)
In-Reply-To: <20260319151230.655687-1-ralf@mandelbit.com>
RFC 7915 requires handling packets that would exceed the translated IPv6
size constraints. Add a pre-fragmentation planning/action path that
invokes kernel fragmentation helpers before translation, carries
fragment size through skb metadata, and then reinjects fragments into
the normal translation path.
Signed-off-by: Ralf Lici <ralf@mandelbit.com>
---
drivers/net/ipxlat/dispatch.c | 99 ++++++++++++++++++++++++++++++-
drivers/net/ipxlat/translate_46.c | 59 +++++++++++++++++-
drivers/net/ipxlat/translate_46.h | 11 ++++
3 files changed, 166 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ipxlat/dispatch.c b/drivers/net/ipxlat/dispatch.c
index b8b9b930b04c..b58191d4b2c9 100644
--- a/drivers/net/ipxlat/dispatch.c
+++ b/drivers/net/ipxlat/dispatch.c
@@ -47,6 +47,16 @@ enum ipxlat_action ipxlat_translate(struct ipxlat_priv *ipxlat,
if (unlikely(ipxlat_v4_validate_skb(ipxlat, skb)))
return ipxlat_resolve_failed_action(skb);
+ /* 4->6 prefrag plan stores per-skb frag_max_size
+ * when the packet must be split before translation
+ * (DF clear and translated size
+ * above PMTU/threshold).
+ */
+ if (unlikely(ipxlat_46_plan_prefrag(ipxlat, skb)))
+ return ipxlat_resolve_failed_action(skb);
+ if (unlikely(ipxlat_skb_cb(skb)->frag_max_size))
+ return IPXLAT_ACT_PRE_FRAG;
+
if (unlikely(ipxlat_46_translate(ipxlat, skb)))
return ipxlat_resolve_failed_action(skb);
@@ -120,6 +130,76 @@ void ipxlat_emit_icmp_error(struct ipxlat_priv *ipxlat, struct sk_buff *inner)
}
}
+static unsigned int ipxlat_frag_dst_get_mtu(const struct dst_entry *dst)
+{
+ return READ_ONCE(dst->dev->mtu);
+}
+
+static struct dst_ops ipxlat_frag_dst_ops = {
+ .family = AF_UNSPEC,
+ .mtu = ipxlat_frag_dst_get_mtu,
+};
+
+/**
+ * ipxlat_46_frag_output - reinject one fragment produced by ip_do_fragment
+ * @net: network namespace of the transmitter
+ * @sk: originating socket
+ * @skb: fragment to reinject
+ *
+ * This callback mirrors ndo_start_xmit processing but runs with
+ * pre-fragmentation disabled to prevent recursive pre-fragment loops.
+ *
+ * Return: 0 on success, negative errno on processing failure.
+ */
+static int ipxlat_46_frag_output(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct ipxlat_priv *ipxlat = netdev_priv(skb->dev);
+
+ return ipxlat_process_skb(ipxlat, skb, false);
+}
+
+/**
+ * ipxlat_46_fragment_pkt - fragment oversized 4->6 input before translation
+ * @ipxlat: translator private context
+ * @skb: original packet to fragment
+ * @frag_max_size: per-fragment payload cap for ip_do_fragment
+ *
+ * Installs a temporary synthetic dst so ip_do_fragment can read MTU and then
+ * reinjects each produced fragment back into ipxlat through
+ * ipxlat_46_frag_output.
+ *
+ * Return: 0 on success, negative errno on fragmentation failure.
+ */
+static int ipxlat_46_fragment_pkt(struct ipxlat_priv *ipxlat,
+ struct sk_buff *skb, u16 frag_max_size)
+{
+ const unsigned long orig_dst = skb->_skb_refdst;
+ struct rtable ipxlat_rt = {};
+ int err;
+
+ /* ip_do_fragment needs a dst object to query mtu */
+ dst_init(&ipxlat_rt.dst, &ipxlat_frag_dst_ops, NULL, DST_OBSOLETE_NONE,
+ DST_NOCOUNT);
+
+ /* use translator netdev as mtu source for the temporary dst */
+ ipxlat_rt.dst.dev = ipxlat->dev;
+
+ /* setup the skb for fragmentation */
+ skb_dst_set_noref(skb, &ipxlat_rt.dst);
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ IPCB(skb)->frag_max_size = frag_max_size;
+
+ /* fragment and reinject each frag in the translator */
+ err = ip_do_fragment(dev_net(ipxlat->dev), skb->sk, skb,
+ ipxlat_46_frag_output);
+
+ /* drop original dst ref replaced by the synthetic NOREF dst */
+ refdst_drop(orig_dst);
+
+ return err;
+}
+
static void ipxlat_forward_pkt(struct ipxlat_priv *ipxlat, struct sk_buff *skb)
{
const unsigned int len = skb->len;
@@ -141,14 +221,29 @@ int ipxlat_process_skb(struct ipxlat_priv *ipxlat, struct sk_buff *skb,
enum ipxlat_action action;
int err = -EINVAL;
- (void)allow_pre_frag;
-
action = ipxlat_translate(ipxlat, skb);
switch (action) {
case IPXLAT_ACT_FWD:
dev_dstats_tx_add(ipxlat->dev, skb->len);
ipxlat_forward_pkt(ipxlat, skb);
return 0;
+ case IPXLAT_ACT_PRE_FRAG:
+ /* prefrag is allowed only once to avoid unbounded loops */
+ if (unlikely(!allow_pre_frag)) {
+ err = -ELOOP;
+ goto drop_free;
+ }
+
+ /* fragment first, then reinject each fragment through
+ * ipxlat_process_skb via ipxlat_46_frag_output
+ */
+ err = ipxlat_46_fragment_pkt(ipxlat, skb,
+ ipxlat_skb_cb(skb)->frag_max_size);
+ /* fragment path already consumed/freed skb */
+ skb = NULL;
+ if (unlikely(err))
+ goto drop_free;
+ return 0;
case IPXLAT_ACT_ICMP_ERR:
dev_dstats_tx_dropped(ipxlat->dev);
ipxlat_emit_icmp_error(ipxlat, skb);
diff --git a/drivers/net/ipxlat/translate_46.c b/drivers/net/ipxlat/translate_46.c
index aec8500db2c2..0b79ca07c771 100644
--- a/drivers/net/ipxlat/translate_46.c
+++ b/drivers/net/ipxlat/translate_46.c
@@ -87,6 +87,63 @@ unsigned int ipxlat_46_lookup_pmtu6(struct ipxlat_priv *ipxlat,
return mtu6;
}
+/**
+ * ipxlat_46_plan_prefrag - plan pre-translation IPv4 fragmentation for 4->6
+ * @ipxlat: translator private context
+ * @skb: packet being translated
+ *
+ * Decides whether packet exceeds PMTU/LIM thresholds and, when needed, stores
+ * per-skb fragmentation cap in cb->frag_max_size for later ip_do_fragment.
+ *
+ * Return: 0 on success, negative errno on policy/validation failure.
+ */
+int ipxlat_46_plan_prefrag(struct ipxlat_priv *ipxlat, struct sk_buff *skb)
+{
+ unsigned int pkt_len6, pmtu6, threshold6, frag_max_size, pkt_len4,
+ old_l3_len, new_l3_len;
+ struct ipxlat_cb *cb = ipxlat_skb_cb(skb);
+ const struct iphdr *in4 = ip_hdr(skb);
+ int l3_delta, frag_l3_delta;
+
+ if (unlikely(cb->frag_max_size)) {
+ DEBUG_NET_WARN_ON_ONCE(1);
+ cb->frag_max_size = 0;
+ }
+
+ pkt_len4 = iph_totlen(skb, in4);
+ old_l3_len = cb->l3_hdr_len;
+ new_l3_len = sizeof(struct ipv6hdr) +
+ (ip_is_fragment(in4) ? sizeof(struct frag_hdr) : 0);
+ l3_delta = (int)new_l3_len - (int)old_l3_len;
+ pkt_len6 = pkt_len4 + l3_delta;
+
+ pmtu6 = ipxlat_46_lookup_pmtu6(ipxlat, skb, in4);
+ threshold6 = min(pmtu6, READ_ONCE(ipxlat->lowest_ipv6_mtu));
+
+ if (likely(pkt_len6 <= threshold6))
+ return 0;
+
+ /* df packets are never locally pre-fragmented */
+ if (likely(be16_to_cpu(in4->frag_off) & IP_DF)) {
+ /* Let the IPv6 forwarding path raise PTB when needed and rely
+ * on the reverse 6->4 ICMP translation path for feedback.
+ */
+ return 0;
+ }
+
+ /* df not set: we can fragment */
+
+ frag_l3_delta =
+ (int)(sizeof(struct ipv6hdr) + sizeof(struct frag_hdr)) -
+ (int)old_l3_len;
+ frag_max_size = threshold6 - frag_l3_delta;
+ /* store per-skb prefrag cap: ipxlat_46_fragment_pkt will copy it into
+ * IPCB(skb)->frag_max_size before calling ip_do_fragment
+ */
+ cb->frag_max_size = min_t(unsigned int, frag_max_size, IP_MAX_MTU);
+ return 0;
+}
+
/**
* ipxlat_46_translate - translate one validated packet from IPv4 to IPv6
* @ipxlat: translator private context
@@ -182,7 +239,7 @@ int ipxlat_46_translate(struct ipxlat_priv *ipxlat, struct sk_buff *skb)
err = ipxlat_46_outer_udp(skb, &outer4);
break;
case IPPROTO_ICMP:
- err = ipxlat_46_icmp(ipxlat, skb);
+ err = -EPROTONOSUPPORT;
break;
default:
err = 0;
diff --git a/drivers/net/ipxlat/translate_46.h b/drivers/net/ipxlat/translate_46.h
index 75def10d0cad..6ba409c94185 100644
--- a/drivers/net/ipxlat/translate_46.h
+++ b/drivers/net/ipxlat/translate_46.h
@@ -61,6 +61,17 @@ unsigned int ipxlat_46_lookup_pmtu6(struct ipxlat_priv *ipxlat,
const struct sk_buff *skb,
const struct iphdr *in4);
+/**
+ * ipxlat_46_plan_prefrag - decide whether IPv4 packet must be pre-fragmented
+ * @ipxlat: translator private context
+ * @skb: packet being translated
+ *
+ * Sets cb->frag_max_size when pre-fragmentation is required.
+ *
+ * Return: 0 on success, negative errno on policy/validation failure.
+ */
+int ipxlat_46_plan_prefrag(struct ipxlat_priv *ipxlat, struct sk_buff *skb);
+
/**
* ipxlat_46_translate - translate outer packet from IPv4 to IPv6 in place
* @ipxlat: translator private context
--
2.53.0
next prev parent reply other threads:[~2026-03-19 15:18 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-19 15:12 [RFC net-next 00/15] Introducing ipxlat: a stateless IPv4/IPv6 translation device Ralf Lici
2026-03-19 15:12 ` [RFC net-next 01/15] drivers/net: add ipxlat netdevice skeleton and build plumbing Ralf Lici
2026-03-19 15:12 ` [RFC net-next 02/15] ipxlat: add RFC 6052 address conversion helpers Ralf Lici
2026-03-19 15:12 ` [RFC net-next 03/15] ipxlat: add packet metadata control block helpers Ralf Lici
2026-03-19 15:12 ` [RFC net-next 04/15] ipxlat: add IPv4 packet validation path Ralf Lici
2026-03-19 15:12 ` [RFC net-next 05/15] ipxlat: add IPv6 " Ralf Lici
2026-03-19 15:12 ` [RFC net-next 06/15] ipxlat: add transport checksum and offload helpers Ralf Lici
2026-03-19 15:12 ` [RFC net-next 07/15] ipxlat: add 4to6 and 6to4 TCP/UDP translation helpers Ralf Lici
2026-03-19 15:12 ` [RFC net-next 08/15] ipxlat: add translation engine and dispatch core Ralf Lici
2026-03-19 15:12 ` [RFC net-next 09/15] ipxlat: emit translator-generated ICMP errors on drop Ralf Lici
2026-03-19 15:12 ` Ralf Lici [this message]
2026-03-19 15:12 ` [RFC net-next 11/15] ipxlat: add ICMP informational translation paths Ralf Lici
2026-03-19 15:12 ` [RFC net-next 12/15] ipxlat: add ICMP error translation and quoted-inner handling Ralf Lici
2026-03-19 15:12 ` [RFC net-next 13/15] ipxlat: add netlink control plane and uapi Ralf Lici
2026-03-19 15:12 ` [RFC net-next 14/15] selftests: net: add ipxlat coverage Ralf Lici
2026-03-19 15:12 ` [RFC net-next 15/15] Documentation: networking: add ipxlat translator guide Ralf Lici
2026-03-19 22:11 ` Jonathan Corbet
2026-03-24 9:55 ` Ralf Lici
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260319151230.655687-11-ralf@mandelbit.com \
--to=ralf@mandelbit.com \
--cc=andrew+netdev@lunn.ch \
--cc=antonio@mandelbit.com \
--cc=davem@davemloft.net \
--cc=dxld@darkboxed.org \
--cc=edumazet@google.com \
--cc=kuba@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox