Netdev List

Netdev List
 help / color / mirror / Atom feed

* [Patch net-next v1 6/7] r8169: move struct ethtool_ops
From: javen @ 2026-05-06  8:13 UTC (permalink / raw)
  To: hkallweit1, nic_swsd, andrew+netdev, davem, edumazet, kuba,
	pabeni, horms
  Cc: netdev, linux-kernel, Javen Xu
In-Reply-To: <20260506081326.767-1-javen_xu@realsil.com.cn>

From: Javen Xu <javen_xu@realsil.com.cn>

This patch move struct ethtool_ops, no changes. Prepare for next patch.

Signed-off-by: Javen Xu <javen_xu@realsil.com.cn>
---
 drivers/net/ethernet/realtek/r8169_main.c | 56 +++++++++++------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 9b42cee24b8a..6e682a5538d3 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -2558,34 +2558,6 @@ static int rtl8169_set_link_ksettings(struct net_device *ndev,
 	return 0;
 }
 
-static const struct ethtool_ops rtl8169_ethtool_ops = {
-	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
-				     ETHTOOL_COALESCE_MAX_FRAMES,
-	.get_drvinfo		= rtl8169_get_drvinfo,
-	.get_regs_len		= rtl8169_get_regs_len,
-	.get_link		= ethtool_op_get_link,
-	.get_coalesce		= rtl_get_coalesce,
-	.set_coalesce		= rtl_set_coalesce,
-	.get_regs		= rtl8169_get_regs,
-	.get_wol		= rtl8169_get_wol,
-	.set_wol		= rtl8169_set_wol,
-	.get_strings		= rtl8169_get_strings,
-	.get_sset_count		= rtl8169_get_sset_count,
-	.get_ethtool_stats	= rtl8169_get_ethtool_stats,
-	.get_ts_info		= ethtool_op_get_ts_info,
-	.nway_reset		= phy_ethtool_nway_reset,
-	.get_eee		= rtl8169_get_eee,
-	.set_eee		= rtl8169_set_eee,
-	.get_link_ksettings	= phy_ethtool_get_link_ksettings,
-	.set_link_ksettings	= rtl8169_set_link_ksettings,
-	.get_ringparam		= rtl8169_get_ringparam,
-	.get_pause_stats	= rtl8169_get_pause_stats,
-	.get_pauseparam		= rtl8169_get_pauseparam,
-	.set_pauseparam		= rtl8169_set_pauseparam,
-	.get_eth_mac_stats	= rtl8169_get_eth_mac_stats,
-	.get_eth_ctrl_stats	= rtl8169_get_eth_ctrl_stats,
-};
-
 static const struct rtl_chip_info *rtl8169_get_chip_version(u32 xid, bool gmii)
 {
 	/* Chips combining a 1Gbps MAC with a 100Mbps PHY */
@@ -6400,6 +6372,34 @@ static void r8169_init_napi(struct rtl8169_private *tp)
 	}
 }
 
+static const struct ethtool_ops rtl8169_ethtool_ops = {
+	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
+				     ETHTOOL_COALESCE_MAX_FRAMES,
+	.get_drvinfo		= rtl8169_get_drvinfo,
+	.get_regs_len		= rtl8169_get_regs_len,
+	.get_link		= ethtool_op_get_link,
+	.get_coalesce		= rtl_get_coalesce,
+	.set_coalesce		= rtl_set_coalesce,
+	.get_regs		= rtl8169_get_regs,
+	.get_wol		= rtl8169_get_wol,
+	.set_wol		= rtl8169_set_wol,
+	.get_strings		= rtl8169_get_strings,
+	.get_sset_count		= rtl8169_get_sset_count,
+	.get_ethtool_stats	= rtl8169_get_ethtool_stats,
+	.get_ts_info		= ethtool_op_get_ts_info,
+	.nway_reset		= phy_ethtool_nway_reset,
+	.get_eee		= rtl8169_get_eee,
+	.set_eee		= rtl8169_set_eee,
+	.get_link_ksettings	= phy_ethtool_get_link_ksettings,
+	.set_link_ksettings	= rtl8169_set_link_ksettings,
+	.get_ringparam		= rtl8169_get_ringparam,
+	.get_pause_stats	= rtl8169_get_pause_stats,
+	.get_pauseparam		= rtl8169_get_pauseparam,
+	.set_pauseparam		= rtl8169_set_pauseparam,
+	.get_eth_mac_stats	= rtl8169_get_eth_mac_stats,
+	.get_eth_ctrl_stats	= rtl8169_get_eth_ctrl_stats,
+};
+
 static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	const struct rtl_chip_info *chip;
-- 
2.43.0


^ permalink raw reply related

* [Patch net-next v1 7/7] r8169: add support for ethtool
From: javen @ 2026-05-06  8:13 UTC (permalink / raw)
  To: hkallweit1, nic_swsd, andrew+netdev, davem, edumazet, kuba,
	pabeni, horms
  Cc: netdev, linux-kernel, Javen Xu
In-Reply-To: <20260506081326.767-1-javen_xu@realsil.com.cn>

From: Javen Xu <javen_xu@realsil.com.cn>

This patch add support for changing rx queues by ethtool. We can set rx
1, 2, 4, 8 by ethtool -L eth1 rx num.

Signed-off-by: Javen Xu <javen_xu@realsil.com.cn>
---
 drivers/net/ethernet/realtek/r8169_main.c | 133 ++++++++++++++++++++++
 1 file changed, 133 insertions(+)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 6e682a5538d3..305c5eaf16f8 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -6372,6 +6372,137 @@ static void r8169_init_napi(struct rtl8169_private *tp)
 	}
 }
 
+static void rtl8169_get_channels(struct net_device *dev,
+				 struct ethtool_channels *ch)
+{
+	struct rtl8169_private *tp = netdev_priv(dev);
+
+	ch->max_rx = tp->hw_supp_num_rx_queues;
+	ch->max_tx = 1;
+	ch->max_other = 0;
+	ch->max_combined = 0;
+
+	ch->rx_count = tp->num_rx_rings;
+	ch->tx_count = 1;
+	ch->other_count = 0;
+	ch->combined_count = 0;
+}
+
+static int rtl8169_realloc_rx(struct rtl8169_private *tp,
+			      struct rtl8169_rx_ring *new_rx,
+			      int new_count)
+{
+	int i, ret;
+
+	new_rx[0].rdsar_reg = RxDescAddrLow;
+	for (i = 1; i < new_count; i++)
+		new_rx[i].rdsar_reg = (u16)(RDSAR_Q1_LOW + (i - 1) * 8);
+
+	for (i = 0; i < new_count; i++)
+		new_rx[i].num_rx_desc = NUM_RX_DESC;
+
+	for (i = 0; i < new_count; i++) {
+		struct rtl8169_rx_ring *ring = &new_rx[i];
+
+		ring->rx_desc_alloc_size = (NUM_RX_DESC + 1) * sizeof(struct RxDesc);
+		ring->rx_desc_array = dma_alloc_coherent(&tp->pci_dev->dev,
+							 ring->rx_desc_alloc_size,
+							 &ring->rx_phy_addr,
+							 GFP_KERNEL);
+		if (!ring->rx_desc_array) {
+			ret = -ENOMEM;
+			goto err_free;
+		}
+
+		memset(ring->rx_databuff, 0, sizeof(ring->rx_databuff));
+		ret = rtl8169_rx_fill(tp, ring);
+		if (ret) {
+			dma_free_coherent(&tp->pci_dev->dev, ring->rx_desc_alloc_size,
+					  ring->rx_desc_array, ring->rx_phy_addr);
+			goto err_free;
+		}
+	}
+	return 0;
+
+err_free:
+	while (--i >= 0) {
+		rtl8169_rx_clear(tp, &new_rx[i]);
+		dma_free_coherent(&tp->pci_dev->dev, new_rx[i].rx_desc_alloc_size,
+				  new_rx[i].rx_desc_array, new_rx[i].rx_phy_addr);
+	}
+	return ret;
+}
+
+static int rtl8169_set_channels(struct net_device *dev,
+				struct ethtool_channels *ch)
+{
+	struct rtl8169_private *tp = netdev_priv(dev);
+	bool if_running = netif_running(dev);
+	struct rtl8169_rx_ring *new_rx;
+	u8 old_tx_desc_type = tp->init_rx_desc_type;
+	u8 new_desc_type;
+	bool new_rss_enable;
+	int i, ret;
+
+	if (!tp->rss_support && (ch->rx_count > 1 || ch->tx_count > 1)) {
+		netdev_warn(dev, "This chip does not support multiple channels/RSS.\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (!(tp->features & RTL_VEC_MAP_ENABLE))
+		return -EINVAL;
+
+	new_rss_enable = (ch->rx_count > 1 && tp->rss_support);
+	new_desc_type = new_rss_enable ? RX_DESC_RING_TYPE_RSS : RX_DESC_RING_TYPE_DEFAULT;
+	tp->init_rx_desc_type = new_desc_type;
+
+	if (!if_running) {
+		tp->num_rx_rings = ch->rx_count;
+		tp->rss_enable = new_rss_enable;
+		return 0;
+	}
+
+	new_rx = kcalloc(R8169_MAX_RX_QUEUES, sizeof(*new_rx), GFP_KERNEL);
+	if (!new_rx)
+		return -ENOMEM;
+
+	ret = rtl8169_realloc_rx(tp, new_rx, ch->rx_count);
+	if (ret) {
+		kfree(new_rx);
+		tp->init_rx_desc_type = old_tx_desc_type;
+		return ret;
+	}
+
+	netif_stop_queue(dev);
+	rtl8169_down(tp);
+
+	for (i = 0; i < tp->num_rx_rings; i++)
+		rtl8169_rx_clear(tp, &tp->rx_ring[i]);
+	rtl8169_free_rx_desc(tp);
+
+	tp->num_rx_rings = ch->rx_count;
+	tp->rss_enable = new_rss_enable;
+
+	memset(tp->rx_ring, 0, sizeof(tp->rx_ring));
+	memcpy(tp->rx_ring, new_rx, sizeof(*new_rx) * ch->rx_count);
+
+	for (i = 0; i < tp->hw_supp_indir_tbl_entries; i++) {
+		if (tp->rss_enable)
+			tp->rss_indir_tbl[i] = ethtool_rxfh_indir_default(i, tp->num_rx_rings);
+		else
+			tp->rss_indir_tbl[i] = 0;
+	}
+
+	rtl_set_irq_mask(tp);
+
+	rtl8169_up(tp);
+	netif_start_queue(dev);
+
+	kfree(new_rx);
+
+	return 0;
+}
+
 static const struct ethtool_ops rtl8169_ethtool_ops = {
 	.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
 				     ETHTOOL_COALESCE_MAX_FRAMES,
@@ -6390,6 +6521,8 @@ static const struct ethtool_ops rtl8169_ethtool_ops = {
 	.nway_reset		= phy_ethtool_nway_reset,
 	.get_eee		= rtl8169_get_eee,
 	.set_eee		= rtl8169_set_eee,
+	.get_channels		= rtl8169_get_channels,
+	.set_channels		= rtl8169_set_channels,
 	.get_link_ksettings	= phy_ethtool_get_link_ksettings,
 	.set_link_ksettings	= rtl8169_set_link_ksettings,
 	.get_ringparam		= rtl8169_get_ringparam,
-- 
2.43.0


^ permalink raw reply related

* [PATCH ipsec-next v2] esp: Consolidate esp4 and esp6
From: Steffen Klassert @ 2026-05-06  8:15 UTC (permalink / raw)
  To: netdev; +Cc: Sabrina Dubroca, Simon Horman, Tobias Brunner, Herbert Xu, devel

This patch merges common code of esp4.c and esp6.c into
xfrm_esp.c. This almost halves the size of the ESP
implementation for the price of three indirect calls
on UDP/TCP encapsulation. No functional changes.

Changes from the RFC version:

- Fix a typo in the commit message.

- Remove some old comments that don't make sense anymore.

- Let the ->input_encap functions return the needed offsets.

- Remove the IP_MAX_MTU check from UDP/TCP encap.
  The IPv4/IPv6 local_out function will do that ceck later.

- The comment on IPv4 ESP offload with UDP encapsulation
  is true for IPv4 and IPv6, so remove the IPv4 from the
  comment.

Changes since v1:

- Remove some now unused code.

- Whitespace fixes.

- Cleanup the header length calculation in the input path for
  UDP/TCP encapsulation.

- Move the skb_pull_rcsum() call to the generic esp_input_done2()
  function.

- Move the skb_postpull_rcsum() call to the offloading codepath,
  it is needed only there.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Tested-by: Tobias Brunner <tobias@strongswan.org>
---
 include/net/esp.h       |    9 +-
 include/net/xfrm.h      |    3 +
 net/ipv4/esp4.c         | 1068 ++------------------------------------
 net/ipv4/esp4_offload.c |    3 +-
 net/ipv6/esp6.c         | 1086 ++-------------------------------------
 net/ipv6/esp6_offload.c |   12 +-
 net/xfrm/Makefile       |    1 +
 net/xfrm/xfrm_esp.c     | 1012 ++++++++++++++++++++++++++++++++++++
 8 files changed, 1135 insertions(+), 2059 deletions(-)
 create mode 100644 net/xfrm/xfrm_esp.c

diff --git a/include/net/esp.h b/include/net/esp.h
index 322950727dd0..e1ae485ac1c7 100644
--- a/include/net/esp.h
+++ b/include/net/esp.h
@@ -44,7 +44,10 @@ struct esp_info {
 int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp);
 int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp);
 int esp_input_done2(struct sk_buff *skb, int err);
-int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp);
-int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp);
-int esp6_input_done2(struct sk_buff *skb, int err);
+int esp_init_aead(struct xfrm_state *x, struct netlink_ext_ack *extack);
+int esp_init_authenc(struct xfrm_state *x, struct netlink_ext_ack *extack);
+void esp_destroy(struct xfrm_state *x);
+int esp_input(struct xfrm_state *x, struct sk_buff *skb);
+int esp_output(struct xfrm_state *x, struct sk_buff *skb);
+
 #endif
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 10d3edde6b2f..d99b6f57ecda 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -455,7 +455,10 @@ struct xfrm_type {
 					      struct netlink_ext_ack *extack);
 	void			(*destructor)(struct xfrm_state *);
 	int			(*input)(struct xfrm_state *, struct sk_buff *skb);
+	int			(*input_encap)(struct sk_buff *skb, struct xfrm_state *x);
 	int			(*output)(struct xfrm_state *, struct sk_buff *pskb);
+	struct sock		*(*find_tcp_sk)(struct xfrm_state *x);
+	void			(*output_encap_csum)(struct sk_buff *skb);
 	int			(*reject)(struct xfrm_state *, struct sk_buff *,
 					  const struct flowi *);
 };
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 6dfc0bcdef65..0498bbb5060e 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -1,123 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0-only
-#define pr_fmt(fmt) "IPsec: " fmt
 
 #include <crypto/aead.h>
 #include <crypto/authenc.h>
-#include <linux/err.h>
-#include <linux/module.h>
 #include <net/ip.h>
-#include <net/xfrm.h>
 #include <net/esp.h>
-#include <linux/scatterlist.h>
-#include <linux/kernel.h>
-#include <linux/pfkeyv2.h>
-#include <linux/rtnetlink.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/in6.h>
 #include <net/icmp.h>
-#include <net/protocol.h>
-#include <net/udp.h>
-#include <net/tcp.h>
 #include <net/espintcp.h>
-#include <linux/skbuff_ref.h>
-
-#include <linux/highmem.h>
-
-struct esp_skb_cb {
-	struct xfrm_skb_cb xfrm;
-	void *tmp;
-};
-
-struct esp_output_extra {
-	__be32 seqhi;
-	u32 esphoff;
-};
-
-#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
-
-/*
- * Allocate an AEAD request structure with extra space for SG and IV.
- *
- * For alignment considerations the IV is placed at the front, followed
- * by the request and finally the SG list.
- *
- * TODO: Use spare space in skb for this where possible.
- */
-static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int extralen)
-{
-	unsigned int len;
-
-	len = extralen;
-
-	len += crypto_aead_ivsize(aead);
-
-	if (len) {
-		len += crypto_aead_alignmask(aead) &
-		       ~(crypto_tfm_ctx_alignment() - 1);
-		len = ALIGN(len, crypto_tfm_ctx_alignment());
-	}
-
-	len += sizeof(struct aead_request) + crypto_aead_reqsize(aead);
-	len = ALIGN(len, __alignof__(struct scatterlist));
-
-	len += sizeof(struct scatterlist) * nfrags;
-
-	return kmalloc(len, GFP_ATOMIC);
-}
-
-static inline void *esp_tmp_extra(void *tmp)
-{
-	return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra));
-}
-
-static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int extralen)
-{
-	return crypto_aead_ivsize(aead) ?
-	       PTR_ALIGN((u8 *)tmp + extralen,
-			 crypto_aead_alignmask(aead) + 1) : tmp + extralen;
-}
-
-static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
-{
-	struct aead_request *req;
-
-	req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
-				crypto_tfm_ctx_alignment());
-	aead_request_set_tfm(req, aead);
-	return req;
-}
-
-static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
-					     struct aead_request *req)
-{
-	return (void *)ALIGN((unsigned long)(req + 1) +
-			     crypto_aead_reqsize(aead),
-			     __alignof__(struct scatterlist));
-}
-
-static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
-{
-	struct crypto_aead *aead = x->data;
-	int extralen = 0;
-	u8 *iv;
-	struct aead_request *req;
-	struct scatterlist *sg;
-
-	if (x->props.flags & XFRM_STATE_ESN)
-		extralen += sizeof(struct esp_output_extra);
-
-	iv = esp_tmp_iv(aead, tmp, extralen);
-	req = esp_tmp_req(aead, iv);
-
-	/* Unref skb_frag_pages in the src scatterlist if necessary.
-	 * Skip the first sg which comes from skb->data.
-	 */
-	if (req->src != req->dst)
-		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
-			skb_page_unref(page_to_netmem(sg_page(sg)),
-				       skb->pp_recycle);
-}
 
 #ifdef CONFIG_INET_ESPINTCP
 static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
@@ -145,790 +33,69 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
 	return sk;
 }
 
-static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
-{
-	struct sock *sk;
-	int err;
-
-	rcu_read_lock();
-
-	sk = esp_find_tcp_sk(x);
-	err = PTR_ERR_OR_ZERO(sk);
-	if (err) {
-		kfree_skb(skb);
-		goto out;
-	}
-
-	bh_lock_sock(sk);
-	if (sock_owned_by_user(sk))
-		err = espintcp_queue_out(sk, skb);
-	else
-		err = espintcp_push_skb(sk, skb);
-	bh_unlock_sock(sk);
-
-	sock_put(sk);
-
-out:
-	rcu_read_unlock();
-	return err;
-}
-
-static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk,
-				   struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb_dst(skb);
-	struct xfrm_state *x = dst->xfrm;
-
-	return esp_output_tcp_finish(x, skb);
-}
-
-static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
-{
-	int err;
-
-	local_bh_disable();
-	err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
-	local_bh_enable();
-
-	/* EINPROGRESS just happens to do the right thing.  It
-	 * actually means that the skb has been consumed and
-	 * isn't coming back.
-	 */
-	return err ?: -EINPROGRESS;
-}
 #else
-static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
 {
 	WARN_ON(1);
-	return -EOPNOTSUPP;
+	return ERR_PTR(-EOPNOTSUPP);
 }
 #endif
 
-static void esp_output_done(void *data, int err)
-{
-	struct sk_buff *skb = data;
-	struct xfrm_offload *xo = xfrm_offload(skb);
-	void *tmp;
-	struct xfrm_state *x;
-
-	if (xo && (xo->flags & XFRM_DEV_RESUME)) {
-		struct sec_path *sp = skb_sec_path(skb);
-
-		x = sp->xvec[sp->len - 1];
-	} else {
-		x = skb_dst(skb)->xfrm;
-	}
-
-	tmp = ESP_SKB_CB(skb)->tmp;
-	esp_ssg_unref(x, tmp, skb);
-	kfree(tmp);
-
-	if (xo && (xo->flags & XFRM_DEV_RESUME)) {
-		if (err) {
-			XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
-			kfree_skb(skb);
-			return;
-		}
-
-		skb_push(skb, skb->data - skb_mac_header(skb));
-		secpath_reset(skb);
-		xfrm_dev_resume(skb);
-	} else {
-		if (!err &&
-		    x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) {
-			err = esp_output_tail_tcp(x, skb);
-			if (err != -EINPROGRESS)
-				kfree_skb(skb);
-		} else {
-			xfrm_output_resume(skb_to_full_sk(skb), skb, err);
-		}
-	}
-}
-
-/* Move ESP header back into place. */
-static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
-{
-	struct ip_esp_hdr *esph = (void *)(skb->data + offset);
-	void *tmp = ESP_SKB_CB(skb)->tmp;
-	__be32 *seqhi = esp_tmp_extra(tmp);
-
-	esph->seq_no = esph->spi;
-	esph->spi = *seqhi;
-}
-
-static void esp_output_restore_header(struct sk_buff *skb)
-{
-	void *tmp = ESP_SKB_CB(skb)->tmp;
-	struct esp_output_extra *extra = esp_tmp_extra(tmp);
-
-	esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff -
-				sizeof(__be32));
-}
-
-static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb,
-					       struct xfrm_state *x,
-					       struct ip_esp_hdr *esph,
-					       struct esp_output_extra *extra)
-{
-	/* For ESN we move the header forward by 4 bytes to
-	 * accommodate the high bits.  We will move it back after
-	 * encryption.
-	 */
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		__u32 seqhi;
-		struct xfrm_offload *xo = xfrm_offload(skb);
-
-		if (xo)
-			seqhi = xo->seq.hi;
-		else
-			seqhi = XFRM_SKB_CB(skb)->seq.output.hi;
-
-		extra->esphoff = (unsigned char *)esph -
-				 skb_transport_header(skb);
-		esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
-		extra->seqhi = esph->spi;
-		esph->seq_no = htonl(seqhi);
-	}
-
-	esph->spi = x->id.spi;
-
-	return esph;
-}
-
-static void esp_output_done_esn(void *data, int err)
+static void esp4_output_encap_csum(struct sk_buff *skb)
 {
-	struct sk_buff *skb = data;
-
-	esp_output_restore_header(skb);
-	esp_output_done(data, err);
-}
-
-static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb,
-					       int encap_type,
-					       struct esp_info *esp,
-					       __be16 sport,
-					       __be16 dport)
-{
-	struct udphdr *uh;
-	unsigned int len;
-	struct xfrm_offload *xo = xfrm_offload(skb);
-
-	len = skb->len + esp->tailen - skb_transport_offset(skb);
-	if (len + sizeof(struct iphdr) > IP_MAX_MTU)
-		return ERR_PTR(-EMSGSIZE);
-
-	uh = (struct udphdr *)esp->esph;
-	uh->source = sport;
-	uh->dest = dport;
-	uh->len = htons(len);
-	uh->check = 0;
-
-	/* For IPv4 ESP with UDP encapsulation, if xo is not null, the skb is in the crypto offload
-	 * data path, which means that esp_output_udp_encap is called outside of the XFRM stack.
-	 * In this case, the mac header doesn't point to the IPv4 protocol field, so don't set it.
-	 */
-	if (!xo || encap_type != UDP_ENCAP_ESPINUDP)
-		*skb_mac_header(skb) = IPPROTO_UDP;
-
-	return (struct ip_esp_hdr *)(uh + 1);
-}
-
-#ifdef CONFIG_INET_ESPINTCP
-static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
-						    struct sk_buff *skb,
-						    struct esp_info *esp)
-{
-	__be16 *lenp = (void *)esp->esph;
-	struct ip_esp_hdr *esph;
-	unsigned int len;
-	struct sock *sk;
-
-	len = skb->len + esp->tailen - skb_transport_offset(skb);
-	if (len > IP_MAX_MTU)
-		return ERR_PTR(-EMSGSIZE);
-
-	rcu_read_lock();
-	sk = esp_find_tcp_sk(x);
-	rcu_read_unlock();
-
-	if (IS_ERR(sk))
-		return ERR_CAST(sk);
-
-	sock_put(sk);
-
-	*lenp = htons(len);
-	esph = (struct ip_esp_hdr *)(lenp + 1);
-
-	return esph;
-}
-#else
-static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
-						    struct sk_buff *skb,
-						    struct esp_info *esp)
-{
-	return ERR_PTR(-EOPNOTSUPP);
 }
-#endif
 
-static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb,
-			    struct esp_info *esp)
+static int esp4_input_encap(struct sk_buff *skb, struct xfrm_state *x)
 {
+	const struct iphdr *iph = ip_hdr(skb);
+	int ihl = iph->ihl * 4;
 	struct xfrm_encap_tmpl *encap = x->encap;
-	struct ip_esp_hdr *esph;
-	__be16 sport, dport;
-	int encap_type;
-
-	spin_lock_bh(&x->lock);
-	sport = encap->encap_sport;
-	dport = encap->encap_dport;
-	encap_type = encap->encap_type;
-	spin_unlock_bh(&x->lock);
+	struct tcphdr *th = (void *)(skb_network_header(skb) + ihl);
+	struct udphdr *uh = (void *)(skb_network_header(skb) + ihl);
+	int ret = skb_network_header_len(skb);
+	__be16 source;
 
-	switch (encap_type) {
-	default:
-	case UDP_ENCAP_ESPINUDP:
-		esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport);
-		break;
+	switch (x->encap->encap_type) {
 	case TCP_ENCAP_ESPINTCP:
-		esph = esp_output_tcp_encap(x, skb, esp);
+		source = th->source;
+		ret -= sizeof(struct tcphdr);
 		break;
-	}
-
-	if (IS_ERR(esph))
-		return PTR_ERR(esph);
-
-	esp->esph = esph;
-
-	return 0;
-}
-
-int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
-{
-	u8 *tail;
-	int nfrags;
-	int esph_offset;
-	struct page *page;
-	struct sk_buff *trailer;
-	int tailen = esp->tailen;
-
-	/* this is non-NULL only with TCP/UDP Encapsulation */
-	if (x->encap) {
-		int err = esp_output_encap(x, skb, esp);
-
-		if (err < 0)
-			return err;
-	}
-
-	if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
-	    ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
-		goto cow;
-
-	if (!skb_cloned(skb)) {
-		if (tailen <= skb_tailroom(skb)) {
-			nfrags = 1;
-			trailer = skb;
-			tail = skb_tail_pointer(trailer);
-
-			goto skip_cow;
-		} else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
-			   && !skb_has_frag_list(skb)) {
-			int allocsize;
-			struct sock *sk = skb->sk;
-			struct page_frag *pfrag = &x->xfrag;
-
-			esp->inplace = false;
-
-			allocsize = ALIGN(tailen, L1_CACHE_BYTES);
-
-			spin_lock_bh(&x->lock);
-
-			if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
-				spin_unlock_bh(&x->lock);
-				goto cow;
-			}
-
-			page = pfrag->page;
-			get_page(page);
-
-			tail = page_address(page) + pfrag->offset;
-
-			esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
-
-			nfrags = skb_shinfo(skb)->nr_frags;
-
-			__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
-					     tailen);
-			skb_shinfo(skb)->nr_frags = ++nfrags;
-
-			pfrag->offset = pfrag->offset + allocsize;
-
-			spin_unlock_bh(&x->lock);
-
-			nfrags++;
-
-			skb_len_add(skb, tailen);
-			if (sk && sk_fullsock(sk))
-				refcount_add(tailen, &sk->sk_wmem_alloc);
-
-			goto out;
-		}
-	}
-
-cow:
-	esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb);
-
-	nfrags = skb_cow_data(skb, tailen, &trailer);
-	if (nfrags < 0)
-		goto out;
-	tail = skb_tail_pointer(trailer);
-	esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset);
-
-skip_cow:
-	esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
-	pskb_put(skb, trailer, tailen);
-
-out:
-	return nfrags;
-}
-EXPORT_SYMBOL_GPL(esp_output_head);
-
-int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
-{
-	u8 *iv;
-	int alen;
-	void *tmp;
-	int ivlen;
-	int assoclen;
-	int extralen;
-	struct page *page;
-	struct ip_esp_hdr *esph;
-	struct crypto_aead *aead;
-	struct aead_request *req;
-	struct scatterlist *sg, *dsg;
-	struct esp_output_extra *extra;
-	int err = -ENOMEM;
-
-	assoclen = sizeof(struct ip_esp_hdr);
-	extralen = 0;
-
-	if (x->props.flags & XFRM_STATE_ESN) {
-		extralen += sizeof(*extra);
-		assoclen += sizeof(__be32);
-	}
-
-	aead = x->data;
-	alen = crypto_aead_authsize(aead);
-	ivlen = crypto_aead_ivsize(aead);
-
-	tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen);
-	if (!tmp)
-		goto error;
-
-	extra = esp_tmp_extra(tmp);
-	iv = esp_tmp_iv(aead, tmp, extralen);
-	req = esp_tmp_req(aead, iv);
-	sg = esp_req_sg(aead, req);
-
-	if (esp->inplace)
-		dsg = sg;
-	else
-		dsg = &sg[esp->nfrags];
-
-	esph = esp_output_set_extra(skb, x, esp->esph, extra);
-	esp->esph = esph;
-
-	sg_init_table(sg, esp->nfrags);
-	err = skb_to_sgvec(skb, sg,
-		           (unsigned char *)esph - skb->data,
-		           assoclen + ivlen + esp->clen + alen);
-	if (unlikely(err < 0))
-		goto error_free;
-
-	if (!esp->inplace) {
-		int allocsize;
-		struct page_frag *pfrag = &x->xfrag;
-
-		allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
-
-		spin_lock_bh(&x->lock);
-		if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
-			spin_unlock_bh(&x->lock);
-			goto error_free;
-		}
-
-		skb_shinfo(skb)->nr_frags = 1;
-
-		page = pfrag->page;
-		get_page(page);
-		/* replace page frags in skb with new page */
-		__skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
-		pfrag->offset = pfrag->offset + allocsize;
-		spin_unlock_bh(&x->lock);
-
-		sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
-		err = skb_to_sgvec(skb, dsg,
-			           (unsigned char *)esph - skb->data,
-			           assoclen + ivlen + esp->clen + alen);
-		if (unlikely(err < 0))
-			goto error_free;
-	}
-
-	if ((x->props.flags & XFRM_STATE_ESN))
-		aead_request_set_callback(req, 0, esp_output_done_esn, skb);
-	else
-		aead_request_set_callback(req, 0, esp_output_done, skb);
-
-	aead_request_set_crypt(req, sg, dsg, ivlen + esp->clen, iv);
-	aead_request_set_ad(req, assoclen);
-
-	memset(iv, 0, ivlen);
-	memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&esp->seqno + 8 - min(ivlen, 8),
-	       min(ivlen, 8));
-
-	ESP_SKB_CB(skb)->tmp = tmp;
-	err = crypto_aead_encrypt(req);
-
-	switch (err) {
-	case -EINPROGRESS:
-		goto error;
-
-	case -ENOSPC:
-		err = NET_XMIT_DROP;
+	case UDP_ENCAP_ESPINUDP:
+		source = uh->source;
+		ret -= sizeof(struct udphdr);
 		break;
-
-	case 0:
-		if ((x->props.flags & XFRM_STATE_ESN))
-			esp_output_restore_header(skb);
-	}
-
-	if (sg != dsg)
-		esp_ssg_unref(x, tmp, skb);
-
-	if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
-		err = esp_output_tail_tcp(x, skb);
-
-error_free:
-	kfree(tmp);
-error:
-	return err;
-}
-EXPORT_SYMBOL_GPL(esp_output_tail);
-
-static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
-{
-	int alen;
-	int blksize;
-	struct ip_esp_hdr *esph;
-	struct crypto_aead *aead;
-	struct esp_info esp;
-
-	esp.inplace = true;
-
-	esp.proto = *skb_mac_header(skb);
-	*skb_mac_header(skb) = IPPROTO_ESP;
-
-	/* skb is pure payload to encrypt */
-
-	aead = x->data;
-	alen = crypto_aead_authsize(aead);
-
-	esp.tfclen = 0;
-	if (x->tfcpad) {
-		struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
-		u32 padto;
-
-		padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached));
-		if (skb->len < padto)
-			esp.tfclen = padto - skb->len;
-	}
-	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
-	esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize);
-	esp.plen = esp.clen - skb->len - esp.tfclen;
-	esp.tailen = esp.tfclen + esp.plen + alen;
-
-	esp.esph = ip_esp_hdr(skb);
-
-	esp.nfrags = esp_output_head(x, skb, &esp);
-	if (esp.nfrags < 0)
-		return esp.nfrags;
-
-	esph = esp.esph;
-	esph->spi = x->id.spi;
-
-	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
-	esp.seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
-				 ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
-
-	skb_push(skb, -skb_network_offset(skb));
-
-	return esp_output_tail(x, skb, &esp);
-}
-
-static inline int esp_remove_trailer(struct sk_buff *skb)
-{
-	struct xfrm_state *x = xfrm_input_state(skb);
-	struct crypto_aead *aead = x->data;
-	int alen, hlen, elen;
-	int padlen, trimlen;
-	__wsum csumdiff;
-	u8 nexthdr[2];
-	int ret;
-
-	alen = crypto_aead_authsize(aead);
-	hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
-	elen = skb->len - hlen;
-
-	if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
-		BUG();
-
-	ret = -EINVAL;
-	padlen = nexthdr[0];
-	if (padlen + 2 + alen >= elen) {
-		net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
-				    padlen + 2, elen - alen);
-		goto out;
-	}
-
-	trimlen = alen + padlen + 2;
-	if (skb->ip_summed == CHECKSUM_COMPLETE) {
-		csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
-		skb->csum = csum_block_sub(skb->csum, csumdiff,
-					   skb->len - trimlen);
-	}
-	ret = pskb_trim(skb, skb->len - trimlen);
-	if (unlikely(ret))
-		return ret;
-
-	ret = nexthdr[1];
-
-out:
-	return ret;
-}
-
-int esp_input_done2(struct sk_buff *skb, int err)
-{
-	const struct iphdr *iph;
-	struct xfrm_state *x = xfrm_input_state(skb);
-	struct xfrm_offload *xo = xfrm_offload(skb);
-	struct crypto_aead *aead = x->data;
-	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
-	int ihl;
-
-	if (!xo || !(xo->flags & CRYPTO_DONE))
-		kfree(ESP_SKB_CB(skb)->tmp);
-
-	if (unlikely(err))
-		goto out;
-
-	err = esp_remove_trailer(skb);
-	if (unlikely(err < 0))
+	default:
+		WARN_ON_ONCE(1);
+		ret = -1;
 		goto out;
-
-	iph = ip_hdr(skb);
-	ihl = iph->ihl * 4;
-
-	if (x->encap) {
-		struct xfrm_encap_tmpl *encap = x->encap;
-		struct tcphdr *th = (void *)(skb_network_header(skb) + ihl);
-		struct udphdr *uh = (void *)(skb_network_header(skb) + ihl);
-		__be16 source;
-
-		switch (x->encap->encap_type) {
-		case TCP_ENCAP_ESPINTCP:
-			source = th->source;
-			break;
-		case UDP_ENCAP_ESPINUDP:
-			source = uh->source;
-			break;
-		default:
-			WARN_ON_ONCE(1);
-			err = -EINVAL;
-			goto out;
-		}
-
-		/*
-		 * 1) if the NAT-T peer's IP or port changed then
-		 *    advertise the change to the keying daemon.
-		 *    This is an inbound SA, so just compare
-		 *    SRC ports.
-		 */
-		if (iph->saddr != x->props.saddr.a4 ||
-		    source != encap->encap_sport) {
-			xfrm_address_t ipaddr;
-
-			ipaddr.a4 = iph->saddr;
-			km_new_mapping(x, &ipaddr, source);
-
-			/* XXX: perhaps add an extra
-			 * policy check here, to see
-			 * if we should allow or
-			 * reject a packet from a
-			 * different source
-			 * address/port.
-			 */
-		}
-
-		/*
-		 * 2) ignore UDP/TCP checksums in case
-		 *    of NAT-T in Transport Mode, or
-		 *    perform other post-processing fixes
-		 *    as per draft-ietf-ipsec-udp-encaps-06,
-		 *    section 3.1.2
-		 */
-		if (x->props.mode == XFRM_MODE_TRANSPORT)
-			skb->ip_summed = CHECKSUM_UNNECESSARY;
 	}
 
-	skb_pull_rcsum(skb, hlen);
-	if (x->props.mode == XFRM_MODE_TUNNEL ||
-	    x->props.mode == XFRM_MODE_IPTFS)
-		skb_reset_transport_header(skb);
-	else
-		skb_set_transport_header(skb, -ihl);
-
-	/* RFC4303: Drop dummy packets without any error */
-	if (err == IPPROTO_NONE)
-		err = -EINVAL;
-
-out:
-	return err;
-}
-EXPORT_SYMBOL_GPL(esp_input_done2);
-
-static void esp_input_done(void *data, int err)
-{
-	struct sk_buff *skb = data;
-
-	xfrm_input_resume(skb, esp_input_done2(skb, err));
-}
-
-static void esp_input_restore_header(struct sk_buff *skb)
-{
-	esp_restore_header(skb, 0);
-	__skb_pull(skb, 4);
-}
-
-static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
-{
-	struct xfrm_state *x = xfrm_input_state(skb);
-	struct ip_esp_hdr *esph;
-
-	/* For ESN we move the header forward by 4 bytes to
-	 * accommodate the high bits.  We will move it back after
-	 * decryption.
+	/*
+	 * 1) if the NAT-T peer's IP or port changed then
+	 *    advertise the change to the keying daemon.
+	 *    This is an inbound SA, so just compare
+	 *    SRC ports.
 	 */
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		esph = skb_push(skb, 4);
-		*seqhi = esph->spi;
-		esph->spi = esph->seq_no;
-		esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
-	}
-}
-
-static void esp_input_done_esn(void *data, int err)
-{
-	struct sk_buff *skb = data;
+	if (iph->saddr != x->props.saddr.a4 ||
+	    source != encap->encap_sport) {
+		xfrm_address_t ipaddr;
 
-	esp_input_restore_header(skb);
-	esp_input_done(data, err);
-}
-
-/*
- * Note: detecting truncated vs. non-truncated authentication data is very
- * expensive, so we only support truncated data, which is the recommended
- * and common case.
- */
-static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
-{
-	struct crypto_aead *aead = x->data;
-	struct aead_request *req;
-	struct sk_buff *trailer;
-	int ivlen = crypto_aead_ivsize(aead);
-	int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen;
-	int nfrags;
-	int assoclen;
-	int seqhilen;
-	__be32 *seqhi;
-	void *tmp;
-	u8 *iv;
-	struct scatterlist *sg;
-	int err = -EINVAL;
-
-	if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen))
-		goto out;
-
-	if (elen <= 0)
-		goto out;
-
-	assoclen = sizeof(struct ip_esp_hdr);
-	seqhilen = 0;
-
-	if (x->props.flags & XFRM_STATE_ESN) {
-		seqhilen += sizeof(__be32);
-		assoclen += seqhilen;
-	}
-
-	if (!skb_cloned(skb)) {
-		if (!skb_is_nonlinear(skb)) {
-			nfrags = 1;
-
-			goto skip_cow;
-		} else if (!skb_has_frag_list(skb)) {
-			nfrags = skb_shinfo(skb)->nr_frags;
-			nfrags++;
-
-			goto skip_cow;
-		}
-	}
-
-	err = skb_cow_data(skb, 0, &trailer);
-	if (err < 0)
-		goto out;
-
-	nfrags = err;
-
-skip_cow:
-	err = -ENOMEM;
-	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
-	if (!tmp)
-		goto out;
-
-	ESP_SKB_CB(skb)->tmp = tmp;
-	seqhi = esp_tmp_extra(tmp);
-	iv = esp_tmp_iv(aead, tmp, seqhilen);
-	req = esp_tmp_req(aead, iv);
-	sg = esp_req_sg(aead, req);
-
-	esp_input_set_header(skb, seqhi);
-
-	sg_init_table(sg, nfrags);
-	err = skb_to_sgvec(skb, sg, 0, skb->len);
-	if (unlikely(err < 0)) {
-		kfree(tmp);
-		goto out;
+		ipaddr.a4 = iph->saddr;
+		km_new_mapping(x, &ipaddr, source);
 	}
 
-	skb->ip_summed = CHECKSUM_NONE;
-
-	if ((x->props.flags & XFRM_STATE_ESN))
-		aead_request_set_callback(req, 0, esp_input_done_esn, skb);
-	else
-		aead_request_set_callback(req, 0, esp_input_done, skb);
-
-	aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
-	aead_request_set_ad(req, assoclen);
-
-	err = crypto_aead_decrypt(req);
-	if (err == -EINPROGRESS)
-		goto out;
-
-	if ((x->props.flags & XFRM_STATE_ESN))
-		esp_input_restore_header(skb);
-
-	err = esp_input_done2(skb, err);
+	/*
+	 * 2) ignore UDP/TCP checksums in case
+	 *    of NAT-T in Transport Mode, or
+	 *    perform other post-processing fixes
+	 *    as per draft-ietf-ipsec-udp-encaps-06,
+	 *    section 3.1.2
+	 */
+	if (x->props.mode == XFRM_MODE_TRANSPORT)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 out:
-	return err;
+	return ret;
 }
 
 static int esp4_err(struct sk_buff *skb, u32 info)
@@ -963,146 +130,6 @@ static int esp4_err(struct sk_buff *skb, u32 info)
 	return 0;
 }
 
-static void esp_destroy(struct xfrm_state *x)
-{
-	struct crypto_aead *aead = x->data;
-
-	if (!aead)
-		return;
-
-	crypto_free_aead(aead);
-}
-
-static int esp_init_aead(struct xfrm_state *x, struct netlink_ext_ack *extack)
-{
-	char aead_name[CRYPTO_MAX_ALG_NAME];
-	struct crypto_aead *aead;
-	int err;
-
-	if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
-		     x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) {
-		NL_SET_ERR_MSG(extack, "Algorithm name is too long");
-		return -ENAMETOOLONG;
-	}
-
-	aead = crypto_alloc_aead(aead_name, 0, 0);
-	err = PTR_ERR(aead);
-	if (IS_ERR(aead))
-		goto error;
-
-	x->data = aead;
-
-	err = crypto_aead_setkey(aead, x->aead->alg_key,
-				 (x->aead->alg_key_len + 7) / 8);
-	if (err)
-		goto error;
-
-	err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8);
-	if (err)
-		goto error;
-
-	return 0;
-
-error:
-	NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
-	return err;
-}
-
-static int esp_init_authenc(struct xfrm_state *x,
-			    struct netlink_ext_ack *extack)
-{
-	struct crypto_aead *aead;
-	struct crypto_authenc_key_param *param;
-	struct rtattr *rta;
-	char *key;
-	char *p;
-	char authenc_name[CRYPTO_MAX_ALG_NAME];
-	unsigned int keylen;
-	int err;
-
-	err = -ENAMETOOLONG;
-
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
-			     "%s%sauthencesn(%s,%s)%s",
-			     x->geniv ?: "", x->geniv ? "(" : "",
-			     x->aalg ? x->aalg->alg_name : "digest_null",
-			     x->ealg->alg_name,
-			     x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
-			NL_SET_ERR_MSG(extack, "Algorithm name is too long");
-			goto error;
-		}
-	} else {
-		if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
-			     "%s%sauthenc(%s,%s)%s",
-			     x->geniv ?: "", x->geniv ? "(" : "",
-			     x->aalg ? x->aalg->alg_name : "digest_null",
-			     x->ealg->alg_name,
-			     x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
-			NL_SET_ERR_MSG(extack, "Algorithm name is too long");
-			goto error;
-		}
-	}
-
-	aead = crypto_alloc_aead(authenc_name, 0, 0);
-	err = PTR_ERR(aead);
-	if (IS_ERR(aead)) {
-		NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
-		goto error;
-	}
-
-	x->data = aead;
-
-	keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
-		 (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
-	err = -ENOMEM;
-	key = kmalloc(keylen, GFP_KERNEL);
-	if (!key)
-		goto error;
-
-	p = key;
-	rta = (void *)p;
-	rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
-	rta->rta_len = RTA_LENGTH(sizeof(*param));
-	param = RTA_DATA(rta);
-	p += RTA_SPACE(sizeof(*param));
-
-	if (x->aalg) {
-		struct xfrm_algo_desc *aalg_desc;
-
-		memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8);
-		p += (x->aalg->alg_key_len + 7) / 8;
-
-		aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
-		BUG_ON(!aalg_desc);
-
-		err = -EINVAL;
-		if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
-		    crypto_aead_authsize(aead)) {
-			NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
-			goto free_key;
-		}
-
-		err = crypto_aead_setauthsize(
-			aead, x->aalg->alg_trunc_len / 8);
-		if (err) {
-			NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
-			goto free_key;
-		}
-	}
-
-	param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
-	memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8);
-
-	err = crypto_aead_setkey(aead, key, keylen);
-
-free_key:
-	kfree_sensitive(key);
-
-error:
-	return err;
-}
-
 static int esp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
 	struct crypto_aead *aead;
@@ -1167,13 +194,16 @@ static int esp4_rcv_cb(struct sk_buff *skb, int err)
 
 static const struct xfrm_type esp_type =
 {
-	.owner		= THIS_MODULE,
-	.proto	     	= IPPROTO_ESP,
-	.flags		= XFRM_TYPE_REPLAY_PROT,
-	.init_state	= esp_init_state,
-	.destructor	= esp_destroy,
-	.input		= esp_input,
-	.output		= esp_output,
+	.owner			= THIS_MODULE,
+	.proto			= IPPROTO_ESP,
+	.flags			= XFRM_TYPE_REPLAY_PROT,
+	.init_state		= esp_init_state,
+	.destructor		= esp_destroy,
+	.input			= esp_input,
+	.input_encap		= esp4_input_encap,
+	.output			= esp_output,
+	.find_tcp_sk		= esp_find_tcp_sk,
+	.output_encap_csum	= esp4_output_encap_csum,
 };
 
 static struct xfrm4_protocol esp4_protocol = {
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index abd77162f5e7..293623dacdd4 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -252,8 +252,9 @@ static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb)
 {
 	struct crypto_aead *aead = x->data;
 	struct xfrm_offload *xo = xfrm_offload(skb);
+	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
 
-	if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead)))
+	if (!pskb_may_pull(skb, hlen))
 		return -EINVAL;
 
 	if (!(xo->flags & CRYPTO_DONE))
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 9f75313734f8..9d6e03051086 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -11,130 +11,15 @@
  *	This file is derived from net/ipv4/esp.c
  */
 
-#define pr_fmt(fmt) "IPv6: " fmt
-
 #include <crypto/aead.h>
 #include <crypto/authenc.h>
-#include <linux/err.h>
-#include <linux/module.h>
 #include <net/ip.h>
-#include <net/xfrm.h>
 #include <net/esp.h>
-#include <linux/scatterlist.h>
-#include <linux/kernel.h>
-#include <linux/pfkeyv2.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
 #include <net/ip6_checksum.h>
 #include <net/ip6_route.h>
-#include <net/icmp.h>
-#include <net/ipv6.h>
-#include <net/protocol.h>
-#include <net/udp.h>
 #include <linux/icmpv6.h>
-#include <net/tcp.h>
 #include <net/espintcp.h>
 #include <net/inet6_hashtables.h>
-#include <linux/skbuff_ref.h>
-
-#include <linux/highmem.h>
-
-struct esp_skb_cb {
-	struct xfrm_skb_cb xfrm;
-	void *tmp;
-};
-
-struct esp_output_extra {
-	__be32 seqhi;
-	u32 esphoff;
-};
-
-#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
-
-/*
- * Allocate an AEAD request structure with extra space for SG and IV.
- *
- * For alignment considerations the upper 32 bits of the sequence number are
- * placed at the front, if present. Followed by the IV, the request and finally
- * the SG list.
- *
- * TODO: Use spare space in skb for this where possible.
- */
-static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen)
-{
-	unsigned int len;
-
-	len = seqihlen;
-
-	len += crypto_aead_ivsize(aead);
-
-	if (len) {
-		len += crypto_aead_alignmask(aead) &
-		       ~(crypto_tfm_ctx_alignment() - 1);
-		len = ALIGN(len, crypto_tfm_ctx_alignment());
-	}
-
-	len += sizeof(struct aead_request) + crypto_aead_reqsize(aead);
-	len = ALIGN(len, __alignof__(struct scatterlist));
-
-	len += sizeof(struct scatterlist) * nfrags;
-
-	return kmalloc(len, GFP_ATOMIC);
-}
-
-static inline void *esp_tmp_extra(void *tmp)
-{
-	return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra));
-}
-
-static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
-{
-	return crypto_aead_ivsize(aead) ?
-	       PTR_ALIGN((u8 *)tmp + seqhilen,
-			 crypto_aead_alignmask(aead) + 1) : tmp + seqhilen;
-}
-
-static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
-{
-	struct aead_request *req;
-
-	req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
-				crypto_tfm_ctx_alignment());
-	aead_request_set_tfm(req, aead);
-	return req;
-}
-
-static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
-					     struct aead_request *req)
-{
-	return (void *)ALIGN((unsigned long)(req + 1) +
-			     crypto_aead_reqsize(aead),
-			     __alignof__(struct scatterlist));
-}
-
-static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
-{
-	struct crypto_aead *aead = x->data;
-	int extralen = 0;
-	u8 *iv;
-	struct aead_request *req;
-	struct scatterlist *sg;
-
-	if (x->props.flags & XFRM_STATE_ESN)
-		extralen += sizeof(struct esp_output_extra);
-
-	iv = esp_tmp_iv(aead, tmp, extralen);
-	req = esp_tmp_req(aead, iv);
-
-	/* Unref skb_frag_pages in the src scatterlist if necessary.
-	 * Skip the first sg which comes from skb->data.
-	 */
-	if (req->src != req->dst)
-		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
-			skb_page_unref(page_to_netmem(sg_page(sg)),
-				       skb->pp_recycle);
-}
 
 #ifdef CONFIG_INET6_ESPINTCP
 static struct sock *esp6_find_tcp_sk(struct xfrm_state *x)
@@ -162,66 +47,15 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x)
 	return sk;
 }
 
-static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
-{
-	struct sock *sk;
-	int err;
-
-	rcu_read_lock();
-
-	sk = esp6_find_tcp_sk(x);
-	err = PTR_ERR_OR_ZERO(sk);
-	if (err) {
-		kfree_skb(skb);
-		goto out;
-	}
-
-	bh_lock_sock(sk);
-	if (sock_owned_by_user(sk))
-		err = espintcp_queue_out(sk, skb);
-	else
-		err = espintcp_push_skb(sk, skb);
-	bh_unlock_sock(sk);
-
-	sock_put(sk);
-
-out:
-	rcu_read_unlock();
-	return err;
-}
-
-static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk,
-				   struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb_dst(skb);
-	struct xfrm_state *x = dst->xfrm;
-
-	return esp_output_tcp_finish(x, skb);
-}
-
-static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
-{
-	int err;
-
-	local_bh_disable();
-	err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
-	local_bh_enable();
-
-	/* EINPROGRESS just happens to do the right thing.  It
-	 * actually means that the skb has been consumed and
-	 * isn't coming back.
-	 */
-	return err ?: -EINPROGRESS;
-}
 #else
-static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+static struct sock *esp6_find_tcp_sk(struct xfrm_state *x)
 {
 	WARN_ON(1);
-	return -EOPNOTSUPP;
+	return ERR_PTR(-EOPNOTSUPP);
 }
 #endif
 
-static void esp_output_encap_csum(struct sk_buff *skb)
+static void esp6_output_encap_csum(struct sk_buff *skb)
 {
 	/* UDP encap with IPv6 requires a valid checksum */
 	if (*skb_mac_header(skb) == IPPROTO_UDP) {
@@ -238,738 +72,63 @@ static void esp_output_encap_csum(struct sk_buff *skb)
 	}
 }
 
-static void esp_output_done(void *data, int err)
-{
-	struct sk_buff *skb = data;
-	struct xfrm_offload *xo = xfrm_offload(skb);
-	void *tmp;
-	struct xfrm_state *x;
-
-	if (xo && (xo->flags & XFRM_DEV_RESUME)) {
-		struct sec_path *sp = skb_sec_path(skb);
-
-		x = sp->xvec[sp->len - 1];
-	} else {
-		x = skb_dst(skb)->xfrm;
-	}
-
-	tmp = ESP_SKB_CB(skb)->tmp;
-	esp_ssg_unref(x, tmp, skb);
-	kfree(tmp);
-
-	esp_output_encap_csum(skb);
-
-	if (xo && (xo->flags & XFRM_DEV_RESUME)) {
-		if (err) {
-			XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
-			kfree_skb(skb);
-			return;
-		}
-
-		skb_push(skb, skb->data - skb_mac_header(skb));
-		secpath_reset(skb);
-		xfrm_dev_resume(skb);
-	} else {
-		if (!err &&
-		    x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) {
-			err = esp_output_tail_tcp(x, skb);
-			if (err != -EINPROGRESS)
-				kfree_skb(skb);
-		} else {
-			xfrm_output_resume(skb_to_full_sk(skb), skb, err);
-		}
-	}
-}
-
-/* Move ESP header back into place. */
-static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
-{
-	struct ip_esp_hdr *esph = (void *)(skb->data + offset);
-	void *tmp = ESP_SKB_CB(skb)->tmp;
-	__be32 *seqhi = esp_tmp_extra(tmp);
-
-	esph->seq_no = esph->spi;
-	esph->spi = *seqhi;
-}
-
-static void esp_output_restore_header(struct sk_buff *skb)
-{
-	void *tmp = ESP_SKB_CB(skb)->tmp;
-	struct esp_output_extra *extra = esp_tmp_extra(tmp);
-
-	esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff -
-				sizeof(__be32));
-}
-
-static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb,
-					     struct xfrm_state *x,
-					     struct ip_esp_hdr *esph,
-					     struct esp_output_extra *extra)
-{
-	/* For ESN we move the header forward by 4 bytes to
-	 * accommodate the high bits.  We will move it back after
-	 * encryption.
-	 */
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		__u32 seqhi;
-		struct xfrm_offload *xo = xfrm_offload(skb);
-
-		if (xo)
-			seqhi = xo->seq.hi;
-		else
-			seqhi = XFRM_SKB_CB(skb)->seq.output.hi;
-
-		extra->esphoff = (unsigned char *)esph -
-				 skb_transport_header(skb);
-		esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
-		extra->seqhi = esph->spi;
-		esph->seq_no = htonl(seqhi);
-	}
-
-	esph->spi = x->id.spi;
-
-	return esph;
-}
-
-static void esp_output_done_esn(void *data, int err)
-{
-	struct sk_buff *skb = data;
-
-	esp_output_restore_header(skb);
-	esp_output_done(data, err);
-}
-
-static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb,
-					       int encap_type,
-					       struct esp_info *esp,
-					       __be16 sport,
-					       __be16 dport)
+static int esp6_input_encap(struct sk_buff *skb, struct xfrm_state *x)
 {
+	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	int offset = skb_network_offset(skb) + sizeof(*ip6h);
+	struct xfrm_encap_tmpl *encap = x->encap;
+	int hdr_len = skb_network_header_len(skb);
+	u8 nexthdr = ip6h->nexthdr;
+	__be16 frag_off, source;
 	struct udphdr *uh;
-	unsigned int len;
-
-	len = skb->len + esp->tailen - skb_transport_offset(skb);
-	if (len > U16_MAX)
-		return ERR_PTR(-EMSGSIZE);
-
-	uh = (struct udphdr *)esp->esph;
-	uh->source = sport;
-	uh->dest = dport;
-	uh->len = htons(len);
-	uh->check = 0;
-
-	*skb_mac_header(skb) = IPPROTO_UDP;
-
-	return (struct ip_esp_hdr *)(uh + 1);
-}
-
-#ifdef CONFIG_INET6_ESPINTCP
-static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x,
-						struct sk_buff *skb,
-						struct esp_info *esp)
-{
-	__be16 *lenp = (void *)esp->esph;
-	struct ip_esp_hdr *esph;
-	unsigned int len;
-	struct sock *sk;
-
-	len = skb->len + esp->tailen - skb_transport_offset(skb);
-	if (len > IP_MAX_MTU)
-		return ERR_PTR(-EMSGSIZE);
-
-	rcu_read_lock();
-	sk = esp6_find_tcp_sk(x);
-	rcu_read_unlock();
-
-	if (IS_ERR(sk))
-		return ERR_CAST(sk);
-
-	sock_put(sk);
-
-	*lenp = htons(len);
-	esph = (struct ip_esp_hdr *)(lenp + 1);
+	struct tcphdr *th;
+	int ret;
 
-	return esph;
-}
-#else
-static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x,
-						struct sk_buff *skb,
-						struct esp_info *esp)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-#endif
+	ret = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
+	if (ret == -1)
+		return ret;
 
-static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb,
-			    struct esp_info *esp)
-{
-	struct xfrm_encap_tmpl *encap = x->encap;
-	struct ip_esp_hdr *esph;
-	__be16 sport, dport;
-	int encap_type;
+	ret += hdr_len;
 
-	spin_lock_bh(&x->lock);
-	sport = encap->encap_sport;
-	dport = encap->encap_dport;
-	encap_type = encap->encap_type;
-	spin_unlock_bh(&x->lock);
+	uh = (void *)(skb->data + ret);
+	th = (void *)(skb->data + ret);
 
-	switch (encap_type) {
-	default:
-	case UDP_ENCAP_ESPINUDP:
-		esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport);
-		break;
+	switch (x->encap->encap_type) {
 	case TCP_ENCAP_ESPINTCP:
-		esph = esp6_output_tcp_encap(x, skb, esp);
+		source = th->source;
 		break;
-	}
-
-	if (IS_ERR(esph))
-		return PTR_ERR(esph);
-
-	esp->esph = esph;
-
-	return 0;
-}
-
-int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
-{
-	u8 *tail;
-	int nfrags;
-	int esph_offset;
-	struct page *page;
-	struct sk_buff *trailer;
-	int tailen = esp->tailen;
-
-	if (x->encap) {
-		int err = esp6_output_encap(x, skb, esp);
-
-		if (err < 0)
-			return err;
-	}
-
-	if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
-	    ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
-		goto cow;
-
-	if (!skb_cloned(skb)) {
-		if (tailen <= skb_tailroom(skb)) {
-			nfrags = 1;
-			trailer = skb;
-			tail = skb_tail_pointer(trailer);
-
-			goto skip_cow;
-		} else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
-			   && !skb_has_frag_list(skb)) {
-			int allocsize;
-			struct sock *sk = skb->sk;
-			struct page_frag *pfrag = &x->xfrag;
-
-			esp->inplace = false;
-
-			allocsize = ALIGN(tailen, L1_CACHE_BYTES);
-
-			spin_lock_bh(&x->lock);
-
-			if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
-				spin_unlock_bh(&x->lock);
-				goto cow;
-			}
-
-			page = pfrag->page;
-			get_page(page);
-
-			tail = page_address(page) + pfrag->offset;
-
-			esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
-
-			nfrags = skb_shinfo(skb)->nr_frags;
-
-			__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
-					     tailen);
-			skb_shinfo(skb)->nr_frags = ++nfrags;
-
-			pfrag->offset = pfrag->offset + allocsize;
-
-			spin_unlock_bh(&x->lock);
-
-			nfrags++;
-
-			skb->len += tailen;
-			skb->data_len += tailen;
-			skb->truesize += tailen;
-			if (sk && sk_fullsock(sk))
-				refcount_add(tailen, &sk->sk_wmem_alloc);
-
-			goto out;
-		}
-	}
-
-cow:
-	esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb);
-
-	nfrags = skb_cow_data(skb, tailen, &trailer);
-	if (nfrags < 0)
-		goto out;
-	tail = skb_tail_pointer(trailer);
-	esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset);
-
-skip_cow:
-	esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
-	pskb_put(skb, trailer, tailen);
-
-out:
-	return nfrags;
-}
-EXPORT_SYMBOL_GPL(esp6_output_head);
-
-int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
-{
-	u8 *iv;
-	int alen;
-	void *tmp;
-	int ivlen;
-	int assoclen;
-	int extralen;
-	struct page *page;
-	struct ip_esp_hdr *esph;
-	struct aead_request *req;
-	struct crypto_aead *aead;
-	struct scatterlist *sg, *dsg;
-	struct esp_output_extra *extra;
-	int err = -ENOMEM;
-
-	assoclen = sizeof(struct ip_esp_hdr);
-	extralen = 0;
-
-	if (x->props.flags & XFRM_STATE_ESN) {
-		extralen += sizeof(*extra);
-		assoclen += sizeof(__be32);
-	}
-
-	aead = x->data;
-	alen = crypto_aead_authsize(aead);
-	ivlen = crypto_aead_ivsize(aead);
-
-	tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen);
-	if (!tmp)
-		goto error;
-
-	extra = esp_tmp_extra(tmp);
-	iv = esp_tmp_iv(aead, tmp, extralen);
-	req = esp_tmp_req(aead, iv);
-	sg = esp_req_sg(aead, req);
-
-	if (esp->inplace)
-		dsg = sg;
-	else
-		dsg = &sg[esp->nfrags];
-
-	esph = esp_output_set_esn(skb, x, esp->esph, extra);
-	esp->esph = esph;
-
-	sg_init_table(sg, esp->nfrags);
-	err = skb_to_sgvec(skb, sg,
-		           (unsigned char *)esph - skb->data,
-		           assoclen + ivlen + esp->clen + alen);
-	if (unlikely(err < 0))
-		goto error_free;
-
-	if (!esp->inplace) {
-		int allocsize;
-		struct page_frag *pfrag = &x->xfrag;
-
-		allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
-
-		spin_lock_bh(&x->lock);
-		if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
-			spin_unlock_bh(&x->lock);
-			goto error_free;
-		}
-
-		skb_shinfo(skb)->nr_frags = 1;
-
-		page = pfrag->page;
-		get_page(page);
-		/* replace page frags in skb with new page */
-		__skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
-		pfrag->offset = pfrag->offset + allocsize;
-		spin_unlock_bh(&x->lock);
-
-		sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
-		err = skb_to_sgvec(skb, dsg,
-			           (unsigned char *)esph - skb->data,
-			           assoclen + ivlen + esp->clen + alen);
-		if (unlikely(err < 0))
-			goto error_free;
-	}
-
-	if ((x->props.flags & XFRM_STATE_ESN))
-		aead_request_set_callback(req, 0, esp_output_done_esn, skb);
-	else
-		aead_request_set_callback(req, 0, esp_output_done, skb);
-
-	aead_request_set_crypt(req, sg, dsg, ivlen + esp->clen, iv);
-	aead_request_set_ad(req, assoclen);
-
-	memset(iv, 0, ivlen);
-	memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&esp->seqno + 8 - min(ivlen, 8),
-	       min(ivlen, 8));
-
-	ESP_SKB_CB(skb)->tmp = tmp;
-	err = crypto_aead_encrypt(req);
-
-	switch (err) {
-	case -EINPROGRESS:
-		goto error;
-
-	case -ENOSPC:
-		err = NET_XMIT_DROP;
+	case UDP_ENCAP_ESPINUDP:
+		source = uh->source;
 		break;
-
-	case 0:
-		if ((x->props.flags & XFRM_STATE_ESN))
-			esp_output_restore_header(skb);
-		esp_output_encap_csum(skb);
-	}
-
-	if (sg != dsg)
-		esp_ssg_unref(x, tmp, skb);
-
-	if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
-		err = esp_output_tail_tcp(x, skb);
-
-error_free:
-	kfree(tmp);
-error:
-	return err;
-}
-EXPORT_SYMBOL_GPL(esp6_output_tail);
-
-static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
-{
-	int alen;
-	int blksize;
-	struct ip_esp_hdr *esph;
-	struct crypto_aead *aead;
-	struct esp_info esp;
-
-	esp.inplace = true;
-
-	esp.proto = *skb_mac_header(skb);
-	*skb_mac_header(skb) = IPPROTO_ESP;
-
-	/* skb is pure payload to encrypt */
-
-	aead = x->data;
-	alen = crypto_aead_authsize(aead);
-
-	esp.tfclen = 0;
-	if (x->tfcpad) {
-		struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
-		u32 padto;
-
-		padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached));
-		if (skb->len < padto)
-			esp.tfclen = padto - skb->len;
-	}
-	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
-	esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize);
-	esp.plen = esp.clen - skb->len - esp.tfclen;
-	esp.tailen = esp.tfclen + esp.plen + alen;
-
-	esp.esph = ip_esp_hdr(skb);
-
-	esp.nfrags = esp6_output_head(x, skb, &esp);
-	if (esp.nfrags < 0)
-		return esp.nfrags;
-
-	esph = esp.esph;
-	esph->spi = x->id.spi;
-
-	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
-	esp.seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
-			    ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
-
-	skb_push(skb, -skb_network_offset(skb));
-
-	return esp6_output_tail(x, skb, &esp);
-}
-
-static inline int esp_remove_trailer(struct sk_buff *skb)
-{
-	struct xfrm_state *x = xfrm_input_state(skb);
-	struct crypto_aead *aead = x->data;
-	int alen, hlen, elen;
-	int padlen, trimlen;
-	__wsum csumdiff;
-	u8 nexthdr[2];
-	int ret;
-
-	alen = crypto_aead_authsize(aead);
-	hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
-	elen = skb->len - hlen;
-
-	ret = skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2);
-	BUG_ON(ret);
-
-	ret = -EINVAL;
-	padlen = nexthdr[0];
-	if (padlen + 2 + alen >= elen) {
-		net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
-				    padlen + 2, elen - alen);
-		goto out;
-	}
-
-	trimlen = alen + padlen + 2;
-	if (skb->ip_summed == CHECKSUM_COMPLETE) {
-		csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
-		skb->csum = csum_block_sub(skb->csum, csumdiff,
-					   skb->len - trimlen);
-	}
-	ret = pskb_trim(skb, skb->len - trimlen);
-	if (unlikely(ret))
-		return ret;
-
-	ret = nexthdr[1];
-
-out:
-	return ret;
-}
-
-int esp6_input_done2(struct sk_buff *skb, int err)
-{
-	struct xfrm_state *x = xfrm_input_state(skb);
-	struct xfrm_offload *xo = xfrm_offload(skb);
-	struct crypto_aead *aead = x->data;
-	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
-	int hdr_len = skb_network_header_len(skb);
-
-	if (!xo || !(xo->flags & CRYPTO_DONE))
-		kfree(ESP_SKB_CB(skb)->tmp);
-
-	if (unlikely(err))
-		goto out;
-
-	err = esp_remove_trailer(skb);
-	if (unlikely(err < 0))
-		goto out;
-
-	if (x->encap) {
-		const struct ipv6hdr *ip6h = ipv6_hdr(skb);
-		int offset = skb_network_offset(skb) + sizeof(*ip6h);
-		struct xfrm_encap_tmpl *encap = x->encap;
-		u8 nexthdr = ip6h->nexthdr;
-		__be16 frag_off, source;
-		struct udphdr *uh;
-		struct tcphdr *th;
-
-		offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
-		if (offset == -1) {
-			err = -EINVAL;
-			goto out;
-		}
-
-		uh = (void *)(skb->data + offset);
-		th = (void *)(skb->data + offset);
-		hdr_len += offset;
-
-		switch (x->encap->encap_type) {
-		case TCP_ENCAP_ESPINTCP:
-			source = th->source;
-			break;
-		case UDP_ENCAP_ESPINUDP:
-			source = uh->source;
-			break;
-		default:
-			WARN_ON_ONCE(1);
-			err = -EINVAL;
-			goto out;
-		}
-
-		/*
-		 * 1) if the NAT-T peer's IP or port changed then
-		 *    advertise the change to the keying daemon.
-		 *    This is an inbound SA, so just compare
-		 *    SRC ports.
-		 */
-		if (!ipv6_addr_equal(&ip6h->saddr, &x->props.saddr.in6) ||
-		    source != encap->encap_sport) {
-			xfrm_address_t ipaddr;
-
-			memcpy(&ipaddr.a6, &ip6h->saddr.s6_addr, sizeof(ipaddr.a6));
-			km_new_mapping(x, &ipaddr, source);
-
-			/* XXX: perhaps add an extra
-			 * policy check here, to see
-			 * if we should allow or
-			 * reject a packet from a
-			 * different source
-			 * address/port.
-			 */
-		}
-
-		/*
-		 * 2) ignore UDP/TCP checksums in case
-		 *    of NAT-T in Transport Mode, or
-		 *    perform other post-processing fixes
-		 *    as per draft-ietf-ipsec-udp-encaps-06,
-		 *    section 3.1.2
-		 */
-		if (x->props.mode == XFRM_MODE_TRANSPORT)
-			skb->ip_summed = CHECKSUM_UNNECESSARY;
+	default:
+		WARN_ON_ONCE(1);
+		return -1;
 	}
 
-	skb_postpull_rcsum(skb, skb_network_header(skb),
-			   skb_network_header_len(skb));
-	skb_pull_rcsum(skb, hlen);
-	if (x->props.mode == XFRM_MODE_TUNNEL ||
-	    x->props.mode == XFRM_MODE_IPTFS)
-		skb_reset_transport_header(skb);
-	else
-		skb_set_transport_header(skb, -hdr_len);
-
-	/* RFC4303: Drop dummy packets without any error */
-	if (err == IPPROTO_NONE)
-		err = -EINVAL;
-
-out:
-	return err;
-}
-EXPORT_SYMBOL_GPL(esp6_input_done2);
-
-static void esp_input_done(void *data, int err)
-{
-	struct sk_buff *skb = data;
-
-	xfrm_input_resume(skb, esp6_input_done2(skb, err));
-}
-
-static void esp_input_restore_header(struct sk_buff *skb)
-{
-	esp_restore_header(skb, 0);
-	__skb_pull(skb, 4);
-}
-
-static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
-{
-	struct xfrm_state *x = xfrm_input_state(skb);
-
-	/* For ESN we move the header forward by 4 bytes to
-	 * accommodate the high bits.  We will move it back after
-	 * decryption.
+	/*
+	 * 1) if the NAT-T peer's IP or port changed then
+	 *    advertise the change to the keying daemon.
+	 *    This is an inbound SA, so just compare
+	 *    SRC ports.
 	 */
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		struct ip_esp_hdr *esph = skb_push(skb, 4);
-
-		*seqhi = esph->spi;
-		esph->spi = esph->seq_no;
-		esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
-	}
-}
-
-static void esp_input_done_esn(void *data, int err)
-{
-	struct sk_buff *skb = data;
-
-	esp_input_restore_header(skb);
-	esp_input_done(data, err);
-}
-
-static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
-{
-	struct crypto_aead *aead = x->data;
-	struct aead_request *req;
-	struct sk_buff *trailer;
-	int ivlen = crypto_aead_ivsize(aead);
-	int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen;
-	int nfrags;
-	int assoclen;
-	int seqhilen;
-	int ret = 0;
-	void *tmp;
-	__be32 *seqhi;
-	u8 *iv;
-	struct scatterlist *sg;
-
-	if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (elen <= 0) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	assoclen = sizeof(struct ip_esp_hdr);
-	seqhilen = 0;
-
-	if (x->props.flags & XFRM_STATE_ESN) {
-		seqhilen += sizeof(__be32);
-		assoclen += seqhilen;
-	}
-
-	if (!skb_cloned(skb)) {
-		if (!skb_is_nonlinear(skb)) {
-			nfrags = 1;
-
-			goto skip_cow;
-		} else if (!skb_has_frag_list(skb)) {
-			nfrags = skb_shinfo(skb)->nr_frags;
-			nfrags++;
-
-			goto skip_cow;
-		}
-	}
-
-	nfrags = skb_cow_data(skb, 0, &trailer);
-	if (nfrags < 0) {
-		ret = -EINVAL;
-		goto out;
-	}
+	if (!ipv6_addr_equal(&ip6h->saddr, &x->props.saddr.in6) ||
+	    source != encap->encap_sport) {
+		xfrm_address_t ipaddr;
 
-skip_cow:
-	ret = -ENOMEM;
-	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
-	if (!tmp)
-		goto out;
-
-	ESP_SKB_CB(skb)->tmp = tmp;
-	seqhi = esp_tmp_extra(tmp);
-	iv = esp_tmp_iv(aead, tmp, seqhilen);
-	req = esp_tmp_req(aead, iv);
-	sg = esp_req_sg(aead, req);
-
-	esp_input_set_header(skb, seqhi);
-
-	sg_init_table(sg, nfrags);
-	ret = skb_to_sgvec(skb, sg, 0, skb->len);
-	if (unlikely(ret < 0)) {
-		kfree(tmp);
-		goto out;
+		memcpy(&ipaddr.a6, &ip6h->saddr.s6_addr, sizeof(ipaddr.a6));
+		km_new_mapping(x, &ipaddr, source);
 	}
 
-	skb->ip_summed = CHECKSUM_NONE;
-
-	if ((x->props.flags & XFRM_STATE_ESN))
-		aead_request_set_callback(req, 0, esp_input_done_esn, skb);
-	else
-		aead_request_set_callback(req, 0, esp_input_done, skb);
-
-	aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
-	aead_request_set_ad(req, assoclen);
-
-	ret = crypto_aead_decrypt(req);
-	if (ret == -EINPROGRESS)
-		goto out;
-
-	if ((x->props.flags & XFRM_STATE_ESN))
-		esp_input_restore_header(skb);
-
-	ret = esp6_input_done2(skb, ret);
+	/*
+	 * 2) ignore UDP/TCP checksums in case
+	 *    of NAT-T in Transport Mode, or
+	 *    perform other post-processing fixes
+	 *    as per draft-ietf-ipsec-udp-encaps-06,
+	 *    section 3.1.2
+	 */
+	if (x->props.mode == XFRM_MODE_TRANSPORT)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-out:
 	return ret;
 }
 
@@ -1000,146 +159,6 @@ static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	return 0;
 }
 
-static void esp6_destroy(struct xfrm_state *x)
-{
-	struct crypto_aead *aead = x->data;
-
-	if (!aead)
-		return;
-
-	crypto_free_aead(aead);
-}
-
-static int esp_init_aead(struct xfrm_state *x, struct netlink_ext_ack *extack)
-{
-	char aead_name[CRYPTO_MAX_ALG_NAME];
-	struct crypto_aead *aead;
-	int err;
-
-	if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
-		     x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) {
-		NL_SET_ERR_MSG(extack, "Algorithm name is too long");
-		return -ENAMETOOLONG;
-	}
-
-	aead = crypto_alloc_aead(aead_name, 0, 0);
-	err = PTR_ERR(aead);
-	if (IS_ERR(aead))
-		goto error;
-
-	x->data = aead;
-
-	err = crypto_aead_setkey(aead, x->aead->alg_key,
-				 (x->aead->alg_key_len + 7) / 8);
-	if (err)
-		goto error;
-
-	err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8);
-	if (err)
-		goto error;
-
-	return 0;
-
-error:
-	NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
-	return err;
-}
-
-static int esp_init_authenc(struct xfrm_state *x,
-			    struct netlink_ext_ack *extack)
-{
-	struct crypto_aead *aead;
-	struct crypto_authenc_key_param *param;
-	struct rtattr *rta;
-	char *key;
-	char *p;
-	char authenc_name[CRYPTO_MAX_ALG_NAME];
-	unsigned int keylen;
-	int err;
-
-	err = -ENAMETOOLONG;
-
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
-			     "%s%sauthencesn(%s,%s)%s",
-			     x->geniv ?: "", x->geniv ? "(" : "",
-			     x->aalg ? x->aalg->alg_name : "digest_null",
-			     x->ealg->alg_name,
-			     x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
-			NL_SET_ERR_MSG(extack, "Algorithm name is too long");
-			goto error;
-		}
-	} else {
-		if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
-			     "%s%sauthenc(%s,%s)%s",
-			     x->geniv ?: "", x->geniv ? "(" : "",
-			     x->aalg ? x->aalg->alg_name : "digest_null",
-			     x->ealg->alg_name,
-			     x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
-			NL_SET_ERR_MSG(extack, "Algorithm name is too long");
-			goto error;
-		}
-	}
-
-	aead = crypto_alloc_aead(authenc_name, 0, 0);
-	err = PTR_ERR(aead);
-	if (IS_ERR(aead)) {
-		NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
-		goto error;
-	}
-
-	x->data = aead;
-
-	keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
-		 (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
-	err = -ENOMEM;
-	key = kmalloc(keylen, GFP_KERNEL);
-	if (!key)
-		goto error;
-
-	p = key;
-	rta = (void *)p;
-	rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
-	rta->rta_len = RTA_LENGTH(sizeof(*param));
-	param = RTA_DATA(rta);
-	p += RTA_SPACE(sizeof(*param));
-
-	if (x->aalg) {
-		struct xfrm_algo_desc *aalg_desc;
-
-		memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8);
-		p += (x->aalg->alg_key_len + 7) / 8;
-
-		aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
-		BUG_ON(!aalg_desc);
-
-		err = -EINVAL;
-		if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
-		    crypto_aead_authsize(aead)) {
-			NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
-			goto free_key;
-		}
-
-		err = crypto_aead_setauthsize(
-			aead, x->aalg->alg_trunc_len / 8);
-		if (err) {
-			NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
-			goto free_key;
-		}
-	}
-
-	param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
-	memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8);
-
-	err = crypto_aead_setkey(aead, key, keylen);
-
-free_key:
-	kfree(key);
-
-error:
-	return err;
-}
-
 static int esp6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 {
 	struct crypto_aead *aead;
@@ -1213,13 +232,16 @@ static int esp6_rcv_cb(struct sk_buff *skb, int err)
 }
 
 static const struct xfrm_type esp6_type = {
-	.owner		= THIS_MODULE,
-	.proto		= IPPROTO_ESP,
-	.flags		= XFRM_TYPE_REPLAY_PROT,
-	.init_state	= esp6_init_state,
-	.destructor	= esp6_destroy,
-	.input		= esp6_input,
-	.output		= esp6_output,
+	.owner			= THIS_MODULE,
+	.proto			= IPPROTO_ESP,
+	.flags			= XFRM_TYPE_REPLAY_PROT,
+	.init_state		= esp6_init_state,
+	.destructor		= esp_destroy,
+	.input			= esp_input,
+	.input_encap		= esp6_input_encap,
+	.output			= esp_output,
+	.find_tcp_sk		= esp6_find_tcp_sk,
+	.output_encap_csum	= esp6_output_encap_csum,
 };
 
 static struct xfrm6_protocol esp6_protocol = {
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 22895521a57d..b1c38d7d286e 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -290,14 +290,18 @@ static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb)
 {
 	struct crypto_aead *aead = x->data;
 	struct xfrm_offload *xo = xfrm_offload(skb);
+	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
 
-	if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead)))
+	if (!pskb_may_pull(skb, hlen))
 		return -EINVAL;
 
 	if (!(xo->flags & CRYPTO_DONE))
 		skb->ip_summed = CHECKSUM_NONE;
+	else
+		skb_postpull_rcsum(skb, skb_network_header(skb),
+				   skb_network_header_len(skb));
 
-	return esp6_input_done2(skb, 0);
+	return esp_input_done2(skb, 0);
 }
 
 static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb,  netdev_features_t features)
@@ -340,7 +344,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb,  netdev_features
 	esp.tailen = esp.tfclen + esp.plen + alen;
 
 	if (!hw_offload || !skb_is_gso(skb)) {
-		esp.nfrags = esp6_output_head(x, skb, &esp);
+		esp.nfrags = esp_output_head(x, skb, &esp);
 		if (esp.nfrags < 0)
 			return esp.nfrags;
 	}
@@ -384,7 +388,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb,  netdev_features
 		return 0;
 	}
 
-	err = esp6_output_tail(x, skb, &esp);
+	err = esp_output_tail(x, skb, &esp);
 	if (err)
 		return err;
 
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index 5a1787587cb3..2a8995a34bdd 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
 obj-$(CONFIG_XFRM_IPTFS) += xfrm_iptfs.o
 obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o
 obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o
+obj-$(CONFIG_XFRM_ESP) += xfrm_esp.o
diff --git a/net/xfrm/xfrm_esp.c b/net/xfrm/xfrm_esp.c
new file mode 100644
index 000000000000..4f75e1ace3bb
--- /dev/null
+++ b/net/xfrm/xfrm_esp.c
@@ -0,0 +1,1012 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/in6.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/espintcp.h>
+#include <linux/skbuff_ref.h>
+
+#include <linux/highmem.h>
+
+struct esp_skb_cb {
+	struct xfrm_skb_cb xfrm;
+	void *tmp;
+};
+
+struct esp_output_extra {
+	__be32 seqhi;
+	u32 esphoff;
+};
+
+#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
+
+/*
+ * Allocate an AEAD request structure with extra space for SG and IV.
+ *
+ * For alignment considerations the IV is placed at the front, followed
+ * by the request and finally the SG list.
+ *
+ * TODO: Use spare space in skb for this where possible.
+ */
+static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int extralen)
+{
+	unsigned int len;
+
+	len = extralen;
+
+	len += crypto_aead_ivsize(aead);
+
+	if (len) {
+		len += crypto_aead_alignmask(aead) &
+		       ~(crypto_tfm_ctx_alignment() - 1);
+		len = ALIGN(len, crypto_tfm_ctx_alignment());
+	}
+
+	len += sizeof(struct aead_request) + crypto_aead_reqsize(aead);
+	len = ALIGN(len, __alignof__(struct scatterlist));
+
+	len += sizeof(struct scatterlist) * nfrags;
+
+	return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline void *esp_tmp_extra(void *tmp)
+{
+	return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra));
+}
+
+static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int extralen)
+{
+	return crypto_aead_ivsize(aead) ?
+	       PTR_ALIGN((u8 *)tmp + extralen,
+			 crypto_aead_alignmask(aead) + 1) : tmp + extralen;
+}
+
+static inline struct aead_request *esp_tmp_req(struct crypto_aead *aead, u8 *iv)
+{
+	struct aead_request *req;
+
+	req = (void *)PTR_ALIGN(iv + crypto_aead_ivsize(aead),
+				crypto_tfm_ctx_alignment());
+	aead_request_set_tfm(req, aead);
+	return req;
+}
+
+static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
+					     struct aead_request *req)
+{
+	return (void *)ALIGN((unsigned long)(req + 1) +
+			     crypto_aead_reqsize(aead),
+			     __alignof__(struct scatterlist));
+}
+
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
+{
+	struct crypto_aead *aead = x->data;
+	int extralen = 0;
+	u8 *iv;
+	struct aead_request *req;
+	struct scatterlist *sg;
+
+	if (x->props.flags & XFRM_STATE_ESN)
+		extralen += sizeof(struct esp_output_extra);
+
+	iv = esp_tmp_iv(aead, tmp, extralen);
+	req = esp_tmp_req(aead, iv);
+
+	/* Unref skb_frag_pages in the src scatterlist if necessary.
+	 * Skip the first sg which comes from skb->data.
+	 */
+	if (req->src != req->dst)
+		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
+			skb_page_unref(page_to_netmem(sg_page(sg)),
+				       skb->pp_recycle);
+}
+
+#ifdef CONFIG_INET_ESPINTCP
+static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct sock *sk;
+	int err;
+
+	rcu_read_lock();
+
+	sk = x->type->find_tcp_sk(x);
+	err = PTR_ERR_OR_ZERO(sk);
+	if (err) {
+		kfree_skb(skb);
+		goto out;
+	}
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk))
+		err = espintcp_queue_out(sk, skb);
+	else
+		err = espintcp_push_skb(sk, skb);
+	bh_unlock_sock(sk);
+
+	sock_put(sk);
+
+out:
+	rcu_read_unlock();
+	return err;
+}
+
+static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk,
+				   struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct xfrm_state *x = dst->xfrm;
+
+	return esp_output_tcp_finish(x, skb);
+}
+
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+
+	local_bh_disable();
+	err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
+	local_bh_enable();
+
+	/* EINPROGRESS just happens to do the right thing.  It
+	 * actually means that the skb has been consumed and
+	 * isn't coming back.
+	 */
+	return err ?: -EINPROGRESS;
+}
+#else
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+	WARN_ON(1);
+	return -EOPNOTSUPP;
+}
+#endif
+
+static void esp_output_done(void *data, int err)
+{
+	struct sk_buff *skb = data;
+	struct xfrm_offload *xo = xfrm_offload(skb);
+	void *tmp;
+	struct xfrm_state *x;
+
+	if (xo && (xo->flags & XFRM_DEV_RESUME)) {
+		struct sec_path *sp = skb_sec_path(skb);
+
+		x = sp->xvec[sp->len - 1];
+	} else {
+		x = skb_dst(skb)->xfrm;
+	}
+
+	tmp = ESP_SKB_CB(skb)->tmp;
+	esp_ssg_unref(x, tmp, skb);
+	kfree(tmp);
+
+	x->type->output_encap_csum(skb);
+
+	if (xo && (xo->flags & XFRM_DEV_RESUME)) {
+		if (err) {
+			XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
+			kfree_skb(skb);
+			return;
+		}
+
+		skb_push(skb, skb->data - skb_mac_header(skb));
+		secpath_reset(skb);
+		xfrm_dev_resume(skb);
+	} else {
+		if (!err &&
+		    x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) {
+			err = esp_output_tail_tcp(x, skb);
+			if (err != -EINPROGRESS)
+				kfree_skb(skb);
+		} else {
+			xfrm_output_resume(skb_to_full_sk(skb), skb, err);
+		}
+	}
+}
+
+/* Move ESP header back into place. */
+static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
+{
+	struct ip_esp_hdr *esph = (void *)(skb->data + offset);
+	void *tmp = ESP_SKB_CB(skb)->tmp;
+	__be32 *seqhi = esp_tmp_extra(tmp);
+
+	esph->seq_no = esph->spi;
+	esph->spi = *seqhi;
+}
+
+static void esp_output_restore_header(struct sk_buff *skb)
+{
+	void *tmp = ESP_SKB_CB(skb)->tmp;
+	struct esp_output_extra *extra = esp_tmp_extra(tmp);
+
+	esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff -
+				sizeof(__be32));
+}
+
+static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb,
+					     struct xfrm_state *x,
+					     struct ip_esp_hdr *esph,
+					     struct esp_output_extra *extra)
+{
+	/* For ESN we move the header forward by 4 bytes to
+	 * accommodate the high bits.  We will move it back after
+	 * encryption.
+	 */
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		__u32 seqhi;
+		struct xfrm_offload *xo = xfrm_offload(skb);
+
+		if (xo)
+			seqhi = xo->seq.hi;
+		else
+			seqhi = XFRM_SKB_CB(skb)->seq.output.hi;
+
+		extra->esphoff = (unsigned char *)esph -
+				 skb_transport_header(skb);
+		esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
+		extra->seqhi = esph->spi;
+		esph->seq_no = htonl(seqhi);
+	}
+
+	esph->spi = x->id.spi;
+
+	return esph;
+}
+
+static void esp_output_done_esn(void *data, int err)
+{
+	struct sk_buff *skb = data;
+
+	esp_output_restore_header(skb);
+	esp_output_done(data, err);
+}
+
+static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb,
+					       int encap_type,
+					       struct esp_info *esp,
+					       __be16 sport,
+					       __be16 dport)
+{
+	struct udphdr *uh;
+	unsigned int len;
+	struct xfrm_offload *xo = xfrm_offload(skb);
+
+	len = skb->len + esp->tailen - skb_transport_offset(skb);
+
+	uh = (struct udphdr *)esp->esph;
+	uh->source = sport;
+	uh->dest = dport;
+	uh->len = htons(len);
+	uh->check = 0;
+
+	/* For ESP with UDP encapsulation, if xo is not null, the skb is in the crypto offload
+	 * data path, which means that esp_output_udp_encap is called outside of the XFRM stack.
+	 * In this case, the mac header doesn't point to the IPv4 protocol field, so don't set it.
+	 */
+	if (!xo || encap_type != UDP_ENCAP_ESPINUDP)
+		*skb_mac_header(skb) = IPPROTO_UDP;
+
+	return (struct ip_esp_hdr *)(uh + 1);
+}
+
+#ifdef CONFIG_INET_ESPINTCP
+static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
+					       struct sk_buff *skb,
+					       struct esp_info *esp)
+{
+	__be16 *lenp = (void *)esp->esph;
+	struct ip_esp_hdr *esph;
+	unsigned int len;
+	struct sock *sk;
+
+	len = skb->len + esp->tailen - skb_transport_offset(skb);
+
+	rcu_read_lock();
+	sk = x->type->find_tcp_sk(x);
+	rcu_read_unlock();
+
+	if (IS_ERR(sk))
+		return ERR_CAST(sk);
+
+	sock_put(sk);
+
+	*lenp = htons(len);
+	esph = (struct ip_esp_hdr *)(lenp + 1);
+
+	return esph;
+}
+#else
+static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
+					       struct sk_buff *skb,
+					       struct esp_info *esp)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+#endif
+
+static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb,
+			    struct esp_info *esp)
+{
+	struct xfrm_encap_tmpl *encap = x->encap;
+	struct ip_esp_hdr *esph;
+	__be16 sport, dport;
+	int encap_type;
+
+	esph = ERR_PTR(-EOPNOTSUPP);
+
+	spin_lock_bh(&x->lock);
+	sport = encap->encap_sport;
+	dport = encap->encap_dport;
+	encap_type = encap->encap_type;
+	spin_unlock_bh(&x->lock);
+
+	switch (encap_type) {
+	default:
+	case UDP_ENCAP_ESPINUDP:
+		esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport);
+		break;
+	case TCP_ENCAP_ESPINTCP:
+		esph = esp_output_tcp_encap(x, skb, esp);
+		break;
+	}
+
+	if (IS_ERR(esph))
+		return PTR_ERR(esph);
+
+	esp->esph = esph;
+
+	return 0;
+}
+
+int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
+{
+	u8 *tail;
+	int nfrags;
+	int esph_offset;
+	struct page *page;
+	struct sk_buff *trailer;
+	int tailen = esp->tailen;
+
+	/* this is non-NULL only with TCP/UDP Encapsulation */
+	if (x->encap) {
+		int err = esp_output_encap(x, skb, esp);
+
+		if (err < 0)
+			return err;
+	}
+
+	if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
+	    ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
+		goto cow;
+
+	if (!skb_cloned(skb)) {
+		if (tailen <= skb_tailroom(skb)) {
+			nfrags = 1;
+			trailer = skb;
+			tail = skb_tail_pointer(trailer);
+
+			goto skip_cow;
+		} else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS) &&
+			    !skb_has_frag_list(skb)) {
+			int allocsize;
+			struct sock *sk = skb->sk;
+			struct page_frag *pfrag = &x->xfrag;
+
+			esp->inplace = false;
+
+			allocsize = ALIGN(tailen, L1_CACHE_BYTES);
+
+			spin_lock_bh(&x->lock);
+
+			if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+				spin_unlock_bh(&x->lock);
+				goto cow;
+			}
+
+			page = pfrag->page;
+			get_page(page);
+
+			tail = page_address(page) + pfrag->offset;
+
+			esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
+
+			nfrags = skb_shinfo(skb)->nr_frags;
+
+			__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
+					     tailen);
+			skb_shinfo(skb)->nr_frags = ++nfrags;
+
+			pfrag->offset = pfrag->offset + allocsize;
+
+			spin_unlock_bh(&x->lock);
+
+			nfrags++;
+
+			skb_len_add(skb, tailen);
+			if (sk && sk_fullsock(sk))
+				refcount_add(tailen, &sk->sk_wmem_alloc);
+
+			goto out;
+		}
+	}
+
+cow:
+	esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb);
+
+	nfrags = skb_cow_data(skb, tailen, &trailer);
+	if (nfrags < 0)
+		goto out;
+	tail = skb_tail_pointer(trailer);
+	esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset);
+
+skip_cow:
+	esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
+	pskb_put(skb, trailer, tailen);
+
+out:
+	return nfrags;
+}
+EXPORT_SYMBOL_GPL(esp_output_head);
+
+int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
+{
+	u8 *iv;
+	int alen;
+	void *tmp;
+	int ivlen;
+	int assoclen;
+	int extralen;
+	struct page *page;
+	struct ip_esp_hdr *esph;
+	struct crypto_aead *aead;
+	struct aead_request *req;
+	struct scatterlist *sg, *dsg;
+	struct esp_output_extra *extra;
+	int err = -ENOMEM;
+
+	assoclen = sizeof(struct ip_esp_hdr);
+	extralen = 0;
+
+	if (x->props.flags & XFRM_STATE_ESN) {
+		extralen += sizeof(*extra);
+		assoclen += sizeof(__be32);
+	}
+
+	aead = x->data;
+	alen = crypto_aead_authsize(aead);
+	ivlen = crypto_aead_ivsize(aead);
+
+	tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen);
+	if (!tmp)
+		goto error;
+
+	extra = esp_tmp_extra(tmp);
+	iv = esp_tmp_iv(aead, tmp, extralen);
+	req = esp_tmp_req(aead, iv);
+	sg = esp_req_sg(aead, req);
+
+	if (esp->inplace)
+		dsg = sg;
+	else
+		dsg = &sg[esp->nfrags];
+
+	esph = esp_output_set_esn(skb, x, esp->esph, extra);
+	esp->esph = esph;
+
+	sg_init_table(sg, esp->nfrags);
+	err = skb_to_sgvec(skb, sg,
+			   (unsigned char *)esph - skb->data,
+			    assoclen + ivlen + esp->clen + alen);
+	if (unlikely(err < 0))
+		goto error_free;
+
+	if (!esp->inplace) {
+		int allocsize;
+		struct page_frag *pfrag = &x->xfrag;
+
+		allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
+
+		spin_lock_bh(&x->lock);
+		if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+			spin_unlock_bh(&x->lock);
+			goto error_free;
+		}
+
+		skb_shinfo(skb)->nr_frags = 1;
+
+		page = pfrag->page;
+		get_page(page);
+		/* replace page frags in skb with new page */
+		__skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
+		pfrag->offset = pfrag->offset + allocsize;
+		spin_unlock_bh(&x->lock);
+
+		sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
+		err = skb_to_sgvec(skb, dsg,
+				   (unsigned char *)esph - skb->data,
+				    assoclen + ivlen + esp->clen + alen);
+		if (unlikely(err < 0))
+			goto error_free;
+	}
+
+	if ((x->props.flags & XFRM_STATE_ESN))
+		aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+	else
+		aead_request_set_callback(req, 0, esp_output_done, skb);
+
+	aead_request_set_crypt(req, sg, dsg, ivlen + esp->clen, iv);
+	aead_request_set_ad(req, assoclen);
+
+	memset(iv, 0, ivlen);
+	memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&esp->seqno + 8 - min(ivlen, 8),
+	       min(ivlen, 8));
+
+	ESP_SKB_CB(skb)->tmp = tmp;
+	err = crypto_aead_encrypt(req);
+
+	switch (err) {
+	case -EINPROGRESS:
+		goto error;
+
+	case -ENOSPC:
+		err = NET_XMIT_DROP;
+		break;
+
+	case 0:
+		if ((x->props.flags & XFRM_STATE_ESN))
+			esp_output_restore_header(skb);
+		x->type->output_encap_csum(skb);
+	}
+
+	if (sg != dsg)
+		esp_ssg_unref(x, tmp, skb);
+
+	if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+		err = esp_output_tail_tcp(x, skb);
+
+error_free:
+	kfree(tmp);
+error:
+	return err;
+}
+EXPORT_SYMBOL_GPL(esp_output_tail);
+
+int esp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int alen;
+	int blksize;
+	struct ip_esp_hdr *esph;
+	struct crypto_aead *aead;
+	struct esp_info esp;
+
+	esp.inplace = true;
+
+	esp.proto = *skb_mac_header(skb);
+	*skb_mac_header(skb) = IPPROTO_ESP;
+
+	/* skb is pure payload to encrypt */
+
+	aead = x->data;
+	alen = crypto_aead_authsize(aead);
+
+	esp.tfclen = 0;
+	if (x->tfcpad) {
+		struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
+		u32 padto;
+
+		padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached));
+		if (skb->len < padto)
+			esp.tfclen = padto - skb->len;
+	}
+	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+	esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize);
+	esp.plen = esp.clen - skb->len - esp.tfclen;
+	esp.tailen = esp.tfclen + esp.plen + alen;
+
+	esp.esph = ip_esp_hdr(skb);
+
+	esp.nfrags = esp_output_head(x, skb, &esp);
+	if (esp.nfrags < 0)
+		return esp.nfrags;
+
+	esph = esp.esph;
+	esph->spi = x->id.spi;
+
+	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+	esp.seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
+				 ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
+
+	skb_push(skb, -skb_network_offset(skb));
+
+	return esp_output_tail(x, skb, &esp);
+}
+EXPORT_SYMBOL_GPL(esp_output);
+
+static inline int esp_remove_trailer(struct sk_buff *skb)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct crypto_aead *aead = x->data;
+	int alen, hlen, elen;
+	int padlen, trimlen;
+	__wsum csumdiff;
+	u8 nexthdr[2];
+	int ret;
+
+	alen = crypto_aead_authsize(aead);
+	hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+	elen = skb->len - hlen;
+
+	if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
+		BUG();
+
+	ret = -EINVAL;
+	padlen = nexthdr[0];
+	if (padlen + 2 + alen >= elen) {
+		net_dbg_ratelimited("ipsec esp packet is garbage padlen=%d, elen=%d\n",
+				    padlen + 2, elen - alen);
+		goto out;
+	}
+
+	trimlen = alen + padlen + 2;
+	if (skb->ip_summed == CHECKSUM_COMPLETE) {
+		csumdiff = skb_checksum(skb, skb->len - trimlen, trimlen, 0);
+		skb->csum = csum_block_sub(skb->csum, csumdiff,
+					   skb->len - trimlen);
+	}
+	ret = pskb_trim(skb, skb->len - trimlen);
+	if (unlikely(ret))
+		return ret;
+
+	ret = nexthdr[1];
+
+out:
+	return ret;
+}
+
+int esp_input_done2(struct sk_buff *skb, int err)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct xfrm_offload *xo = xfrm_offload(skb);
+	struct crypto_aead *aead = x->data;
+	int hdr_len = skb_network_header_len(skb);
+	int nexthdr;
+	int hlen;
+
+	if (!xo || !(xo->flags & CRYPTO_DONE))
+		kfree(ESP_SKB_CB(skb)->tmp);
+
+	if (unlikely(err))
+		goto out;
+
+	err = esp_remove_trailer(skb);
+	if (unlikely(err < 0))
+		goto out;
+
+	nexthdr = err;
+
+	if (x->encap) {
+		hdr_len = x->type->input_encap(skb, x);
+		if (unlikely(hdr_len == -1)) {
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
+	skb_pull_rcsum(skb, hlen);
+
+	if (x->props.mode == XFRM_MODE_TUNNEL ||
+	    x->props.mode == XFRM_MODE_IPTFS)
+		skb_reset_transport_header(skb);
+	else
+		skb_set_transport_header(skb, -hdr_len);
+
+	/* RFC4303: Drop dummy packets without any error */
+	if (nexthdr == IPPROTO_NONE)
+		err = -EINVAL;
+	else
+		err = nexthdr;
+
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(esp_input_done2);
+
+static void esp_input_done(void *data, int err)
+{
+	struct sk_buff *skb = data;
+
+	xfrm_input_resume(skb, esp_input_done2(skb, err));
+}
+
+static void esp_input_restore_header(struct sk_buff *skb)
+{
+	esp_restore_header(skb, 0);
+	__skb_pull(skb, 4);
+}
+
+static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct ip_esp_hdr *esph;
+
+	/* For ESN we move the header forward by 4 bytes to
+	 * accommodate the high bits.  We will move it back after
+	 * decryption.
+	 */
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		esph = skb_push(skb, 4);
+		*seqhi = esph->spi;
+		esph->spi = esph->seq_no;
+		esph->seq_no = XFRM_SKB_CB(skb)->seq.input.hi;
+	}
+}
+
+static void esp_input_done_esn(void *data, int err)
+{
+	struct sk_buff *skb = data;
+
+	esp_input_restore_header(skb);
+	esp_input_done(data, err);
+}
+
+/*
+ * Note: detecting truncated vs. non-truncated authentication data is very
+ * expensive, so we only support truncated data, which is the recommended
+ * and common case.
+ */
+int esp_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct crypto_aead *aead = x->data;
+	struct aead_request *req;
+	struct sk_buff *trailer;
+	int ivlen = crypto_aead_ivsize(aead);
+	int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen;
+	int nfrags;
+	int assoclen;
+	int seqhilen;
+	__be32 *seqhi;
+	void *tmp;
+	u8 *iv;
+	struct scatterlist *sg;
+	int err = -EINVAL;
+
+	if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen))
+		goto out;
+
+	if (elen <= 0)
+		goto out;
+
+	assoclen = sizeof(struct ip_esp_hdr);
+	seqhilen = 0;
+
+	if (x->props.flags & XFRM_STATE_ESN) {
+		seqhilen += sizeof(__be32);
+		assoclen += seqhilen;
+	}
+
+	if (!skb_cloned(skb)) {
+		if (!skb_is_nonlinear(skb)) {
+			nfrags = 1;
+
+			goto skip_cow;
+		} else if (!skb_has_frag_list(skb)) {
+			nfrags = skb_shinfo(skb)->nr_frags;
+			nfrags++;
+
+			goto skip_cow;
+		}
+	}
+
+	err = skb_cow_data(skb, 0, &trailer);
+	if (err < 0)
+		goto out;
+
+	nfrags = err;
+
+skip_cow:
+	err = -ENOMEM;
+	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
+	if (!tmp)
+		goto out;
+
+	ESP_SKB_CB(skb)->tmp = tmp;
+	seqhi = esp_tmp_extra(tmp);
+	iv = esp_tmp_iv(aead, tmp, seqhilen);
+	req = esp_tmp_req(aead, iv);
+	sg = esp_req_sg(aead, req);
+
+	esp_input_set_header(skb, seqhi);
+
+	sg_init_table(sg, nfrags);
+	err = skb_to_sgvec(skb, sg, 0, skb->len);
+	if (unlikely(err < 0)) {
+		kfree(tmp);
+		goto out;
+	}
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	if ((x->props.flags & XFRM_STATE_ESN))
+		aead_request_set_callback(req, 0, esp_input_done_esn, skb);
+	else
+		aead_request_set_callback(req, 0, esp_input_done, skb);
+
+	aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
+	aead_request_set_ad(req, assoclen);
+
+	err = crypto_aead_decrypt(req);
+	if (err == -EINPROGRESS)
+		goto out;
+
+	if ((x->props.flags & XFRM_STATE_ESN))
+		esp_input_restore_header(skb);
+
+	err = esp_input_done2(skb, err);
+
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(esp_input);
+
+void esp_destroy(struct xfrm_state *x)
+{
+	struct crypto_aead *aead = x->data;
+
+	if (!aead)
+		return;
+
+	crypto_free_aead(aead);
+}
+EXPORT_SYMBOL_GPL(esp_destroy);
+
+int esp_init_aead(struct xfrm_state *x, struct netlink_ext_ack *extack)
+{
+	char aead_name[CRYPTO_MAX_ALG_NAME];
+	struct crypto_aead *aead;
+	int err;
+
+	if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
+		     x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) {
+		NL_SET_ERR_MSG(extack, "Algorithm name is too long");
+		return -ENAMETOOLONG;
+	}
+
+	aead = crypto_alloc_aead(aead_name, 0, 0);
+	err = PTR_ERR(aead);
+	if (IS_ERR(aead))
+		goto error;
+
+	x->data = aead;
+
+	err = crypto_aead_setkey(aead, x->aead->alg_key,
+				 (x->aead->alg_key_len + 7) / 8);
+	if (err)
+		goto error;
+
+	err = crypto_aead_setauthsize(aead, x->aead->alg_icv_len / 8);
+	if (err)
+		goto error;
+
+	return 0;
+
+error:
+	NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+	return err;
+}
+EXPORT_SYMBOL_GPL(esp_init_aead);
+
+int esp_init_authenc(struct xfrm_state *x, struct netlink_ext_ack *extack)
+{
+	struct crypto_aead *aead;
+	struct crypto_authenc_key_param *param;
+	struct rtattr *rta;
+	char *key;
+	char *p;
+	char authenc_name[CRYPTO_MAX_ALG_NAME];
+	unsigned int keylen;
+	int err;
+
+	err = -ENAMETOOLONG;
+
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+			     "%s%sauthencesn(%s,%s)%s",
+			     x->geniv ?: "", x->geniv ? "(" : "",
+			     x->aalg ? x->aalg->alg_name : "digest_null",
+			     x->ealg->alg_name,
+			     x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+			NL_SET_ERR_MSG(extack, "Algorithm name is too long");
+			goto error;
+		}
+	} else {
+		if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
+			     "%s%sauthenc(%s,%s)%s",
+			     x->geniv ?: "", x->geniv ? "(" : "",
+			     x->aalg ? x->aalg->alg_name : "digest_null",
+			     x->ealg->alg_name,
+			     x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+			NL_SET_ERR_MSG(extack, "Algorithm name is too long");
+			goto error;
+		}
+	}
+
+	aead = crypto_alloc_aead(authenc_name, 0, 0);
+	err = PTR_ERR(aead);
+	if (IS_ERR(aead)) {
+		NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+		goto error;
+	}
+
+	x->data = aead;
+
+	keylen = (x->aalg ? (x->aalg->alg_key_len + 7) / 8 : 0) +
+		 (x->ealg->alg_key_len + 7) / 8 + RTA_SPACE(sizeof(*param));
+	err = -ENOMEM;
+	key = kmalloc(keylen, GFP_KERNEL);
+	if (!key)
+		goto error;
+
+	p = key;
+	rta = (void *)p;
+	rta->rta_type = CRYPTO_AUTHENC_KEYA_PARAM;
+	rta->rta_len = RTA_LENGTH(sizeof(*param));
+	param = RTA_DATA(rta);
+	p += RTA_SPACE(sizeof(*param));
+
+	if (x->aalg) {
+		struct xfrm_algo_desc *aalg_desc;
+
+		memcpy(p, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8);
+		p += (x->aalg->alg_key_len + 7) / 8;
+
+		aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+		BUG_ON(!aalg_desc);
+
+		err = -EINVAL;
+		if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
+		    crypto_aead_authsize(aead)) {
+			NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+			goto free_key;
+		}
+
+		err = crypto_aead_setauthsize(aead,
+					      x->aalg->alg_trunc_len / 8);
+		if (err) {
+			NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
+			goto free_key;
+		}
+	}
+
+	param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
+	memcpy(p, x->ealg->alg_key, (x->ealg->alg_key_len + 7) / 8);
+
+	err = crypto_aead_setkey(aead, key, keylen);
+
+free_key:
+	kfree_sensitive(key);
+
+error:
+	return err;
+}
+EXPORT_SYMBOL_GPL(esp_init_authenc);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Generic ESP");
-- 
2.43.0


^ permalink raw reply related

* RE: [PATCH v5 net-next 11/15] net: dsa: netc: add phylink MAC operations
From: Wei Fang @ 2026-05-06  8:20 UTC (permalink / raw)
  To: Claudiu Manoil, Vladimir Oltean, Clark Wang,
	andrew+netdev@lunn.ch, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com, robh@kernel.org,
	krzk+dt@kernel.org, conor+dt@kernel.org, f.fainelli@gmail.com,
	Frank Li, chleroy@kernel.org, horms@kernel.org,
	linux@armlinux.org.uk
  Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	devicetree@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
	linux-arm-kernel@lists.infradead.org, imx@lists.linux.dev
In-Reply-To: <20260430024945.3413973-12-wei.fang@nxp.com>

There are some comments from Sashiko and see my response inline.

> +static void netc_mac_port_rmw(struct netc_port *np, u32 reg,
> +			      u32 mask, u32 val)
> +{
> +	u32 old, new;
> +
> +	if (is_netc_pseudo_port(np))
> +		return;
> +
> +	WARN_ON((mask | val) != mask);
> +
> +	old = netc_port_rd(np, reg);
> +	new = (old & ~mask) | val;
> +	if (new == old)
> +		return;
> +
> +	netc_port_wr(np, reg, new);
> +	if (np->caps.pmac)
> +		netc_port_wr(np, reg + NETC_PMAC_OFFSET, new);

Does this code risk corrupting the pseudo-MAC (PMAC) configuration?

By reading the original register value exclusively from the primary MAC
(MAC 0) and writing the resulting new value to both MAC 0 and the PMAC,
any independent configuration stored in the unmasked bits of the PMAC could
be blindly overwritten.

Additionally, if the new value matches the old value for MAC 0, the function
returns early. Could this leave the PMAC permanently out of sync if its state
needed updating but MAC 0 did not?

The pmac means preemption MAC not the pseudo MAC, I will add a comment
to the pmac definition. The purpose of netc_mac_port_rmw() is to make
pMAC and eMAC configurations the same. So this is a false positive.

[...]

> +static void netc_port_mac_tx_graceful_stop(struct netc_port *np) {
> +	netc_port_rmw(np, NETC_POR, POR_TXDIS, POR_TXDIS);
> +
> +	if (is_netc_pseudo_port(np))
> +		return;
> +
> +	netc_port_wait_tx_empty(np, 0);
> +	if (np->caps.pmac)
> +		netc_port_wait_tx_empty(np, 1);
> +
> +	netc_mac_port_rmw(np, NETC_PM_CMD_CFG(0), PM_CMD_CFG_TX_EN,
> 0); }

Can this sequence race with the Port actively pushing the remainder of a packet
to the MAC?

The code disables the Port TX (POR_TXDIS) and then immediately waits for the
MAC TX FIFO to empty. However, without polling the PSR_TX_BUSY bit in NETC_PSR
to ensure the port has finished its work (similar to how PSR_RX_BUSY is checked
in netc_port_mac_rx_graceful_stop), it appears the MAC TX FIFO could
momentarily empty while the port is still pushing data.

Would this lead to prematurely disabling the MAC TX (PM_CMD_CFG_TX_EN = 0) and
potentially truncating egress packets or leaving the hardware in an invalid
state?

This is also a false positive, the procedures for configuring graceful stop for TX
and RX are different, and they are all configured in accordance with the NETC
reference manual.

^ permalink raw reply

* Re: [PATCH net-next 5/5] dt-bindings: net: Add bindings for the ADIN1140
From: Krzysztof Kozlowski @ 2026-05-06  8:20 UTC (permalink / raw)
  To: Ciprian Regus
  Cc: Parthiban Veerasooran, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Jonathan Corbet,
	Shuah Khan, Andrew Lunn, Heiner Kallweit, Russell King,
	Rob Herring, Krzysztof Kozlowski, Conor Dooley, netdev,
	linux-kernel, linux-doc, devicetree
In-Reply-To: <20260503-adin1140-driver-v1-5-dd043cdd88f0@analog.com>

On Sun, May 03, 2026 at 02:24:54AM +0300, Ciprian Regus wrote:
> Add DT bindings for the ADIN1140 10BASE-T1S MACPHY. Update the
> MAINTAINERS entry to include the bindings file as well.

Beside other review, two things since I expect a v2 anyway:

A nit, subject: drop second/last, redundant "bindings for the". The
"dt-bindings" prefix is already stating that these are bindings.
See also:
https://elixir.bootlin.com/linux/v6.17-rc3/source/Documentation/devicetree/bindings/submitting-patches.rst#L18

> 
> Signed-off-by: Ciprian Regus <ciprian.regus@analog.com>
> ---
>  .../devicetree/bindings/net/adi,adin1140.yaml      | 69 ++++++++++++++++++++++
>  MAINTAINERS                                        |  1 +
>  2 files changed, 70 insertions(+)
> 
> diff --git a/Documentation/devicetree/bindings/net/adi,adin1140.yaml b/Documentation/devicetree/bindings/net/adi,adin1140.yaml
> new file mode 100644
> index 000000000000..26cd40d36f9b
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/adi,adin1140.yaml
> @@ -0,0 +1,69 @@
> +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> +%YAML 1.2
> +---
> +$id: http://devicetree.org/schemas/net/adi,adin1140.yaml#
> +$schema: http://devicetree.org/meta-schemas/core.yaml#
> +
> +title: ADI ADIN1140 10BASE-T1S MAC-PHY
> +
> +maintainers:
> +  - Ciprian Regus <ciprian.regus@analog.com>
> +
> +description: |
> +  The ADIN1140 (also called AD3306) is a low power single port
> +  10BASE-T1S MAC-PHY. It integrates an Ethernet PHY with a MAC
> +  and all the associated analog circuitry.
> +  The device implements the Open Alliance TC6 10BASE-T1x MAC-PHY
> +  Serial Interface specification and is compliant with the
> +  IEEE 802.3cg-2019 Ethernet standard for 10 Mbps single pair
> +  Ethernet (SPE). The device has a 4-wire SPI interface for
> +  communication between the MAC and host processor.
> +
> +allOf:
> +  - $ref: /schemas/net/ethernet-controller.yaml#
> +  - $ref: /schemas/spi/spi-peripheral-props.yaml#
> +
> +properties:
> +  compatible:
> +    enum:
> +      - adi,adin1140
> +      - adi,ad3306

I guess reversed order as numbers are before letters in most sorting.


Best regards,
Krzysztof


^ permalink raw reply

* [PATCH net v8 0/2] ipv6: flowlabel: per-netns budget for unprivileged callers
From: Maoyi Xie @ 2026-05-06  8:24 UTC (permalink / raw)
  To: David S . Miller
  Cc: Jakub Kicinski, Paolo Abeni, Eric Dumazet, David Ahern,
	Alexey Kuznetsov, Willem de Bruijn, Willem de Bruijn, netdev,
	linux-kernel, stable, Maoyi Xie

From: Maoyi Xie <maoyi.xie@ntu.edu.sg>

This series fixes the cross-tenant DoS in net/ipv6/ip6_flowlabel.c.
v1 through v6 were single-patch postings, each in its own thread.
v6 review pointed out that the existing fl_size read in
mem_check() and the corresponding write in fl_intern() are not in
the same critical section. v7 split the work into 2 patches.

Patch 1/2 is a prerequisite. It moves spin_lock_bh(&ip6_fl_lock)
and the matching unlock from fl_intern() into its only caller
ipv6_flowlabel_get(), so the mem_check() call runs under the same
critical section as the fl_intern() insert. With all writers and
the read of fl_size under the lock, fl_size is converted from
atomic_t to plain int. This is independent of the per-netns
budget. It also makes 2/2 backportable without conflicts.

Patch 2/2 is the v6 patch, rebased on 1/2.

  - flowlabel_count is plain int rather than atomic_t, since the
    previous patch put all writers and readers under ip6_fl_lock.
  - In ip6_fl_gc(), fl_free() is now placed below the fl_size
    and flowlabel_count decrements, removing the v6 cache of
    fl->fl_net.
  - In ip6_fl_purge(), fl_free() stays in its original position.
    The function argument net is used for flowlabel_count.
  - mem_check() uses spaces around the / operator on all four
    expressions, addressing the checkpatch note in v6 review.

Numeric budget (preserved from v6):

  pre-patch:
    global non-CAP_NET_ADMIN budget = FL_MAX_SIZE - FL_MAX_SIZE/4
                                    = 4096 - 1024 = 3072
    per-actor reach                 = 3072

  post-patch:
    FL_MAX_SIZE doubled to 8192
    global non-CAP_NET_ADMIN budget = 8192 - 2048 = 6144
    per-netns ceiling               = 6144 / 2 = 3072
    per-actor reach                 = 3072 (preserved)

CAP_NET_ADMIN against init_user_ns still bypasses both caps.

Reproducer (KASAN VM, 4 cores, qemu): unprivileged netns A holds
3072 flowlabels via 100 procs. Fresh unprivileged netns B then
allocates 32 flowlabels (the FL_MAX_PER_SOCK ceiling for one
socket), the same as a clean baseline. Without the per-netns
ceiling, netns A could push fl_size past FL_MAX_SIZE - FL_MAX_SIZE
/ 4 and netns B would see allocations denied.

v8:
  - 1/2: replaced the "Caller must hold ip6_fl_lock" comment in
    fl_intern() with lockdep_assert_held(&ip6_fl_lock), matching
    the runtime check already used in mem_check(), per Willem's
    review.
  - 1/2: added Fixes: 1da177e4c3f4 trailer to match 2/2, per
    Willem's review.
  - Carried forward Reviewed-by: Willem de Bruijn on both
    patches.
  - No code change beyond the lockdep_assert_held swap.
v7:
  - 2-patch series: 1/2 (lock prep) and 2/2 (v6 rebased on 1/2).
  - 2/2: flowlabel_count int, fl_free() reorder removed in
    ip6_fl_purge(), checkpatch / spacing in mem_check() fixed.
v6: rebased onto current net (resolves the conflict on
    include/net/netns/ipv6.h that v5 hit). fl_free() restored
    to its pre-series position, with fl->fl_net cached locally
    in ip6_fl_gc().
v5: replaced the per-netns ceiling FL_MAX_SIZE/8 with the
    computed unpriv_user_limit = (FL_MAX_SIZE - FL_MAX_SIZE/4)/2,
    which evaluates to 3072.
v4: addressed Willem's v3 review on netdev. Dropped the
    flowlabel_has_excl cacheline argument in favour of "fills
    the existing 4-byte hole after ipmr_seq".
v3: addressed Willem's review on the private security@ thread.
    Merged FL_MAX_SIZE doubling, dropped test data, moved
    flowlabel_count near ipmr_seq, inlined fl->fl_net in
    ip6_fl_gc().
v2: per-netns counter + cap, sent to security@ as a 2-patch
    series.
v1: fix-shape sketch in original disclosure.

Maoyi Xie (2):
  ipv6: flowlabel: take ip6_fl_lock across mem_check and fl_intern
  ipv6: flowlabel: enforce per-netns limit for unprivileged callers

 include/net/netns/ipv6.h |  1 +
 net/ipv6/ip6_flowlabel.c | 46 +++++++++++++++++++++++++++-------------
 2 files changed, 32 insertions(+), 15 deletions(-)


base-commit: ebb639024ebd47a13a511cce6ae630c15e4b3126
-- 
2.34.1


^ permalink raw reply

* [PATCH net v8 1/2] ipv6: flowlabel: take ip6_fl_lock across mem_check and fl_intern
From: Maoyi Xie @ 2026-05-06  8:24 UTC (permalink / raw)
  To: David S . Miller
  Cc: Jakub Kicinski, Paolo Abeni, Eric Dumazet, David Ahern,
	Alexey Kuznetsov, Willem de Bruijn, Willem de Bruijn, netdev,
	linux-kernel, stable, Maoyi Xie
In-Reply-To: <20260506082416.2259567-1-maoyixie.tju@gmail.com>

From: Maoyi Xie <maoyi.xie@ntu.edu.sg>

mem_check() in net/ipv6/ip6_flowlabel.c reads fl_size without
holding ip6_fl_lock. fl_intern() takes the lock immediately
afterwards. The two checks therefore race against concurrent
fl_intern, ip6_fl_gc and ip6_fl_purge writers, which makes the
mem_check budget check approximate.

Move spin_lock_bh(&ip6_fl_lock) and the matching unlock from
fl_intern() into its only caller ipv6_flowlabel_get(). The
mem_check() call now runs under the same critical section as the
fl_intern() insert, so the budget check is exact.

With all writers and the read of fl_size under ip6_fl_lock,
convert fl_size from atomic_t to plain int. The four sites that
update or read fl_size are fl_intern (insert path), ip6_fl_gc
(garbage collector, the !sched check and the per-entry decrement),
ip6_fl_purge (per-netns purge), and mem_check (budget check), and
all four now run under ip6_fl_lock.

This is a prerequisite for adding a per-netns budget alongside
fl_size. The follow-up patch adds netns_ipv6::flowlabel_count and
folds it into mem_check().

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Suggested-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Maoyi Xie <maoyi.xie@ntu.edu.sg>
---
 net/ipv6/ip6_flowlabel.c | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index c92f98c6f..a89746431 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -40,7 +40,7 @@
 #define FL_HASH_MASK	255
 #define FL_HASH(l)	(ntohl(l)&FL_HASH_MASK)
 
-static atomic_t fl_size = ATOMIC_INIT(0);
+static int fl_size;
 static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1];
 
 static void ip6_fl_gc(struct timer_list *unused);
@@ -163,7 +163,7 @@ static void ip6_fl_gc(struct timer_list *unused)
 				if (time_after_eq(now, ttd)) {
 					*flp = fl->next;
 					fl_free(fl);
-					atomic_dec(&fl_size);
+					fl_size--;
 					continue;
 				}
 				if (!sched || time_before(ttd, sched))
@@ -172,7 +172,7 @@ static void ip6_fl_gc(struct timer_list *unused)
 			flp = &fl->next;
 		}
 	}
-	if (!sched && atomic_read(&fl_size))
+	if (!sched && fl_size)
 		sched = now + FL_MAX_LINGER;
 	if (sched) {
 		mod_timer(&ip6_fl_gc_timer, sched);
@@ -196,7 +196,7 @@ static void __net_exit ip6_fl_purge(struct net *net)
 			    atomic_read(&fl->users) == 0) {
 				*flp = fl->next;
 				fl_free(fl);
-				atomic_dec(&fl_size);
+				fl_size--;
 				continue;
 			}
 			flp = &fl->next;
@@ -210,10 +210,10 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
 {
 	struct ip6_flowlabel *lfl;
 
+	lockdep_assert_held(&ip6_fl_lock);
+
 	fl->label = label & IPV6_FLOWLABEL_MASK;
 
-	rcu_read_lock();
-	spin_lock_bh(&ip6_fl_lock);
 	if (label == 0) {
 		for (;;) {
 			fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK;
@@ -235,8 +235,6 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
 		lfl = __fl_lookup(net, fl->label);
 		if (lfl) {
 			atomic_inc(&lfl->users);
-			spin_unlock_bh(&ip6_fl_lock);
-			rcu_read_unlock();
 			return lfl;
 		}
 	}
@@ -244,9 +242,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
 	fl->lastuse = jiffies;
 	fl->next = fl_ht[FL_HASH(fl->label)];
 	rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl);
-	atomic_inc(&fl_size);
-	spin_unlock_bh(&ip6_fl_lock);
-	rcu_read_unlock();
+	fl_size++;
 	return NULL;
 }
 
@@ -464,10 +460,14 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
 
 static int mem_check(struct sock *sk)
 {
-	int room = FL_MAX_SIZE - atomic_read(&fl_size);
+	int room;
 	struct ipv6_fl_socklist *sfl;
 	int count = 0;
 
+	lockdep_assert_held(&ip6_fl_lock);
+
+	room = FL_MAX_SIZE - fl_size;
+
 	if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK)
 		return 0;
 
@@ -692,11 +692,19 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq,
 	if (!sfl1)
 		goto done;
 
+	rcu_read_lock();
+	spin_lock_bh(&ip6_fl_lock);
 	err = mem_check(sk);
+	if (err == 0)
+		fl1 = fl_intern(net, fl, freq->flr_label);
+	else
+		fl1 = NULL;
+	spin_unlock_bh(&ip6_fl_lock);
+	rcu_read_unlock();
+
 	if (err != 0)
 		goto done;
 
-	fl1 = fl_intern(net, fl, freq->flr_label);
 	if (fl1)
 		goto recheck;
 
-- 
2.34.1


^ permalink raw reply related

* [PATCH net v8 2/2] ipv6: flowlabel: enforce per-netns limit for unprivileged callers
From: Maoyi Xie @ 2026-05-06  8:24 UTC (permalink / raw)
  To: David S . Miller
  Cc: Jakub Kicinski, Paolo Abeni, Eric Dumazet, David Ahern,
	Alexey Kuznetsov, Willem de Bruijn, Willem de Bruijn, netdev,
	linux-kernel, stable, Maoyi Xie
In-Reply-To: <20260506082416.2259567-1-maoyixie.tju@gmail.com>

From: Maoyi Xie <maoyi.xie@ntu.edu.sg>

fl_size, fl_ht and ip6_fl_lock in net/ipv6/ip6_flowlabel.c are
file scope and shared across netns. mem_check() reads fl_size to
decide whether to deny non-CAP_NET_ADMIN callers. capable() runs
against init_user_ns, so an unprivileged user in any non-init
userns can push fl_size past FL_MAX_SIZE - FL_MAX_SIZE / 4 and
starve every other unprivileged userns on the host.

Add struct netns_ipv6::flowlabel_count, bumped and decremented
next to fl_size in fl_intern, ip6_fl_gc and ip6_fl_purge. The new
field fills the existing 4-byte hole after ipmr_seq, so struct
netns_ipv6 stays the same size on 64-bit builds.

Bump FL_MAX_SIZE from 4096 to 8192. It has been 4096 since the
file was added. Machines and connection counts have grown.

mem_check() folds an extra per-netns ceiling into the existing
non-CAP_NET_ADMIN conditional. The ceiling is half of the total
budget that unprivileged callers have ever been able to use, i.e.
(FL_MAX_SIZE - FL_MAX_SIZE / 4) / 2 = 3072 entries. With
FL_MAX_SIZE doubled, this preserves the original per-user reach
of 3K (what an unprivileged caller could already obtain before
this change), while forcing an attacker to spread allocations
across at least two netns to exhaust the global non-CAP_NET_ADMIN
budget.

CAP_NET_ADMIN against init_user_ns still bypasses both caps.

The previous patch took ip6_fl_lock across mem_check and
fl_intern, so the new flowlabel_count read in mem_check and the
new flowlabel_count++ in fl_intern run under the same critical
section. flowlabel_count is therefore plain int, like fl_size.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Suggested-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Cc: stable@vger.kernel.org # v5.15+
Signed-off-by: Maoyi Xie <maoyi.xie@ntu.edu.sg>
---
 include/net/netns/ipv6.h |  1 +
 net/ipv6/ip6_flowlabel.c | 14 +++++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 499e42881..875916d60 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -119,6 +119,7 @@ struct netns_ipv6 {
 	struct fib_notifier_ops	*notifier_ops;
 	struct fib_notifier_ops	*ip6mr_notifier_ops;
 	atomic_t		ipmr_seq;
+	int			flowlabel_count;
 	struct {
 		struct hlist_head head;
 		spinlock_t	lock;
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index a89746431..b1ccdf0dc 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -36,7 +36,7 @@
 /* FL hash table */

 #define FL_MAX_PER_SOCK	32
-#define FL_MAX_SIZE	4096
+#define FL_MAX_SIZE	8192
 #define FL_HASH_MASK	255
 #define FL_HASH(l)	(ntohl(l)&FL_HASH_MASK)

@@ -162,8 +162,9 @@ static void ip6_fl_gc(struct timer_list *unused)
 				ttd = fl->expires;
 				if (time_after_eq(now, ttd)) {
 					*flp = fl->next;
-					fl_free(fl);
 					fl_size--;
+					fl->fl_net->ipv6.flowlabel_count--;
+					fl_free(fl);
 					continue;
 				}
 				if (!sched || time_before(ttd, sched))
@@ -197,6 +198,7 @@ static void __net_exit ip6_fl_purge(struct net *net)
 				*flp = fl->next;
 				fl_free(fl);
 				fl_size--;
+				net->ipv6.flowlabel_count--;
 				continue;
 			}
 			flp = &fl->next;
@@ -243,6 +245,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
 	fl->next = fl_ht[FL_HASH(fl->label)];
 	rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl);
 	fl_size++;
+	net->ipv6.flowlabel_count++;
 	return NULL;
 }

@@ -460,6 +463,9 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,

 static int mem_check(struct sock *sk)
 {
+	const int unpriv_total_limit = FL_MAX_SIZE - (FL_MAX_SIZE / 4);
+	const int unpriv_user_limit = unpriv_total_limit / 2;
+	struct net *net = sock_net(sk);
 	int room;
 	struct ipv6_fl_socklist *sfl;
 	int count = 0;
@@ -478,7 +484,9 @@ static int mem_check(struct sock *sk)

 	if (room <= 0 ||
 	    ((count >= FL_MAX_PER_SOCK ||
-	      (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) &&
+	      (count > 0 && room < FL_MAX_SIZE / 2) ||
+	      room < FL_MAX_SIZE / 4 ||
+	      net->ipv6.flowlabel_count >= unpriv_user_limit) &&
 	     !capable(CAP_NET_ADMIN)))
 		return -ENOBUFS;

-- 
2.34.1

^ permalink raw reply related

* RE: [PATCH net] net: wan: fsl_ucc_hdlc: free tx_skbuff in uhdlc_memclean
From: Holger Brunck @ 2026-05-06  8:27 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Christophe Leroy (CS GROUP), netdev@vger.kernel.org,
	linuxppc-dev@lists.ozlabs.org, andrew+netdev@lunn.ch,
	qiang.zhao@nxp.com, horms@kernel.org
In-Reply-To: <20260505164736.26d203a8@kernel.org>

 > On Tue, 5 May 2026 08:33:34 +0000 Holger Brunck wrote:
> > > I don't think you can just kfree() an skb like this.
> > >
> > > I think you have to call dev_kfree_skb_any() instead.
> >
> > yes you are right or at least dev_kfree_skb() as the error handling
> > code in ucc_hdlc_tx does.
> 
> Please make it clear in the commit message how you discovered the issue and
> how you tested your patches.

currently the driver is broken if you compile it as a module and try to reload it.
The issue was discovered by a sashiko review for a previous patch of me:
 https://sashiko.dev/#/patchset/20260429114208.941011-1-holger.brunck%40hitachienergy.com

Still there are further issues remaining in the driver. I will update the patch and
the commit message.

Best regards
Holger

^ permalink raw reply

* RE: [PATCH net-next 1/6] bridge: uapi: Add neigh_forward_grat netlink attributes
From: Danielle Ratson @ 2026-05-06  8:31 UTC (permalink / raw)
  To: Ido Schimmel, Jakub Kicinski
  Cc: netdev@vger.kernel.org, donald.hunter@gmail.com,
	davem@davemloft.net, edumazet@google.com, pabeni@redhat.com,
	horms@kernel.org, razor@blackwall.org, andrew+netdev@lunn.ch,
	shuah@kernel.org, ast@fiberby.net, liuhangbin@gmail.com,
	daniel@iogearbox.net, Andy Roulin, fmaurer@redhat.com,
	sdf.kernel@gmail.com, sd@queasysnail.net, kees@kernel.org,
	nickgarlis@gmail.com, amorenoz@redhat.com, alasdair@mcwilliam.dev,
	johannes.wiesboeck@aisec.fraunhofer.de, Petr Machata,
	linux-kernel@vger.kernel.org, bridge@lists.linux.dev,
	linux-kselftest@vger.kernel.org
In-Reply-To: <20260506070334.GA617302@shredder>

> -----Original Message-----
> From: Ido Schimmel <idosch@nvidia.com>
> Sent: Wednesday, 6 May 2026 10:04
> To: Jakub Kicinski <kuba@kernel.org>
> Cc: Danielle Ratson <danieller@nvidia.com>; netdev@vger.kernel.org;
> donald.hunter@gmail.com; davem@davemloft.net; edumazet@google.com;
> pabeni@redhat.com; horms@kernel.org; razor@blackwall.org;
> andrew+netdev@lunn.ch; shuah@kernel.org; ast@fiberby.net;
> liuhangbin@gmail.com; daniel@iogearbox.net; Andy Roulin
> <aroulin@nvidia.com>; fmaurer@redhat.com; sdf.kernel@gmail.com;
> sd@queasysnail.net; kees@kernel.org; nickgarlis@gmail.com;
> amorenoz@redhat.com; alasdair@mcwilliam.dev;
> johannes.wiesboeck@aisec.fraunhofer.de; Petr Machata
> <petrm@nvidia.com>; linux-kernel@vger.kernel.org; bridge@lists.linux.dev;
> linux-kselftest@vger.kernel.org
> Subject: Re: [PATCH net-next 1/6] bridge: uapi: Add neigh_forward_grat
> netlink attributes
> 
> On Tue, May 05, 2026 at 07:00:44PM -0700, Jakub Kicinski wrote:
> > On Sun, 3 May 2026 10:35:27 +0300 Danielle Ratson wrote:
> > > --- a/Documentation/netlink/specs/rt-link.yaml
> > > +++ b/Documentation/netlink/specs/rt-link.yaml
> > > @@ -1700,6 +1700,9 @@ attribute-sets:
> > >        -
> > >          name: backup-nhid
> > >          type: u32
> > > +      -
> > > +        name: neigh-forward-grat
> > > +        type: flag
> >
> > I think this should be u8 ? neigh-vlan-suppress looks buggy too
> 
> I pointed this out during internal review, but assumed I am missing something
> since almost all the attributes use flag when they are in fact u8. We can fix

This is in fact the reason why I also changed it myself to use flag before sending.

> neigh-forward-grat to use u8 in v2 and change the rest in net. To be clear, I
> believe the following should be converted from flag to u8:
> 
> mode, guard, protect, fast-leave, learning, unicast-flood, proxyarp, learning-
> sync, proxyarp-wifi, mcast-flood, mcast-to-ucast, vlan-tunnel, bcast-flood,
> neigh-suppress, isolated, mrp-ring-open, mrp-in-open, locked, mab, neigh-
> vlan-suppress
> 

So should we proceed as Ido suggested?

> > flag is a type without a payload, the presence of the attr is the
> > entire information
> >
> > None of the AIs seem to catch this, I think you may have over-split
> > this submission a little bit. This patch may have been better off
> > squashed into patch 4 ?

It seems like the patch has enough content, but I can squash. I guess ill split the commit between patches 4 and 5 accordingly.

> 
> Related: The AI also did not catch that the spec was missing (easy to forget for
> rtnetlink). Do you think it's worth adding to review-prompts?

^ permalink raw reply

* [PATCH net 0/1] net: stmmac: dwmac-nuvoton: fix NULL pointer dereference in nvt_set_phy_intf_sel()
From: Joey Lu @ 2026-05-06  8:46 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, mcoquelin.stm32
  Cc: alexandre.torgue, linux-arm-kernel, netdev, linux-stm32,
	linux-kernel, Joey Lu

This series fixes a NULL pointer dereference bug introduced in the
initial dwmac-nuvoton glue driver. The struct nvt_priv_data::dev field
was never initialized after devm_kzalloc(), leaving it NULL. When
phylink later calls nvt_set_phy_intf_sel() for interface selection,
the callback dereferences priv->dev via nvt_gmac_get_delay(), triggering
a NULL pointer dereference.

Joey Lu (1):
  net: stmmac: dwmac-nuvoton: fix NULL pointer dereference in
    nvt_set_phy_intf_sel()

 drivers/net/ethernet/stmicro/stmmac/dwmac-nuvoton.c | 2 ++
 1 file changed, 2 insertions(+)

-- 
2.43.0

^ permalink raw reply

* [PATCH net 1/1] net: stmmac: dwmac-nuvoton: fix NULL pointer dereference in nvt_set_phy_intf_sel()
From: Joey Lu @ 2026-05-06  8:46 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, mcoquelin.stm32
  Cc: alexandre.torgue, linux-arm-kernel, netdev, linux-stm32,
	linux-kernel, Joey Lu
In-Reply-To: <20260506084614.192894-1-a0987203069@gmail.com>

priv->dev was never initialized after devm_kzalloc() allocates the
private data structure. When nvt_set_phy_intf_sel() is later invoked
via the phylink interface_select callback, it calls
nvt_gmac_get_delay(priv->dev, ...) which dereferences the NULL pointer.

Fix this by assigning priv->dev = dev immediately after allocation.

Fixes: 4d7c557f58ef ("net: stmmac: dwmac-nuvoton: Add dwmac glue for Nuvoton MA35 family")
Signed-off-by: Joey Lu <a0987203069@gmail.com>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-nuvoton.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-nuvoton.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-nuvoton.c
index e2240b68ad98..2ab6ecac6422 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-nuvoton.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-nuvoton.c
@@ -100,6 +100,8 @@ static int nvt_gmac_probe(struct platform_device *pdev)
 	if (!priv)
 		return dev_err_probe(dev, -ENOMEM, "Failed to allocate private data\n");
 
+	priv->dev = dev;
+
 	priv->regmap = syscon_regmap_lookup_by_phandle_args(dev->of_node, "nuvoton,sys",
 							    1, &priv->macid);
 	if (IS_ERR(priv->regmap))
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH net 0/8] IPVS fixes for net
From: Julian Anastasov @ 2026-05-06  8:56 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: netfilter-devel, davem, netdev, kuba, pabeni, edumazet, fw, horms,
	longman, lvs-devel
In-Reply-To: <20260505001648.360569-1-pablo@netfilter.org>


	Hello,

On Tue, 5 May 2026, Pablo Neira Ayuso wrote:

> Hi,
> 
> The following batch contains IPVS fixes for net to address issues
> from the latest net-next pull request.
> 
> Julian Anastasov made the following summary:
> 
> 1-3) Fixes for the recently added resizable hash tables
>  
> 4) dest from trash can be leaked if ip_vs_start_estimator() fails
>  
> 5) fixed races and locking for the estimation kthreads
>  
> 6) fix for wrong roundup_pow_of_two() usage in the resizable hash
>    tables
>  
> 7-8) v2 of the changes from Waiman Long to properly guard against
>   the housekeeping_cpumask() updates:
>  
>   https://lore.kernel.org/netfilter-devel/20260331165015.2777765-1-longman@redhat.com/
>  
>   I added missing Fixes tag. The original description:
>  
>   Since commit 041ee6f3727a ("kthread: Rely on HK_TYPE_DOMAIN for preferred
>   affinity management"), the HK_TYPE_KTHREAD housekeeping cpumask may no
>   longer be correct in showing the actual CPU affinity of kthreads that
>   have no predefined CPU affinity. As the ipvs networking code is still
>   using HK_TYPE_KTHREAD, we need to make HK_TYPE_KTHREAD reflect the
>   reality.
>  
>   This patch series makes HK_TYPE_KTHREAD an alias of HK_TYPE_DOMAIN
>   and uses RCU to protect access to the HK_TYPE_KTHREAD housekeeping
>   cpumask.
> 
> Julian plans to post a nf-next patch to limit the connections by using
> "conn_max" sysctl. With Simon Horman, they agreed that this is an old
> problem that we do not have a limit of connections and it is not a
> stopper for this patchset.
> 
> Please, pull these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git nf-26-05-05
> 
> Thanks.
> 
> ----------------------------------------------------------------
> 
> The following changes since commit bd3a4795d5744f59a1f485379f1303e5e606f377:
> 
>   selftests: tls: add test for data loss on small pipe (2026-05-02 18:27:14 -0700)
> 
> are available in the Git repository at:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf.git tags/nf-26-05-05
> 
> for you to fetch changes up to 8f78b749f3da0f43990490b4c1193b5ede3eec0a:
> 
>   sched/isolation: Make HK_TYPE_KTHREAD an alias of HK_TYPE_DOMAIN (2026-05-05 01:52:55 +0200)
> 
> ----------------------------------------------------------------
> netfilter pull request 26-05-05
> 
> ----------------------------------------------------------------
> Julian Anastasov (6):
>       ipvs: fixes for the new ip_vs_status info
>       ipvs: fix races around the conn_lfactor and svc_lfactor sysctl vars
>       ipvs: fix the spin_lock usage for RT build
>       ipvs: do not leak dest after get from dest trash
>       ipvs: fix races around est_mutex and est_cpulist
>       ipvs: fix shift-out-of-bounds in ip_vs_rht_desired_size
> 
> Waiman Long (2):
>       ipvs: Guard access of HK_TYPE_KTHREAD cpumask with RCU
>       sched/isolation: Make HK_TYPE_KTHREAD an alias of HK_TYPE_DOMAIN
> 
>  include/linux/sched/isolation.h |   6 +-
>  include/net/ip_vs.h             |  31 ++++++--
>  net/netfilter/ipvs/ip_vs_conn.c |  76 ++++++++++---------
>  net/netfilter/ipvs/ip_vs_core.c |   2 +-
>  net/netfilter/ipvs/ip_vs_ctl.c  | 164 +++++++++++++++++++++++++++++-----------
>  net/netfilter/ipvs/ip_vs_est.c  |  83 +++++++++++---------
>  6 files changed, 241 insertions(+), 121 deletions(-)

	Here are some comments after the last review from
Sashiko:

https://sashiko.dev/#/patchset/20260505001648.360569-1-pablo%40netfilter.org

Patch 1:
- while ip_vs_dst_event() should loop and ensure all dev
references are released, single change of svc_table_changes
does not indicate the old references are dropped by ip_vs_flush() or
ip_vs_del_service(). I'll post new change to abort the loop
when we are sure the services are at least once released.

Patch 5:
- after executing ip_vs_est_calc_phase(), data can
remain only for kt0 because all estimators are stopped,
unlinked and the kt data structures for kt > 0 are empty
and as result freed and the kthread tasks stopped (which
happens early). After this, kt 0 calls
ip_vs_est_drain_temp_list() as part of its loop,
so it will eventually call ip_vs_est_add_kthread()
and ip_vs_est_reload_start() to request kthread tasks
to be started if data for new kthreads are created.
So, I don't see problem here.

Patch 6:
- we will add conn_max sysctl soon

Patch 7 and 8:
- I can not decide how valid are the concerns in the review.

Regards

--
Julian Anastasov <ja@ssi.bg>


^ permalink raw reply

* Re: [PATCH net 2/3] ovpn: ensure packet delivery happens with BH disabled
From: Antonio Quartulli @ 2026-05-06  9:00 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: netdev, edumazet, sd, davem, pabeni, ralf, andrew+netdev, horms,
	shuah, linux-kselftest
In-Reply-To: <20260506010035.1563280-1-kuba@kernel.org>

Hi Jakub

On 06/05/2026 03:00, Jakub Kicinski wrote:
[...]
> This protects the success path, but do the error and drop paths also need
> BH protection?
> 
> If ovpn_decrypt_post() is called in process context for TCP connections, an
> error condition jumps to the drop label:
> 
> ovpn_decrypt_post() {
>          ...
>          if (unlikely(ret < 0))
>                  goto drop;
>          ...
> drop:
>          if (unlikely(skb))
>                  dev_dstats_rx_dropped(peer->ovpn->dev);
>          ...
> }
> 
> Since dev_dstats_rx_dropped() updates the same per-CPU dstats structure
> without disabling bottom halves, could it still be vulnerable to softirq
> preemption and stat corruption?
> 

Actually we were already looking into this.
However, since this needs a separate analysis, I wanted to get this 
fixed in a follow up patch.

Would it be ok to pull this PR as is, so we don't hold back the 
outstanding fixes?

Then we will address the issue highlighted by Sashiko in a new patch.
The problem is similar, but may need to be fixed differently.


Regards,

-- 
Antonio Quartulli
OpenVPN Inc.


^ permalink raw reply

* RE: [PATCH net-next v2 1/6] net: ngbe: implement libwx reset ops
From: Jiawen Wu @ 2026-05-06  9:05 UTC (permalink / raw)
  To: 'Jakub Kicinski'
  Cc: netdev, mengyuanlou, andrew+netdev, davem, edumazet, pabeni,
	richardcochran, linux, horms, kees, larysa.zaremba, leitao, joe,
	jacob.e.keller, fabio.baltieri, netdev, mengyuanlou,
	andrew+netdev, davem, edumazet, pabeni, richardcochran, linux,
	horms, kees, larysa.zaremba, leitao, joe, jacob.e.keller,
	fabio.baltieri
In-Reply-To: <20260503021529.4127250-1-kuba@kernel.org>

On Sun, May 3, 2026 10:15 AM, Jakub Kicinski wrote:
> This is an AI-generated review of your patch. The human sending this
> email has considered the AI review valid, or at least plausible.
> ---
> net: ngbe: implement libwx reset ops
> 
> This commit implements the wx->do_reset() operation for the ngbe driver,
> allowing the library module to trigger a reset by bringing the interface
> down and back up.
> 
> > diff --git a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c
> > index d8e3827a8b1f2..bd905e2675758 100644
> > --- a/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c
> > +++ b/drivers/net/ethernet/wangxun/ngbe/ngbe_main.c
> 
> [ ... ]
> 
> > +static void ngbe_reinit_locked(struct wx *wx)
> > +{
> > +	netif_trans_update(wx->netdev);
> > +
> > +	mutex_lock(&wx->reset_lock);
> > +	set_bit(WX_STATE_RESETTING, wx->state);
> > +
> > +	ngbe_down(wx);
> > +	ngbe_up(wx);
> > +
> > +	clear_bit(WX_STATE_RESETTING, wx->state);
> 
> Does this introduce a regression by using WX_STATE_RESETTING as an ad-hoc
> lock here?
> 
> While this pattern appears to exist elsewhere in the driver, adding it
> to ngbe_reinit_locked() relies on a design where background readers might
> check test_bit(WX_STATE_RESETTING, wx->state) without holding reset_lock.
> 
> Since the flag check and subsequent accesses are not atomic, a reader could
> test the flag just before it is set and then proceed to access shared state
> concurrently with the ngbe_down() and ngbe_up() teardown path.
> 
> Could this result in torn or corrupted reads, and would it be safer to use a
> proper synchronization mechanism like RCU or a shared lock for readers?

I think we have come up with a good way to deal with this in the previous
discussion:
https://lore.kernel.org/all/990c010b-f716-499d-b3ca-e8296003c37e@intel.com

Currently, only wx_update_stats() in watchdog that needs to test bit
WX_STATE_RESETTING, because the hardware statistics cannot be counted at the
reset time. Even if wx_update_stats() test the flag before ngbe_reinit_locked()
set it, the work task will be canceled before the reset.

> 
> > +	mutex_unlock(&wx->reset_lock);
> > +}
> > +
> > +void ngbe_do_reset(struct net_device *netdev)
> 
> [ ... ]
> 


^ permalink raw reply

* [PATCH net v2] net: napi: Avoid gro timer misfiring at end of busypoll
From: Dragos Tatulea @ 2026-05-06  9:08 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Björn Töpel, Daniel Borkmann
  Cc: dtatulea, Gal Pressman, Joe Damato, Frederik Deweerdt,
	Martin Karsten, Tariq Toukan, Cosmin Ratiu, netdev, linux-kernel

When in irq deferral mode (defer-hard-irqs > 0), a short enough
gro-flush timeout can trigger before NAPI_STATE_SCHED is cleared if the
last poll in busy_poll_stop() takes too long. This can have the effect
of leaving the queue stuck with interrupts disabled and no timer armed
which results in a tx timeout if there is no subsequent busypoll cycle.

To prevent this, defer the gro-flush timer arm after the last poll.

Fixes: 7fd3253a7de6 ("net: Introduce preferred busy-polling")
Co-developed-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Martin Karsten <mkarsten@uwaterloo.ca>
Signed-off-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
---
Changes since RFC [1]:
- Sending only fix to net.
- Made commit message clearer and more succint.
- Fixed timer arming to happen after clearing the NAPI_STATE_SCHED bit
- Arm timer after clearing NAPI_STATE_SCHED and drop IRQ disable.

[1] https://lore.kernel.org/all/20260428175134.1197036-3-dtatulea@nvidia.com/
---
 net/core/dev.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 06c195906231..3ebd69988d51 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6862,9 +6862,9 @@ static void skb_defer_free_flush(void)
 
 #if defined(CONFIG_NET_RX_BUSY_POLL)
 
-static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
+static void __busy_poll_stop(struct napi_struct *napi, unsigned long timeout)
 {
-	if (!skip_schedule) {
+	if (!timeout) {
 		gro_normal_list(&napi->gro);
 		__napi_schedule(napi);
 		return;
@@ -6874,6 +6874,8 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
 	gro_flush_normal(&napi->gro, HZ >= 1000);
 
 	clear_bit(NAPI_STATE_SCHED, &napi->state);
+	hrtimer_start(&napi->timer, ns_to_ktime(timeout),
+		      HRTIMER_MODE_REL_PINNED);
 }
 
 enum {
@@ -6885,8 +6887,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
 			   unsigned flags, u16 budget)
 {
 	struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
-	bool skip_schedule = false;
-	unsigned long timeout;
+	unsigned long timeout = 0;
 	int rc;
 
 	/* Busy polling means there is a high chance device driver hard irq
@@ -6906,10 +6907,12 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
 
 	if (flags & NAPI_F_PREFER_BUSY_POLL) {
 		napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
-		timeout = napi_get_gro_flush_timeout(napi);
-		if (napi->defer_hard_irqs_count && timeout) {
-			hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
-			skip_schedule = true;
+		if (napi->defer_hard_irqs_count) {
+			/* A short enough gro flush timeout and long enough
+			 * poll can result in timer firing too early.
+			 * Timer will be armed later if necessary.
+			 */
+			timeout = napi_get_gro_flush_timeout(napi);
 		}
 	}
 
@@ -6924,7 +6927,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
 	trace_napi_poll(napi, rc, budget);
 	netpoll_poll_unlock(have_poll_lock);
 	if (rc == budget)
-		__busy_poll_stop(napi, skip_schedule);
+		__busy_poll_stop(napi, timeout);
 	bpf_net_ctx_clear(bpf_net_ctx);
 	local_bh_enable();
 }
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH net-next v2] declance: Remove IRQF_ONESHOT
From: Maciej W. Rozycki @ 2026-05-06  9:25 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: netdev, linux-mips, Jakub Kicinski, Andrew Lunn, David S. Miller,
	Eric Dumazet, Paolo Abeni
In-Reply-To: <20260505152450.1KYVS2pr@linutronix.de>

On Tue, 5 May 2026, Sebastian Andrzej Siewior wrote:

> I'm not if sure if you may need to change the primary handler if the
> interrupt flow is EOI and cascading based on what you wrote. If you have
> access to the HW then you it should be easy to test given the
> `threadirqs' argument should expose problems.

 The interrupt is exceedingly rare, I've only seen it actually fire maybe 
a dozen times across all my systems in 25+ years.  It happens when there 
is a memory read error on DMA, such as an uncorrected ECC or parity error 
(depending on the system variant), or a bus timeout.

 It should be possible to orchestrate it, such as by making the LANCE DMA 
pointer register refer an unpopulated location in the system address map; 
memory ECC errors can be induced too by the DRAM controller's diagnostic 
feature.  It seems enough hassle though I'd rather get things right by the 
spec.

 Thanks for the hint as to the `threadirqs' facility though, it may come 
up helpful sometime.

  Maciej

^ permalink raw reply

* [PATCH net-next v2] ipv4: Flush the FIB once per dev nexthop removal
From: Cosmin Ratiu @ 2026-05-06  9:27 UTC (permalink / raw)
  To: netdev
  Cc: David Ahern, Ido Schimmel, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Simon Horman, Paolo Abeni, Cosmin Ratiu

When a device is going down, all nexthops on it are removed, and for
each nexthop being removed the FIB table is flushed, which does a full
trie traversal looking for entries marked RTNH_F_DEAD and removing them.
The performance of this is O(N x R), with N being number of dev nexthops
and R being number of IPv4 routes.

The RTNL is held the entire time.

When there are many nexthops to be removed and many routing entries,
this can result in the RTNL being held for multiple minutes, which
causes unhappiness in other processes trying to acquire the RTNL (e.g.
systemd-networkd for DHCP renewals).

In a complicated deployment with multiple vxlan devices, each having
16K nexthops and a total of 128K ipv4 routes, this is exactly what
happens:

nexthop_flush_dev()                # loops over 16K nexthops
  -> remove_nexthop()
    -> __remove_nexthop()
      -> __remove_nexthop_fib()    # marks fi->fib_flags |= RTNH_F_DEAD
        -> fib_flush()             # for EACH nexthop!
	  -> fib_table_flush()     # walks the ENTIRE FIB, 128K entries

Change that so that a nexthop_flush_dev() does a single fib_flush()
after all nexthops are removed. This is done with:
- __remove_nexthop_fib() no longer flushes the FIB, instead returns
  whether a flush is needed and is marked with __must_check.
- __remove_nexthop() and remove_nexthop() propagate this return value up
  with __must_check, which was also added to remove_nexthop_from_groups.
- A new wrapper is defined, remove_one_nexthop() which calls
  remove_nexthop() and flushes if necessary.
- The two direct callers of __remove_nexthop() get a WARN_ON, since the
  nh about to be removed should not have any FIB entries referencing it
  when replacing or inserting a new one.
- Callers which need to remove a single nexthop were migrated to
  remove_one_nexthop().
- Callers which need to remove multiple nexthops keep track in a local
  bool whether a flush is needed and call flush once at the end.
- This is plumbed through group removal as well, so when removing a leaf
  nh causes a parent group to lose its last member, the group's flush is
  also deferred, accumulated via remove_nexthop_from_groups() ->
  remove_nh_grp_entry() -> remove_nexthop(). remove_nh_grp_entry() gets
  a __must_check as well.

This dramatically improves performance from O(N x R) to O(N + R).

Releasing a nexthop reference in remove_nexthop() now no longer frees
it. Instead, it is deleted when the last fib_info pointing to it gets
freed via free_fib_info_rcu(). All routing code is already careful not
to take into consideration routes marked with RTNH_F_DEAD.

Tested with:
DEV=eth2
ip link set up dev $DEV
ip link add testnh0 link $DEV type macvlan mode bridge
ip addr add 198.51.100.1/24 dev testnh0
ip link set testnh0 up

seq 1 65536 | \
sed 's/.*/nexthop add id & via 198.51.100.2 dev testnh0/' | \
ip -batch -

i=1
for a in $(seq 0 255); do
  for b in $(seq 0 255); do
    echo "route add 10.${a}.${b}.0/32 nhid $i"
    i=$((i + 1))
  done
done | ip -batch -

time ip link set testnh0 down
ip link del testnh0

Without this patch:
real	0m32.601s
user	0m0.000s
sys	0m32.511s

With this patch:
real	0m0.209s
user	0m0.000s
sys	0m0.153s

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
---
 net/ipv4/nexthop.c | 88 +++++++++++++++++++++++++++++-----------------
 1 file changed, 56 insertions(+), 32 deletions(-)

V1 -> V2:
- Fixes xmas tree in a couple places (Kuniyuki Iwashima)
- Added __must_check to remove_nexthop_from_groups() (Kuniyuki Iwashima)

diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index f92fcc39fc4c..6e25ed804099 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -20,8 +20,8 @@
 #define NH_RES_DEFAULT_IDLE_TIMER	(120 * HZ)
 #define NH_RES_DEFAULT_UNBALANCED_TIMER	0	/* No forced rebalancing. */
 
-static void remove_nexthop(struct net *net, struct nexthop *nh,
-			   struct nl_info *nlinfo);
+static bool __must_check remove_nexthop(struct net *net, struct nexthop *nh,
+					struct nl_info *nlinfo);
 
 #define NH_DEV_HASHBITS  8
 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
@@ -2016,9 +2016,9 @@ static void nh_hthr_group_rebalance(struct nh_group *nhg)
 	}
 }
 
-static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
-				struct nl_info *nlinfo,
-				struct list_head *deferred_free)
+static bool __must_check
+remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
+		    struct nl_info *nlinfo, struct list_head *deferred_free)
 {
 	struct nh_grp_entry *nhges, *new_nhges;
 	struct nexthop *nhp = nhge->nh_parent;
@@ -2033,10 +2033,8 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
 	newg = nhg->spare;
 
 	/* last entry, keep it visible and remove the parent */
-	if (nhg->num_nh == 1) {
-		remove_nexthop(net, nhp, nlinfo);
-		return;
-	}
+	if (nhg->num_nh == 1)
+		return remove_nexthop(net, nhp, nlinfo);
 
 	newg->has_v4 = false;
 	newg->is_multipath = nhg->is_multipath;
@@ -2093,22 +2091,27 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
 
 	if (nlinfo)
 		nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);
+
+	return false;
 }
 
-static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
-				       struct nl_info *nlinfo)
+static bool __must_check
+remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
+			   struct nl_info *nlinfo)
 {
 	struct nh_grp_entry *nhge, *tmp;
 	LIST_HEAD(deferred_free);
+	bool need_flush = false;
 
 	/* If there is nothing to do, let's avoid the costly call to
 	 * synchronize_net()
 	 */
 	if (list_empty(&nh->grp_list))
-		return;
+		return false;
 
 	list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list)
-		remove_nh_grp_entry(net, nhge, nlinfo, &deferred_free);
+		need_flush |= remove_nh_grp_entry(net, nhge, nlinfo,
+						   &deferred_free);
 
 	/* make sure all see the newly published array before releasing rtnl */
 	synchronize_net();
@@ -2118,6 +2121,8 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
 		list_del(&nhge->nh_list);
 		free_percpu(nhge->stats);
 	}
+
+	return need_flush;
 }
 
 static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
@@ -2142,18 +2147,15 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
 }
 
 /* not called for nexthop replace */
-static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
+static bool __must_check __remove_nexthop_fib(struct net *net,
+					      struct nexthop *nh)
 {
+	bool need_flush = !list_empty(&nh->fi_list);
 	struct fib6_info *f6i;
-	bool do_flush = false;
 	struct fib_info *fi;
 
-	list_for_each_entry(fi, &nh->fi_list, nh_list) {
+	list_for_each_entry(fi, &nh->fi_list, nh_list)
 		fi->fib_flags |= RTNH_F_DEAD;
-		do_flush = true;
-	}
-	if (do_flush)
-		fib_flush(net);
 
 	spin_lock_bh(&nh->lock);
 
@@ -2173,12 +2175,14 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 	}
 
 	spin_unlock_bh(&nh->lock);
+
+	return need_flush;
 }
 
-static void __remove_nexthop(struct net *net, struct nexthop *nh,
-			     struct nl_info *nlinfo)
+static bool __must_check __remove_nexthop(struct net *net, struct nexthop *nh,
+					  struct nl_info *nlinfo)
 {
-	__remove_nexthop_fib(net, nh);
+	bool need_flush = __remove_nexthop_fib(net, nh);
 
 	if (nh->is_group) {
 		remove_nexthop_group(nh, nlinfo);
@@ -2189,13 +2193,17 @@ static void __remove_nexthop(struct net *net, struct nexthop *nh,
 		if (nhi->fib_nhc.nhc_dev)
 			hlist_del(&nhi->dev_hash);
 
-		remove_nexthop_from_groups(net, nh, nlinfo);
+		need_flush |= remove_nexthop_from_groups(net, nh, nlinfo);
 	}
+
+	return need_flush;
 }
 
-static void remove_nexthop(struct net *net, struct nexthop *nh,
-			   struct nl_info *nlinfo)
+static bool __must_check remove_nexthop(struct net *net, struct nexthop *nh,
+					struct nl_info *nlinfo)
 {
+	bool need_flush;
+
 	call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL);
 
 	/* remove from the tree */
@@ -2204,10 +2212,19 @@ static void remove_nexthop(struct net *net, struct nexthop *nh,
 	if (nlinfo)
 		nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
 
-	__remove_nexthop(net, nh, nlinfo);
+	need_flush = __remove_nexthop(net, nh, nlinfo);
 	nh_base_seq_inc(net);
 
 	nexthop_put(nh);
+
+	return need_flush;
+}
+
+static void remove_one_nexthop(struct net *net, struct nexthop *nh,
+			       struct nl_info *nlinfo)
+{
+	if (remove_nexthop(net, nh, nlinfo))
+		fib_flush(net);
 }
 
 /* if any FIB entries reference this nexthop, any dst entries
@@ -2592,7 +2609,7 @@ static int replace_nexthop(struct net *net, struct nexthop *old,
 	if (!err) {
 		nh_rt_cache_flush(net, old, new);
 
-		__remove_nexthop(net, new, NULL);
+		WARN_ON(__remove_nexthop(net, new, NULL));
 		nexthop_put(new);
 	}
 
@@ -2699,6 +2716,7 @@ static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
 	unsigned int hash = nh_dev_hashfn(dev->ifindex);
 	struct net *net = dev_net(dev);
 	struct hlist_head *head = &net->nexthop.devhash[hash];
+	bool need_flush = false;
 	struct hlist_node *n;
 	struct nh_info *nhi;
 
@@ -2710,22 +2728,28 @@ static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
 		    (event == NETDEV_DOWN || event == NETDEV_CHANGE))
 			continue;
 
-		remove_nexthop(net, nhi->nh_parent, NULL);
+		need_flush |= remove_nexthop(net, nhi->nh_parent, NULL);
 	}
+
+	if (need_flush)
+		fib_flush(net);
 }
 
 /* rtnl; called when net namespace is deleted */
 static void flush_all_nexthops(struct net *net)
 {
 	struct rb_root *root = &net->nexthop.rb_root;
+	bool need_flush = false;
 	struct rb_node *node;
 	struct nexthop *nh;
 
 	while ((node = rb_first(root))) {
 		nh = rb_entry(node, struct nexthop, rb_node);
-		remove_nexthop(net, nh, NULL);
+		need_flush |= remove_nexthop(net, nh, NULL);
 		cond_resched();
 	}
+	if (need_flush)
+		fib_flush(net);
 }
 
 static struct nexthop *nexthop_create_group(struct net *net,
@@ -2994,7 +3018,7 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
 
 	err = insert_nexthop(net, nh, cfg, extack);
 	if (err) {
-		__remove_nexthop(net, nh, NULL);
+		WARN_ON(__remove_nexthop(net, nh, NULL));
 		nexthop_put(nh);
 		nh = ERR_PTR(err);
 	}
@@ -3363,7 +3387,7 @@ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	nh = nexthop_find_by_id(net, id);
 	if (nh)
-		remove_nexthop(net, nh, &nlinfo);
+		remove_one_nexthop(net, nh, &nlinfo);
 	else
 		err = -ENOENT;
 
-- 
2.53.0


^ permalink raw reply related

* RE: [PATCH net] net: wan: fsl_ucc_hdlc: return NETDEV_TX_OK if skb was freed
From: Holger Brunck @ 2026-05-06  9:35 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: netdev@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
	andrew+netdev@lunn.ch, chleroy@kernel.org, qiang.zhao@nxp.com,
	horms@kernel.org
In-Reply-To: <20260505182220.29e5f34e@kernel.org>

> 
> On Mon,  4 May 2026 19:44:06 +0200 Holger Brunck wrote:
> > If the skb was freed in the ucc_hdlc_tx function and the packet marked
> > as dropped we need to return NETDEV_TX_OK. Otherwise the above layer
> > will try to requeue an already freed skb.
> 
> Is this really true? I thought negative returns mean drop.

the API suggest to only use NETDEV_TX_OK or NETDEV_TX_BUSY as return value.
I checked several drivers and they are usually returning NETDEV_TX_OK if an
error occurred and the driver consumed the packet. But you are right
dev_xmit_complete will also return true if the return code is smaller than zero
and the packet is not requeued. Should I update the commit message or should
the patch be dropped?

Best regards
Holger

^ permalink raw reply

* Re: [PATCH iwl-next v4 0/3] igc: add support for forcing link speed without autonegotiation
From: David Laight @ 2026-05-06  9:40 UTC (permalink / raw)
  To: Abdul Rahim, Faizal
  Cc: KhaiWenTan, anthony.l.nguyen, andrew+netdev, davem, edumazet,
	kuba, pabeni, intel-wired-lan, netdev, linux-kernel,
	faizal.abdul.rahim, hong.aun.looi, khai.wen.tan
In-Reply-To: <63b186e0-046d-496e-8ae4-d68cd5eb5817@linux.intel.com>

On Wed, 6 May 2026 14:21:59 +0800
"Abdul Rahim, Faizal" <faizal.abdul.rahim@linux.intel.com> wrote:

> On 30/4/2026 10:41 pm, David Laight wrote:
> > On Tue, 28 Apr 2026 14:00:06 +0800
> > KhaiWenTan <khai.wen.tan@linux.intel.com> wrote:
> >   
> >> From: Faizal Rahim <faizal.abdul.rahim@linux.intel.com>
> >>
> >> This series adds support for forcing 10/100 Mb/s link speed via ethtool
> >> when autonegotiation is disabled on the igc driver.  
> > 
> > I'll ask 'why' ?
> > 
> > In particular forcing half/full duplex has always been a very good way
> > of 'breaking' a network connection.
> > 
> > It really is much better to restrict the advertised link modes and let
> > the autodetect/autonegotiation logic in the phy/mac do its job.
> > 
> > About the only think I can think of is to force 10M HDX when connected
> > to a remote system that supports 10M/100M HDX.
> > In that case you need to send out single link test pulses, not the
> > burst used to identify 100M HDX, or the pattern encoded on the burst
> > used by autonegotiation.
> > But you need to got back to the mid 1990s to find such systems.
> > Anything that supports FDX will do autonegotiation.
> > 
> > 	David
> >   
> 
> There's a use case requested:
> 
> Profinet Certification tool reports that forcing a link speed without
> auto-negotiation is not working.
> Forcing the link speed is a critical feature for the industrial automation
> "fast-start" use case. When there is a connection lost, the system must
> come back up as fast as possible. In PROFINET, that means to force the
> speed and rejoin the controller loops. Without supporting forcing the speed
> to 100M in Foxville, the certification tool would not be able to certify
> the availability of this feature.
> 
> I'm hoping this context is enough to justify the need?

Is auto-negotiation of the 'low' speed actually that slow?
IIRC detecting 10G and above requires a lot of signal processing.
But 10/100 and hdx/fdx just uses the ANAR register value sent in the
link test pulses.
(IIRC 1G uses the ANAR pattern, but requires extra signal processing as well.
The higher speeds didn't exist when I was writing ethernet drivers.)

I've been on the 'wrong end' of hdx/fdx mismatches - you really don't
want to let people get there, it is terribly confusing.

There actually ought to be a way of setting the auto-negotiation
registers to 100M (HDX and/or FDX) and then transmitting as (say) 100M HDX
even before negotiation completes.
Then correcting hdx/fdx based on the received ANAR register.
Or, at least, sending out an ANAR that only contains what you are using.

The problem I always had was that the actual operating mode of the phy
wasn't in one of the standard registers.
So if you connected to a system that didn't do auto-negotiation the
phy would be using (say) 10M HDX, but the received ANAR register would
still contain a value from an earlier connection.
If the driver read that register from the phy it used the wrong duplex mode.
(The speed for 10/100 doesn't matter, the phy clocks the interface to the
mac at the right speed and the mac doesn't care.)

	David

^ permalink raw reply

* Re: [PATCH net-next 03/12] net: pcs: pcs-xpcs: Preserve BMCR_ANENBLE during link up
From: Daniel Thompson @ 2026-05-06  9:46 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Alex Elder, andrew+netdev, davem, edumazet, kuba, pabeni,
	maxime.chevallier, rmk+kernel, andersson, konradybcio, robh,
	krzk+dt, conor+dt, linusw, brgl, arnd, gregkh, mohd.anwar,
	a0987203069, alexandre.torgue, ast, boon.khai.ng, chenchuangyu,
	chenhuacai, daniel, hawk, hkallweit1, inochiama, john.fastabend,
	julianbraha, livelycarpet87, matthew.gerlach, mcoquelin.stm32, me,
	prabhakar.mahadev-lad.rj, richardcochran, rohan.g.thomas, sdf,
	siyanteng, weishangjuan, wens, netdev, bpf, linux-arm-msm,
	devicetree, linux-gpio, linux-stm32, linux-arm-kernel,
	linux-kernel
In-Reply-To: <91332fc1-11ed-444e-a211-699420cf0108@lunn.ch>

On Fri, May 01, 2026 at 07:06:15PM +0200, Andrew Lunn wrote:
> On Fri, May 01, 2026 at 10:54:11AM -0500, Alex Elder wrote:
> > From: Daniel Thompson <daniel@riscstar.com>
> >
> > Currently the XCPS found on Toshiba TC9564 (a.k.a. Qualcomm QPS615)
> > is unable to operate at 1000base-X and slower with a PHY connected
> > using SGMII/2500base-X (in our case a Qualcomm QCA8081). The link
> > negotiates speed correctly but the MAC can't get any packets out.
>
> We need to break this down into its components.
>
> I assume you are saying the PHY negotiates the media speed with the
> link partner. That is PHY talking to PHY.
>
> But we also have the PCS talking to the PHY. There can be inband
> signalling here, for SGMII and 2500BaseX. But not for over clocked
> SGMII, which is how some vendors implement 2500BaseX. SGMII signalling
> does not work when overclocked to 2.5G.
>
> > This attracted attention to the ANENABLE bit and we observed that the
> > bit is currently set during config and cleared during link up.
>
> Here we are talking about the PCS ANEBNABLE, not the PHY ANEBNABLE. So
> this is negotiation between the PCS and the PHY.
>
> > Preserving the bit during link up allows the system to work as expected.
>
> >  	int ret;
> >
> >  	if (neg_mode == PHYLINK_PCS_NEG_INBAND_ENABLED)
> >  		return;
>
> Think about this.
>
> >  	ret = xpcs_write(xpcs, MDIO_MMD_VEND2, MII_BMCR,
> > -			 mii_bmcr_encode_fixed(speed, duplex));
> > +			 mii_bmcr_encode_fixed(speed, duplex) | an_enable);
>
> And mii_bmcr_encode_fixed().

Thanks for highlighting that... and for being gentle with the clue
stick!

This patch will be gone the next time this patchset is published and
I'll get the phylink configured properly.


Daniel.

^ permalink raw reply

* Re: [PATCH net] vsock/virtio: fix potential unbounded skb queue
From: Arseniy Krasnov @ 2026-05-06  9:50 UTC (permalink / raw)
  To: Bobby Eshleman, Stefano Garzarella
  Cc: Eric Dumazet, Bobby Eshleman, Stefan Hajnoczi, Michael S. Tsirkin,
	David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
	netdev, eric.dumazet, Arseniy Krasnov, Jason Wang, Xuan Zhuo,
	Eugenio Pérez, kvm, virtualization
In-Reply-To: <afocwkfBHJ5u10rx@devvm29614.prn0.facebook.com>



05.05.2026 19:37, Bobby Eshleman wrote:
> On Tue, May 05, 2026 at 06:11:13PM +0200, Stefano Garzarella wrote:
>> On Tue, May 05, 2026 at 07:14:36AM -0700, Eric Dumazet wrote:
>>> On Tue, May 5, 2026 at 6:52 AM Stefano Garzarella <sgarzare@redhat.com> wrote:
>>>>
>>>> On Thu, Apr 30, 2026 at 12:26:52PM +0000, Eric Dumazet wrote:
>>>>> virtio_transport_inc_rx_pkt() checks vvs->rx_bytes + len > vvs->buf_alloc.
>>>>>
>>>>> virtio_transport_recv_enqueue() skips coalescing for packets
>>>>> with VIRTIO_VSOCK_SEQ_EOM.
>>>>>
>>>>> If fed with packets with len == 0 and VIRTIO_VSOCK_SEQ_EOM,
>>>>> a very large number of packets can be queued
>>>>> because vvs->rx_bytes stays at 0.
>>>>>
>>>>> Fix this by estimating the skb metadata size:
>>>>>
>>>>>       (Number of skbs in the queue) * SKB_TRUESIZE(0)
>>>>>
>>>>> Fixes: 077706165717 ("virtio/vsock: don't use skbuff state to account credit")
>>>>> Signed-off-by: Eric Dumazet <edumazet@google.com>
>>>>> Cc: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
>>>>> Cc: Stefan Hajnoczi <stefanha@redhat.com>
>>>>> Cc: Stefano Garzarella <sgarzare@redhat.com>
>>>>> Cc: "Michael S. Tsirkin" <mst@redhat.com>
>>>>> Cc: Jason Wang <jasowang@redhat.com>
>>>>> Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>>>>> Cc: "Eugenio Pérez" <eperezma@redhat.com>
>>>>> Cc: kvm@vger.kernel.org
>>>>> Cc: virtualization@lists.linux.dev
>>>>> ---
>>>>> net/vmw_vsock/virtio_transport_common.c | 4 +++-
>>>>> 1 file changed, 3 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>>>>> index 416d533f493d7b07e9c77c43f741d28cfcd0953e..9b8014516f4fb1130ae184635fbba4dfee58bd64 100644
>>>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>>>> @@ -447,7 +447,9 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>>>>> static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
>>>>>                                       u32 len)
>>>>> {
>>>>> -      if (vvs->buf_used + len > vvs->buf_alloc)
>>>>> +      u64 skb_overhead = (skb_queue_len(&vvs->rx_queue) + 1) * SKB_TRUESIZE(0);
>>>>> +
>>>>> +      if (skb_overhead + vvs->buf_used + len > vvs->buf_alloc)
>>>>>               return false;
>>>>
>>>> I'm not sure about this fix, I mean that maybe this is incomplete.
>>>> In virtio-vsock, there is a credit mechanism between the two peers:
>>>> https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-4850003
>>>>
>>>> This takes only the payload into account, so it’s true that this problem
>>>> exists; however, perhaps we should also inform the other peer of a lower
>>>> credit balance, otherwise the other peer will believe it has much more
>>>> credit than it actually does, send a large payload, and then the packet
>>>> will be discarded and the data lost (there are no retransmissions,
>>>> etc.).
>>>
>>> I dunno, perhaps revert 077706165717 ("virtio/vsock: don't use skbuff
>>> state to account credit")
>>> and find a better fix then?
>>
>> IIRC the same issue was there before the commit fixed by that one (commit
>> 71dc9ec9ac7d ("virtio/vsock: replace virtio_vsock_pkt with sk_buff")), so
>> not sure about reverting it TBH.
>>
>> CCing Arseniy and Bobby.

Thanks!

>>
>>>
>>> There is always a discrepancy between skb->len and skb->truesize.
>>> You will not be able to announce a 1MB window, and accept one milliion
>>> skb of 1-byte each.
>>>
>>> This kind of contract is broken.
>>>
>>
>> Yep, I agree, but before we start discarding data (and losing it), IMHO we
>> should at least inform the other peer that we're out of space.
>>
>> @Stefan, @Michael, do you think we can do something in the spec to avoid
>> this issue and in some way take into account also the metadata in the
>> credit. I mean to avoid the 1-byte packets flooding.
>>
>> Thanks,
>> Stefano
>>
>>
> 
> Indeed the old pre-fix skb code would have the same issue.
> 
> I can't think of any way around this without extending the spec.

Hi, thanks, agree with Bobby, that accounting metadata (e.g. skb size here) was not implemented "by
design" in credit logic - another side of data exchange knows nothing about that. Also the same
situation was before skb implementation was added by Bobby. So looks like need to update spec may be.

Thanks!


> 
> Best,
> Bobby


^ permalink raw reply

* [PATCH net-next v9 0/7] net: bcmgenet: add XDP support
From: Nicolai Buchwitz @ 2026-05-06  9:55 UTC (permalink / raw)
  To: netdev
  Cc: Justin Chen, Simon Horman, Mohsin Bashir, Doug Berger,
	Florian Fainelli, Broadcom internal kernel review list,
	Andrew Lunn, Eric Dumazet, Paolo Abeni, Nicolai Buchwitz,
	Alexei Starovoitov, Daniel Borkmann, David S. Miller,
	Jakub Kicinski, Jesper Dangaard Brouer, John Fastabend,
	Stanislav Fomichev, bpf

Add XDP support to the bcmgenet driver, covering XDP_PASS, XDP_DROP,
XDP_TX, XDP_REDIRECT, and ndo_xdp_xmit.

The first patch converts the RX path from the existing kmalloc-based
allocation to page_pool, which is a prerequisite for XDP. The remaining
patches incrementally add XDP functionality and per-action statistics.

Tested on Raspberry Pi CM4 (BCM2711, bcmgenet, 1Gbps link):
- XDP_PASS: 943 Mbit/s TX, 935 Mbit/s RX (no regression vs baseline)
- XDP_PASS latency: 0.164ms avg, 0% packet loss
- XDP_DROP: all inbound traffic blocked as expected
- XDP_TX: TX counter increments (packet reflection working)
- Link flap with XDP attached: no errors
- Program swap under iperf3 load: no errors
- Upstream XDP selftests (xdp.py): pass_sb, drop_sb, tx_sb passing
- XDP-based EtherCAT master (~37 kHz cycle rate, all packet processing
  in BPF/XDP), stable over multiple days

Previous versions:
  v8: https://lore.kernel.org/netdev/20260428205846.2625550-1-nb@tipi-net.de/
  v7: https://lore.kernel.org/netdev/20260416054743.1289191-1-nb@tipi-net.de/
  v6: https://lore.kernel.org/netdev/20260406083536.839517-1-nb@tipi-net.de/
  v5: https://lore.kernel.org/netdev/20260328230513.415790-1-nb@tipi-net.de/
  v4: https://lore.kernel.org/netdev/20260323120539.136029-1-nb@tipi-net.de/
  v3: https://lore.kernel.org/netdev/20260319115402.353509-1-nb@tipi-net.de/
  v2: https://lore.kernel.org/netdev/20260315214914.1555777-1-nb@tipi-net.de/
  v1: https://lore.kernel.org/netdev/20260313092101.1344954-1-nb@tipi-net.de/

Changes since v8 (all from Jakub Kicinski's review):
  - Patch 1:
    * Add explicit #include <linux/bpf.h> for XDP_PACKET_HEADROOM.
    * Remove unused priv->rx_buf_len after page_pool conversion.
    * Sync only the RSB and the actual received frame length in
      page_pool_dma_sync_for_cpu(), instead of the full RX_BUF_LENGTH.
    * Add a runt length guard (len < GENET_RSB_PAD) to prevent
      __skb_put underflow on broken HW.
  - Patch 4:
    * Guard netdev_tx_reset_queue() in bcmgenet_tx_reclaim()'s
      all=true path against ring->index == DESC_INDEX, fixing an
      out-of-bounds dev->_tx[16] access during interface teardown.
    * Assign explicit lowest priority to ring 16 so XDP_TX does not
      preempt normal SKB TX under strict-priority arbitration.
  - Patch 5:
    * Stop manipulating xdp_features_clear/set_redirect_target() in
      bcmgenet_xdp_setup(); ndo_xdp_xmit works without a local XDP
      program, so leave NETDEV_XDP_ACT_NDO_XMIT advertised at all times.
  - Patch 6:
    * Increment xdp_tx_err on xdp_convert_buff_to_frame() failure.
    * Add separate xdp_aborted counter for XDP_ABORTED and invalid-
      action returns, instead of folding them into xdp_drop.

Changes since v7:
  - No code changes; resubmitted after net-next reopened.

Changes since v6:
  - Removed GENET_XDP_HEADROOM alias, use XDP_PACKET_HEADROOM
    directly. (Jakub Kicinski)
  - Dropped redundant __GFP_NOWARN from page_pool_alloc_pages(),
    page_pool adds it automatically. (Jakub Kicinski)
  - Removed floating code block in desc_rx, moved variables to outer
    scope. (Jakub Kicinski)
  - Make bcmgenet_run_xdp() return XDP_PASS when no program is set,
    removing the if (xdp_prog) indentation from desc_rx.
    (Jakub Kicinski)

Changes since v5:
  - Refactored desc_rx: always prepare xdp_buff and use
    bcmgenet_xdp_build_skb for both XDP and non-XDP paths, treating
    no-prog as XDP_PASS. (Jakub Kicinski)
  - Removed synchronize_net() before bpf_prog_put(), RCU handles
    the grace period. (Jakub Kicinski)
  - Save status->rx_csum before running XDP program to prevent
    bpf_xdp_adjust_head from corrupting the RSB checksum.
    (Jakub Kicinski)
  - Tightened TSB headroom check to include sizeof(struct xdp_frame).
    (Jakub Kicinski)
  - Fixed reclaim gating: check for pending frames on the XDP TX ring
    instead of priv->xdp_prog, so in-flight frames are still reclaimed
    after XDP program detach. (Jakub Kicinski)
  - Removed dead len -= ETH_FCS_LEN in patch 1. (Mohsin Bashir)
  - Added patch 7: minimal ndo_change_mtu that rejects MTU values
    incompatible with XDP when a program is attached. (Mohsin Bashir,
    Florian Fainelli)

Changes since v4:
  - Fixed unused variable warning: moved tx_ring declaration from
    patch 4 to patch 5 where it is first used. (Jakub Kicinski)

Changes since v3:
  - Fixed xdp_prepare_buff() called with meta_valid=false, causing
    bcmgenet_xdp_build_skb() to compute metasize=UINT_MAX and corrupt
    skb meta_len. Now passes true. (Simon Horman)
  - Removed bcmgenet_dump_tx_queue() for ring 16 in bcmgenet_timeout().
    Ring 16 has no netdev TX queue, so netdev_get_tx_queue(dev, 16)
    accessed beyond the allocated _tx array. (Simon Horman)
  - Fixed checkpatch alignment warnings in patches 4 and 5.

Changes since v2:
  - Fixed page leak on partial bcmgenet_alloc_rx_buffers() failure:
    free already-allocated rx_cbs before destroying page pool.
    (Simon Horman)
  - Fixed GENET_Q16_TX_BD_CNT defined as 64 instead of 32.
    (Simon Horman)
  - Moved XDP TX ring to a separate struct member (xdp_tx_ring)
    instead of expanding tx_rings[] to DESC_INDEX+1. (Justin Chen)
  - Added synchronize_net() before bpf_prog_put() in XDP prog swap.
  - Removed goto drop_page inside switch; inlined page_pool_put
    calls in each failure path. (Justin Chen)
  - Removed unnecessary curly braces around case XDP_TX. (Justin Chen)
  - Moved int err hoisting from patch 2 to patch 1. (Justin Chen)
  - Kept return type on same line as function name, per driver
    convention. (Justin Chen)
  - XDP TX packets/bytes now counted in TX reclaim for standard
    network statistics.

Changes since v1:
  - Fixed tx_rings[DESC_INDEX] out-of-bounds access. Expanded array
    to DESC_INDEX+1 and initialized ring 16 with dedicated BDs.
  - Use ring 16 (hardware default descriptor ring) for XDP TX,
    isolating from normal SKB TX queues.
  - Piggyback ring 16 TX completion on RX NAPI poll (INTRL2_1 bit
    collision with RX ring 0).
  - Fixed ring 16 TX reclaim: skip INTRL2_1 clear, skip BQL
    completion, use non-destructive reclaim in RX poll path.
  - Prepend zeroed TSB before XDP TX frame data (TBUF_64B_EN requires
    64-byte struct status_64 prefix on all TX buffers).
  - Tested with upstream XDP selftests (xdp.py): pass_sb, drop_sb,
    tx_sb all passing. The multi-buffer tests (pass_mb, drop_mb,
    tx_mb) fail because bcmgenet does not support jumbo frames /
    MTU changes; I plan to add ndo_change_mtu support in a follow-up
    series.

Nicolai Buchwitz (7):
  net: bcmgenet: convert RX path to page_pool
  net: bcmgenet: register xdp_rxq_info for each RX ring
  net: bcmgenet: add basic XDP support (PASS/DROP)
  net: bcmgenet: add XDP_TX support
  net: bcmgenet: add XDP_REDIRECT and ndo_xdp_xmit support
  net: bcmgenet: add XDP statistics counters
  net: bcmgenet: reject MTU changes incompatible with XDP

 drivers/net/ethernet/broadcom/Kconfig         |   1 +
 .../net/ethernet/broadcom/genet/bcmgenet.c    | 655 +++++++++++++++---
 .../net/ethernet/broadcom/genet/bcmgenet.h    |  21 +-
 3 files changed, 573 insertions(+), 104 deletions(-)

--
2.51.0


^ permalink raw reply

* [PATCH net-next v9 2/7] net: bcmgenet: register xdp_rxq_info for each RX ring
From: Nicolai Buchwitz @ 2026-05-06  9:55 UTC (permalink / raw)
  To: netdev
  Cc: Justin Chen, Simon Horman, Mohsin Bashir, Doug Berger,
	Florian Fainelli, Broadcom internal kernel review list,
	Andrew Lunn, Eric Dumazet, Paolo Abeni, Nicolai Buchwitz,
	David S. Miller, Jakub Kicinski, Alexei Starovoitov,
	Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
	Stanislav Fomichev, linux-kernel, bpf
In-Reply-To: <20260506095553.55357-1-nb@tipi-net.de>

Register an xdp_rxq_info per RX ring and associate it with the ring's
page_pool via MEM_TYPE_PAGE_POOL. This is required infrastructure for
XDP program execution: the XDP framework needs to know the memory model
backing each RX queue for correct page lifecycle management.

No functional change - XDP programs are not yet attached or executed.

Signed-off-by: Nicolai Buchwitz <nb@tipi-net.de>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 18 ++++++++++++++++++
 drivers/net/ethernet/broadcom/genet/bcmgenet.h |  2 ++
 2 files changed, 20 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index df11c4977e8f..5bedc18685b0 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -2789,7 +2789,23 @@ static int bcmgenet_rx_ring_create_pool(struct bcmgenet_priv *priv,
 		return err;
 	}
 
+	err = xdp_rxq_info_reg(&ring->xdp_rxq, priv->dev, ring->index, 0);
+	if (err)
+		goto err_free_pp;
+
+	err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, MEM_TYPE_PAGE_POOL,
+					 ring->page_pool);
+	if (err)
+		goto err_unreg_rxq;
+
 	return 0;
+
+err_unreg_rxq:
+	xdp_rxq_info_unreg(&ring->xdp_rxq);
+err_free_pp:
+	page_pool_destroy(ring->page_pool);
+	ring->page_pool = NULL;
+	return err;
 }
 
 /* Initialize a RDMA ring */
@@ -2818,6 +2834,7 @@ static int bcmgenet_init_rx_ring(struct bcmgenet_priv *priv,
 	if (ret) {
 		for (i = 0; i < ring->size; i++)
 			bcmgenet_free_rx_cb(ring->cbs + i, ring->page_pool);
+		xdp_rxq_info_unreg(&ring->xdp_rxq);
 		page_pool_destroy(ring->page_pool);
 		ring->page_pool = NULL;
 		return ret;
@@ -3023,6 +3040,7 @@ static void bcmgenet_destroy_rx_page_pools(struct bcmgenet_priv *priv)
 	for (i = 0; i <= priv->hw_params->rx_queues; ++i) {
 		ring = &priv->rx_rings[i];
 		if (ring->page_pool) {
+			xdp_rxq_info_unreg(&ring->xdp_rxq);
 			page_pool_destroy(ring->page_pool);
 			ring->page_pool = NULL;
 		}
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
index 7203bde37b78..da7b7fee896f 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
@@ -16,6 +16,7 @@
 #include <linux/dim.h>
 #include <linux/ethtool.h>
 #include <net/page_pool/helpers.h>
+#include <net/xdp.h>
 
 #include "../unimac.h"
 
@@ -579,6 +580,7 @@ struct bcmgenet_rx_ring {
 	u32		rx_max_coalesced_frames;
 	u32		rx_coalesce_usecs;
 	struct page_pool *page_pool;
+	struct xdp_rxq_info xdp_rxq;
 	struct bcmgenet_priv *priv;
 };
 
-- 
2.51.0


^ permalink raw reply related

* [PATCH net-next v9 3/7] net: bcmgenet: add basic XDP support (PASS/DROP)
From: Nicolai Buchwitz @ 2026-05-06  9:55 UTC (permalink / raw)
  To: netdev
  Cc: Justin Chen, Simon Horman, Mohsin Bashir, Doug Berger,
	Florian Fainelli, Broadcom internal kernel review list,
	Andrew Lunn, Eric Dumazet, Paolo Abeni, Nicolai Buchwitz,
	David S. Miller, Jakub Kicinski, Alexei Starovoitov,
	Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
	Stanislav Fomichev, linux-kernel, bpf
In-Reply-To: <20260506095553.55357-1-nb@tipi-net.de>

Add XDP program attachment via ndo_bpf and execute XDP programs in the
RX path. XDP_PASS builds an SKB from the xdp_buff (handling
xdp_adjust_head/tail), XDP_DROP returns the page to page_pool without
SKB allocation.

XDP_TX and XDP_REDIRECT are not yet supported and return XDP_ABORTED.

Advertise NETDEV_XDP_ACT_BASIC in xdp_features.

Signed-off-by: Nicolai Buchwitz <nb@tipi-net.de>
---
 .../net/ethernet/broadcom/genet/bcmgenet.c    | 129 +++++++++++++++---
 .../net/ethernet/broadcom/genet/bcmgenet.h    |   4 +
 2 files changed, 116 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 5bedc18685b0..ee1d4ecc2b87 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -36,6 +36,8 @@
 #include <linux/ipv6.h>
 #include <linux/phy.h>
 #include <linux/bpf.h>
+#include <linux/bpf_trace.h>
+#include <linux/filter.h>
 
 #include <linux/unaligned.h>
 
@@ -2276,6 +2278,56 @@ static int bcmgenet_rx_refill(struct bcmgenet_rx_ring *ring,
 	return 0;
 }
 
+static struct sk_buff *bcmgenet_xdp_build_skb(struct bcmgenet_rx_ring *ring,
+					      struct xdp_buff *xdp)
+{
+	unsigned int metasize;
+	struct sk_buff *skb;
+
+	skb = napi_build_skb(xdp->data_hard_start, PAGE_SIZE);
+	if (unlikely(!skb))
+		return NULL;
+
+	skb_mark_for_recycle(skb);
+
+	metasize = xdp->data - xdp->data_meta;
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	__skb_put(skb, xdp->data_end - xdp->data);
+
+	if (metasize)
+		skb_metadata_set(skb, metasize);
+
+	return skb;
+}
+
+static unsigned int bcmgenet_run_xdp(struct bcmgenet_rx_ring *ring,
+				     struct bpf_prog *prog,
+				     struct xdp_buff *xdp,
+				     struct page *rx_page)
+{
+	unsigned int act;
+
+	if (!prog)
+		return XDP_PASS;
+
+	act = bpf_prog_run_xdp(prog, xdp);
+
+	switch (act) {
+	case XDP_PASS:
+		return XDP_PASS;
+	case XDP_DROP:
+		page_pool_put_full_page(ring->page_pool, rx_page, true);
+		return XDP_DROP;
+	default:
+		bpf_warn_invalid_xdp_action(ring->priv->dev, prog, act);
+		fallthrough;
+	case XDP_ABORTED:
+		trace_xdp_exception(ring->priv->dev, prog, act);
+		page_pool_put_full_page(ring->page_pool, rx_page, true);
+		return XDP_ABORTED;
+	}
+}
+
 /* bcmgenet_desc_rx - descriptor based rx process.
  * this could be called from bottom half, or from NAPI polling method.
  */
@@ -2285,6 +2337,7 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
 	struct bcmgenet_rx_stats64 *stats = &ring->stats64;
 	struct bcmgenet_priv *priv = ring->priv;
 	struct net_device *dev = priv->dev;
+	struct bpf_prog *xdp_prog;
 	struct enet_cb *cb;
 	struct sk_buff *skb;
 	u32 dma_length_status;
@@ -2295,6 +2348,8 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
 	unsigned int p_index, mask;
 	unsigned int discards;
 
+	xdp_prog = READ_ONCE(priv->xdp_prog);
+
 	/* Clear status before servicing to reduce spurious interrupts */
 	mask = 1 << (UMAC_IRQ1_RX_INTR_SHIFT + ring->index);
 	bcmgenet_intrl2_1_writel(priv, mask, INTRL2_CPU_CLEAR);
@@ -2326,9 +2381,12 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
 	       (rxpktprocessed < budget)) {
 		struct status_64 *status;
 		struct page *rx_page;
+		unsigned int xdp_act;
 		unsigned int rx_off;
-		__be16 rx_csum;
+		struct xdp_buff xdp;
+		__be16 rx_csum = 0;
 		void *hard_start;
+		int pkt_len;
 
 		cb = &priv->rx_cbs[ring->read_ptr];
 
@@ -2413,30 +2471,34 @@ static unsigned int bcmgenet_desc_rx(struct bcmgenet_rx_ring *ring,
 			goto next;
 		} /* error packet */
 
-		/* Build SKB from the page - data starts at hard_start,
-		 * frame begins after RSB(64) + pad(2) = 66 bytes.
+		pkt_len = len - GENET_RSB_PAD;
+		if (priv->crc_fwd_en)
+			pkt_len -= ETH_FCS_LEN;
+
+		/* Save rx_csum before XDP runs - an XDP program
+		 * could overwrite the RSB via bpf_xdp_adjust_head.
 		 */
-		skb = napi_build_skb(hard_start, PAGE_SIZE - XDP_PACKET_HEADROOM);
-		if (unlikely(!skb)) {
-			BCMGENET_STATS64_INC(stats, dropped);
-			page_pool_put_full_page(ring->page_pool, rx_page,
-						true);
-			goto next;
-		}
+		if (dev->features & NETIF_F_RXCSUM)
+			rx_csum = (__force __be16)(status->rx_csum & 0xffff);
 
-		skb_mark_for_recycle(skb);
+		xdp_init_buff(&xdp, PAGE_SIZE, &ring->xdp_rxq);
+		xdp_prepare_buff(&xdp, page_address(rx_page),
+				 GENET_RX_HEADROOM, pkt_len, true);
 
-		/* Reserve the RSB + pad, then set the data length */
-		skb_reserve(skb, GENET_RSB_PAD);
-		__skb_put(skb, len - GENET_RSB_PAD);
+		xdp_act = bcmgenet_run_xdp(ring, xdp_prog, &xdp, rx_page);
+		if (xdp_act != XDP_PASS)
+			goto next;
 
-		if (priv->crc_fwd_en) {
-			skb_trim(skb, skb->len - ETH_FCS_LEN);
+		skb = bcmgenet_xdp_build_skb(ring, &xdp);
+		if (unlikely(!skb)) {
+			BCMGENET_STATS64_INC(stats, dropped);
+			page_pool_put_full_page(ring->page_pool,
+						rx_page, true);
+			goto next;
 		}
 
 		/* Set up checksum offload */
 		if (dev->features & NETIF_F_RXCSUM) {
-			rx_csum = (__force __be16)(status->rx_csum & 0xffff);
 			if (rx_csum) {
 				skb->csum = (__force __wsum)ntohs(rx_csum);
 				skb->ip_summed = CHECKSUM_COMPLETE;
@@ -3750,6 +3812,37 @@ static int bcmgenet_change_carrier(struct net_device *dev, bool new_carrier)
 	return 0;
 }
 
+static int bcmgenet_xdp_setup(struct net_device *dev,
+			      struct netdev_bpf *xdp)
+{
+	struct bcmgenet_priv *priv = netdev_priv(dev);
+	struct bpf_prog *old_prog;
+	struct bpf_prog *prog = xdp->prog;
+
+	if (prog && dev->mtu > PAGE_SIZE - GENET_RX_HEADROOM -
+	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) {
+		NL_SET_ERR_MSG_MOD(xdp->extack,
+				   "MTU too large for single-page XDP buffer");
+		return -EOPNOTSUPP;
+	}
+
+	old_prog = xchg(&priv->xdp_prog, prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+
+	return 0;
+}
+
+static int bcmgenet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return bcmgenet_xdp_setup(dev, xdp);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static const struct net_device_ops bcmgenet_netdev_ops = {
 	.ndo_open		= bcmgenet_open,
 	.ndo_stop		= bcmgenet_close,
@@ -3761,6 +3854,7 @@ static const struct net_device_ops bcmgenet_netdev_ops = {
 	.ndo_set_features	= bcmgenet_set_features,
 	.ndo_get_stats64	= bcmgenet_get_stats64,
 	.ndo_change_carrier	= bcmgenet_change_carrier,
+	.ndo_bpf		= bcmgenet_xdp,
 };
 
 /* GENET hardware parameters/characteristics */
@@ -4063,6 +4157,7 @@ static int bcmgenet_probe(struct platform_device *pdev)
 			 NETIF_F_RXCSUM;
 	dev->hw_features |= dev->features;
 	dev->vlan_features |= dev->features;
+	dev->xdp_features = NETDEV_XDP_ACT_BASIC;
 
 	netdev_sw_irq_coalesce_default_on(dev);
 
diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.h b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
index da7b7fee896f..3d65f0e4b4b4 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.h
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.h
@@ -16,6 +16,7 @@
 #include <linux/dim.h>
 #include <linux/ethtool.h>
 #include <net/page_pool/helpers.h>
+#include <linux/bpf.h>
 #include <net/xdp.h>
 
 #include "../unimac.h"
@@ -670,6 +671,9 @@ struct bcmgenet_priv {
 	u8 sopass[SOPASS_MAX];
 
 	struct bcmgenet_mib_counters mib;
+
+	/* XDP */
+	struct bpf_prog *xdp_prog;
 };
 
 static inline bool bcmgenet_has_40bits(struct bcmgenet_priv *priv)
-- 
2.51.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox