public inbox for netdev@vger.kernel.org

public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH net] net: flow_offload: protect driver_block_list in flow_block_cb_setup_simple()
From: Shigeru Yoshida @ 2026-02-08 11:00 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Pablo Neira Ayuso, Florian Westphal, Phil Sutter,
	Shigeru Yoshida
  Cc: syzbot+5a66db916cdde0dbcc1c, netdev, linux-kernel,
	netfilter-devel, coreteam

syzbot reported a list_del corruption in flow_block_cb_setup_simple(). [0]

flow_block_cb_setup_simple() accesses the driver_block_list (e.g.,
netdevsim's nsim_block_cb_list) without any synchronization. The
nftables offload path calls into this function via ndo_setup_tc while
holding the per-netns commit_mutex, but this mutex does not prevent
concurrent access from tasks in different network namespaces that
share the same driver_block_list, leading to list corruption:

- Task A (FLOW_BLOCK_BIND) calls list_add_tail() to insert a new
  flow_block_cb into driver_block_list.

- Task B (FLOW_BLOCK_UNBIND) concurrently calls list_del() on another
  flow_block_cb from the same list.

- The concurrent modifications corrupt the list pointers.

Fix this by adding a static mutex (flow_block_cb_list_lock) that
protects all driver_block_list operations within
flow_block_cb_setup_simple(). Also add a flow_block_cb_remove_driver()
helper for external callers that need to remove a block_cb from the
driver list under the same lock, and convert nft_indr_block_cleanup()
to use it.

[0]:
list_del corruption. prev->next should be ffff888028878200, but was ffffffff8e940fc0. (prev=ffffffff8e940fc0)
------------[ cut here ]------------
kernel BUG at lib/list_debug.c:64!
Oops: invalid opcode: 0000 [#1] SMP KASAN PTI
CPU: 1 UID: 0 PID: 6308 Comm: syz.3.231 Not tainted syzkaller #0 PREEMPT(full)
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/18/2025
RIP: 0010:__list_del_entry_valid_or_report+0x15a/0x190 lib/list_debug.c:62
[...]
Call Trace:
 <TASK>
 __list_del_entry_valid include/linux/list.h:124 [inline]
 __list_del_entry include/linux/list.h:215 [inline]
 list_del include/linux/list.h:229 [inline]
 flow_block_cb_setup_simple+0x62d/0x740 net/core/flow_offload.c:369
 nft_block_offload_cmd net/netfilter/nf_tables_offload.c:397 [inline]
 nft_chain_offload_cmd+0x293/0x660 net/netfilter/nf_tables_offload.c:451
 nft_flow_block_chain net/netfilter/nf_tables_offload.c:471 [inline]
 nft_flow_offload_chain net/netfilter/nf_tables_offload.c:513 [inline]
 nft_flow_rule_offload_commit+0x40d/0x1b60 net/netfilter/nf_tables_offload.c:592
 nf_tables_commit+0x675/0x8710 net/netfilter/nf_tables_api.c:10925
 nfnetlink_rcv_batch net/netfilter/nfnetlink.c:576 [inline]
 nfnetlink_rcv_skb_batch net/netfilter/nfnetlink.c:649 [inline]
 nfnetlink_rcv+0x1ac9/0x2590 net/netfilter/nfnetlink.c:667
 netlink_unicast_kernel net/netlink/af_netlink.c:1320 [inline]
 netlink_unicast+0x82c/0x9e0 net/netlink/af_netlink.c:1346
 netlink_sendmsg+0x805/0xb30 net/netlink/af_netlink.c:1896
 sock_sendmsg_nosec net/socket.c:727 [inline]
 __sock_sendmsg+0x219/0x270 net/socket.c:742
 ____sys_sendmsg+0x505/0x830 net/socket.c:2630
 ___sys_sendmsg+0x21f/0x2a0 net/socket.c:2684
 __sys_sendmsg net/socket.c:2716 [inline]
 __do_sys_sendmsg net/socket.c:2721 [inline]
 __se_sys_sendmsg net/socket.c:2719 [inline]
 __x64_sys_sendmsg+0x19b/0x260 net/socket.c:2719
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f

Fixes: 955bcb6ea0df ("drivers: net: use flow block API")
Reported-by: syzbot+5a66db916cdde0dbcc1c@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=5a66db916cdde0dbcc1c
Tested-by: syzbot+5a66db916cdde0dbcc1c@syzkaller.appspotmail.com
Signed-off-by: Shigeru Yoshida <syoshida@redhat.com>
---
 include/net/flow_offload.h        |  2 ++
 net/core/flow_offload.c           | 41 ++++++++++++++++++++++++-------
 net/netfilter/nf_tables_offload.c |  2 +-
 3 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 596ab9791e4d..ff6d2bcb2cca 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -673,6 +673,8 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f,
 			       flow_setup_cb_t *cb,
 			       void *cb_ident, void *cb_priv, bool ingress_only);
 
+void flow_block_cb_remove_driver(struct flow_block_cb *block_cb);
+
 enum flow_cls_command {
 	FLOW_CLS_REPLACE,
 	FLOW_CLS_DESTROY,
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index bc5169482710..137a44af5e1c 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -334,6 +334,8 @@ bool flow_block_cb_is_busy(flow_setup_cb_t *cb, void *cb_ident,
 }
 EXPORT_SYMBOL(flow_block_cb_is_busy);
 
+static DEFINE_MUTEX(flow_block_cb_list_lock);
+
 int flow_block_cb_setup_simple(struct flow_block_offload *f,
 			       struct list_head *driver_block_list,
 			       flow_setup_cb_t *cb,
@@ -341,6 +343,7 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f,
 			       bool ingress_only)
 {
 	struct flow_block_cb *block_cb;
+	int err = 0;
 
 	if (ingress_only &&
 	    f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
@@ -348,32 +351,52 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f,
 
 	f->driver_block_list = driver_block_list;
 
+	mutex_lock(&flow_block_cb_list_lock);
+
 	switch (f->command) {
 	case FLOW_BLOCK_BIND:
-		if (flow_block_cb_is_busy(cb, cb_ident, driver_block_list))
-			return -EBUSY;
+		if (flow_block_cb_is_busy(cb, cb_ident, driver_block_list)) {
+			err = -EBUSY;
+			break;
+		}
 
 		block_cb = flow_block_cb_alloc(cb, cb_ident, cb_priv, NULL);
-		if (IS_ERR(block_cb))
-			return PTR_ERR(block_cb);
+		if (IS_ERR(block_cb)) {
+			err = PTR_ERR(block_cb);
+			break;
+		}
 
 		flow_block_cb_add(block_cb, f);
 		list_add_tail(&block_cb->driver_list, driver_block_list);
-		return 0;
+		break;
 	case FLOW_BLOCK_UNBIND:
 		block_cb = flow_block_cb_lookup(f->block, cb, cb_ident);
-		if (!block_cb)
-			return -ENOENT;
+		if (!block_cb) {
+			err = -ENOENT;
+			break;
+		}
 
 		flow_block_cb_remove(block_cb, f);
 		list_del(&block_cb->driver_list);
-		return 0;
+		break;
 	default:
-		return -EOPNOTSUPP;
+		err = -EOPNOTSUPP;
+		break;
 	}
+
+	mutex_unlock(&flow_block_cb_list_lock);
+	return err;
 }
 EXPORT_SYMBOL(flow_block_cb_setup_simple);
 
+void flow_block_cb_remove_driver(struct flow_block_cb *block_cb)
+{
+	mutex_lock(&flow_block_cb_list_lock);
+	list_del(&block_cb->driver_list);
+	mutex_unlock(&flow_block_cb_list_lock);
+}
+EXPORT_SYMBOL(flow_block_cb_remove_driver);
+
 static DEFINE_MUTEX(flow_indr_block_lock);
 static LIST_HEAD(flow_block_indr_list);
 static LIST_HEAD(flow_block_indr_dev_list);
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index fd30e205de84..d60838bceafb 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -414,7 +414,7 @@ static void nft_indr_block_cleanup(struct flow_block_cb *block_cb)
 				    basechain, &extack);
 	nft_net = nft_pernet(net);
 	mutex_lock(&nft_net->commit_mutex);
-	list_del(&block_cb->driver_list);
+	flow_block_cb_remove_driver(block_cb);
 	list_move(&block_cb->list, &bo.cb_list);
 	nft_flow_offload_unbind(&bo, basechain);
 	mutex_unlock(&nft_net->commit_mutex);
-- 
2.52.0


^ permalink raw reply related

* Re: [PATCH net-next v20 10/12] virtio_net: Add support for IPv6 ethtool steering
From: Michael S. Tsirkin @ 2026-02-08 10:08 UTC (permalink / raw)
  To: Daniel Jurgens
  Cc: netdev, jasowang, pabeni, virtualization, parav, shshitrit,
	yohadt, xuanzhuo, eperezma, jgg, kevin.tian, kuba, andrew+netdev,
	edumazet
In-Reply-To: <20260205224707.16995-11-danielj@nvidia.com>

> @@ -6111,20 +6176,38 @@ static int setup_ip_key_mask(struct virtio_net_ff_selector *selector,
>  			     u8 *key,
>  			     const struct ethtool_rx_flow_spec *fs)
>  {
> +	struct ipv6hdr *v6_m = (struct ipv6hdr *)&selector->mask;
>  	struct iphdr *v4_m = (struct iphdr *)&selector->mask;
> +	struct ipv6hdr *v6_k = (struct ipv6hdr *)key;
>  	struct iphdr *v4_k = (struct iphdr *)key;
>  
> -	selector->type = VIRTIO_NET_FF_MASK_TYPE_IPV4;
> -	selector->length = sizeof(struct iphdr);
> +	if (has_ipv6(fs->flow_type)) {
> +		selector->type = VIRTIO_NET_FF_MASK_TYPE_IPV6;
> +		selector->length = sizeof(struct ipv6hdr);
> +
> +		/* exclude tclass, it's not exposed properly struct ip6hdr */

do you mean:

+		/* exclude tclass, it's not exposed properly in struct ipv6hdr */

?

and maybe properly -> directly?

> +		if (fs->h_u.usr_ip6_spec.l4_4_bytes ||
> +		    fs->m_u.usr_ip6_spec.l4_4_bytes ||
> +		    fs->h_u.usr_ip6_spec.tclass ||
> +		    fs->m_u.usr_ip6_spec.tclass ||
> +		    fs->h_u.usr_ip6_spec.l4_proto ||
> +		    fs->m_u.usr_ip6_spec.l4_proto)
> +			return -EINVAL;
>  
> -	if (fs->h_u.usr_ip4_spec.l4_4_bytes ||
> -	    fs->h_u.usr_ip4_spec.ip_ver != ETH_RX_NFC_IP4 ||
> -	    fs->m_u.usr_ip4_spec.l4_4_bytes ||
> -	    fs->m_u.usr_ip4_spec.ip_ver ||
> -	    fs->m_u.usr_ip4_spec.proto)
> -		return -EINVAL;
> +		parse_ip6(v6_m, v6_k, fs);
> +	} else {
> +		selector->type = VIRTIO_NET_FF_MASK_TYPE_IPV4;
> +		selector->length = sizeof(struct iphdr);
> +
> +		if (fs->h_u.usr_ip4_spec.l4_4_bytes ||
> +		    fs->h_u.usr_ip4_spec.ip_ver != ETH_RX_NFC_IP4 ||
> +		    fs->m_u.usr_ip4_spec.l4_4_bytes ||
> +		    fs->m_u.usr_ip4_spec.ip_ver ||
> +		    fs->m_u.usr_ip4_spec.proto)
> +			return -EINVAL;
>  
> -	parse_ip4(v4_m, v4_k, fs);
> +		parse_ip4(v4_m, v4_k, fs);
> +	}
>  
>  	return 0;
>  }
> @@ -6194,7 +6277,7 @@ static int build_and_insert(struct virtnet_ff *ff,
>  
>  	setup_eth_hdr_key_mask(selector, key, fs, num_hdrs);
>  
> -	if (has_ipv4(fs->flow_type)) {
> +	if (has_ipv4(fs->flow_type) || has_ipv6(fs->flow_type)) {
>  		selector = next_selector(selector);
>  
>  		err = setup_ip_key_mask(selector, key + sizeof(struct ethhdr), fs);
> -- 
> 2.50.1


^ permalink raw reply

* [PATCH net-next v26 8/8] eea: introduce callback for ndo_get_stats64
From: Xuan Zhuo @ 2026-02-08  8:46 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Xuan Zhuo, Wen Gu, Philo Lu, Lorenzo Bianconi,
	Vadim Fedorenko, Dong Yibo, Heiner Kallweit, Lukas Bulwahn,
	Dust Li
In-Reply-To: <20260208084613.2658-1-xuanzhuo@linux.alibaba.com>

This commit introduces ndo_get_stats64 support.

Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Reviewed-by: Philo Lu <lulie@linux.alibaba.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/net/ethernet/alibaba/eea/eea_net.c | 55 ++++++++++++++++++++++
 drivers/net/ethernet/alibaba/eea/eea_net.h |  5 ++
 2 files changed, 60 insertions(+)

diff --git a/drivers/net/ethernet/alibaba/eea/eea_net.c b/drivers/net/ethernet/alibaba/eea/eea_net.c
index bb147a0866bc..d1e9a8d5eff1 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_net.c
+++ b/drivers/net/ethernet/alibaba/eea/eea_net.c
@@ -106,6 +106,8 @@ static void eea_bind_q_and_cfg(struct eea_net *enet,
 	struct eea_net_tx *tx;
 	int i;
 
+	spin_lock(&enet->stats_lock);
+
 	enet->cfg = ctx->cfg;
 	enet->rx = ctx->rx;
 	enet->tx = ctx->tx;
@@ -125,6 +127,8 @@ static void eea_bind_q_and_cfg(struct eea_net *enet,
 
 		blk->rx = rx;
 	}
+
+	spin_unlock(&enet->stats_lock);
 }
 
 static void eea_unbind_q_and_cfg(struct eea_net *enet,
@@ -134,6 +138,8 @@ static void eea_unbind_q_and_cfg(struct eea_net *enet,
 	struct eea_net_rx *rx;
 	int i;
 
+	spin_lock(&enet->stats_lock);
+
 	ctx->cfg = enet->cfg;
 	ctx->rx = enet->rx;
 	ctx->tx = enet->tx;
@@ -150,6 +156,8 @@ static void eea_unbind_q_and_cfg(struct eea_net *enet,
 
 		blk->rx = NULL;
 	}
+
+	spin_unlock(&enet->stats_lock);
 }
 
 static void eea_free_rxtx_q_mem(struct eea_net_init_ctx *ctx)
@@ -336,6 +344,50 @@ static int eea_netdev_open(struct net_device *netdev)
 	return err;
 }
 
+static void eea_stats(struct net_device *netdev, struct rtnl_link_stats64 *tot)
+{
+	struct eea_net *enet = netdev_priv(netdev);
+	u64 packets, bytes;
+	u32 start;
+	int i;
+
+	spin_lock(&enet->stats_lock);
+
+	if (enet->rx) {
+		for (i = 0; i < enet->cfg.rx_ring_num; i++) {
+			struct eea_net_rx *rx = enet->rx[i];
+
+			do {
+				start = u64_stats_fetch_begin(&rx->stats.syncp);
+				packets = u64_stats_read(&rx->stats.packets);
+				bytes = u64_stats_read(&rx->stats.bytes);
+			} while (u64_stats_fetch_retry(&rx->stats.syncp,
+						       start));
+
+			tot->rx_packets += packets;
+			tot->rx_bytes   += bytes;
+		}
+	}
+
+	if (enet->tx) {
+		for (i = 0; i < enet->cfg.tx_ring_num; i++) {
+			struct eea_net_tx *tx = &enet->tx[i];
+
+			do {
+				start = u64_stats_fetch_begin(&tx->stats.syncp);
+				packets = u64_stats_read(&tx->stats.packets);
+				bytes = u64_stats_read(&tx->stats.bytes);
+			} while (u64_stats_fetch_retry(&tx->stats.syncp,
+						       start));
+
+			tot->tx_packets += packets;
+			tot->tx_bytes   += bytes;
+		}
+	}
+
+	spin_unlock(&enet->stats_lock);
+}
+
 /* resources: ring, buffers, irq */
 int eea_reset_hw_resources(struct eea_net *enet, struct eea_net_init_ctx *ctx)
 {
@@ -565,6 +617,7 @@ static const struct net_device_ops eea_netdev = {
 	.ndo_stop           = eea_netdev_stop,
 	.ndo_start_xmit     = eea_tx_xmit,
 	.ndo_validate_addr  = eth_validate_addr,
+	.ndo_get_stats64    = eea_stats,
 	.ndo_features_check = passthru_features_check,
 	.ndo_tx_timeout     = eea_tx_timeout,
 };
@@ -599,6 +652,8 @@ static struct eea_net *eea_netdev_alloc(struct eea_device *edev, u32 pairs)
 		return NULL;
 	}
 
+	spin_lock_init(&enet->stats_lock);
+
 	return enet;
 }
 
diff --git a/drivers/net/ethernet/alibaba/eea/eea_net.h b/drivers/net/ethernet/alibaba/eea/eea_net.h
index 037585410ad1..001d105604ba 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_net.h
+++ b/drivers/net/ethernet/alibaba/eea/eea_net.h
@@ -161,6 +161,11 @@ struct eea_net {
 	u32 speed;
 
 	u64 hw_ts_offset;
+
+	/* Protect the tx and rx of struct eea_net, when eea_stats accesses the
+	 * stats from rx and tx queues.
+	 */
+	spinlock_t stats_lock;
 };
 
 int eea_net_probe(struct eea_device *edev);
-- 
2.32.0.3.g01195cf9f


^ permalink raw reply related

* [PATCH net-next v26 7/8] eea: introduce ethtool support
From: Xuan Zhuo @ 2026-02-08  8:46 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Xuan Zhuo, Wen Gu, Philo Lu, Lorenzo Bianconi,
	Vadim Fedorenko, Dong Yibo, Heiner Kallweit, Lukas Bulwahn,
	Dust Li, Andrew Lunn
In-Reply-To: <20260208084613.2658-1-xuanzhuo@linux.alibaba.com>

Add basic driver framework for the Alibaba Elastic Ethernet Adapter(EEA).

This commit introduces ethtool support.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Reviewed-by: Philo Lu <lulie@linux.alibaba.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/net/ethernet/alibaba/eea/Makefile     |   1 +
 .../net/ethernet/alibaba/eea/eea_ethtool.c    | 243 ++++++++++++++++++
 .../net/ethernet/alibaba/eea/eea_ethtool.h    |  49 ++++
 drivers/net/ethernet/alibaba/eea/eea_net.c    |   1 +
 drivers/net/ethernet/alibaba/eea/eea_net.h    |   5 +
 drivers/net/ethernet/alibaba/eea/eea_rx.c     |  26 +-
 drivers/net/ethernet/alibaba/eea/eea_tx.c     |  24 +-
 7 files changed, 346 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_ethtool.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_ethtool.h

diff --git a/drivers/net/ethernet/alibaba/eea/Makefile b/drivers/net/ethernet/alibaba/eea/Makefile
index fa34a005fa01..8f8fbb8d2d9a 100644
--- a/drivers/net/ethernet/alibaba/eea/Makefile
+++ b/drivers/net/ethernet/alibaba/eea/Makefile
@@ -4,5 +4,6 @@ eea-y := eea_ring.o \
 	eea_net.o \
 	eea_pci.o \
 	eea_adminq.o \
+	eea_ethtool.o \
 	eea_tx.o \
 	eea_rx.o
diff --git a/drivers/net/ethernet/alibaba/eea/eea_ethtool.c b/drivers/net/ethernet/alibaba/eea/eea_ethtool.c
new file mode 100644
index 000000000000..011a5532f8c2
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_ethtool.c
@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#include <linux/ethtool.h>
+#include <linux/ethtool_netlink.h>
+#include <linux/rtnetlink.h>
+
+#include "eea_adminq.h"
+
+struct eea_stat_desc {
+	char desc[ETH_GSTRING_LEN];
+	size_t offset;
+};
+
+#define EEA_TX_STAT(m)	{#m, offsetof(struct eea_tx_stats, m)}
+#define EEA_RX_STAT(m)	{#m, offsetof(struct eea_rx_stats, m)}
+
+static const struct eea_stat_desc eea_rx_stats_desc[] = {
+	EEA_RX_STAT(descs),
+	EEA_RX_STAT(kicks),
+};
+
+static const struct eea_stat_desc eea_tx_stats_desc[] = {
+	EEA_TX_STAT(descs),
+	EEA_TX_STAT(kicks),
+};
+
+#define EEA_TX_STATS_LEN	ARRAY_SIZE(eea_tx_stats_desc)
+#define EEA_RX_STATS_LEN	ARRAY_SIZE(eea_rx_stats_desc)
+
+static void eea_get_drvinfo(struct net_device *netdev,
+			    struct ethtool_drvinfo *info)
+{
+	struct eea_net *enet = netdev_priv(netdev);
+	struct eea_device *edev = enet->edev;
+
+	strscpy(info->driver,   KBUILD_MODNAME,     sizeof(info->driver));
+	strscpy(info->bus_info, eea_pci_name(edev), sizeof(info->bus_info));
+}
+
+static void eea_get_ringparam(struct net_device *netdev,
+			      struct ethtool_ringparam *ring,
+			      struct kernel_ethtool_ringparam *kernel_ring,
+			      struct netlink_ext_ack *extack)
+{
+	struct eea_net *enet = netdev_priv(netdev);
+
+	ring->rx_max_pending = enet->cfg_hw.rx_ring_depth;
+	ring->tx_max_pending = enet->cfg_hw.tx_ring_depth;
+	ring->rx_pending = enet->cfg.rx_ring_depth;
+	ring->tx_pending = enet->cfg.tx_ring_depth;
+
+	kernel_ring->tcp_data_split = enet->cfg.split_hdr ?
+				      ETHTOOL_TCP_DATA_SPLIT_ENABLED :
+				      ETHTOOL_TCP_DATA_SPLIT_DISABLED;
+}
+
+static int eea_set_ringparam(struct net_device *netdev,
+			     struct ethtool_ringparam *ring,
+			     struct kernel_ethtool_ringparam *kernel_ring,
+			     struct netlink_ext_ack *extack)
+{
+	struct eea_net *enet = netdev_priv(netdev);
+	struct eea_net_init_ctx ctx;
+	bool need_update = false;
+	struct eea_net_cfg *cfg;
+	bool sh;
+
+	eea_init_ctx(enet, &ctx);
+
+	cfg = &ctx.cfg;
+
+	if (ring->rx_pending != cfg->rx_ring_depth)
+		need_update = true;
+
+	if (ring->tx_pending != cfg->tx_ring_depth)
+		need_update = true;
+
+	sh = kernel_ring->tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_ENABLED;
+	if (sh != !!(cfg->split_hdr))
+		need_update = true;
+
+	if (!need_update)
+		return 0;
+
+	cfg->rx_ring_depth = ring->rx_pending;
+	cfg->tx_ring_depth = ring->tx_pending;
+
+	cfg->split_hdr = sh ? enet->cfg_hw.split_hdr : 0;
+
+	return eea_reset_hw_resources(enet, &ctx);
+}
+
+static int eea_set_channels(struct net_device *netdev,
+			    struct ethtool_channels *channels)
+{
+	struct eea_net *enet = netdev_priv(netdev);
+	u16 queue_pairs = channels->combined_count;
+	struct eea_net_init_ctx ctx;
+	struct eea_net_cfg *cfg;
+
+	eea_init_ctx(enet, &ctx);
+
+	cfg = &ctx.cfg;
+
+	cfg->rx_ring_num = queue_pairs;
+	cfg->tx_ring_num = queue_pairs;
+
+	return eea_reset_hw_resources(enet, &ctx);
+}
+
+static void eea_get_channels(struct net_device *netdev,
+			     struct ethtool_channels *channels)
+{
+	struct eea_net *enet = netdev_priv(netdev);
+
+	channels->combined_count = enet->cfg.rx_ring_num;
+	channels->max_combined   = enet->cfg_hw.rx_ring_num;
+}
+
+static void eea_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
+{
+	struct eea_net *enet = netdev_priv(netdev);
+	u8 *p = data;
+	u32 i, j;
+
+	if (stringset != ETH_SS_STATS)
+		return;
+
+	for (i = 0; i < enet->cfg.rx_ring_num; i++) {
+		for (j = 0; j < EEA_RX_STATS_LEN; j++)
+			ethtool_sprintf(&p, "rx%u_%s", i,
+					eea_rx_stats_desc[j].desc);
+	}
+
+	for (i = 0; i < enet->cfg.tx_ring_num; i++) {
+		for (j = 0; j < EEA_TX_STATS_LEN; j++)
+			ethtool_sprintf(&p, "tx%u_%s", i,
+					eea_tx_stats_desc[j].desc);
+	}
+}
+
+static int eea_get_sset_count(struct net_device *netdev, int sset)
+{
+	struct eea_net *enet = netdev_priv(netdev);
+
+	if (sset != ETH_SS_STATS)
+		return -EOPNOTSUPP;
+
+	return enet->cfg.rx_ring_num * EEA_RX_STATS_LEN +
+		enet->cfg.tx_ring_num * EEA_TX_STATS_LEN;
+}
+
+static void eea_stats_fill_for_q(struct u64_stats_sync *syncp, u32 num,
+				 const struct eea_stat_desc *desc,
+				 u64 *data, u32 idx)
+{
+	void *stats_base = syncp;
+	u32 start, i;
+
+	do {
+		start = u64_stats_fetch_begin(syncp);
+		for (i = 0; i < num; i++)
+			data[idx + i] =
+				u64_stats_read(stats_base + desc[i].offset);
+
+	} while (u64_stats_fetch_retry(syncp, start));
+}
+
+static void eea_get_ethtool_stats(struct net_device *netdev,
+				  struct ethtool_stats *stats, u64 *data)
+{
+	struct eea_net *enet = netdev_priv(netdev);
+	u32 i, idx = 0;
+
+	ASSERT_RTNL();
+
+	if (enet->rx) {
+		for (i = 0; i < enet->cfg.rx_ring_num; i++) {
+			struct eea_net_rx *rx = enet->rx[i];
+
+			eea_stats_fill_for_q(&rx->stats.syncp, EEA_RX_STATS_LEN,
+					     eea_rx_stats_desc, data, idx);
+
+			idx += EEA_RX_STATS_LEN;
+		}
+	}
+
+	if (enet->tx) {
+		for (i = 0; i < enet->cfg.tx_ring_num; i++) {
+			struct eea_net_tx *tx = &enet->tx[i];
+
+			eea_stats_fill_for_q(&tx->stats.syncp, EEA_TX_STATS_LEN,
+					     eea_tx_stats_desc, data, idx);
+
+			idx += EEA_TX_STATS_LEN;
+		}
+	}
+}
+
+void eea_update_rx_stats(struct eea_rx_stats *rx_stats,
+			 struct eea_rx_ctx_stats *stats)
+{
+	u64_stats_update_begin(&rx_stats->syncp);
+	u64_stats_add(&rx_stats->descs,             stats->descs);
+	u64_stats_add(&rx_stats->packets,           stats->packets);
+	u64_stats_add(&rx_stats->bytes,             stats->bytes);
+	u64_stats_add(&rx_stats->drops,             stats->drops);
+	u64_stats_add(&rx_stats->split_hdr_bytes,   stats->split_hdr_bytes);
+	u64_stats_add(&rx_stats->split_hdr_packets, stats->split_hdr_packets);
+	u64_stats_add(&rx_stats->length_errors,     stats->length_errors);
+	u64_stats_update_end(&rx_stats->syncp);
+}
+
+static int eea_get_link_ksettings(struct net_device *netdev,
+				  struct ethtool_link_ksettings *cmd)
+{
+	struct eea_net *enet = netdev_priv(netdev);
+
+	cmd->base.speed  = enet->speed;
+	cmd->base.duplex = enet->duplex;
+	cmd->base.port   = PORT_OTHER;
+
+	return 0;
+}
+
+const struct ethtool_ops eea_ethtool_ops = {
+	.supported_ring_params = ETHTOOL_RING_USE_TCP_DATA_SPLIT,
+	.get_drvinfo        = eea_get_drvinfo,
+	.get_link           = ethtool_op_get_link,
+	.get_ringparam      = eea_get_ringparam,
+	.set_ringparam      = eea_set_ringparam,
+	.set_channels       = eea_set_channels,
+	.get_channels       = eea_get_channels,
+	.get_strings        = eea_get_strings,
+	.get_sset_count     = eea_get_sset_count,
+	.get_ethtool_stats  = eea_get_ethtool_stats,
+	.get_link_ksettings = eea_get_link_ksettings,
+};
diff --git a/drivers/net/ethernet/alibaba/eea/eea_ethtool.h b/drivers/net/ethernet/alibaba/eea/eea_ethtool.h
new file mode 100644
index 000000000000..a437065d1cab
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_ethtool.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#ifndef __EEA_ETHTOOL_H__
+#define __EEA_ETHTOOL_H__
+
+struct eea_tx_stats {
+	struct u64_stats_sync syncp;
+	u64_stats_t descs;
+	u64_stats_t packets;
+	u64_stats_t bytes;
+	u64_stats_t drops;
+	u64_stats_t kicks;
+};
+
+struct eea_rx_ctx_stats {
+	u64 descs;
+	u64 packets;
+	u64 bytes;
+	u64 drops;
+	u64 split_hdr_bytes;
+	u64 split_hdr_packets;
+
+	u64 length_errors;
+};
+
+struct eea_rx_stats {
+	struct u64_stats_sync syncp;
+	u64_stats_t descs;
+	u64_stats_t packets;
+	u64_stats_t bytes;
+	u64_stats_t drops;
+	u64_stats_t kicks;
+	u64_stats_t split_hdr_bytes;
+	u64_stats_t split_hdr_packets;
+
+	u64_stats_t length_errors;
+};
+
+void eea_update_rx_stats(struct eea_rx_stats *rx_stats,
+			 struct eea_rx_ctx_stats *stats);
+
+extern const struct ethtool_ops eea_ethtool_ops;
+
+#endif
diff --git a/drivers/net/ethernet/alibaba/eea/eea_net.c b/drivers/net/ethernet/alibaba/eea/eea_net.c
index 1dc93722b44b..bb147a0866bc 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_net.c
+++ b/drivers/net/ethernet/alibaba/eea/eea_net.c
@@ -583,6 +583,7 @@ static struct eea_net *eea_netdev_alloc(struct eea_device *edev, u32 pairs)
 	}
 
 	netdev->netdev_ops = &eea_netdev;
+	netdev->ethtool_ops = &eea_ethtool_ops;
 	SET_NETDEV_DEV(netdev, edev->dma_dev);
 
 	enet = netdev_priv(netdev);
diff --git a/drivers/net/ethernet/alibaba/eea/eea_net.h b/drivers/net/ethernet/alibaba/eea/eea_net.h
index a3a7cc304327..037585410ad1 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_net.h
+++ b/drivers/net/ethernet/alibaba/eea/eea_net.h
@@ -12,6 +12,7 @@
 #include <linux/netdevice.h>
 
 #include "eea_adminq.h"
+#include "eea_ethtool.h"
 #include "eea_ring.h"
 
 #define EEA_VER_MAJOR		1
@@ -33,6 +34,8 @@ struct eea_net_tx {
 	u32 index;
 
 	char name[16];
+
+	struct eea_tx_stats stats;
 };
 
 struct eea_rx_meta {
@@ -84,6 +87,8 @@ struct eea_net_rx {
 
 	struct napi_struct *napi;
 
+	struct eea_rx_stats stats;
+
 	char name[16];
 
 	struct eea_net_rx_pkt_ctx pkt;
diff --git a/drivers/net/ethernet/alibaba/eea/eea_rx.c b/drivers/net/ethernet/alibaba/eea/eea_rx.c
index a395bac1a881..55abea92de58 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_rx.c
+++ b/drivers/net/ethernet/alibaba/eea/eea_rx.c
@@ -30,6 +30,8 @@ struct eea_rx_ctx {
 	u32 frame_sz;
 
 	struct eea_rx_meta *meta;
+
+	struct eea_rx_ctx_stats stats;
 };
 
 static struct eea_rx_meta *eea_rx_meta_get(struct eea_net_rx *rx)
@@ -199,6 +201,7 @@ static int eea_harden_check_overflow(struct eea_rx_ctx *ctx,
 	if (unlikely(ctx->len > max_len)) {
 		pr_debug("%s: rx error: len %u exceeds truesize %u\n",
 			 enet->netdev->name, ctx->len, max_len);
+		++ctx->stats.length_errors;
 		return -EINVAL;
 	}
 
@@ -215,6 +218,7 @@ static int eea_harden_check_size(struct eea_rx_ctx *ctx, struct eea_net *enet)
 
 	if (unlikely(ctx->hdr_len + ctx->len < ETH_HLEN)) {
 		pr_debug("%s: short packet %u\n", enet->netdev->name, ctx->len);
+		++ctx->stats.length_errors;
 		return -EINVAL;
 	}
 
@@ -356,6 +360,7 @@ static int process_remain_buf(struct eea_net_rx *rx, struct eea_rx_ctx *ctx)
 
 err:
 	dev_kfree_skb(rx->pkt.head_skb);
+	++ctx->stats.drops;
 	rx->pkt.do_drop = true;
 	rx->pkt.head_skb = NULL;
 	return 0;
@@ -384,6 +389,7 @@ static int process_first_buf(struct eea_net_rx *rx, struct eea_rx_ctx *ctx)
 	return 0;
 
 err:
+	++ctx->stats.drops;
 	rx->pkt.do_drop = true;
 	return 0;
 }
@@ -418,6 +424,8 @@ static void eea_rx_desc_to_ctx(struct eea_net_rx *rx,
 	if (ctx->flags & EEA_DESC_F_SPLIT_HDR) {
 		ctx->hdr_len = le16_to_cpu(desc->len_ex) &
 			EEA_RX_CDESC_HDR_LEN_MASK;
+		ctx->stats.split_hdr_bytes += ctx->hdr_len;
+		++ctx->stats.split_hdr_packets;
 	}
 
 	ctx->more = ctx->flags & EEA_RING_DESC_F_MORE;
@@ -446,6 +454,8 @@ static int eea_cleanrx(struct eea_net_rx *rx, int budget,
 
 		eea_rx_meta_dma_sync_for_cpu(rx, meta, ctx->len);
 
+		ctx->stats.bytes += ctx->len;
+
 		if (!rx->pkt.idx)
 			process_first_buf(rx, ctx);
 		else
@@ -463,17 +473,20 @@ static int eea_cleanrx(struct eea_net_rx *rx, int budget,
 skip:
 		eea_rx_meta_put(rx, meta);
 		ering_cq_ack_desc(rx->ering, 1);
+		++ctx->stats.descs;
 
 		if (!ctx->more)
 			memset(&rx->pkt, 0, sizeof(rx->pkt));
 	}
 
+	ctx->stats.packets = packets;
+
 	return packets;
 }
 
 static bool eea_rx_post(struct eea_net *enet, struct eea_net_rx *rx)
 {
-	u32 tailroom, headroom, room, len;
+	u32 tailroom, headroom, room, flags, len;
 	struct eea_rx_meta *meta;
 	struct eea_rx_desc *desc;
 	int err = 0, num = 0;
@@ -513,9 +526,14 @@ static bool eea_rx_post(struct eea_net *enet, struct eea_net_rx *rx)
 		++num;
 	}
 
-	if (num)
+	if (num) {
 		ering_kick(rx->ering);
 
+		flags = u64_stats_update_begin_irqsave(&rx->stats.syncp);
+		u64_stats_inc(&rx->stats.kicks);
+		u64_stats_update_end_irqrestore(&rx->stats.syncp, flags);
+	}
+
 	/* true means busy, napi should be called again. */
 	return !!err;
 }
@@ -537,6 +555,8 @@ static int eea_poll(struct napi_struct *napi, int budget)
 	if (rx->ering->num_free > budget)
 		busy |= eea_rx_post(enet, rx);
 
+	eea_update_rx_stats(&rx->stats, &ctx.stats);
+
 	busy |= received >= budget;
 
 	if (busy)
@@ -667,6 +687,8 @@ struct eea_net_rx *eea_alloc_rx(struct eea_net_init_ctx *ctx, u32 idx)
 	rx->index = idx;
 	sprintf(rx->name, "rx.%u", idx);
 
+	u64_stats_init(&rx->stats.syncp);
+
 	/* ering */
 	ering = ering_alloc(idx * 2, ctx->cfg.rx_ring_depth, ctx->edev,
 			    ctx->cfg.rx_sq_desc_size,
diff --git a/drivers/net/ethernet/alibaba/eea/eea_tx.c b/drivers/net/ethernet/alibaba/eea/eea_tx.c
index 2cdaff9645f8..7c1fc499fe7f 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_tx.c
+++ b/drivers/net/ethernet/alibaba/eea/eea_tx.c
@@ -115,6 +115,13 @@ static u32 eea_clean_tx(struct eea_net_tx *tx, int budget)
 		ering_cq_ack_desc(tx->ering, desc_n);
 	}
 
+	if (stats.packets) {
+		u64_stats_update_begin(&tx->stats.syncp);
+		u64_stats_add(&tx->stats.bytes, stats.bytes);
+		u64_stats_add(&tx->stats.packets, stats.packets);
+		u64_stats_update_end(&tx->stats.syncp);
+	}
+
 	return stats.packets;
 }
 
@@ -248,6 +255,10 @@ static int eea_tx_post_skb(struct eea_net_tx *tx, struct sk_buff *skb)
 	meta->num = shinfo->nr_frags + 1;
 	ering_sq_commit_desc(tx->ering);
 
+	u64_stats_update_begin(&tx->stats.syncp);
+	u64_stats_add(&tx->stats.descs, meta->num);
+	u64_stats_update_end(&tx->stats.syncp);
+
 	return 0;
 
 err_cancel:
@@ -260,6 +271,10 @@ static int eea_tx_post_skb(struct eea_net_tx *tx, struct sk_buff *skb)
 static void eea_tx_kick(struct eea_net_tx *tx)
 {
 	ering_kick(tx->ering);
+
+	u64_stats_update_begin(&tx->stats.syncp);
+	u64_stats_inc(&tx->stats.kicks);
+	u64_stats_update_end(&tx->stats.syncp);
 }
 
 netdev_tx_t eea_tx_xmit(struct sk_buff *skb, struct net_device *netdev)
@@ -275,8 +290,13 @@ netdev_tx_t eea_tx_xmit(struct sk_buff *skb, struct net_device *netdev)
 	skb_tx_timestamp(skb);
 
 	err = eea_tx_post_skb(tx, skb);
-	if (unlikely(err))
+	if (unlikely(err)) {
+		u64_stats_update_begin(&tx->stats.syncp);
+		u64_stats_inc(&tx->stats.drops);
+		u64_stats_update_end(&tx->stats.syncp);
+
 		dev_kfree_skb_any(skb);
+	}
 
 	/* NETDEV_TX_BUSY is expensive. So stop advancing the TX queue. */
 	n = MAX_SKB_FRAGS + 1;
@@ -346,6 +366,8 @@ int eea_alloc_tx(struct eea_net_init_ctx *ctx, struct eea_net_tx *tx, u32 idx)
 	struct eea_ring *ering;
 	u32 i;
 
+	u64_stats_init(&tx->stats.syncp);
+
 	sprintf(tx->name, "tx.%u", idx);
 
 	ering = ering_alloc(idx * 2 + 1, ctx->cfg.tx_ring_depth, ctx->edev,
-- 
2.32.0.3.g01195cf9f


^ permalink raw reply related

* [PATCH net-next v26 6/8] eea: implement packet transmit logic
From: Xuan Zhuo @ 2026-02-08  8:46 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Xuan Zhuo, Wen Gu, Philo Lu, Lorenzo Bianconi,
	Vadim Fedorenko, Dong Yibo, Heiner Kallweit, Lukas Bulwahn,
	Dust Li
In-Reply-To: <20260208084613.2658-1-xuanzhuo@linux.alibaba.com>

Implement the core logic for transmitting packets in the EEA TX path,
including packet preparation and submission to the underlying transport.

Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Reviewed-by: Philo Lu <lulie@linux.alibaba.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/net/ethernet/alibaba/eea/eea_tx.c | 270 +++++++++++++++++++++-
 1 file changed, 268 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/alibaba/eea/eea_tx.c b/drivers/net/ethernet/alibaba/eea/eea_tx.c
index 1475fca44b6e..2cdaff9645f8 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_tx.c
+++ b/drivers/net/ethernet/alibaba/eea/eea_tx.c
@@ -11,6 +11,11 @@
 #include "eea_pci.h"
 #include "eea_ring.h"
 
+struct eea_sq_free_stats {
+	u64 packets;
+	u64 bytes;
+};
+
 struct eea_tx_meta {
 	struct eea_tx_meta *next;
 
@@ -28,20 +33,281 @@ struct eea_tx_meta {
 	u16 dma_len;
 };
 
+static struct eea_tx_meta *eea_tx_meta_get(struct eea_net_tx *tx)
+{
+	struct eea_tx_meta *meta;
+
+	if (!tx->free)
+		return NULL;
+
+	meta = tx->free;
+	tx->free = meta->next;
+
+	return meta;
+}
+
+static void eea_tx_meta_put_and_unmap(struct eea_net_tx *tx,
+				      struct eea_tx_meta *meta)
+{
+	struct eea_tx_meta *head;
+
+	head = meta;
+
+	while (true) {
+		dma_unmap_single(tx->dma_dev, meta->dma_addr,
+				 meta->dma_len, DMA_TO_DEVICE);
+
+		if (meta->next) {
+			meta = meta->next;
+			continue;
+		}
+
+		break;
+	}
+
+	meta->next = tx->free;
+	tx->free = head;
+}
+
+static void eea_meta_free_xmit(struct eea_net_tx *tx,
+			       struct eea_tx_meta *meta,
+			       int budget,
+			       struct eea_tx_cdesc *desc,
+			       struct eea_sq_free_stats *stats)
+{
+	struct sk_buff *skb = meta->skb;
+
+	if (unlikely((skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) && desc)) {
+		struct skb_shared_hwtstamps ts = {};
+
+		ts.hwtstamp = EEA_DESC_TS(desc) + tx->enet->hw_ts_offset;
+		skb_tstamp_tx(skb, &ts);
+	}
+
+	++stats->packets;
+	stats->bytes += meta->skb->len;
+	napi_consume_skb(meta->skb, budget);
+
+	meta->data = NULL;
+}
+
+static u32 eea_clean_tx(struct eea_net_tx *tx, int budget)
+{
+	struct eea_sq_free_stats stats = {0};
+	struct eea_tx_cdesc *desc;
+	struct eea_tx_meta *meta;
+	int desc_n;
+
+	while ((desc = ering_cq_get_desc(tx->ering))) {
+		meta = &tx->meta[le16_to_cpu(desc->id)];
+
+		if (meta->data) {
+			eea_tx_meta_put_and_unmap(tx, meta);
+			eea_meta_free_xmit(tx, meta, budget, desc, &stats);
+			desc_n = meta->num;
+		} else {
+			netdev_err(tx->enet->netdev,
+				   "tx meta->data is null. id %d num: %d\n",
+				   meta->id, meta->num);
+			desc_n = 1;
+		}
+
+		ering_cq_ack_desc(tx->ering, desc_n);
+	}
+
+	return stats.packets;
+}
+
 int eea_poll_tx(struct eea_net_tx *tx, int budget)
 {
-	/* Empty function; will be implemented in a subsequent commit. */
+	struct eea_net *enet = tx->enet;
+	u32 index = tx - enet->tx;
+	struct netdev_queue *txq;
+	u32 cleaned;
+
+	txq = netdev_get_tx_queue(enet->netdev, index);
+
+	__netif_tx_lock(txq, smp_processor_id());
+
+	cleaned = eea_clean_tx(tx, budget);
+
+	if (netif_tx_queue_stopped(txq) && cleaned > 0)
+		netif_tx_wake_queue(txq);
+
+	__netif_tx_unlock(txq);
+
+	return 0;
+}
+
+static int eea_fill_desc_from_skb(const struct sk_buff *skb,
+				  struct eea_ring *ering,
+				  struct eea_tx_desc *desc)
+{
+	if (skb_is_gso(skb)) {
+		struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+		desc->gso_size = cpu_to_le16(sinfo->gso_size);
+		if (sinfo->gso_type & SKB_GSO_TCPV4)
+			desc->gso_type = EEA_TX_GSO_TCPV4;
+
+		else if (sinfo->gso_type & SKB_GSO_TCPV6)
+			desc->gso_type = EEA_TX_GSO_TCPV6;
+
+		else if (sinfo->gso_type & SKB_GSO_UDP_L4)
+			desc->gso_type = EEA_TX_GSO_UDP_L4;
+
+		else
+			return -EINVAL;
+
+		if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+			desc->gso_type |= EEA_TX_GSO_ECN;
+	} else {
+		desc->gso_type = EEA_TX_GSO_NONE;
+	}
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		desc->csum_start = cpu_to_le16(skb_checksum_start_offset(skb));
+		desc->csum_offset = cpu_to_le16(skb->csum_offset);
+	}
+
+	return 0;
+}
+
+static struct eea_tx_meta *eea_tx_desc_fill(struct eea_net_tx *tx,
+					    dma_addr_t addr, u32 len,
+					    bool is_last, void *data, u16 flags)
+{
+	struct eea_tx_meta *meta;
+	struct eea_tx_desc *desc;
+
+	meta = eea_tx_meta_get(tx);
+
+	desc = ering_sq_alloc_desc(tx->ering, meta->id, is_last, flags);
+	desc->addr = cpu_to_le64(addr);
+	desc->len = cpu_to_le16(len);
+
+	meta->next     = NULL;
+	meta->dma_len  = len;
+	meta->dma_addr = addr;
+	meta->data     = data;
+	meta->num      = 1;
+	meta->desc     = desc;
+
+	return meta;
+}
+
+static int eea_tx_add_skb_frag(struct eea_net_tx *tx,
+			       struct eea_tx_meta *head_meta,
+			       const skb_frag_t *frag, bool is_last)
+{
+	u32 len = skb_frag_size(frag);
+	struct eea_tx_meta *meta;
+	dma_addr_t addr;
+
+	addr = skb_frag_dma_map(tx->dma_dev, frag, 0, len, DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(tx->dma_dev, addr)))
+		return -ENOMEM;
+
+	meta = eea_tx_desc_fill(tx, addr, len, is_last, NULL, 0);
+
+	meta->next = head_meta->next;
+	head_meta->next = meta;
+
 	return 0;
 }
 
+static int eea_tx_post_skb(struct eea_net_tx *tx, struct sk_buff *skb)
+{
+	const struct skb_shared_info *shinfo = skb_shinfo(skb);
+	u32 hlen = skb_headlen(skb);
+	struct eea_tx_meta *meta;
+	dma_addr_t addr;
+	int i, err;
+	u16 flags;
+
+	addr = dma_map_single(tx->dma_dev, skb->data, hlen, DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(tx->dma_dev, addr)))
+		return -ENOMEM;
+
+	flags = skb->ip_summed == CHECKSUM_PARTIAL ? EEA_DESC_F_DO_CSUM : 0;
+
+	meta = eea_tx_desc_fill(tx, addr, hlen, !shinfo->nr_frags, skb, flags);
+
+	if (eea_fill_desc_from_skb(skb, tx->ering, meta->desc))
+		goto err_cancel;
+
+	for (i = 0; i < shinfo->nr_frags; i++) {
+		const skb_frag_t *frag = &shinfo->frags[i];
+		bool is_last = i == (shinfo->nr_frags - 1);
+
+		err = eea_tx_add_skb_frag(tx, meta, frag, is_last);
+		if (err)
+			goto err_cancel;
+	}
+
+	meta->num = shinfo->nr_frags + 1;
+	ering_sq_commit_desc(tx->ering);
+
+	return 0;
+
+err_cancel:
+	ering_sq_cancel(tx->ering);
+	eea_tx_meta_put_and_unmap(tx, meta);
+	meta->data = NULL;
+	return -ENOMEM;
+}
+
+static void eea_tx_kick(struct eea_net_tx *tx)
+{
+	ering_kick(tx->ering);
+}
+
 netdev_tx_t eea_tx_xmit(struct sk_buff *skb, struct net_device *netdev)
 {
-	/* Empty function; will be implemented in a subsequent commit. */
+	struct eea_net *enet = netdev_priv(netdev);
+	int qnum = skb_get_queue_mapping(skb);
+	struct eea_net_tx *tx = &enet->tx[qnum];
+	struct netdev_queue *txq;
+	int err, n;
+
+	txq = netdev_get_tx_queue(netdev, qnum);
+
+	skb_tx_timestamp(skb);
+
+	err = eea_tx_post_skb(tx, skb);
+	if (unlikely(err))
+		dev_kfree_skb_any(skb);
+
+	/* NETDEV_TX_BUSY is expensive. So stop advancing the TX queue. */
+	n = MAX_SKB_FRAGS + 1;
+	netif_txq_maybe_stop(txq, tx->ering->num_free, n, n);
+
+	if (!netdev_xmit_more() || netif_xmit_stopped(txq))
+		eea_tx_kick(tx);
+
 	return NETDEV_TX_OK;
 }
 
 static void eea_free_meta(struct eea_net_tx *tx, struct eea_net_cfg *cfg)
 {
+	struct eea_sq_free_stats stats = {0};
+	struct eea_tx_meta *meta;
+	int i;
+
+	while ((meta = eea_tx_meta_get(tx)))
+		meta->skb = NULL;
+
+	for (i = 0; i < cfg->tx_ring_depth; i++) {
+		meta = &tx->meta[i];
+
+		if (!meta->skb)
+			continue;
+
+		eea_tx_meta_put_and_unmap(tx, meta);
+
+		eea_meta_free_xmit(tx, meta, 0, NULL, &stats);
+	}
+
 	kvfree(tx->meta);
 	tx->meta = NULL;
 }
-- 
2.32.0.3.g01195cf9f


^ permalink raw reply related

* [PATCH net-next v26 5/8] eea: implement packet receive logic
From: Xuan Zhuo @ 2026-02-08  8:46 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Xuan Zhuo, Wen Gu, Philo Lu, Lorenzo Bianconi,
	Vadim Fedorenko, Dong Yibo, Heiner Kallweit, Lukas Bulwahn,
	Dust Li
In-Reply-To: <20260208084613.2658-1-xuanzhuo@linux.alibaba.com>

Implement the core logic for receiving packets in the EEA RX path,
including packet buffering and basic validation.

Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Reviewed-by: Philo Lu <lulie@linux.alibaba.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/net/ethernet/alibaba/eea/eea_net.h |   1 +
 drivers/net/ethernet/alibaba/eea/eea_rx.c  | 458 ++++++++++++++++++++-
 2 files changed, 457 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/alibaba/eea/eea_net.h b/drivers/net/ethernet/alibaba/eea/eea_net.h
index 6f9c89c180de..a3a7cc304327 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_net.h
+++ b/drivers/net/ethernet/alibaba/eea/eea_net.h
@@ -166,6 +166,7 @@ void eea_init_ctx(struct eea_net *enet, struct eea_net_init_ctx *ctx);
 int eea_queues_check_and_reset(struct eea_device *edev);
 
 /* rx apis */
+
 void enet_rx_stop(struct eea_net_rx *rx);
 void enet_rx_start(struct eea_net_rx *rx);
 
diff --git a/drivers/net/ethernet/alibaba/eea/eea_rx.c b/drivers/net/ethernet/alibaba/eea/eea_rx.c
index 7c9bb513191b..a395bac1a881 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_rx.c
+++ b/drivers/net/ethernet/alibaba/eea/eea_rx.c
@@ -16,6 +16,41 @@
 
 #define EEA_PAGE_FRAGS_NUM 1024
 
+#define EEA_RX_BUF_ALIGN 128
+
+struct eea_rx_ctx {
+	void *buf;
+
+	u32 len;
+	u32 hdr_len;
+
+	u16 flags;
+	bool more;
+
+	u32 frame_sz;
+
+	struct eea_rx_meta *meta;
+};
+
+static struct eea_rx_meta *eea_rx_meta_get(struct eea_net_rx *rx)
+{
+	struct eea_rx_meta *meta;
+
+	if (!rx->free)
+		return NULL;
+
+	meta = rx->free;
+	rx->free = meta->next;
+
+	return meta;
+}
+
+static void eea_rx_meta_put(struct eea_net_rx *rx, struct eea_rx_meta *meta)
+{
+	meta->next = rx->free;
+	rx->free = meta;
+}
+
 static void eea_free_rx_buffer(struct eea_net_rx *rx, struct eea_rx_meta *meta)
 {
 	u32 drain_count;
@@ -28,6 +63,63 @@ static void eea_free_rx_buffer(struct eea_net_rx *rx, struct eea_rx_meta *meta)
 	meta->page = NULL;
 }
 
+static void meta_align_offset(struct eea_net_rx *rx, struct eea_rx_meta *meta)
+{
+	int h, b;
+
+	h = rx->headroom;
+	b = meta->offset + h;
+
+	/* For better performance, we align the buffer address to
+	 * EEA_RX_BUF_ALIGN, as required by the device design.
+	 */
+	b = ALIGN(b, EEA_RX_BUF_ALIGN);
+
+	meta->offset = b - h;
+}
+
+static int eea_alloc_rx_buffer(struct eea_net_rx *rx, struct eea_rx_meta *meta)
+{
+	struct page *page;
+
+	if (meta->page)
+		return 0;
+
+	page = page_pool_dev_alloc_pages(rx->pp);
+	if (!page)
+		return -ENOMEM;
+
+	page_pool_fragment_page(page, EEA_PAGE_FRAGS_NUM);
+
+	meta->page = page;
+	meta->dma = page_pool_get_dma_addr(page);
+	meta->offset = 0;
+	meta->frags = 0;
+
+	meta_align_offset(rx, meta);
+
+	return 0;
+}
+
+static void eea_consume_rx_buffer(struct eea_net_rx *rx,
+				  struct eea_rx_meta *meta,
+				  u32 consumed)
+{
+	int min;
+
+	meta->offset += consumed;
+	++meta->frags;
+
+	min = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	min += rx->headroom;
+	min += ETH_DATA_LEN;
+
+	meta_align_offset(rx, meta);
+
+	if (min + meta->offset > PAGE_SIZE)
+		eea_free_rx_buffer(rx, meta);
+}
+
 static void eea_free_rx_hdr(struct eea_net_rx *rx, struct eea_net_cfg *cfg)
 {
 	struct eea_rx_meta *meta;
@@ -88,12 +180,374 @@ static int eea_alloc_rx_hdr(struct eea_net_init_ctx *ctx, struct eea_net_rx *rx)
 	return 0;
 }
 
-static int eea_poll(struct napi_struct *napi, int budget)
+static void eea_rx_meta_dma_sync_for_cpu(struct eea_net_rx *rx,
+					 struct eea_rx_meta *meta, u32 len)
+{
+	dma_sync_single_for_cpu(rx->enet->edev->dma_dev,
+				meta->dma + meta->offset + meta->headroom,
+				len, DMA_FROM_DEVICE);
+}
+
+static int eea_harden_check_overflow(struct eea_rx_ctx *ctx,
+				     struct eea_net *enet)
 {
-	/* Empty function; will be implemented in a subsequent commit. */
+	u32 max_len;
+
+	max_len = ctx->meta->truesize - ctx->meta->headroom -
+		ctx->meta->tailroom;
+
+	if (unlikely(ctx->len > max_len)) {
+		pr_debug("%s: rx error: len %u exceeds truesize %u\n",
+			 enet->netdev->name, ctx->len, max_len);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int eea_harden_check_size(struct eea_rx_ctx *ctx, struct eea_net *enet)
+{
+	int err;
+
+	err = eea_harden_check_overflow(ctx, enet);
+	if (err)
+		return err;
+
+	if (unlikely(ctx->hdr_len + ctx->len < ETH_HLEN)) {
+		pr_debug("%s: short packet %u\n", enet->netdev->name, ctx->len);
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
+static struct sk_buff *eea_build_skb(void *buf, u32 buflen, u32 headroom,
+				     u32 len)
+{
+	struct sk_buff *skb;
+
+	skb = build_skb(buf, buflen);
+	if (unlikely(!skb))
+		return NULL;
+
+	skb_reserve(skb, headroom);
+	skb_put(skb, len);
+
+	return skb;
+}
+
+static struct sk_buff *eea_rx_build_split_hdr_skb(struct eea_net_rx *rx,
+						  struct eea_rx_ctx *ctx)
+{
+	struct eea_rx_meta *meta = ctx->meta;
+	struct sk_buff *skb;
+	u32 truesize;
+
+	dma_sync_single_for_cpu(rx->enet->edev->dma_dev, meta->hdr_dma,
+				ctx->hdr_len, DMA_FROM_DEVICE);
+
+	skb = napi_alloc_skb(rx->napi, ctx->hdr_len);
+	if (unlikely(!skb))
+		return NULL;
+
+	truesize = meta->headroom + ctx->len;
+
+	skb_put_data(skb, ctx->meta->hdr_addr, ctx->hdr_len);
+
+	if (ctx->len) {
+		skb_add_rx_frag(skb, 0, meta->page,
+				meta->offset + meta->headroom,
+				ctx->len, truesize);
+
+		eea_consume_rx_buffer(rx, meta, truesize);
+	}
+
+	skb_mark_for_recycle(skb);
+
+	return skb;
+}
+
+static struct sk_buff *eea_rx_build_skb(struct eea_net_rx *rx,
+					struct eea_rx_ctx *ctx)
+{
+	struct eea_rx_meta *meta = ctx->meta;
+	u32 len, shinfo_size, truesize;
+	struct sk_buff *skb;
+	struct page *page;
+	void *buf, *pkt;
+
+	page = meta->page;
+	if (!page)
+		return NULL;
+
+	shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	buf = page_address(page) + meta->offset;
+	pkt = buf + meta->headroom;
+	len = ctx->len;
+	truesize = meta->headroom + ctx->len + shinfo_size;
+
+	skb = eea_build_skb(buf, truesize, pkt - buf, len);
+	if (unlikely(!skb))
+		return NULL;
+
+	eea_consume_rx_buffer(rx, meta, truesize);
+	skb_mark_for_recycle(skb);
+
+	return skb;
+}
+
+static int eea_skb_append_buf(struct eea_net_rx *rx, struct eea_rx_ctx *ctx)
+{
+	struct sk_buff *curr_skb = rx->pkt.curr_skb;
+	struct sk_buff *head_skb = rx->pkt.head_skb;
+	int num_skb_frags;
+	int offset;
+
+	if (!curr_skb)
+		curr_skb = head_skb;
+
+	num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
+	if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
+		struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
+
+		if (unlikely(!nskb))
+			return -ENOMEM;
+
+		if (curr_skb == head_skb)
+			skb_shinfo(curr_skb)->frag_list = nskb;
+		else
+			curr_skb->next = nskb;
+
+		curr_skb = nskb;
+		head_skb->truesize += nskb->truesize;
+		num_skb_frags = 0;
+
+		rx->pkt.curr_skb = curr_skb;
+	}
+
+	if (curr_skb != head_skb) {
+		head_skb->data_len += ctx->len;
+		head_skb->len += ctx->len;
+		head_skb->truesize += ctx->meta->truesize;
+	}
+
+	offset = ctx->meta->offset + ctx->meta->headroom;
+
+	skb_add_rx_frag(curr_skb, num_skb_frags, ctx->meta->page,
+			offset, ctx->len, ctx->meta->truesize);
+
+	eea_consume_rx_buffer(rx, ctx->meta, ctx->meta->headroom + ctx->len);
+
+	return 0;
+}
+
+static int process_remain_buf(struct eea_net_rx *rx, struct eea_rx_ctx *ctx)
+{
+	struct eea_net *enet = rx->enet;
+
+	if (eea_harden_check_overflow(ctx, enet))
+		goto err;
+
+	if (eea_skb_append_buf(rx, ctx))
+		goto err;
+
+	return 0;
+
+err:
+	dev_kfree_skb(rx->pkt.head_skb);
+	rx->pkt.do_drop = true;
+	rx->pkt.head_skb = NULL;
+	return 0;
+}
+
+static int process_first_buf(struct eea_net_rx *rx, struct eea_rx_ctx *ctx)
+{
+	struct eea_net *enet = rx->enet;
+	struct sk_buff *skb = NULL;
+
+	if (eea_harden_check_size(ctx, enet))
+		goto err;
+
+	rx->pkt.data_valid = ctx->flags & EEA_DESC_F_DATA_VALID;
+
+	if (ctx->hdr_len)
+		skb = eea_rx_build_split_hdr_skb(rx, ctx);
+	else
+		skb = eea_rx_build_skb(rx, ctx);
+
+	if (unlikely(!skb))
+		goto err;
+
+	rx->pkt.head_skb = skb;
+
+	return 0;
+
+err:
+	rx->pkt.do_drop = true;
+	return 0;
+}
+
+static void eea_submit_skb(struct eea_net_rx *rx, struct sk_buff *skb,
+			   struct eea_rx_cdesc *desc)
+{
+	struct eea_net *enet = rx->enet;
+
+	if (rx->pkt.data_valid)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	if (enet->cfg.ts_cfg.rx_filter == HWTSTAMP_FILTER_ALL)
+		skb_hwtstamps(skb)->hwtstamp = EEA_DESC_TS(desc) +
+			enet->hw_ts_offset;
+
+	skb_record_rx_queue(skb, rx->index);
+	skb->protocol = eth_type_trans(skb, enet->netdev);
+
+	napi_gro_receive(rx->napi, skb);
+}
+
+static void eea_rx_desc_to_ctx(struct eea_net_rx *rx,
+			       struct eea_rx_ctx *ctx,
+			       struct eea_rx_cdesc *desc)
+{
+	ctx->meta = &rx->meta[le16_to_cpu(desc->id)];
+	ctx->len = le16_to_cpu(desc->len);
+	ctx->flags = le16_to_cpu(desc->flags);
+
+	ctx->hdr_len = 0;
+	if (ctx->flags & EEA_DESC_F_SPLIT_HDR) {
+		ctx->hdr_len = le16_to_cpu(desc->len_ex) &
+			EEA_RX_CDESC_HDR_LEN_MASK;
+	}
+
+	ctx->more = ctx->flags & EEA_RING_DESC_F_MORE;
+}
+
+static int eea_cleanrx(struct eea_net_rx *rx, int budget,
+		       struct eea_rx_ctx *ctx)
+{
+	struct eea_rx_cdesc *desc;
+	struct eea_rx_meta *meta;
+	int packets;
+
+	for (packets = 0; packets < budget; ) {
+		desc = ering_cq_get_desc(rx->ering);
+		if (!desc)
+			break;
+
+		eea_rx_desc_to_ctx(rx, ctx, desc);
+
+		meta = ctx->meta;
+		ctx->buf = page_address(meta->page) + meta->offset +
+			meta->headroom;
+
+		if (unlikely(rx->pkt.do_drop))
+			goto skip;
+
+		eea_rx_meta_dma_sync_for_cpu(rx, meta, ctx->len);
+
+		if (!rx->pkt.idx)
+			process_first_buf(rx, ctx);
+		else
+			process_remain_buf(rx, ctx);
+
+		++rx->pkt.idx;
+
+		if (!ctx->more) {
+			if (likely(rx->pkt.head_skb))
+				eea_submit_skb(rx, rx->pkt.head_skb, desc);
+
+			++packets;
+		}
+
+skip:
+		eea_rx_meta_put(rx, meta);
+		ering_cq_ack_desc(rx->ering, 1);
+
+		if (!ctx->more)
+			memset(&rx->pkt, 0, sizeof(rx->pkt));
+	}
+
+	return packets;
+}
+
+static bool eea_rx_post(struct eea_net *enet, struct eea_net_rx *rx)
+{
+	u32 tailroom, headroom, room, len;
+	struct eea_rx_meta *meta;
+	struct eea_rx_desc *desc;
+	int err = 0, num = 0;
+	dma_addr_t addr;
+
+	tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	headroom = rx->headroom;
+	room = headroom + tailroom;
+
+	while (true) {
+		meta = eea_rx_meta_get(rx);
+		if (!meta)
+			break;
+
+		err = eea_alloc_rx_buffer(rx, meta);
+		if (err) {
+			eea_rx_meta_put(rx, meta);
+			break;
+		}
+
+		len = PAGE_SIZE - meta->offset - room;
+		addr = meta->dma + meta->offset + headroom;
+
+		desc = ering_sq_alloc_desc(rx->ering, meta->id, true, 0);
+		desc->addr = cpu_to_le64(addr);
+		desc->len = cpu_to_le16(len);
+
+		if (meta->hdr_addr)
+			desc->hdr_addr = cpu_to_le64(meta->hdr_dma);
+
+		ering_sq_commit_desc(rx->ering);
+
+		meta->truesize = len + room;
+		meta->headroom = headroom;
+		meta->tailroom = tailroom;
+		meta->len = len;
+		++num;
+	}
+
+	if (num)
+		ering_kick(rx->ering);
+
+	/* true means busy, napi should be called again. */
+	return !!err;
+}
+
+static int eea_poll(struct napi_struct *napi, int budget)
+{
+	struct eea_irq_blk *blk = container_of(napi, struct eea_irq_blk, napi);
+	struct eea_net_rx *rx = blk->rx;
+	struct eea_net_tx *tx = &rx->enet->tx[rx->index];
+	struct eea_net *enet = rx->enet;
+	struct eea_rx_ctx ctx = {};
+	bool busy = false;
+	u32 received;
+
+	eea_poll_tx(tx, budget);
+
+	received = eea_cleanrx(rx, budget, &ctx);
+
+	if (rx->ering->num_free > budget)
+		busy |= eea_rx_post(enet, rx);
+
+	busy |= received >= budget;
+
+	if (busy)
+		return budget;
+
+	if (napi_complete_done(napi, received))
+		ering_irq_active(rx->ering, tx->ering);
+
+	return received;
+}
+
 static void eea_free_rx_buffers(struct eea_net_rx *rx, struct eea_net_cfg *cfg)
 {
 	struct eea_rx_meta *meta;
-- 
2.32.0.3.g01195cf9f


^ permalink raw reply related

* [PATCH net-next v26 4/8] eea: create/destroy rx,tx queues for netdevice open and stop
From: Xuan Zhuo @ 2026-02-08  8:46 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Xuan Zhuo, Wen Gu, Philo Lu, Lorenzo Bianconi,
	Vadim Fedorenko, Dong Yibo, Heiner Kallweit, Lukas Bulwahn,
	Dust Li
In-Reply-To: <20260208084613.2658-1-xuanzhuo@linux.alibaba.com>

Add basic driver framework for the Alibaba Elastic Ethernet Adapter(EEA).

This commit introduces the implementation for the netdevice open and
stop.

Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Reviewed-by: Philo Lu <lulie@linux.alibaba.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/net/ethernet/alibaba/eea/Makefile  |   4 +-
 drivers/net/ethernet/alibaba/eea/eea_net.c | 531 ++++++++++++++++++++-
 drivers/net/ethernet/alibaba/eea/eea_net.h |  47 ++
 drivers/net/ethernet/alibaba/eea/eea_pci.c | 182 ++++++-
 drivers/net/ethernet/alibaba/eea/eea_pci.h |  13 +
 drivers/net/ethernet/alibaba/eea/eea_rx.c  | 254 ++++++++++
 drivers/net/ethernet/alibaba/eea/eea_tx.c  | 114 +++++
 7 files changed, 1140 insertions(+), 5 deletions(-)
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_rx.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_tx.c

diff --git a/drivers/net/ethernet/alibaba/eea/Makefile b/drivers/net/ethernet/alibaba/eea/Makefile
index 91f318e8e046..fa34a005fa01 100644
--- a/drivers/net/ethernet/alibaba/eea/Makefile
+++ b/drivers/net/ethernet/alibaba/eea/Makefile
@@ -3,4 +3,6 @@ obj-$(CONFIG_EEA) += eea.o
 eea-y := eea_ring.o \
 	eea_net.o \
 	eea_pci.o \
-	eea_adminq.o
+	eea_adminq.o \
+	eea_tx.o \
+	eea_rx.o
diff --git a/drivers/net/ethernet/alibaba/eea/eea_net.c b/drivers/net/ethernet/alibaba/eea/eea_net.c
index 31cb9ca5b408..1dc93722b44b 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_net.c
+++ b/drivers/net/ethernet/alibaba/eea/eea_net.c
@@ -18,6 +18,451 @@
 
 #define EEA_SPLIT_HDR_SIZE 128
 
+static irqreturn_t eea_irq_handler(int irq, void *data)
+{
+	struct eea_irq_blk *blk = data;
+
+	napi_schedule_irqoff(&blk->napi);
+
+	return IRQ_HANDLED;
+}
+
+static void eea_free_irq_blk(struct eea_net *enet)
+{
+	struct eea_irq_blk *blk;
+	u32 num;
+	int i;
+
+	if (!enet->irq_blks)
+		return;
+
+	num = enet->edev->rx_num;
+
+	for (i = 0; i < num; i++) {
+		blk = &enet->irq_blks[i];
+
+		if (blk->ready)
+			eea_pci_free_irq(blk);
+
+		blk->ready = false;
+	}
+
+	kvfree(enet->irq_blks);
+	enet->irq_blks = NULL;
+}
+
+static int eea_alloc_irq_blks(struct eea_net *enet, u32 num)
+{
+	struct eea_device *edev = enet->edev;
+	struct eea_irq_blk *blk, *irq_blks;
+	int i, err;
+
+	irq_blks = kvcalloc(num, sizeof(*blk), GFP_KERNEL);
+	if (!irq_blks)
+		return -ENOMEM;
+
+	for (i = 0; i < num; i++) {
+		blk = &irq_blks[i];
+		blk->idx = i;
+
+		/* vec 0 is for error notify. */
+		blk->msix_vec = i + 1;
+
+		err = eea_pci_request_irq(edev, blk, eea_irq_handler);
+		if (err)
+			goto err_free_irq_blk;
+
+		blk->ready = true;
+	}
+
+	enet->irq_blks = irq_blks;
+	return 0;
+
+err_free_irq_blk:
+	eea_free_irq_blk(enet);
+	return err;
+}
+
+static int eea_update_queues(struct eea_net *enet)
+{
+	return netif_set_real_num_queues(enet->netdev, enet->cfg.tx_ring_num,
+					 enet->cfg.rx_ring_num);
+}
+
+void eea_init_ctx(struct eea_net *enet, struct eea_net_init_ctx *ctx)
+{
+	memset(ctx, 0, sizeof(*ctx));
+
+	ctx->netdev = enet->netdev;
+	ctx->edev = enet->edev;
+	ctx->cfg = enet->cfg;
+}
+
+static void eea_bind_q_and_cfg(struct eea_net *enet,
+			       struct eea_net_init_ctx *ctx)
+{
+	struct eea_irq_blk *blk;
+	struct eea_net_rx *rx;
+	struct eea_net_tx *tx;
+	int i;
+
+	enet->cfg = ctx->cfg;
+	enet->rx = ctx->rx;
+	enet->tx = ctx->tx;
+
+	for (i = 0; i < ctx->cfg.rx_ring_num; i++) {
+		blk = &enet->irq_blks[i];
+
+		rx = ctx->rx[i];
+		tx = &ctx->tx[i];
+
+		rx->enet = enet;
+		rx->napi = &blk->napi;
+		rx->ering->msix_vec = blk->msix_vec;
+
+		tx->enet = enet;
+		tx->ering->msix_vec = blk->msix_vec;
+
+		blk->rx = rx;
+	}
+}
+
+static void eea_unbind_q_and_cfg(struct eea_net *enet,
+				 struct eea_net_init_ctx *ctx)
+{
+	struct eea_irq_blk *blk;
+	struct eea_net_rx *rx;
+	int i;
+
+	ctx->cfg = enet->cfg;
+	ctx->rx = enet->rx;
+	ctx->tx = enet->tx;
+
+	enet->rx = NULL;
+	enet->tx = NULL;
+
+	for (i = 0; i < ctx->cfg.rx_ring_num; i++) {
+		blk = &enet->irq_blks[i];
+
+		rx = ctx->rx[i];
+
+		rx->napi = NULL;
+
+		blk->rx = NULL;
+	}
+}
+
+static void eea_free_rxtx_q_mem(struct eea_net_init_ctx *ctx)
+{
+	struct eea_net_rx *rx;
+	struct eea_net_tx *tx;
+	int i;
+
+	for (i = 0; i < ctx->cfg.rx_ring_num; i++) {
+		rx = ctx->rx[i];
+		tx = &ctx->tx[i];
+
+		eea_free_rx(rx, &ctx->cfg);
+		eea_free_tx(tx, &ctx->cfg);
+	}
+
+	kvfree(ctx->rx);
+	kvfree(ctx->tx);
+}
+
+/* alloc tx/rx: struct, ring, meta, pp, napi */
+static int eea_alloc_rxtx_q_mem(struct eea_net_init_ctx *ctx)
+{
+	struct eea_net_rx *rx;
+	struct eea_net_tx *tx;
+	int err, i;
+
+	ctx->tx = kvcalloc(ctx->cfg.tx_ring_num, sizeof(*ctx->tx), GFP_KERNEL);
+	if (!ctx->tx)
+		return -ENOMEM;
+
+	ctx->rx = kvcalloc(ctx->cfg.rx_ring_num, sizeof(*ctx->rx), GFP_KERNEL);
+	if (!ctx->rx)
+		goto err_free_tx;
+
+	ctx->cfg.rx_sq_desc_size = sizeof(struct eea_rx_desc);
+	ctx->cfg.rx_cq_desc_size = sizeof(struct eea_rx_cdesc);
+	ctx->cfg.tx_sq_desc_size = sizeof(struct eea_tx_desc);
+	ctx->cfg.tx_cq_desc_size = sizeof(struct eea_tx_cdesc);
+
+	ctx->cfg.tx_cq_desc_size /= 2;
+
+	if (!ctx->cfg.split_hdr)
+		ctx->cfg.rx_sq_desc_size /= 2;
+
+	for (i = 0; i < ctx->cfg.rx_ring_num; i++) {
+		rx = eea_alloc_rx(ctx, i);
+		if (!rx)
+			goto err_free;
+
+		ctx->rx[i] = rx;
+
+		tx = ctx->tx + i;
+		err = eea_alloc_tx(ctx, tx, i);
+		if (err)
+			goto err_free;
+	}
+
+	return 0;
+
+err_free:
+	for (i = 0; i < ctx->cfg.rx_ring_num; i++) {
+		rx = ctx->rx[i];
+		tx = ctx->tx + i;
+
+		eea_free_rx(rx, &ctx->cfg);
+		eea_free_tx(tx, &ctx->cfg);
+	}
+
+	kvfree(ctx->rx);
+
+err_free_tx:
+	kvfree(ctx->tx);
+	return -ENOMEM;
+}
+
+static int eea_hw_active_ring(struct eea_net *enet)
+{
+	return eea_adminq_create_q(enet, /* qidx = */ 0,
+				   enet->cfg.rx_ring_num +
+				   enet->cfg.tx_ring_num, 0);
+}
+
+static int eea_hw_unactive_ring(struct eea_net *enet)
+{
+	int err;
+
+	err = eea_adminq_destroy_all_q(enet);
+	if (err)
+		netdev_warn(enet->netdev, "unactive rxtx ring failed.\n");
+
+	return err;
+}
+
+/* stop rx napi, stop tx queue. */
+static void eea_stop_rxtx(struct net_device *netdev)
+{
+	struct eea_net *enet = netdev_priv(netdev);
+	int i;
+
+	netif_tx_disable(netdev);
+
+	for (i = 0; i < enet->cfg.rx_ring_num; i++)
+		enet_rx_stop(enet->rx[i]);
+
+	netif_carrier_off(netdev);
+}
+
+static void eea_start_rxtx(struct eea_net *enet)
+{
+	int i;
+
+	for (i = 0; i < enet->cfg.rx_ring_num; i++)
+		enet_rx_start(enet->rx[i]);
+
+	netif_tx_start_all_queues(enet->netdev);
+	netif_carrier_on(enet->netdev);
+
+	enet->started = true;
+}
+
+static int eea_netdev_stop(struct net_device *netdev)
+{
+	struct eea_net *enet = netdev_priv(netdev);
+	struct eea_net_init_ctx ctx;
+
+	/* This function can be called during device anomaly recovery. To
+	 * prevent duplicate stop operations, the `started` flag is introduced
+	 * for checking.
+	 */
+
+	if (!enet->started) {
+		netdev_warn(netdev, "eea netdev stop: but dev is not started.\n");
+		return 0;
+	}
+
+	eea_stop_rxtx(netdev);
+	eea_hw_unactive_ring(enet);
+	eea_unbind_q_and_cfg(enet, &ctx);
+	eea_free_rxtx_q_mem(&ctx);
+
+	enet->started = false;
+
+	return 0;
+}
+
+static int eea_netdev_open(struct net_device *netdev)
+{
+	struct eea_net *enet = netdev_priv(netdev);
+	struct eea_net_init_ctx ctx;
+	int err;
+
+	if (enet->link_err) {
+		netdev_err(netdev, "netdev open err, because link error: %d\n",
+			   enet->link_err);
+		return -EBUSY;
+	}
+
+	eea_init_ctx(enet, &ctx);
+
+	err = eea_alloc_rxtx_q_mem(&ctx);
+	if (err)
+		goto err_done;
+
+	eea_bind_q_and_cfg(enet, &ctx);
+
+	err = eea_update_queues(enet);
+	if (err)
+		goto err_free_q;
+
+	err = eea_hw_active_ring(enet);
+	if (err)
+		goto err_free_q;
+
+	eea_start_rxtx(enet);
+
+	return 0;
+
+err_free_q:
+	eea_unbind_q_and_cfg(enet, &ctx);
+	eea_free_rxtx_q_mem(&ctx);
+
+err_done:
+	return err;
+}
+
+/* resources: ring, buffers, irq */
+int eea_reset_hw_resources(struct eea_net *enet, struct eea_net_init_ctx *ctx)
+{
+	struct eea_net_init_ctx ctx_old = {0};
+	int err;
+
+	if (!netif_running(enet->netdev) || !enet->started) {
+		enet->cfg = ctx->cfg;
+		return 0;
+	}
+
+	err = eea_alloc_rxtx_q_mem(ctx);
+	if (err) {
+		netdev_warn(enet->netdev,
+			    "eea reset: alloc q failed. stop reset. err %d\n",
+			    err);
+		return err;
+	}
+
+	eea_stop_rxtx(enet->netdev);
+	eea_hw_unactive_ring(enet);
+
+	eea_unbind_q_and_cfg(enet, &ctx_old);
+	eea_bind_q_and_cfg(enet, ctx);
+
+	err = eea_update_queues(enet);
+	if (err) {
+		netdev_err(enet->netdev,
+			   "eea reset: set real num queues failed. err %d\n",
+			   err);
+		goto err_bind_old;
+	}
+
+	err = eea_hw_active_ring(enet);
+	if (err) {
+		netdev_err(enet->netdev, "eea reset: active new ring. err %d\n",
+			   err);
+		eea_unbind_q_and_cfg(enet, ctx);
+		goto err_free_q;
+	}
+
+	eea_start_rxtx(enet);
+	eea_free_rxtx_q_mem(&ctx_old);
+	return 0;
+
+err_bind_old:
+	eea_unbind_q_and_cfg(enet, ctx);
+	eea_bind_q_and_cfg(enet, &ctx_old);
+	err = eea_hw_active_ring(enet);
+	if (err) {
+		netdev_err(enet->netdev, "eea reset: active old ring. err %d\n",
+			   err);
+		eea_unbind_q_and_cfg(enet, &ctx_old);
+		goto err_free_q;
+	}
+
+	eea_start_rxtx(enet);
+	eea_free_rxtx_q_mem(ctx);
+	return 0;
+
+err_free_q:
+
+	/* An exception occurred at the hardware level, and there's not much we
+	 * can do about it -- we can only release the resources first.
+	 */
+	eea_free_rxtx_q_mem(ctx);
+	eea_free_rxtx_q_mem(&ctx_old);
+	enet->started = false;
+	return err;
+}
+
+int eea_queues_check_and_reset(struct eea_device *edev)
+{
+	struct eea_aq_queue_status *qstatus;
+	struct eea_aq_dev_status *dstatus;
+	struct eea_aq_queue_status *qs;
+	struct eea_net_init_ctx ctx;
+	bool need_reset = false;
+	int num, i, err = 0;
+
+	rtnl_lock();
+
+	if (!netif_running(edev->enet->netdev))
+		goto err_unlock;
+
+	num = edev->enet->cfg.tx_ring_num * 2 + 1;
+
+	dstatus = eea_adminq_dev_status(edev->enet);
+	if (!dstatus) {
+		netdev_warn(edev->enet->netdev, "query queue status failed.\n");
+		err = -ENOMEM;
+		goto err_unlock;
+	}
+
+	if (le16_to_cpu(dstatus->link_status) == EEA_LINK_DOWN_STATUS) {
+		eea_netdev_stop(edev->enet->netdev);
+		edev->enet->link_err = EEA_LINK_ERR_LINK_DOWN;
+		netdev_warn(edev->enet->netdev, "device link is down. stop device.\n");
+		goto err_free;
+	}
+
+	qstatus = dstatus->q_status;
+
+	for (i = 0; i < num; ++i) {
+		qs = &qstatus[i];
+
+		if (le16_to_cpu(qs->status) == EEA_QUEUE_STATUS_NEED_RESET) {
+			netdev_warn(edev->enet->netdev,
+				    "queue status: queue %u needs to reset\n",
+				    le16_to_cpu(qs->qidx));
+			need_reset = true;
+		}
+	}
+
+	if (need_reset) {
+		eea_init_ctx(edev->enet, &ctx);
+		err = eea_reset_hw_resources(edev->enet, &ctx);
+	}
+
+err_free:
+	kfree(dstatus);
+
+err_unlock:
+	rtnl_unlock();
+	return err;
+}
+
 static void eea_update_cfg(struct eea_net *enet,
 			   struct eea_device *edev,
 			   struct eea_aq_cfg *hwcfg)
@@ -116,14 +561,19 @@ static int eea_netdev_init_features(struct net_device *netdev,
 }
 
 static const struct net_device_ops eea_netdev = {
+	.ndo_open           = eea_netdev_open,
+	.ndo_stop           = eea_netdev_stop,
+	.ndo_start_xmit     = eea_tx_xmit,
 	.ndo_validate_addr  = eth_validate_addr,
 	.ndo_features_check = passthru_features_check,
+	.ndo_tx_timeout     = eea_tx_timeout,
 };
 
 static struct eea_net *eea_netdev_alloc(struct eea_device *edev, u32 pairs)
 {
 	struct net_device *netdev;
 	struct eea_net *enet;
+	int err;
 
 	netdev = alloc_etherdev_mq(sizeof(struct eea_net), pairs);
 	if (!netdev) {
@@ -140,14 +590,65 @@ static struct eea_net *eea_netdev_alloc(struct eea_device *edev, u32 pairs)
 	enet->edev = edev;
 	edev->enet = enet;
 
+	err = eea_alloc_irq_blks(enet, pairs);
+	if (err) {
+		dev_err(edev->dma_dev,
+			"eea_alloc_irq_blks failed with pairs %d\n", pairs);
+		free_netdev(netdev);
+		return NULL;
+	}
+
 	return enet;
 }
 
+static void eea_update_ts_off(struct eea_device *edev, struct eea_net *enet)
+{
+	u64 ts;
+
+	ts = eea_pci_device_ts(edev);
+
+	enet->hw_ts_offset = ktime_get_real() - ts;
+}
+
+static int eea_net_reprobe(struct eea_device *edev)
+{
+	struct eea_net *enet = edev->enet;
+	int err = 0;
+
+	enet->edev = edev;
+
+	if (!enet->adminq.ring) {
+		err = eea_create_adminq(enet, edev->rx_num + edev->tx_num);
+		if (err)
+			return err;
+	}
+
+	err = eea_alloc_irq_blks(enet, edev->rx_num);
+	if (err) {
+		eea_destroy_adminq(enet);
+		return -ENOMEM;
+	}
+
+	eea_update_ts_off(edev, enet);
+
+	if (edev->ha_reset_netdev_running) {
+		rtnl_lock();
+		enet->link_err = 0;
+		err = eea_netdev_open(enet->netdev);
+		rtnl_unlock();
+	}
+
+	return err;
+}
+
 int eea_net_probe(struct eea_device *edev)
 {
 	struct eea_net *enet;
 	int err = -ENOMEM;
 
+	if (edev->ha_reset)
+		return eea_net_reprobe(edev);
+
 	enet = eea_netdev_alloc(edev, edev->rx_num);
 	if (!enet)
 		return -ENOMEM;
@@ -168,6 +669,7 @@ int eea_net_probe(struct eea_device *edev)
 	if (err)
 		goto err_reset_dev;
 
+	eea_update_ts_off(edev, enet);
 	netif_carrier_off(enet->netdev);
 
 	netdev_dbg(enet->netdev, "eea probe success.\n");
@@ -179,10 +681,31 @@ int eea_net_probe(struct eea_device *edev)
 	eea_destroy_adminq(enet);
 
 err_free_netdev:
+	eea_free_irq_blk(enet);
 	free_netdev(enet->netdev);
 	return err;
 }
 
+static void eea_net_ha_reset_remove(struct eea_net *enet,
+				    struct eea_device *edev,
+				    struct net_device *netdev)
+{
+	rtnl_lock();
+	edev->ha_reset_netdev_running = false;
+	if (netif_running(enet->netdev)) {
+		eea_netdev_stop(enet->netdev);
+		enet->link_err = EEA_LINK_ERR_HA_RESET_DEV;
+		edev->ha_reset_netdev_running = true;
+	}
+	rtnl_unlock();
+
+	eea_device_reset(edev);
+	eea_destroy_adminq(enet);
+	eea_free_irq_blk(enet);
+
+	enet->edev = NULL;
+}
+
 void eea_net_remove(struct eea_device *edev)
 {
 	struct net_device *netdev;
@@ -191,12 +714,16 @@ void eea_net_remove(struct eea_device *edev)
 	enet = edev->enet;
 	netdev = enet->netdev;
 
+	if (edev->ha_reset) {
+		eea_net_ha_reset_remove(enet, edev, netdev);
+		return;
+	}
+
 	unregister_netdev(netdev);
-	netdev_dbg(enet->netdev, "eea removed.\n");
 
 	eea_device_reset(edev);
-
 	eea_destroy_adminq(enet);
+	eea_free_irq_blk(enet);
 
 	free_netdev(netdev);
 }
diff --git a/drivers/net/ethernet/alibaba/eea/eea_net.h b/drivers/net/ethernet/alibaba/eea/eea_net.h
index ab487bc88af2..6f9c89c180de 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_net.h
+++ b/drivers/net/ethernet/alibaba/eea/eea_net.h
@@ -18,6 +18,8 @@
 #define EEA_VER_MINOR		0
 #define EEA_VER_SUB_MINOR	0
 
+struct eea_tx_meta;
+
 struct eea_net_tx {
 	struct eea_net *enet;
 
@@ -101,6 +103,18 @@ struct eea_net_cfg {
 	u8 tx_cq_desc_size;
 
 	u32 split_hdr;
+
+	struct hwtstamp_config ts_cfg;
+};
+
+struct eea_net_init_ctx {
+	struct eea_net_cfg cfg;
+
+	struct eea_net_tx *tx;
+	struct eea_net_rx **rx;
+
+	struct net_device *netdev;
+	struct eea_device *edev;
 };
 
 enum {
@@ -109,6 +123,17 @@ enum {
 	EEA_LINK_ERR_LINK_DOWN,
 };
 
+struct eea_irq_blk {
+	struct napi_struct napi;
+	u16 msix_vec;
+	bool ready;
+	struct eea_net_rx *rx;
+	char irq_name[32];
+	int irq;
+	int idx;
+
+};
+
 struct eea_net {
 	struct eea_device *edev;
 	struct net_device *netdev;
@@ -121,6 +146,8 @@ struct eea_net {
 	struct eea_net_cfg cfg;
 	struct eea_net_cfg cfg_hw;
 
+	struct eea_irq_blk *irq_blks;
+
 	u32 link_err;
 
 	bool started;
@@ -134,4 +161,24 @@ struct eea_net {
 int eea_net_probe(struct eea_device *edev);
 void eea_net_remove(struct eea_device *edev);
 
+int eea_reset_hw_resources(struct eea_net *enet, struct eea_net_init_ctx *ctx);
+void eea_init_ctx(struct eea_net *enet, struct eea_net_init_ctx *ctx);
+int eea_queues_check_and_reset(struct eea_device *edev);
+
+/* rx apis */
+void enet_rx_stop(struct eea_net_rx *rx);
+void enet_rx_start(struct eea_net_rx *rx);
+
+void eea_free_rx(struct eea_net_rx *rx, struct eea_net_cfg *cfg);
+struct eea_net_rx *eea_alloc_rx(struct eea_net_init_ctx *ctx, u32 idx);
+
+/* tx apis */
+int eea_poll_tx(struct eea_net_tx *tx, int budget);
+netdev_tx_t eea_tx_xmit(struct sk_buff *skb, struct net_device *netdev);
+
+void eea_tx_timeout(struct net_device *netdev, unsigned int txqueue);
+
+void eea_free_tx(struct eea_net_tx *tx, struct eea_net_cfg *cfg);
+int eea_alloc_tx(struct eea_net_init_ctx *ctx, struct eea_net_tx *tx, u32 idx);
+
 #endif
diff --git a/drivers/net/ethernet/alibaba/eea/eea_pci.c b/drivers/net/ethernet/alibaba/eea/eea_pci.c
index 97efac753cfb..e42b0298eebb 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_pci.c
+++ b/drivers/net/ethernet/alibaba/eea/eea_pci.c
@@ -13,6 +13,9 @@
 
 #define EEA_PCI_DB_OFFSET 4096
 
+#define EEA_PCI_CAP_RESET_DEVICE 0xFA
+#define EEA_PCI_CAP_RESET_FLAG BIT(1)
+
 struct eea_pci_cfg {
 	__le32 reserve0;
 	__le32 reserve1;
@@ -51,6 +54,7 @@ struct eea_pci_device {
 	void __iomem *reg;
 	void __iomem *db_base;
 
+	struct work_struct ha_handle_work;
 	char ha_irq_name[32];
 	u8 reset_pos;
 };
@@ -67,6 +71,11 @@ struct eea_pci_device {
 #define cfg_read32(reg, item) ioread32(cfg_pointer(reg, item))
 #define cfg_readq(reg, item) readq(cfg_pointer(reg, item))
 
+/* Due to circular references, we have to add function definitions here. */
+static int __eea_pci_probe(struct pci_dev *pci_dev,
+			   struct eea_pci_device *ep_dev);
+static void __eea_pci_remove(struct pci_dev *pci_dev, bool flush_ha_work);
+
 const char *eea_pci_name(struct eea_device *edev)
 {
 	return pci_name(edev->ep_dev->pci_dev);
@@ -250,6 +259,150 @@ void eea_pci_active_aq(struct eea_ring *ering, int msix_vec)
 				    cfg_read32(ep_dev->reg, aq_db_off));
 }
 
+void eea_pci_free_irq(struct eea_irq_blk *blk)
+{
+	irq_update_affinity_hint(blk->irq, NULL);
+	free_irq(blk->irq, blk);
+}
+
+int eea_pci_request_irq(struct eea_device *edev, struct eea_irq_blk *blk,
+			irqreturn_t (*callback)(int irq, void *data))
+{
+	struct eea_pci_device *ep_dev = edev->ep_dev;
+	int irq;
+
+	snprintf(blk->irq_name, sizeof(blk->irq_name), "eea-q%d@%s", blk->idx,
+		 pci_name(ep_dev->pci_dev));
+
+	irq = pci_irq_vector(ep_dev->pci_dev, blk->msix_vec);
+
+	blk->irq = irq;
+
+	return request_irq(irq, callback, IRQF_NO_AUTOEN, blk->irq_name, blk);
+}
+
+static void eea_ha_handle_reset(struct eea_pci_device *ep_dev)
+{
+	struct eea_device *edev;
+	struct pci_dev *pci_dev;
+	u16 reset;
+	int err;
+
+	if (!ep_dev->reset_pos) {
+		eea_queues_check_and_reset(&ep_dev->edev);
+		return;
+	}
+
+	edev = &ep_dev->edev;
+
+	pci_read_config_word(ep_dev->pci_dev, ep_dev->reset_pos, &reset);
+
+	/* clear bit */
+	pci_write_config_word(ep_dev->pci_dev, ep_dev->reset_pos, 0xFFFF);
+
+	if (reset & EEA_PCI_CAP_RESET_FLAG) {
+		dev_warn(&ep_dev->pci_dev->dev, "recv device reset request.\n");
+
+		pci_dev = ep_dev->pci_dev;
+
+		/* The pci remove callback may hold this lock. If the
+		 * pci remove callback is called, then we can ignore the
+		 * ha interrupt.
+		 */
+		if (mutex_trylock(&edev->ha_lock)) {
+			edev->ha_reset = true;
+
+			__eea_pci_remove(pci_dev, false);
+			err = __eea_pci_probe(pci_dev, ep_dev);
+			if (err)
+				dev_err(&ep_dev->pci_dev->dev,
+					"ha: re-setup failed.\n");
+
+			edev->ha_reset = false;
+			mutex_unlock(&edev->ha_lock);
+		} else {
+			dev_warn(&ep_dev->pci_dev->dev,
+				 "ha device reset: trylock failed.\n");
+		}
+		return;
+	}
+
+	eea_queues_check_and_reset(&ep_dev->edev);
+}
+
+/* ha handle code */
+static void eea_ha_handle_work(struct work_struct *work)
+{
+	struct eea_pci_device *ep_dev;
+
+	ep_dev = container_of(work, struct eea_pci_device, ha_handle_work);
+
+	/* Ha interrupt is triggered, so there maybe some error, we may need to
+	 * reset the device or reset some queues.
+	 */
+	dev_warn(&ep_dev->pci_dev->dev, "recv ha interrupt.\n");
+
+	eea_ha_handle_reset(ep_dev);
+}
+
+static irqreturn_t eea_pci_ha_handle(int irq, void *data)
+{
+	struct eea_device *edev = data;
+
+	schedule_work(&edev->ep_dev->ha_handle_work);
+
+	return IRQ_HANDLED;
+}
+
+static void eea_pci_free_ha_irq(struct eea_device *edev)
+{
+	struct eea_pci_device *ep_dev = edev->ep_dev;
+	int irq = pci_irq_vector(ep_dev->pci_dev, 0);
+
+	free_irq(irq, edev);
+}
+
+static int eea_pci_ha_init(struct eea_device *edev, struct pci_dev *pci_dev)
+{
+	u8 pos, cfg_type_off, type, cfg_drv_off, cfg_dev_off;
+	struct eea_pci_device *ep_dev = edev->ep_dev;
+	int irq;
+
+	cfg_type_off = offsetof(struct eea_pci_cap, cfg_type);
+	cfg_drv_off = offsetof(struct eea_pci_reset_reg, driver);
+	cfg_dev_off = offsetof(struct eea_pci_reset_reg, device);
+
+	for (pos = pci_find_capability(pci_dev, PCI_CAP_ID_VNDR);
+	     pos > 0;
+	     pos = pci_find_next_capability(pci_dev, pos, PCI_CAP_ID_VNDR)) {
+		pci_read_config_byte(pci_dev, pos + cfg_type_off, &type);
+
+		if (type == EEA_PCI_CAP_RESET_DEVICE) {
+			/* notify device, driver support this feature. */
+			pci_write_config_word(pci_dev, pos + cfg_drv_off,
+					      EEA_PCI_CAP_RESET_FLAG);
+			pci_write_config_word(pci_dev, pos + cfg_dev_off,
+					      0xFFFF);
+
+			edev->ep_dev->reset_pos = pos + cfg_dev_off;
+			goto found;
+		}
+	}
+
+	dev_warn(&edev->ep_dev->pci_dev->dev, "Not Found reset cap.\n");
+
+found:
+	snprintf(ep_dev->ha_irq_name, sizeof(ep_dev->ha_irq_name), "eea-ha@%s",
+		 pci_name(ep_dev->pci_dev));
+
+	irq = pci_irq_vector(ep_dev->pci_dev, 0);
+
+	INIT_WORK(&ep_dev->ha_handle_work, eea_ha_handle_work);
+
+	return request_irq(irq, eea_pci_ha_handle, 0,
+			   ep_dev->ha_irq_name, edev);
+}
+
 u64 eea_pci_device_ts(struct eea_device *edev)
 {
 	struct eea_pci_device *ep_dev = edev->ep_dev;
@@ -284,10 +437,13 @@ static int eea_init_device(struct eea_device *edev)
 static int __eea_pci_probe(struct pci_dev *pci_dev,
 			   struct eea_pci_device *ep_dev)
 {
+	struct eea_device *edev;
 	int err;
 
 	pci_set_drvdata(pci_dev, ep_dev);
 
+	edev = &ep_dev->edev;
+
 	err = eea_pci_setup(pci_dev, ep_dev);
 	if (err)
 		return err;
@@ -296,19 +452,31 @@ static int __eea_pci_probe(struct pci_dev *pci_dev,
 	if (err)
 		goto err_pci_rel;
 
+	err = eea_pci_ha_init(edev, pci_dev);
+	if (err)
+		goto err_net_rm;
+
 	return 0;
 
+err_net_rm:
+	eea_net_remove(edev);
+
 err_pci_rel:
 	eea_pci_release_resource(ep_dev);
 	return err;
 }
 
-static void __eea_pci_remove(struct pci_dev *pci_dev)
+static void __eea_pci_remove(struct pci_dev *pci_dev, bool flush_ha_work)
 {
 	struct eea_pci_device *ep_dev = pci_get_drvdata(pci_dev);
 	struct device *dev = get_device(&ep_dev->pci_dev->dev);
 	struct eea_device *edev = &ep_dev->edev;
 
+	eea_pci_free_ha_irq(edev);
+
+	if (flush_ha_work)
+		flush_work(&ep_dev->ha_handle_work);
+
 	eea_net_remove(edev);
 
 	pci_disable_sriov(pci_dev);
@@ -336,8 +504,11 @@ static int eea_pci_probe(struct pci_dev *pci_dev,
 
 	ep_dev->pci_dev = pci_dev;
 
+	mutex_init(&edev->ha_lock);
+
 	err = __eea_pci_probe(pci_dev, ep_dev);
 	if (err) {
+		mutex_destroy(&edev->ha_lock);
 		pci_set_drvdata(pci_dev, NULL);
 		kfree(ep_dev);
 	}
@@ -348,10 +519,17 @@ static int eea_pci_probe(struct pci_dev *pci_dev,
 static void eea_pci_remove(struct pci_dev *pci_dev)
 {
 	struct eea_pci_device *ep_dev = pci_get_drvdata(pci_dev);
+	struct eea_device *edev;
+
+	edev = &ep_dev->edev;
 
-	__eea_pci_remove(pci_dev);
+	mutex_lock(&edev->ha_lock);
+	__eea_pci_remove(pci_dev, true);
+	mutex_unlock(&edev->ha_lock);
 
 	pci_set_drvdata(pci_dev, NULL);
+
+	mutex_destroy(&edev->ha_lock);
 	kfree(ep_dev);
 }
 
diff --git a/drivers/net/ethernet/alibaba/eea/eea_pci.h b/drivers/net/ethernet/alibaba/eea/eea_pci.h
index d240dc2dae9b..8dfb860165dc 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_pci.h
+++ b/drivers/net/ethernet/alibaba/eea/eea_pci.h
@@ -10,8 +10,11 @@
 
 #include <linux/pci.h>
 
+#include "eea_net.h"
 #include "eea_ring.h"
 
+struct eea_irq_blk;
+
 struct eea_pci_cap {
 	__u8 cap_vndr;
 	__u8 cap_next;
@@ -34,6 +37,12 @@ struct eea_device {
 
 	u64 features;
 
+	bool ha_reset;
+	bool ha_reset_netdev_running;
+
+	/* ha lock for the race between ha work and pci remove */
+	struct mutex ha_lock;
+
 	u32 rx_num;
 	u32 tx_num;
 	u32 db_blk_size;
@@ -47,6 +56,10 @@ int eea_device_reset(struct eea_device *dev);
 void eea_device_ready(struct eea_device *dev);
 void eea_pci_active_aq(struct eea_ring *ering, int msix_vec);
 
+int eea_pci_request_irq(struct eea_device *edev, struct eea_irq_blk *blk,
+			irqreturn_t (*callback)(int irq, void *data));
+void eea_pci_free_irq(struct eea_irq_blk *blk);
+
 u64 eea_pci_device_ts(struct eea_device *edev);
 
 void __iomem *eea_pci_db_addr(struct eea_device *edev, u32 off);
diff --git a/drivers/net/ethernet/alibaba/eea/eea_rx.c b/drivers/net/ethernet/alibaba/eea/eea_rx.c
new file mode 100644
index 000000000000..7c9bb513191b
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_rx.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#include <net/netdev_rx_queue.h>
+#include <net/page_pool/helpers.h>
+
+#include "eea_adminq.h"
+#include "eea_net.h"
+#include "eea_ring.h"
+
+#define EEA_ENABLE_F_NAPI        BIT(0)
+
+#define EEA_PAGE_FRAGS_NUM 1024
+
+static void eea_free_rx_buffer(struct eea_net_rx *rx, struct eea_rx_meta *meta)
+{
+	u32 drain_count;
+
+	drain_count = EEA_PAGE_FRAGS_NUM - meta->frags;
+
+	if (page_pool_unref_page(meta->page, drain_count) == 0)
+		page_pool_put_unrefed_page(rx->pp, meta->page, -1, true);
+
+	meta->page = NULL;
+}
+
+static void eea_free_rx_hdr(struct eea_net_rx *rx, struct eea_net_cfg *cfg)
+{
+	struct eea_rx_meta *meta;
+	int i;
+
+	for (i = 0; i < cfg->rx_ring_depth; ++i) {
+		meta = &rx->meta[i];
+		meta->hdr_addr = NULL;
+
+		if (!meta->hdr_page)
+			continue;
+
+		dma_unmap_page(rx->dma_dev, meta->hdr_dma, PAGE_SIZE,
+			       DMA_FROM_DEVICE);
+		put_page(meta->hdr_page);
+
+		meta->hdr_page = NULL;
+	}
+}
+
+static int eea_alloc_rx_hdr(struct eea_net_init_ctx *ctx, struct eea_net_rx *rx)
+{
+	struct page *hdr_page = NULL;
+	struct eea_rx_meta *meta;
+	u32 offset = 0, hdrsize;
+	struct device *dmadev;
+	dma_addr_t dma;
+	int i;
+
+	dmadev = ctx->edev->dma_dev;
+	hdrsize = ctx->cfg.split_hdr;
+
+	for (i = 0; i < ctx->cfg.rx_ring_depth; ++i) {
+		meta = &rx->meta[i];
+
+		if (!hdr_page || offset + hdrsize > PAGE_SIZE) {
+			hdr_page = dev_alloc_page();
+			if (!hdr_page)
+				return -ENOMEM;
+
+			dma = dma_map_page(dmadev, hdr_page, 0, PAGE_SIZE,
+					   DMA_FROM_DEVICE);
+
+			if (unlikely(dma_mapping_error(dmadev, dma))) {
+				put_page(hdr_page);
+				return -ENOMEM;
+			}
+
+			offset = 0;
+			meta->hdr_page = hdr_page;
+		}
+
+		meta->hdr_dma = dma + offset;
+		meta->hdr_addr = page_address(hdr_page) + offset;
+		offset += hdrsize;
+	}
+
+	return 0;
+}
+
+static int eea_poll(struct napi_struct *napi, int budget)
+{
+	/* Empty function; will be implemented in a subsequent commit. */
+	return 0;
+}
+
+static void eea_free_rx_buffers(struct eea_net_rx *rx, struct eea_net_cfg *cfg)
+{
+	struct eea_rx_meta *meta;
+	u32 i;
+
+	for (i = 0; i < cfg->rx_ring_depth; ++i) {
+		meta = &rx->meta[i];
+		if (!meta->page)
+			continue;
+
+		eea_free_rx_buffer(rx, meta);
+	}
+}
+
+static struct page_pool *eea_create_pp(struct eea_net_rx *rx,
+				       struct eea_net_init_ctx *ctx, u32 idx)
+{
+	struct page_pool_params pp_params = {0};
+
+	pp_params.order     = 0;
+	pp_params.flags     = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
+	pp_params.pool_size = ctx->cfg.rx_ring_depth;
+	pp_params.nid       = dev_to_node(ctx->edev->dma_dev);
+	pp_params.dev       = ctx->edev->dma_dev;
+	pp_params.napi      = rx->napi;
+	pp_params.netdev    = ctx->netdev;
+	pp_params.dma_dir   = DMA_FROM_DEVICE;
+	pp_params.max_len   = PAGE_SIZE;
+
+	return page_pool_create(&pp_params);
+}
+
+static void eea_destroy_page_pool(struct eea_net_rx *rx)
+{
+	if (rx->pp)
+		page_pool_destroy(rx->pp);
+}
+
+void enet_rx_stop(struct eea_net_rx *rx)
+{
+	if (rx->flags & EEA_ENABLE_F_NAPI) {
+		rx->flags &= ~EEA_ENABLE_F_NAPI;
+		disable_irq(rx->enet->irq_blks[rx->index].irq);
+		napi_disable(rx->napi);
+		netif_napi_del(rx->napi);
+	}
+}
+
+void enet_rx_start(struct eea_net_rx *rx)
+{
+	netif_napi_add(rx->enet->netdev, rx->napi, eea_poll);
+
+	napi_enable(rx->napi);
+
+	rx->flags |= EEA_ENABLE_F_NAPI;
+
+	local_bh_disable();
+	napi_schedule(rx->napi);
+	local_bh_enable();
+
+	enable_irq(rx->enet->irq_blks[rx->index].irq);
+}
+
+/* Maybe called before enet_bind_new_q_and_cfg. So the cfg must be
+ * passed.
+ */
+void eea_free_rx(struct eea_net_rx *rx, struct eea_net_cfg *cfg)
+{
+	if (!rx)
+		return;
+
+	if (rx->ering) {
+		ering_free(rx->ering);
+		rx->ering = NULL;
+	}
+
+	if (rx->meta) {
+		eea_free_rx_buffers(rx, cfg);
+		eea_free_rx_hdr(rx, cfg);
+		kvfree(rx->meta);
+		rx->meta = NULL;
+	}
+
+	if (rx->pp) {
+		eea_destroy_page_pool(rx);
+		rx->pp = NULL;
+	}
+
+	kfree(rx);
+}
+
+static void eea_rx_meta_init(struct eea_net_rx *rx, u32 num)
+{
+	struct eea_rx_meta *meta;
+	int i;
+
+	rx->free = NULL;
+
+	for (i = 0; i < num; ++i) {
+		meta = &rx->meta[i];
+		meta->id = i;
+		meta->next = rx->free;
+		rx->free = meta;
+	}
+}
+
+struct eea_net_rx *eea_alloc_rx(struct eea_net_init_ctx *ctx, u32 idx)
+{
+	struct eea_ring *ering;
+	struct eea_net_rx *rx;
+	int err;
+
+	rx = kzalloc(sizeof(*rx), GFP_KERNEL);
+	if (!rx)
+		return rx;
+
+	rx->index = idx;
+	sprintf(rx->name, "rx.%u", idx);
+
+	/* ering */
+	ering = ering_alloc(idx * 2, ctx->cfg.rx_ring_depth, ctx->edev,
+			    ctx->cfg.rx_sq_desc_size,
+			    ctx->cfg.rx_cq_desc_size,
+			    rx->name);
+	if (!ering)
+		goto err_free_rx;
+
+	rx->ering = ering;
+
+	rx->dma_dev = ctx->edev->dma_dev;
+
+	/* meta */
+	rx->meta = kvcalloc(ctx->cfg.rx_ring_depth,
+			    sizeof(*rx->meta), GFP_KERNEL);
+	if (!rx->meta)
+		goto err_free_rx;
+
+	eea_rx_meta_init(rx, ctx->cfg.rx_ring_depth);
+
+	if (ctx->cfg.split_hdr) {
+		err = eea_alloc_rx_hdr(ctx, rx);
+		if (err)
+			goto err_free_rx;
+	}
+
+	rx->pp = eea_create_pp(rx, ctx, idx);
+	if (IS_ERR(rx->pp)) {
+		err = PTR_ERR(rx->pp);
+		rx->pp = NULL;
+		goto err_free_rx;
+	}
+
+	return rx;
+
+err_free_rx:
+	eea_free_rx(rx, &ctx->cfg);
+	return NULL;
+}
diff --git a/drivers/net/ethernet/alibaba/eea/eea_tx.c b/drivers/net/ethernet/alibaba/eea/eea_tx.c
new file mode 100644
index 000000000000..1475fca44b6e
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_tx.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#include <net/netdev_queues.h>
+
+#include "eea_net.h"
+#include "eea_pci.h"
+#include "eea_ring.h"
+
+struct eea_tx_meta {
+	struct eea_tx_meta *next;
+
+	u32 id;
+
+	union {
+		struct sk_buff *skb;
+		void *data;
+	};
+
+	u32 num;
+
+	dma_addr_t dma_addr;
+	struct eea_tx_desc *desc;
+	u16 dma_len;
+};
+
+int eea_poll_tx(struct eea_net_tx *tx, int budget)
+{
+	/* Empty function; will be implemented in a subsequent commit. */
+	return 0;
+}
+
+netdev_tx_t eea_tx_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	/* Empty function; will be implemented in a subsequent commit. */
+	return NETDEV_TX_OK;
+}
+
+static void eea_free_meta(struct eea_net_tx *tx, struct eea_net_cfg *cfg)
+{
+	kvfree(tx->meta);
+	tx->meta = NULL;
+}
+
+void eea_tx_timeout(struct net_device *netdev, unsigned int txqueue)
+{
+	struct netdev_queue *txq = netdev_get_tx_queue(netdev, txqueue);
+	struct eea_net *priv = netdev_priv(netdev);
+	struct eea_net_tx *tx = &priv->tx[txqueue];
+
+	netdev_err(netdev, "TX timeout on queue: %u, tx: %s, ering: 0x%x, %u usecs ago\n",
+		   txqueue, tx->name, tx->ering->index,
+		   jiffies_to_usecs(jiffies - READ_ONCE(txq->trans_start)));
+}
+
+/* Maybe called before enet_bind_new_q_and_cfg. So the cfg must be
+ * passed.
+ */
+void eea_free_tx(struct eea_net_tx *tx, struct eea_net_cfg *cfg)
+{
+	if (!tx)
+		return;
+
+	if (tx->ering) {
+		ering_free(tx->ering);
+		tx->ering = NULL;
+	}
+
+	if (tx->meta)
+		eea_free_meta(tx, cfg);
+}
+
+int eea_alloc_tx(struct eea_net_init_ctx *ctx, struct eea_net_tx *tx, u32 idx)
+{
+	struct eea_tx_meta *meta;
+	struct eea_ring *ering;
+	u32 i;
+
+	sprintf(tx->name, "tx.%u", idx);
+
+	ering = ering_alloc(idx * 2 + 1, ctx->cfg.tx_ring_depth, ctx->edev,
+			    ctx->cfg.tx_sq_desc_size,
+			    ctx->cfg.tx_cq_desc_size,
+			    tx->name);
+	if (!ering)
+		goto err_free_tx;
+
+	tx->ering = ering;
+	tx->index = idx;
+	tx->dma_dev = ctx->edev->dma_dev;
+
+	/* meta */
+	tx->meta = kvcalloc(ctx->cfg.tx_ring_depth,
+			    sizeof(*tx->meta), GFP_KERNEL);
+	if (!tx->meta)
+		goto err_free_tx;
+
+	for (i = 0; i < ctx->cfg.tx_ring_depth; ++i) {
+		meta = &tx->meta[i];
+		meta->id = i;
+		meta->next = tx->free;
+		tx->free = meta;
+	}
+
+	return 0;
+
+err_free_tx:
+	eea_free_tx(tx, &ctx->cfg);
+	return -ENOMEM;
+}
-- 
2.32.0.3.g01195cf9f


^ permalink raw reply related

* [PATCH net-next v26 3/8] eea: probe the netdevice and create adminq
From: Xuan Zhuo @ 2026-02-08  8:46 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Xuan Zhuo, Wen Gu, Philo Lu, Lorenzo Bianconi,
	Vadim Fedorenko, Dong Yibo, Heiner Kallweit, Lukas Bulwahn,
	Dust Li
In-Reply-To: <20260208084613.2658-1-xuanzhuo@linux.alibaba.com>

Add basic driver framework for the Alibaba Elastic Ethernet Adapter(EEA).

This commit creates and registers the netdevice after PCI probe,
and initializes the admin queue to send commands to the device.

Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Reviewed-by: Philo Lu <lulie@linux.alibaba.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/net/ethernet/alibaba/eea/Makefile     |   6 +-
 drivers/net/ethernet/alibaba/eea/eea_adminq.c | 442 ++++++++++++++++++
 drivers/net/ethernet/alibaba/eea/eea_adminq.h |  75 +++
 drivers/net/ethernet/alibaba/eea/eea_net.c    | 202 ++++++++
 drivers/net/ethernet/alibaba/eea/eea_net.h    | 137 ++++++
 drivers/net/ethernet/alibaba/eea/eea_pci.c    |  24 +-
 drivers/net/ethernet/alibaba/eea/eea_pci.h    |   3 +
 7 files changed, 886 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_adminq.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_adminq.h
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_net.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_net.h

diff --git a/drivers/net/ethernet/alibaba/eea/Makefile b/drivers/net/ethernet/alibaba/eea/Makefile
index e5e4007810a6..91f318e8e046 100644
--- a/drivers/net/ethernet/alibaba/eea/Makefile
+++ b/drivers/net/ethernet/alibaba/eea/Makefile
@@ -1,4 +1,6 @@
 
 obj-$(CONFIG_EEA) += eea.o
-eea-y :=  eea_ring.o \
-	eea_pci.o
+eea-y := eea_ring.o \
+	eea_net.o \
+	eea_pci.o \
+	eea_adminq.o
diff --git a/drivers/net/ethernet/alibaba/eea/eea_adminq.c b/drivers/net/ethernet/alibaba/eea/eea_adminq.c
new file mode 100644
index 000000000000..24c2896f0b9e
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_adminq.c
@@ -0,0 +1,442 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/iopoll.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+
+#include "eea_adminq.h"
+#include "eea_net.h"
+#include "eea_pci.h"
+#include "eea_ring.h"
+
+#define EEA_AQ_CMD_CFG_QUERY         ((0 << 8) | 0)
+
+#define EEA_AQ_CMD_QUEUE_CREATE      ((1 << 8) | 0)
+#define EEA_AQ_CMD_QUEUE_DESTROY_ALL ((1 << 8) | 1)
+
+#define EEA_AQ_CMD_HOST_INFO         ((2 << 8) | 0)
+
+#define EEA_AQ_CMD_DEV_STATUS        ((3 << 8) | 0)
+
+#define EEA_RING_DESC_F_AQ_PHASE     (BIT(15) | BIT(7))
+
+#define EEA_QUEUE_FLAGS_HW_SPLIT_HDR BIT(0)
+#define EEA_QUEUE_FLAGS_SQCQ         BIT(1)
+#define EEA_QUEUE_FLAGS_HWTS         BIT(2)
+
+struct eea_aq_create {
+	__le32 flags;
+	/* queue index.
+	 * rx: 0 == qidx % 2
+	 * tx: 1 == qidx % 2
+	 */
+	__le16 qidx;
+	/* the depth of the queue */
+	__le16 depth;
+	/*  0: without SPLIT HDR
+	 *  1: 128B
+	 *  2: 256B
+	 *  3: 512B
+	 */
+	u8 hdr_buf_size;
+	u8 sq_desc_size;
+	u8 cq_desc_size;
+	u8 reserve0;
+	/* The vector for the irq. rx,tx share the same vector */
+	__le16 msix_vector;
+	__le16 reserve;
+	/* sq ring cfg. */
+	__le32 sq_addr_low;
+	__le32 sq_addr_high;
+	/* cq ring cfg. Just valid when flags include EEA_QUEUE_FLAGS_SQCQ. */
+	__le32 cq_addr_low;
+	__le32 cq_addr_high;
+};
+
+struct eea_aq_queue_drv_status {
+	__le16 qidx;
+
+	__le16 sq_head;
+	__le16 cq_head;
+	__le16 reserved;
+};
+
+#define EEA_OS_DISTRO		0
+#define EEA_DRV_TYPE		0
+#define EEA_OS_LINUX		1
+#define EEA_SPEC_VER_MAJOR	1
+#define EEA_SPEC_VER_MINOR	0
+
+struct eea_aq_host_info_cfg {
+	__le16	os_type;
+	__le16	os_dist;
+	__le16	drv_type;
+
+	__le16	kern_ver_major;
+	__le16	kern_ver_minor;
+	__le16	kern_ver_sub_minor;
+
+	__le16	drv_ver_major;
+	__le16	drv_ver_minor;
+	__le16	drv_ver_sub_minor;
+
+	__le16	spec_ver_major;
+	__le16	spec_ver_minor;
+	__le16	pci_bdf;
+	__le32	pci_domain;
+
+	u8      os_ver_str[64];
+	u8      isa_str[64];
+};
+
+#define EEA_HINFO_MAX_REP_LEN	1024
+#define EEA_HINFO_REP_REJECT	2
+
+struct eea_aq_host_info_rep {
+	u8	op_code;
+	u8	has_reply;
+	u8	reply_str[EEA_HINFO_MAX_REP_LEN];
+};
+
+static struct eea_ring *qid_to_ering(struct eea_net *enet, u32 qid)
+{
+	struct eea_ring *ering;
+
+	if (qid % 2 == 0)
+		ering = enet->rx[qid / 2]->ering;
+	else
+		ering = enet->tx[qid / 2].ering;
+
+	return ering;
+}
+
+#define EEA_AQ_TIMEOUT_US (60 * 1000 * 1000)
+
+static int eea_adminq_submit(struct eea_net *enet, u16 cmd,
+			     dma_addr_t req_addr, dma_addr_t res_addr,
+			     u32 req_size, u32 res_size)
+{
+	struct eea_aq_cdesc *cdesc;
+	struct eea_aq_desc *desc;
+	int ret;
+
+	desc = ering_aq_alloc_desc(enet->adminq.ring);
+
+	desc->classid = cmd >> 8;
+	desc->command = cmd & 0xff;
+
+	desc->data_addr = cpu_to_le64(req_addr);
+	desc->data_len = cpu_to_le32(req_size);
+
+	desc->reply_addr = cpu_to_le64(res_addr);
+	desc->reply_len = cpu_to_le32(res_size);
+
+	/* for update flags */
+	wmb();
+
+	desc->flags = cpu_to_le16(enet->adminq.phase);
+
+	ering_sq_commit_desc(enet->adminq.ring);
+
+	ering_kick(enet->adminq.ring);
+
+	++enet->adminq.num;
+
+	if ((enet->adminq.num % enet->adminq.ring->num) == 0)
+		enet->adminq.phase ^= EEA_RING_DESC_F_AQ_PHASE;
+
+	ret = read_poll_timeout(ering_cq_get_desc, cdesc, cdesc, 0,
+				EEA_AQ_TIMEOUT_US, false, enet->adminq.ring);
+	if (ret)
+		return ret;
+
+	ret = le32_to_cpu(cdesc->status);
+
+	ering_cq_ack_desc(enet->adminq.ring, 1);
+
+	if (ret)
+		netdev_err(enet->netdev,
+			   "adminq exec failed. cmd: %d ret %d\n", cmd, ret);
+
+	return ret;
+}
+
+static int eea_adminq_exec(struct eea_net *enet, u16 cmd,
+			   void *req, u32 req_size, void *res, u32 res_size)
+{
+	dma_addr_t req_addr = 0, res_addr = 0;
+	struct device *dma;
+	int ret;
+
+	dma = enet->edev->dma_dev;
+
+	if (req) {
+		req_addr = dma_map_single(dma, req, req_size, DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(dma, req_addr)))
+			return -ENOMEM;
+	}
+
+	if (res) {
+		res_addr = dma_map_single(dma, res, res_size, DMA_FROM_DEVICE);
+		if (unlikely(dma_mapping_error(dma, res_addr))) {
+			ret = -ENOMEM;
+			goto err_unmap_req;
+		}
+	}
+
+	ret = eea_adminq_submit(enet, cmd, req_addr, res_addr,
+				req_size, res_size);
+	if (res)
+		dma_unmap_single(dma, res_addr, res_size, DMA_FROM_DEVICE);
+
+err_unmap_req:
+	if (req)
+		dma_unmap_single(dma, req_addr, req_size, DMA_TO_DEVICE);
+
+	return ret;
+}
+
+void eea_destroy_adminq(struct eea_net *enet)
+{
+	struct eea_aq *aq;
+
+	aq = &enet->adminq;
+
+	if (aq->ring) {
+		ering_free(aq->ring);
+		aq->ring = NULL;
+		aq->phase = 0;
+	}
+
+	kfree(aq->q_req_buf);
+	kfree(aq->q_res_buf);
+
+	aq->q_req_buf = NULL;
+	aq->q_res_buf = NULL;
+}
+
+int eea_create_adminq(struct eea_net *enet, u32 qid)
+{
+	u32 db_size, q_size, num;
+	struct eea_ring *ering;
+	struct eea_aq *aq;
+
+	num = enet->edev->rx_num + enet->edev->tx_num;
+	aq = &enet->adminq;
+
+	ering = ering_alloc(qid, 64, enet->edev, sizeof(struct eea_aq_desc),
+			    sizeof(struct eea_aq_desc), "adminq");
+	if (!ering)
+		return -ENOMEM;
+
+	eea_pci_active_aq(ering, qid / 2 + 1);
+
+	aq->ring = ering;
+	aq->phase = BIT(7);
+	aq->num = 0;
+
+	q_size = sizeof(struct eea_aq_create) * num;
+	db_size = sizeof(int) * num;
+
+	aq->q_req_size = q_size;
+	aq->q_res_size = db_size;
+
+	aq->q_req_buf = kmalloc(q_size, GFP_KERNEL);
+	if (!aq->q_req_buf)
+		goto err;
+
+	aq->q_res_buf = kmalloc(db_size, GFP_KERNEL);
+	if (!aq->q_res_buf)
+		goto err;
+
+	/* set device ready to active adminq */
+	eea_device_ready(enet->edev);
+
+	return 0;
+
+err:
+	eea_destroy_adminq(enet);
+	return -ENOMEM;
+}
+
+int eea_adminq_query_cfg(struct eea_net *enet, struct eea_aq_cfg *cfg)
+{
+	return eea_adminq_exec(enet, EEA_AQ_CMD_CFG_QUERY, NULL, 0, cfg,
+			       sizeof(*cfg));
+}
+
+static void qcfg_fill(struct eea_aq_create *qcfg, struct eea_ring *ering,
+		      u32 flags)
+{
+	qcfg->flags = cpu_to_le32(flags);
+	qcfg->qidx = cpu_to_le16(ering->index);
+	qcfg->depth = cpu_to_le16(ering->num);
+
+	qcfg->hdr_buf_size = flags & EEA_QUEUE_FLAGS_HW_SPLIT_HDR ? 1 : 0;
+	qcfg->sq_desc_size = ering->sq.desc_size;
+	qcfg->cq_desc_size = ering->cq.desc_size;
+	qcfg->msix_vector = cpu_to_le16(ering->msix_vec);
+
+	qcfg->sq_addr_low = cpu_to_le32(ering->sq.dma_addr);
+	qcfg->sq_addr_high = cpu_to_le32(ering->sq.dma_addr >> 32);
+
+	qcfg->cq_addr_low = cpu_to_le32(ering->cq.dma_addr);
+	qcfg->cq_addr_high = cpu_to_le32(ering->cq.dma_addr >> 32);
+}
+
+int eea_adminq_create_q(struct eea_net *enet, u32 qidx, u32 num, u32 flags)
+{
+	int i, db_size, q_size, qid, err = -ENOMEM;
+	struct eea_net_cfg *cfg;
+	struct eea_ring *ering;
+	struct eea_aq *aq;
+
+	cfg = &enet->cfg;
+	aq = &enet->adminq;
+
+	if (cfg->split_hdr)
+		flags |= EEA_QUEUE_FLAGS_HW_SPLIT_HDR;
+
+	flags |= EEA_QUEUE_FLAGS_SQCQ;
+	flags |= EEA_QUEUE_FLAGS_HWTS;
+
+	db_size = sizeof(int) * num;
+	q_size = sizeof(struct eea_aq_create) * num;
+
+	qid = qidx;
+	for (i = 0; i < num; i++, qid++) {
+		ering = qid_to_ering(enet, qid);
+		qcfg_fill(aq->q_req_buf + i, ering, flags);
+	}
+
+	err = eea_adminq_exec(enet, EEA_AQ_CMD_QUEUE_CREATE,
+			      aq->q_req_buf, q_size, aq->q_res_buf, db_size);
+	if (err)
+		return err;
+
+	qid = qidx;
+	for (i = 0; i < num; i++, qid++) {
+		ering = qid_to_ering(enet, qid);
+		ering->db = eea_pci_db_addr(ering->edev,
+					    le32_to_cpu(aq->q_res_buf[i]));
+	}
+
+	return err;
+}
+
+int eea_adminq_destroy_all_q(struct eea_net *enet)
+{
+	return eea_adminq_exec(enet, EEA_AQ_CMD_QUEUE_DESTROY_ALL, NULL, 0,
+			       NULL, 0);
+}
+
+struct eea_aq_dev_status *eea_adminq_dev_status(struct eea_net *enet)
+{
+	struct eea_aq_queue_drv_status *drv_status;
+	struct eea_aq_dev_status *dev_status;
+	int err, i, io_num, size, q_num;
+	struct eea_ring *ering;
+	void *rep, *req;
+
+	q_num = enet->cfg.rx_ring_num + enet->cfg.tx_ring_num + 1;
+	io_num = enet->cfg.rx_ring_num + enet->cfg.tx_ring_num;
+
+	req = kcalloc(q_num, sizeof(struct eea_aq_queue_drv_status),
+		      GFP_KERNEL);
+	if (!req)
+		return NULL;
+
+	size = struct_size(dev_status, q_status, q_num);
+
+	rep = kmalloc(size, GFP_KERNEL);
+	if (!rep) {
+		kfree(req);
+		return NULL;
+	}
+
+	drv_status = req;
+	for (i = 0; i < io_num; ++i, ++drv_status) {
+		ering = qid_to_ering(enet, i);
+		drv_status->qidx = cpu_to_le16(i);
+		drv_status->cq_head = cpu_to_le16(ering->cq.head);
+		drv_status->sq_head = cpu_to_le16(ering->sq.head);
+	}
+
+	drv_status->qidx = cpu_to_le16(i);
+	drv_status->cq_head = cpu_to_le16(enet->adminq.ring->cq.head);
+	drv_status->sq_head = cpu_to_le16(enet->adminq.ring->sq.head);
+
+	err = eea_adminq_exec(enet, EEA_AQ_CMD_DEV_STATUS, req,
+			      q_num * sizeof(struct eea_aq_queue_drv_status),
+			      rep, size);
+	kfree(req);
+	if (err) {
+		kfree(rep);
+		return NULL;
+	}
+
+	return rep;
+}
+
+int eea_adminq_config_host_info(struct eea_net *enet)
+{
+	struct device *dev = enet->edev->dma_dev;
+	struct eea_aq_host_info_cfg *cfg;
+	struct eea_aq_host_info_rep *rep;
+	int rc = -ENOMEM;
+
+	cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
+	if (!cfg)
+		return rc;
+
+	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
+	if (!rep)
+		goto err_free_cfg;
+
+	cfg->os_type            = cpu_to_le16(EEA_OS_LINUX);
+	cfg->os_dist            = cpu_to_le16(EEA_OS_DISTRO);
+	cfg->drv_type           = cpu_to_le16(EEA_DRV_TYPE);
+
+	cfg->kern_ver_major     = cpu_to_le16(LINUX_VERSION_MAJOR);
+	cfg->kern_ver_minor     = cpu_to_le16(LINUX_VERSION_PATCHLEVEL);
+	cfg->kern_ver_sub_minor = cpu_to_le16(LINUX_VERSION_SUBLEVEL);
+
+	cfg->drv_ver_major      = cpu_to_le16(EEA_VER_MAJOR);
+	cfg->drv_ver_minor      = cpu_to_le16(EEA_VER_MINOR);
+	cfg->drv_ver_sub_minor  = cpu_to_le16(EEA_VER_SUB_MINOR);
+
+	cfg->spec_ver_major     = cpu_to_le16(EEA_SPEC_VER_MAJOR);
+	cfg->spec_ver_minor     = cpu_to_le16(EEA_SPEC_VER_MINOR);
+
+	cfg->pci_bdf            = cpu_to_le16(eea_pci_dev_id(enet->edev));
+	cfg->pci_domain         = cpu_to_le32(eea_pci_domain_nr(enet->edev));
+
+	strscpy(cfg->os_ver_str, utsname()->release, sizeof(cfg->os_ver_str));
+	strscpy(cfg->isa_str, utsname()->machine, sizeof(cfg->isa_str));
+
+	rc = eea_adminq_exec(enet, EEA_AQ_CMD_HOST_INFO,
+			     cfg, sizeof(*cfg), rep, sizeof(*rep));
+
+	if (!rc) {
+		if (rep->op_code == EEA_HINFO_REP_REJECT) {
+			dev_err(dev, "Device has refused the initialization due to provided host information\n");
+			rc = -ENODEV;
+		}
+		if (rep->has_reply) {
+			rep->reply_str[EEA_HINFO_MAX_REP_LEN - 1] = '\0';
+			dev_warn(dev, "Device replied: %s\n",
+				 rep->reply_str);
+		}
+	}
+
+	kfree(rep);
+err_free_cfg:
+	kfree(cfg);
+	return rc;
+}
diff --git a/drivers/net/ethernet/alibaba/eea/eea_adminq.h b/drivers/net/ethernet/alibaba/eea/eea_adminq.h
new file mode 100644
index 000000000000..bb0d97238537
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_adminq.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#include "eea_pci.h"
+
+#ifndef __EEA_ADMINQ_H__
+#define __EEA_ADMINQ_H__
+
+struct eea_aq_cfg {
+	__le32 rx_depth_max;
+	__le32 rx_depth_def;
+
+	__le32 tx_depth_max;
+	__le32 tx_depth_def;
+
+	__le32 max_tso_size;
+	__le32 max_tso_segs;
+
+	u8 mac[ETH_ALEN];
+	__le16 status;
+
+	__le16 mtu;
+	__le16 reserved0;
+	__le16 reserved1;
+	u8 reserved2;
+	u8 reserved3;
+
+	__le16 reserved4;
+	__le16 reserved5;
+	__le16 reserved6;
+};
+
+struct eea_aq_queue_status {
+	__le16 qidx;
+#define EEA_QUEUE_STATUS_OK 0
+#define EEA_QUEUE_STATUS_NEED_RESET 1
+	__le16 status;
+};
+
+struct eea_aq_dev_status {
+#define EEA_LINK_DOWN_STATUS  0
+#define EEA_LINK_UP_STATUS    1
+	__le16 link_status;
+	__le16 reserved;
+
+	struct eea_aq_queue_status q_status[];
+};
+
+struct eea_aq {
+	struct eea_ring *ring;
+	u32 num;
+	u16 phase;
+
+	u32 q_req_size;
+	u32 q_res_size;
+	struct eea_aq_create *q_req_buf;
+	__le32 *q_res_buf;
+};
+
+struct eea_net;
+
+int eea_create_adminq(struct eea_net *enet, u32 qid);
+void eea_destroy_adminq(struct eea_net *enet);
+
+int eea_adminq_query_cfg(struct eea_net *enet, struct eea_aq_cfg *cfg);
+
+int eea_adminq_create_q(struct eea_net *enet, u32 qidx, u32 num, u32 flags);
+int eea_adminq_destroy_all_q(struct eea_net *enet);
+struct eea_aq_dev_status *eea_adminq_dev_status(struct eea_net *enet);
+int eea_adminq_config_host_info(struct eea_net *enet);
+#endif
diff --git a/drivers/net/ethernet/alibaba/eea/eea_net.c b/drivers/net/ethernet/alibaba/eea/eea_net.c
new file mode 100644
index 000000000000..31cb9ca5b408
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_net.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/netdev_queues.h>
+
+#include "eea_adminq.h"
+#include "eea_net.h"
+#include "eea_pci.h"
+#include "eea_ring.h"
+
+#define EEA_SPLIT_HDR_SIZE 128
+
+static void eea_update_cfg(struct eea_net *enet,
+			   struct eea_device *edev,
+			   struct eea_aq_cfg *hwcfg)
+{
+	enet->cfg_hw.rx_ring_depth = le32_to_cpu(hwcfg->rx_depth_max);
+	enet->cfg_hw.tx_ring_depth = le32_to_cpu(hwcfg->tx_depth_max);
+
+	enet->cfg_hw.rx_ring_num = edev->rx_num;
+	enet->cfg_hw.tx_ring_num = edev->tx_num;
+
+	enet->cfg.rx_ring_depth = le32_to_cpu(hwcfg->rx_depth_def);
+	enet->cfg.tx_ring_depth = le32_to_cpu(hwcfg->tx_depth_def);
+
+	enet->cfg.rx_ring_num = edev->rx_num;
+	enet->cfg.tx_ring_num = edev->tx_num;
+
+	enet->cfg_hw.split_hdr = EEA_SPLIT_HDR_SIZE;
+}
+
+static int eea_netdev_init_features(struct net_device *netdev,
+				    struct eea_net *enet,
+				    struct eea_device *edev)
+{
+	struct eea_aq_cfg *cfg;
+	int err;
+	u32 mtu;
+
+	cfg = kmalloc(sizeof(*cfg), GFP_KERNEL);
+	if (!cfg)
+		return -ENOMEM;
+
+	err = eea_adminq_query_cfg(enet, cfg);
+	if (err)
+		goto err_free;
+
+	mtu = le16_to_cpu(cfg->mtu);
+	if (mtu < ETH_MIN_MTU) {
+		dev_err(edev->dma_dev, "The device gave us an invalid MTU. Here we can only exit the initialization. %u < %u\n",
+			mtu, ETH_MIN_MTU);
+		err = -EINVAL;
+		goto err_free;
+	}
+
+	eea_update_cfg(enet, edev, cfg);
+
+	/* Now, we assert that the rx ring num is equal to the tx ring num. */
+	if (enet->cfg.rx_ring_num != enet->cfg.tx_ring_num) {
+		dev_err(edev->dma_dev, "The device gave us an rx,tx ring num. Here we can only exit the initialization. %d %d\n",
+			enet->cfg.rx_ring_num,
+			enet->cfg.tx_ring_num);
+		err = -EINVAL;
+		goto err_free;
+	}
+
+	netdev->priv_flags |= IFF_UNICAST_FLT;
+	netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+
+	netdev->hw_features |= NETIF_F_HW_CSUM;
+	netdev->hw_features |= NETIF_F_GRO_HW;
+	netdev->hw_features |= NETIF_F_SG;
+	netdev->hw_features |= NETIF_F_TSO;
+	netdev->hw_features |= NETIF_F_TSO_ECN;
+	netdev->hw_features |= NETIF_F_TSO6;
+	netdev->hw_features |= NETIF_F_GSO_UDP_L4;
+
+	netdev->features |= NETIF_F_HIGHDMA;
+	netdev->features |= NETIF_F_HW_CSUM;
+	netdev->features |= NETIF_F_SG;
+	netdev->features |= NETIF_F_GSO_ROBUST;
+	netdev->features |= netdev->hw_features & NETIF_F_ALL_TSO;
+	netdev->features |= NETIF_F_RXCSUM;
+	netdev->features |= NETIF_F_GRO_HW;
+
+	netdev->vlan_features = netdev->features;
+
+	eth_hw_addr_set(netdev, cfg->mac);
+
+	enet->speed = SPEED_UNKNOWN;
+	enet->duplex = DUPLEX_UNKNOWN;
+
+	netdev->min_mtu = ETH_MIN_MTU;
+
+	netdev->mtu = mtu;
+
+	/* If jumbo frames are already enabled, then the returned MTU will be a
+	 * jumbo MTU, and the driver will automatically enable jumbo frame
+	 * support by default.
+	 */
+	netdev->max_mtu = mtu;
+
+	netif_carrier_on(netdev);
+
+err_free:
+	kfree(cfg);
+	return err;
+}
+
+static const struct net_device_ops eea_netdev = {
+	.ndo_validate_addr  = eth_validate_addr,
+	.ndo_features_check = passthru_features_check,
+};
+
+static struct eea_net *eea_netdev_alloc(struct eea_device *edev, u32 pairs)
+{
+	struct net_device *netdev;
+	struct eea_net *enet;
+
+	netdev = alloc_etherdev_mq(sizeof(struct eea_net), pairs);
+	if (!netdev) {
+		dev_err(edev->dma_dev,
+			"alloc_etherdev_mq failed with pairs %d\n", pairs);
+		return NULL;
+	}
+
+	netdev->netdev_ops = &eea_netdev;
+	SET_NETDEV_DEV(netdev, edev->dma_dev);
+
+	enet = netdev_priv(netdev);
+	enet->netdev = netdev;
+	enet->edev = edev;
+	edev->enet = enet;
+
+	return enet;
+}
+
+int eea_net_probe(struct eea_device *edev)
+{
+	struct eea_net *enet;
+	int err = -ENOMEM;
+
+	enet = eea_netdev_alloc(edev, edev->rx_num);
+	if (!enet)
+		return -ENOMEM;
+
+	err = eea_create_adminq(enet, edev->rx_num + edev->tx_num);
+	if (err)
+		goto err_free_netdev;
+
+	err = eea_adminq_config_host_info(enet);
+	if (err)
+		goto err_reset_dev;
+
+	err = eea_netdev_init_features(enet->netdev, enet, edev);
+	if (err)
+		goto err_reset_dev;
+
+	err = register_netdev(enet->netdev);
+	if (err)
+		goto err_reset_dev;
+
+	netif_carrier_off(enet->netdev);
+
+	netdev_dbg(enet->netdev, "eea probe success.\n");
+
+	return 0;
+
+err_reset_dev:
+	eea_device_reset(edev);
+	eea_destroy_adminq(enet);
+
+err_free_netdev:
+	free_netdev(enet->netdev);
+	return err;
+}
+
+void eea_net_remove(struct eea_device *edev)
+{
+	struct net_device *netdev;
+	struct eea_net *enet;
+
+	enet = edev->enet;
+	netdev = enet->netdev;
+
+	unregister_netdev(netdev);
+	netdev_dbg(enet->netdev, "eea removed.\n");
+
+	eea_device_reset(edev);
+
+	eea_destroy_adminq(enet);
+
+	free_netdev(netdev);
+}
diff --git a/drivers/net/ethernet/alibaba/eea/eea_net.h b/drivers/net/ethernet/alibaba/eea/eea_net.h
new file mode 100644
index 000000000000..ab487bc88af2
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_net.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#ifndef __EEA_NET_H__
+#define __EEA_NET_H__
+
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+
+#include "eea_adminq.h"
+#include "eea_ring.h"
+
+#define EEA_VER_MAJOR		1
+#define EEA_VER_MINOR		0
+#define EEA_VER_SUB_MINOR	0
+
+struct eea_net_tx {
+	struct eea_net *enet;
+
+	struct eea_ring *ering;
+
+	struct eea_tx_meta *meta;
+	struct eea_tx_meta *free;
+
+	struct device *dma_dev;
+
+	u32 index;
+
+	char name[16];
+};
+
+struct eea_rx_meta {
+	struct eea_rx_meta *next;
+
+	struct page *page;
+	dma_addr_t dma;
+	u32 offset;
+	u32 frags;
+
+	struct page *hdr_page;
+	void *hdr_addr;
+	dma_addr_t hdr_dma;
+
+	u32 id;
+
+	u32 truesize;
+	u32 headroom;
+	u32 tailroom;
+
+	u32 len;
+};
+
+struct eea_net_rx_pkt_ctx {
+	u16 idx;
+
+	bool data_valid;
+	bool do_drop;
+
+	struct sk_buff *head_skb;
+	struct sk_buff *curr_skb;
+};
+
+struct eea_net_rx {
+	struct eea_net *enet;
+
+	struct eea_ring *ering;
+
+	struct eea_rx_meta *meta;
+	struct eea_rx_meta *free;
+
+	struct device *dma_dev;
+
+	u32 index;
+
+	u32 flags;
+
+	u32 headroom;
+
+	struct napi_struct *napi;
+
+	char name[16];
+
+	struct eea_net_rx_pkt_ctx pkt;
+
+	struct page_pool *pp;
+};
+
+struct eea_net_cfg {
+	u32 rx_ring_depth;
+	u32 tx_ring_depth;
+	u32 rx_ring_num;
+	u32 tx_ring_num;
+
+	u8 rx_sq_desc_size;
+	u8 rx_cq_desc_size;
+	u8 tx_sq_desc_size;
+	u8 tx_cq_desc_size;
+
+	u32 split_hdr;
+};
+
+enum {
+	EEA_LINK_ERR_NONE,
+	EEA_LINK_ERR_HA_RESET_DEV,
+	EEA_LINK_ERR_LINK_DOWN,
+};
+
+struct eea_net {
+	struct eea_device *edev;
+	struct net_device *netdev;
+
+	struct eea_aq adminq;
+
+	struct eea_net_tx *tx;
+	struct eea_net_rx **rx;
+
+	struct eea_net_cfg cfg;
+	struct eea_net_cfg cfg_hw;
+
+	u32 link_err;
+
+	bool started;
+
+	u8 duplex;
+	u32 speed;
+
+	u64 hw_ts_offset;
+};
+
+int eea_net_probe(struct eea_device *edev);
+void eea_net_remove(struct eea_device *edev);
+
+#endif
diff --git a/drivers/net/ethernet/alibaba/eea/eea_pci.c b/drivers/net/ethernet/alibaba/eea/eea_pci.c
index 88dfe5eef73f..97efac753cfb 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_pci.c
+++ b/drivers/net/ethernet/alibaba/eea/eea_pci.c
@@ -8,6 +8,7 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/iopoll.h>
 
+#include "eea_net.h"
 #include "eea_pci.h"
 
 #define EEA_PCI_DB_OFFSET 4096
@@ -58,7 +59,9 @@ struct eea_pci_device {
 	((void __iomem *)((reg) + offsetof(struct eea_pci_cfg, item)))
 
 #define cfg_write8(reg, item, val) iowrite8(val, cfg_pointer(reg, item))
+#define cfg_write16(reg, item, val) iowrite16(val, cfg_pointer(reg, item))
 #define cfg_write32(reg, item, val) iowrite32(val, cfg_pointer(reg, item))
+#define cfg_write64(reg, item, val) iowrite64_lo_hi(val, cfg_pointer(reg, item))
 
 #define cfg_read8(reg, item) ioread8(cfg_pointer(reg, item))
 #define cfg_read32(reg, item) ioread32(cfg_pointer(reg, item))
@@ -233,6 +236,20 @@ void __iomem *eea_pci_db_addr(struct eea_device *edev, u32 off)
 	return edev->ep_dev->db_base + off;
 }
 
+void eea_pci_active_aq(struct eea_ring *ering, int msix_vec)
+{
+	struct eea_pci_device *ep_dev = ering->edev->ep_dev;
+
+	cfg_write16(ep_dev->reg, aq_size, ering->num);
+	cfg_write16(ep_dev->reg, aq_msix_vector, msix_vec);
+
+	cfg_write64(ep_dev->reg, aq_sq_addr, ering->sq.dma_addr);
+	cfg_write64(ep_dev->reg, aq_cq_addr, ering->cq.dma_addr);
+
+	ering->db = eea_pci_db_addr(ering->edev,
+				    cfg_read32(ep_dev->reg, aq_db_off));
+}
+
 u64 eea_pci_device_ts(struct eea_device *edev)
 {
 	struct eea_pci_device *ep_dev = edev->ep_dev;
@@ -254,7 +271,9 @@ static int eea_init_device(struct eea_device *edev)
 	if (err)
 		goto err;
 
-	/* do net device probe ... */
+	err = eea_net_probe(edev);
+	if (err)
+		goto err;
 
 	return 0;
 err:
@@ -288,6 +307,9 @@ static void __eea_pci_remove(struct pci_dev *pci_dev)
 {
 	struct eea_pci_device *ep_dev = pci_get_drvdata(pci_dev);
 	struct device *dev = get_device(&ep_dev->pci_dev->dev);
+	struct eea_device *edev = &ep_dev->edev;
+
+	eea_net_remove(edev);
 
 	pci_disable_sriov(pci_dev);
 
diff --git a/drivers/net/ethernet/alibaba/eea/eea_pci.h b/drivers/net/ethernet/alibaba/eea/eea_pci.h
index 126704a207d5..d240dc2dae9b 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_pci.h
+++ b/drivers/net/ethernet/alibaba/eea/eea_pci.h
@@ -10,6 +10,8 @@
 
 #include <linux/pci.h>
 
+#include "eea_ring.h"
+
 struct eea_pci_cap {
 	__u8 cap_vndr;
 	__u8 cap_next;
@@ -43,6 +45,7 @@ u16 eea_pci_dev_id(struct eea_device *edev);
 
 int eea_device_reset(struct eea_device *dev);
 void eea_device_ready(struct eea_device *dev);
+void eea_pci_active_aq(struct eea_ring *ering, int msix_vec);
 
 u64 eea_pci_device_ts(struct eea_device *edev);
 
-- 
2.32.0.3.g01195cf9f


^ permalink raw reply related

* [PATCH net-next v26 2/8] eea: introduce ring and descriptor structures
From: Xuan Zhuo @ 2026-02-08  8:46 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Xuan Zhuo, Wen Gu, Philo Lu, Lorenzo Bianconi,
	Vadim Fedorenko, Dong Yibo, Heiner Kallweit, Lukas Bulwahn,
	Dust Li
In-Reply-To: <20260208084613.2658-1-xuanzhuo@linux.alibaba.com>

Add basic driver framework for the Alibaba Elastic Ethernet Adapter(EEA).

This commit introduces the ring and descriptor implementations.

These structures and ring APIs are used by the RX, TX, and admin queues.

Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Reviewed-by: Philo Lu <lulie@linux.alibaba.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/net/ethernet/alibaba/eea/Makefile   |   3 +-
 drivers/net/ethernet/alibaba/eea/eea_desc.h | 134 ++++++++++
 drivers/net/ethernet/alibaba/eea/eea_ring.c | 265 ++++++++++++++++++++
 drivers/net/ethernet/alibaba/eea/eea_ring.h |  89 +++++++
 4 files changed, 490 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_desc.h
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_ring.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_ring.h

diff --git a/drivers/net/ethernet/alibaba/eea/Makefile b/drivers/net/ethernet/alibaba/eea/Makefile
index cf2acf1733fd..e5e4007810a6 100644
--- a/drivers/net/ethernet/alibaba/eea/Makefile
+++ b/drivers/net/ethernet/alibaba/eea/Makefile
@@ -1,3 +1,4 @@
 
 obj-$(CONFIG_EEA) += eea.o
-eea-y := eea_pci.o
+eea-y :=  eea_ring.o \
+	eea_pci.o
diff --git a/drivers/net/ethernet/alibaba/eea/eea_desc.h b/drivers/net/ethernet/alibaba/eea/eea_desc.h
new file mode 100644
index 000000000000..90c402bd6c36
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_desc.h
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#ifndef __EEA_DESC_H__
+#define __EEA_DESC_H__
+
+#define EEA_DESC_TS_MASK GENMASK(47, 0)
+#define EEA_DESC_TS(desc) (le64_to_cpu((desc)->ts) & EEA_DESC_TS_MASK)
+
+struct eea_aq_desc {
+	__le16 flags;
+	__le16 id;
+	__le16 reserved;
+	u8 classid;
+	u8 command;
+	__le64 data_addr;
+	__le64 reply_addr;
+	__le32 data_len;
+	__le32 reply_len;
+};
+
+struct eea_aq_cdesc {
+	__le16 flags;
+	__le16 id;
+#define EEA_OK     0
+#define EEA_ERR    0xffffffff
+	__le32 status;
+	__le32 reply_len;
+	__le32 reserved1;
+
+	__le64 reserved2;
+	__le64 reserved3;
+};
+
+struct eea_rx_desc {
+	__le16 flags;
+	__le16 id;
+	__le16 len;
+	__le16 reserved1;
+
+	__le64 addr;
+
+	__le64 hdr_addr;
+	__le32 reserved2;
+	__le32 reserved3;
+};
+
+#define EEA_RX_CDESC_HDR_LEN_MASK GENMASK(9, 0)
+
+struct eea_rx_cdesc {
+#define EEA_DESC_F_DATA_VALID	BIT(6)
+#define EEA_DESC_F_SPLIT_HDR	BIT(5)
+	__le16 flags;
+	__le16 id;
+	__le16 len;
+#define EEA_NET_PT_NONE      0
+#define EEA_NET_PT_IPv4      1
+#define EEA_NET_PT_TCPv4     2
+#define EEA_NET_PT_UDPv4     3
+#define EEA_NET_PT_IPv6      4
+#define EEA_NET_PT_TCPv6     5
+#define EEA_NET_PT_UDPv6     6
+#define EEA_NET_PT_IPv6_EX   7
+#define EEA_NET_PT_TCPv6_EX  8
+#define EEA_NET_PT_UDPv6_EX  9
+	/* [9:0] is packet type. */
+	__le16 type;
+
+	/* hw timestamp [0:47]: ts */
+	__le64 ts;
+
+	__le32 hash;
+
+	/* 0-9: hdr_len  split header
+	 * 10-15: reserved1
+	 */
+	__le16 len_ex;
+	__le16 reserved2;
+
+	__le32 reserved3;
+	__le32 reserved4;
+};
+
+#define EEA_TX_GSO_NONE   0
+#define EEA_TX_GSO_TCPV4  1
+#define EEA_TX_GSO_TCPV6  4
+#define EEA_TX_GSO_UDP_L4 5
+#define EEA_TX_GSO_ECN    0x80
+
+struct eea_tx_desc {
+#define EEA_DESC_F_DO_CSUM	BIT(6)
+	__le16 flags;
+	__le16 id;
+	__le16 len;
+	__le16 reserved1;
+
+	__le64 addr;
+
+	__le16 csum_start;
+	__le16 csum_offset;
+	u8 gso_type;
+	u8 reserved2;
+	__le16 gso_size;
+	__le64 reserved3;
+};
+
+struct eea_tx_cdesc {
+	__le16 flags;
+	__le16 id;
+	__le16 len;
+	__le16 reserved1;
+
+	/* hw timestamp [0:47]: ts */
+	__le64 ts;
+	__le64 reserved2;
+	__le64 reserved3;
+};
+
+struct eea_db {
+#define EEA_IDX_PRESENT   BIT(0)
+#define EEA_IRQ_MASK      BIT(1)
+#define EEA_IRQ_UNMASK    BIT(2)
+	u8 kick_flags;
+	u8 reserved;
+	__le16 idx;
+
+	__le16 tx_cq_head;
+	__le16 rx_cq_head;
+};
+#endif
diff --git a/drivers/net/ethernet/alibaba/eea/eea_ring.c b/drivers/net/ethernet/alibaba/eea/eea_ring.c
new file mode 100644
index 000000000000..c8269dbf0d76
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_ring.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#include "eea_pci.h"
+#include "eea_ring.h"
+
+void ering_irq_unactive(struct eea_ring *ering)
+{
+	union {
+		u64 data;
+		struct eea_db db;
+	} val;
+
+	if (ering->mask == EEA_IRQ_MASK)
+		return;
+
+	ering->mask = EEA_IRQ_MASK;
+
+	val.data = 0;
+
+	val.db.kick_flags = EEA_IRQ_MASK;
+
+	writeq(val.data, (void __iomem *)ering->db);
+}
+
+void ering_irq_active(struct eea_ring *ering, struct eea_ring *tx_ering)
+{
+	union {
+		u64 data;
+		struct eea_db db;
+	} val;
+
+	if (ering->mask == EEA_IRQ_UNMASK)
+		return;
+
+	ering->mask = EEA_IRQ_UNMASK;
+
+	val.data = 0;
+
+	val.db.kick_flags = EEA_IRQ_UNMASK;
+
+	val.db.tx_cq_head = cpu_to_le16(tx_ering->cq.hw_idx);
+	val.db.rx_cq_head = cpu_to_le16(ering->cq.hw_idx);
+
+	writeq(val.data, (void __iomem *)ering->db);
+}
+
+void *ering_cq_get_desc(const struct eea_ring *ering)
+{
+	u8 phase;
+	u8 *desc;
+
+	desc = ering->cq.desc + (ering->cq.head << ering->cq.desc_size_shift);
+
+	phase = *(u8 *)(desc + ering->cq.desc_size - 1);
+
+	if ((phase & EEA_RING_DESC_F_CQ_PHASE) == ering->cq.phase) {
+		dma_rmb();
+		return desc;
+	}
+
+	return NULL;
+}
+
+/* sq api */
+void *ering_sq_alloc_desc(struct eea_ring *ering, u16 id, bool is_last,
+			  u16 flags)
+{
+	struct eea_ring_sq *sq = &ering->sq;
+	struct eea_common_desc *desc;
+
+	if (!sq->shadow_num) {
+		sq->shadow_idx = sq->head;
+		sq->shadow_id = cpu_to_le16(id);
+	}
+
+	if (!is_last)
+		flags |= EEA_RING_DESC_F_MORE;
+
+	desc = sq->desc + (sq->shadow_idx << sq->desc_size_shift);
+
+	desc->flags = cpu_to_le16(flags);
+	desc->id = sq->shadow_id;
+
+	if (unlikely(++sq->shadow_idx >= ering->num))
+		sq->shadow_idx = 0;
+
+	++sq->shadow_num;
+
+	return desc;
+}
+
+/* alloc desc for adminq */
+void *ering_aq_alloc_desc(struct eea_ring *ering)
+{
+	struct eea_ring_sq *sq = &ering->sq;
+	struct eea_common_desc *desc;
+
+	sq->shadow_idx = sq->head;
+
+	desc = sq->desc + (sq->shadow_idx << sq->desc_size_shift);
+
+	if (unlikely(++sq->shadow_idx >= ering->num))
+		sq->shadow_idx = 0;
+
+	++sq->shadow_num;
+
+	return desc;
+}
+
+void ering_sq_commit_desc(struct eea_ring *ering)
+{
+	struct eea_ring_sq *sq = &ering->sq;
+	int num;
+
+	num = sq->shadow_num;
+
+	ering->num_free -= num;
+
+	sq->head       = sq->shadow_idx;
+	sq->hw_idx     += num;
+	sq->shadow_num = 0;
+}
+
+void ering_sq_cancel(struct eea_ring *ering)
+{
+	ering->sq.shadow_num = 0;
+}
+
+/* cq api */
+void ering_cq_ack_desc(struct eea_ring *ering, u32 num)
+{
+	struct eea_ring_cq *cq = &ering->cq;
+
+	cq->head += num;
+	cq->hw_idx += num;
+
+	if (unlikely(cq->head >= ering->num)) {
+		cq->head -= ering->num;
+		cq->phase ^= EEA_RING_DESC_F_CQ_PHASE;
+	}
+
+	ering->num_free += num;
+}
+
+/* notify */
+bool ering_kick(struct eea_ring *ering)
+{
+	union {
+		struct eea_db db;
+		u64 data;
+	} val;
+
+	val.data = 0;
+
+	val.db.kick_flags = EEA_IDX_PRESENT;
+	val.db.idx = cpu_to_le16(ering->sq.hw_idx);
+
+	writeq(val.data, (void __iomem *)ering->db);
+
+	return true;
+}
+
+/* ering alloc/free */
+static void ering_free_queue(struct eea_device *edev, size_t size,
+			     void *queue, dma_addr_t dma_handle)
+{
+	dma_free_coherent(edev->dma_dev, size, queue, dma_handle);
+}
+
+static void *ering_alloc_queue(struct eea_device *edev, size_t size,
+			       dma_addr_t *dma_handle)
+{
+	gfp_t flags = GFP_KERNEL | __GFP_NOWARN;
+
+	return dma_alloc_coherent(edev->dma_dev, size, dma_handle, flags);
+}
+
+static int ering_alloc_queues(struct eea_ring *ering, struct eea_device *edev,
+			      u32 num, u8 sq_desc_size, u8 cq_desc_size)
+{
+	dma_addr_t addr;
+	size_t size;
+	void *ring;
+
+	size = num * sq_desc_size;
+
+	ring = ering_alloc_queue(edev, size, &addr);
+	if (!ring)
+		return -ENOMEM;
+
+	ering->sq.desc     = ring;
+	ering->sq.dma_addr = addr;
+	ering->sq.dma_size = size;
+	ering->sq.desc_size = sq_desc_size;
+	ering->sq.desc_size_shift = fls(sq_desc_size) - 1;
+
+	size = num * cq_desc_size;
+
+	ring = ering_alloc_queue(edev, size, &addr);
+	if (!ring)
+		goto err_free_sq;
+
+	ering->cq.desc     = ring;
+	ering->cq.dma_addr = addr;
+	ering->cq.dma_size = size;
+	ering->cq.desc_size = cq_desc_size;
+	ering->cq.desc_size_shift = fls(cq_desc_size) - 1;
+
+	ering->num = num;
+
+	return 0;
+
+err_free_sq:
+	ering_free_queue(ering->edev, ering->sq.dma_size,
+			 ering->sq.desc, ering->sq.dma_addr);
+	return -ENOMEM;
+}
+
+static void ering_init(struct eea_ring *ering)
+{
+	ering->cq.phase = EEA_RING_DESC_F_CQ_PHASE;
+	ering->num_free = ering->num;
+}
+
+struct eea_ring *ering_alloc(u32 index, u32 num, struct eea_device *edev,
+			     u8 sq_desc_size, u8 cq_desc_size,
+			     const char *name)
+{
+	struct eea_ring *ering;
+
+	ering = kzalloc(sizeof(*ering), GFP_KERNEL);
+	if (!ering)
+		return NULL;
+
+	ering->edev = edev;
+	ering->name = name;
+	ering->index = index;
+
+	if (ering_alloc_queues(ering, edev, num, sq_desc_size, cq_desc_size))
+		goto err_free;
+
+	ering_init(ering);
+
+	return ering;
+
+err_free:
+	kfree(ering);
+	return NULL;
+}
+
+void ering_free(struct eea_ring *ering)
+{
+	ering_free_queue(ering->edev, ering->cq.dma_size,
+			 ering->cq.desc, ering->cq.dma_addr);
+
+	ering_free_queue(ering->edev, ering->sq.dma_size,
+			 ering->sq.desc, ering->sq.dma_addr);
+
+	kfree(ering);
+}
diff --git a/drivers/net/ethernet/alibaba/eea/eea_ring.h b/drivers/net/ethernet/alibaba/eea/eea_ring.h
new file mode 100644
index 000000000000..89ec7e2a1398
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_ring.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#ifndef __EEA_RING_H__
+#define __EEA_RING_H__
+
+#include <linux/dma-mapping.h>
+#include "eea_desc.h"
+
+#define EEA_RING_DESC_F_MORE		BIT(0)
+#define EEA_RING_DESC_F_CQ_PHASE	BIT(7)
+
+struct eea_common_desc {
+	__le16 flags;
+	__le16 id;
+};
+
+struct eea_device;
+
+struct eea_ring_sq {
+	void *desc;
+
+	u16 head;
+	u16 hw_idx;
+
+	u16 shadow_idx;
+	__le16 shadow_id;
+	u16 shadow_num;
+
+	u8 desc_size;
+	u8 desc_size_shift;
+
+	dma_addr_t dma_addr;
+	u32 dma_size;
+};
+
+struct eea_ring_cq {
+	void *desc;
+
+	u16 head;
+	u16 hw_idx;
+
+	u8 phase;
+	u8 desc_size_shift;
+	u8 desc_size;
+
+	dma_addr_t dma_addr;
+	u32 dma_size;
+};
+
+struct eea_ring {
+	const char *name;
+	struct eea_device *edev;
+	u32 index;
+	void __iomem *db;
+	u16 msix_vec;
+
+	u8 mask;
+
+	u32 num;
+
+	u32 num_free;
+
+	struct eea_ring_sq sq;
+	struct eea_ring_cq cq;
+};
+
+struct eea_ring *ering_alloc(u32 index, u32 num, struct eea_device *edev,
+			     u8 sq_desc_size, u8 cq_desc_size,
+			     const char *name);
+void ering_free(struct eea_ring *ering);
+bool ering_kick(struct eea_ring *ering);
+
+void *ering_sq_alloc_desc(struct eea_ring *ering, u16 id,
+			  bool is_last, u16 flags);
+void *ering_aq_alloc_desc(struct eea_ring *ering);
+void ering_sq_commit_desc(struct eea_ring *ering);
+void ering_sq_cancel(struct eea_ring *ering);
+
+void ering_cq_ack_desc(struct eea_ring *ering, u32 num);
+
+void ering_irq_unactive(struct eea_ring *ering);
+void ering_irq_active(struct eea_ring *ering, struct eea_ring *tx_ering);
+void *ering_cq_get_desc(const struct eea_ring *ering);
+#endif
-- 
2.32.0.3.g01195cf9f


^ permalink raw reply related

* [PATCH net-next v26 1/8] eea: introduce PCI framework
From: Xuan Zhuo @ 2026-02-08  8:46 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Xuan Zhuo, Wen Gu, Philo Lu, Lorenzo Bianconi,
	Vadim Fedorenko, Dong Yibo, Heiner Kallweit, Lukas Bulwahn,
	Dust Li
In-Reply-To: <20260208084613.2658-1-xuanzhuo@linux.alibaba.com>

Add basic driver framework for the Alibaba Elastic Ethernet Adapter(EEA).

This commit implements the EEA PCI probe functionality.

Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Reviewed-by: Philo Lu <lulie@linux.alibaba.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 MAINTAINERS                                |   8 +
 drivers/net/ethernet/Kconfig               |   1 +
 drivers/net/ethernet/Makefile              |   1 +
 drivers/net/ethernet/alibaba/Kconfig       |  28 ++
 drivers/net/ethernet/alibaba/Makefile      |   5 +
 drivers/net/ethernet/alibaba/eea/Makefile  |   3 +
 drivers/net/ethernet/alibaba/eea/eea_pci.c | 390 +++++++++++++++++++++
 drivers/net/ethernet/alibaba/eea/eea_pci.h |  50 +++
 8 files changed, 486 insertions(+)
 create mode 100644 drivers/net/ethernet/alibaba/Kconfig
 create mode 100644 drivers/net/ethernet/alibaba/Makefile
 create mode 100644 drivers/net/ethernet/alibaba/eea/Makefile
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_pci.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_pci.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 34c2ed4da1f9..f6c1aef47ba7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -808,6 +808,14 @@ S:	Maintained
 F:	Documentation/i2c/busses/i2c-ali1563.rst
 F:	drivers/i2c/busses/i2c-ali1563.c
 
+ALIBABA ELASTIC ETHERNET ADAPTER DRIVER
+M:	Xuan Zhuo <xuanzhuo@linux.alibaba.com>
+M:	Wen Gu <guwen@linux.alibaba.com>
+R:	Philo Lu <lulie@linux.alibaba.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	drivers/net/ethernet/alibaba/eea
+
 ALIBABA ELASTIC RDMA DRIVER
 M:	Cheng Xu <chengyou@linux.alibaba.com>
 M:	Kai Shen <kaishen@linux.alibaba.com>
diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig
index aa7103e7f47f..9ead9c49e6c6 100644
--- a/drivers/net/ethernet/Kconfig
+++ b/drivers/net/ethernet/Kconfig
@@ -22,6 +22,7 @@ source "drivers/net/ethernet/aeroflex/Kconfig"
 source "drivers/net/ethernet/agere/Kconfig"
 source "drivers/net/ethernet/airoha/Kconfig"
 source "drivers/net/ethernet/alacritech/Kconfig"
+source "drivers/net/ethernet/alibaba/Kconfig"
 source "drivers/net/ethernet/allwinner/Kconfig"
 source "drivers/net/ethernet/alteon/Kconfig"
 source "drivers/net/ethernet/altera/Kconfig"
diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile
index 6615a67a63d5..9e6d740f4cf7 100644
--- a/drivers/net/ethernet/Makefile
+++ b/drivers/net/ethernet/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_NET_VENDOR_ADI) += adi/
 obj-$(CONFIG_NET_VENDOR_AGERE) += agere/
 obj-$(CONFIG_NET_VENDOR_AIROHA) += airoha/
 obj-$(CONFIG_NET_VENDOR_ALACRITECH) += alacritech/
+obj-$(CONFIG_NET_VENDOR_ALIBABA) += alibaba/
 obj-$(CONFIG_NET_VENDOR_ALLWINNER) += allwinner/
 obj-$(CONFIG_NET_VENDOR_ALTEON) += alteon/
 obj-$(CONFIG_ALTERA_TSE) += altera/
diff --git a/drivers/net/ethernet/alibaba/Kconfig b/drivers/net/ethernet/alibaba/Kconfig
new file mode 100644
index 000000000000..9bd8cc9fd203
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/Kconfig
@@ -0,0 +1,28 @@
+#
+# Alibaba network device configuration
+#
+
+config NET_VENDOR_ALIBABA
+	bool "Alibaba Devices"
+	default y
+	help
+	  If you have a network (Ethernet) device belonging to this class, say Y.
+
+	  Note that the answer to this question doesn't directly affect the
+	  kernel: saying N will just cause the configurator to skip all
+	  the questions about Alibaba devices. If you say Y, you will be asked
+	  for your specific device in the following questions.
+
+if NET_VENDOR_ALIBABA
+
+config EEA
+	tristate "Alibaba Elastic Ethernet Adapter support"
+	depends on PCI_MSI
+	depends on 64BIT
+	select PAGE_POOL
+	help
+	  This driver supports Alibaba Elastic Ethernet Adapter.
+
+	  To compile this driver as a module, choose M here.
+
+endif #NET_VENDOR_ALIBABA
diff --git a/drivers/net/ethernet/alibaba/Makefile b/drivers/net/ethernet/alibaba/Makefile
new file mode 100644
index 000000000000..7980525cb086
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the Alibaba network device drivers.
+#
+
+obj-$(CONFIG_EEA) += eea/
diff --git a/drivers/net/ethernet/alibaba/eea/Makefile b/drivers/net/ethernet/alibaba/eea/Makefile
new file mode 100644
index 000000000000..cf2acf1733fd
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/Makefile
@@ -0,0 +1,3 @@
+
+obj-$(CONFIG_EEA) += eea.o
+eea-y := eea_pci.o
diff --git a/drivers/net/ethernet/alibaba/eea/eea_pci.c b/drivers/net/ethernet/alibaba/eea/eea_pci.c
new file mode 100644
index 000000000000..88dfe5eef73f
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_pci.c
@@ -0,0 +1,390 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/iopoll.h>
+
+#include "eea_pci.h"
+
+#define EEA_PCI_DB_OFFSET 4096
+
+struct eea_pci_cfg {
+	__le32 reserve0;
+	__le32 reserve1;
+	__le32 drv_f_idx;
+	__le32 drv_f;
+
+#define EEA_S_OK           BIT(2)
+#define EEA_S_FEATURE_DONE BIT(3)
+#define EEA_S_FAILED       BIT(7)
+	u8   device_status;
+	u8   reserved[7];
+
+	__le32 rx_num_max;
+	__le32 tx_num_max;
+	__le32 db_blk_size;
+
+	/* admin queue cfg */
+	__le16 aq_size;
+	__le16 aq_msix_vector;
+	__le32 aq_db_off;
+
+	__le32 aq_sq_addr;
+	__le32 aq_sq_addr_hi;
+	__le32 aq_cq_addr;
+	__le32 aq_cq_addr_hi;
+
+	__le64 hw_ts;
+};
+
+struct eea_pci_device {
+	struct eea_device edev;
+	struct pci_dev *pci_dev;
+
+	u32 msix_vec_n;
+
+	void __iomem *reg;
+	void __iomem *db_base;
+
+	char ha_irq_name[32];
+	u8 reset_pos;
+};
+
+#define cfg_pointer(reg, item) \
+	((void __iomem *)((reg) + offsetof(struct eea_pci_cfg, item)))
+
+#define cfg_write8(reg, item, val) iowrite8(val, cfg_pointer(reg, item))
+#define cfg_write32(reg, item, val) iowrite32(val, cfg_pointer(reg, item))
+
+#define cfg_read8(reg, item) ioread8(cfg_pointer(reg, item))
+#define cfg_read32(reg, item) ioread32(cfg_pointer(reg, item))
+#define cfg_readq(reg, item) readq(cfg_pointer(reg, item))
+
+const char *eea_pci_name(struct eea_device *edev)
+{
+	return pci_name(edev->ep_dev->pci_dev);
+}
+
+int eea_pci_domain_nr(struct eea_device *edev)
+{
+	return pci_domain_nr(edev->ep_dev->pci_dev->bus);
+}
+
+u16 eea_pci_dev_id(struct eea_device *edev)
+{
+	return pci_dev_id(edev->ep_dev->pci_dev);
+}
+
+static void eea_pci_io_set_status(struct eea_device *edev, u8 status)
+{
+	struct eea_pci_device *ep_dev = edev->ep_dev;
+
+	cfg_write8(ep_dev->reg, device_status, status);
+}
+
+static u8 eea_pci_io_get_status(struct eea_device *edev)
+{
+	struct eea_pci_device *ep_dev = edev->ep_dev;
+
+	return cfg_read8(ep_dev->reg, device_status);
+}
+
+static void eea_add_status(struct eea_device *dev, u32 status)
+{
+	eea_pci_io_set_status(dev, eea_pci_io_get_status(dev) | status);
+}
+
+#define EEA_RESET_TIMEOUT_US (1000 * 1000 * 1000)
+
+int eea_device_reset(struct eea_device *edev)
+{
+	struct eea_pci_device *ep_dev = edev->ep_dev;
+	int i, err;
+	u8 val;
+
+	eea_pci_io_set_status(edev, 0);
+
+	err = read_poll_timeout(cfg_read8, val, !val, 20, EEA_RESET_TIMEOUT_US,
+				false, ep_dev->reg, device_status);
+	if (err)
+		return -EBUSY;
+
+	for (i = 0; i < ep_dev->msix_vec_n; ++i)
+		synchronize_irq(pci_irq_vector(ep_dev->pci_dev, i));
+
+	return 0;
+}
+
+void eea_device_ready(struct eea_device *dev)
+{
+	u8 status = eea_pci_io_get_status(dev);
+
+	WARN_ON(status & EEA_S_OK);
+
+	eea_pci_io_set_status(dev, status | EEA_S_OK);
+}
+
+static int eea_negotiate(struct eea_device *edev)
+{
+	struct eea_pci_device *ep_dev;
+	u32 status;
+
+	ep_dev = edev->ep_dev;
+
+	edev->features = 0;
+
+	cfg_write32(ep_dev->reg, drv_f_idx, 0);
+	cfg_write32(ep_dev->reg, drv_f, (u32)edev->features);
+	cfg_write32(ep_dev->reg, drv_f_idx, 1);
+	cfg_write32(ep_dev->reg, drv_f, edev->features >> 32);
+
+	eea_add_status(edev, EEA_S_FEATURE_DONE);
+	status = eea_pci_io_get_status(edev);
+	if (!(status & EEA_S_FEATURE_DONE))
+		return -ENODEV;
+
+	return 0;
+}
+
+static void eea_pci_release_resource(struct eea_pci_device *ep_dev)
+{
+	struct pci_dev *pci_dev = ep_dev->pci_dev;
+
+	if (ep_dev->reg) {
+		pci_iounmap(pci_dev, ep_dev->reg);
+		ep_dev->reg = NULL;
+	}
+
+	if (ep_dev->msix_vec_n) {
+		ep_dev->msix_vec_n = 0;
+		pci_free_irq_vectors(ep_dev->pci_dev);
+	}
+
+	pci_release_regions(pci_dev);
+	pci_disable_device(pci_dev);
+}
+
+static int eea_pci_setup(struct pci_dev *pci_dev, struct eea_pci_device *ep_dev)
+{
+	int err, n, ret;
+
+	ep_dev->pci_dev = pci_dev;
+
+	err = pci_enable_device(pci_dev);
+	if (err)
+		return err;
+
+	err = pci_request_regions(pci_dev, "EEA");
+	if (err)
+		goto err_disable_dev;
+
+	pci_set_master(pci_dev);
+
+	err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pci_dev->dev, "Failed to enable 64-bit DMA.\n");
+		goto err_release_regions;
+	}
+
+	ep_dev->reg = pci_iomap(pci_dev, 0, 0);
+	if (!ep_dev->reg) {
+		dev_err(&pci_dev->dev, "Failed to map pci bar!\n");
+		err = -ENOMEM;
+		goto err_release_regions;
+	}
+
+	ep_dev->edev.rx_num = cfg_read32(ep_dev->reg, rx_num_max);
+	ep_dev->edev.tx_num = cfg_read32(ep_dev->reg, tx_num_max);
+
+	/* 2: adminq, error handle*/
+	n = ep_dev->edev.rx_num + ep_dev->edev.tx_num + 2;
+	ret = pci_alloc_irq_vectors(ep_dev->pci_dev, n, n, PCI_IRQ_MSIX);
+	if (ret != n) {
+		err = ret;
+		goto err_unmap_reg;
+	}
+
+	ep_dev->msix_vec_n = ret;
+
+	ep_dev->db_base = ep_dev->reg + EEA_PCI_DB_OFFSET;
+	ep_dev->edev.db_blk_size = cfg_read32(ep_dev->reg, db_blk_size);
+
+	return 0;
+
+err_unmap_reg:
+	pci_iounmap(pci_dev, ep_dev->reg);
+	ep_dev->reg = NULL;
+
+err_release_regions:
+	pci_release_regions(pci_dev);
+
+err_disable_dev:
+	pci_disable_device(pci_dev);
+
+	return err;
+}
+
+void __iomem *eea_pci_db_addr(struct eea_device *edev, u32 off)
+{
+	return edev->ep_dev->db_base + off;
+}
+
+u64 eea_pci_device_ts(struct eea_device *edev)
+{
+	struct eea_pci_device *ep_dev = edev->ep_dev;
+
+	return cfg_readq(ep_dev->reg, hw_ts);
+}
+
+static int eea_init_device(struct eea_device *edev)
+{
+	int err;
+
+	err = eea_device_reset(edev);
+	if (err)
+		return err;
+
+	eea_pci_io_set_status(edev, BIT(0) | BIT(1));
+
+	err = eea_negotiate(edev);
+	if (err)
+		goto err;
+
+	/* do net device probe ... */
+
+	return 0;
+err:
+	eea_add_status(edev, EEA_S_FAILED);
+	return err;
+}
+
+static int __eea_pci_probe(struct pci_dev *pci_dev,
+			   struct eea_pci_device *ep_dev)
+{
+	int err;
+
+	pci_set_drvdata(pci_dev, ep_dev);
+
+	err = eea_pci_setup(pci_dev, ep_dev);
+	if (err)
+		return err;
+
+	err = eea_init_device(&ep_dev->edev);
+	if (err)
+		goto err_pci_rel;
+
+	return 0;
+
+err_pci_rel:
+	eea_pci_release_resource(ep_dev);
+	return err;
+}
+
+static void __eea_pci_remove(struct pci_dev *pci_dev)
+{
+	struct eea_pci_device *ep_dev = pci_get_drvdata(pci_dev);
+	struct device *dev = get_device(&ep_dev->pci_dev->dev);
+
+	pci_disable_sriov(pci_dev);
+
+	eea_pci_release_resource(ep_dev);
+
+	put_device(dev);
+}
+
+static int eea_pci_probe(struct pci_dev *pci_dev,
+			 const struct pci_device_id *id)
+{
+	struct eea_pci_device *ep_dev;
+	struct eea_device *edev;
+	int err;
+
+	ep_dev = kzalloc(sizeof(*ep_dev), GFP_KERNEL);
+	if (!ep_dev)
+		return -ENOMEM;
+
+	edev = &ep_dev->edev;
+
+	edev->ep_dev = ep_dev;
+	edev->dma_dev = &pci_dev->dev;
+
+	ep_dev->pci_dev = pci_dev;
+
+	err = __eea_pci_probe(pci_dev, ep_dev);
+	if (err) {
+		pci_set_drvdata(pci_dev, NULL);
+		kfree(ep_dev);
+	}
+
+	return err;
+}
+
+static void eea_pci_remove(struct pci_dev *pci_dev)
+{
+	struct eea_pci_device *ep_dev = pci_get_drvdata(pci_dev);
+
+	__eea_pci_remove(pci_dev);
+
+	pci_set_drvdata(pci_dev, NULL);
+	kfree(ep_dev);
+}
+
+static int eea_pci_sriov_configure(struct pci_dev *pci_dev, int num_vfs)
+{
+	struct eea_pci_device *ep_dev = pci_get_drvdata(pci_dev);
+	struct eea_device *edev = &ep_dev->edev;
+	int ret;
+
+	if (!(eea_pci_io_get_status(edev) & EEA_S_OK))
+		return -EBUSY;
+
+	if (pci_vfs_assigned(pci_dev))
+		return -EPERM;
+
+	if (num_vfs == 0) {
+		pci_disable_sriov(pci_dev);
+		return 0;
+	}
+
+	ret = pci_enable_sriov(pci_dev, num_vfs);
+	if (ret < 0)
+		return ret;
+
+	return num_vfs;
+}
+
+static const struct pci_device_id eea_pci_id_table[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_ALIBABA, 0x500B) },
+	{ 0 }
+};
+
+MODULE_DEVICE_TABLE(pci, eea_pci_id_table);
+
+static struct pci_driver eea_pci_driver = {
+	.name            = "eea",
+	.id_table        = eea_pci_id_table,
+	.probe           = eea_pci_probe,
+	.remove          = eea_pci_remove,
+	.sriov_configure = eea_pci_sriov_configure,
+};
+
+static __init int eea_pci_init(void)
+{
+	return pci_register_driver(&eea_pci_driver);
+}
+
+static __exit void eea_pci_exit(void)
+{
+	pci_unregister_driver(&eea_pci_driver);
+}
+
+module_init(eea_pci_init);
+module_exit(eea_pci_exit);
+
+MODULE_DESCRIPTION("Driver for Alibaba Elastic Ethernet Adapter");
+MODULE_AUTHOR("Xuan Zhuo <xuanzhuo@linux.alibaba.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/alibaba/eea/eea_pci.h b/drivers/net/ethernet/alibaba/eea/eea_pci.h
new file mode 100644
index 000000000000..126704a207d5
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_pci.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#ifndef __EEA_PCI_H__
+#define __EEA_PCI_H__
+
+#include <linux/pci.h>
+
+struct eea_pci_cap {
+	__u8 cap_vndr;
+	__u8 cap_next;
+	__u8 cap_len;
+	__u8 cfg_type;
+};
+
+struct eea_pci_reset_reg {
+	struct eea_pci_cap cap;
+	__le16 driver;
+	__le16 device;
+};
+
+struct eea_pci_device;
+
+struct eea_device {
+	struct eea_pci_device *ep_dev;
+	struct device         *dma_dev;
+	struct eea_net        *enet;
+
+	u64 features;
+
+	u32 rx_num;
+	u32 tx_num;
+	u32 db_blk_size;
+};
+
+const char *eea_pci_name(struct eea_device *edev);
+int eea_pci_domain_nr(struct eea_device *edev);
+u16 eea_pci_dev_id(struct eea_device *edev);
+
+int eea_device_reset(struct eea_device *dev);
+void eea_device_ready(struct eea_device *dev);
+
+u64 eea_pci_device_ts(struct eea_device *edev);
+
+void __iomem *eea_pci_db_addr(struct eea_device *edev, u32 off);
+#endif
-- 
2.32.0.3.g01195cf9f


^ permalink raw reply related

* [PATCH net-next v26 0/8] eea: Add basic driver framework for Alibaba Elastic Ethernet Adaptor
From: Xuan Zhuo @ 2026-02-08  8:46 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Xuan Zhuo, Wen Gu, Philo Lu, Lorenzo Bianconi,
	Vadim Fedorenko, Dong Yibo, Heiner Kallweit, Lukas Bulwahn,
	Dust Li

Add a driver framework for EEA that will be available in the future.

This driver is currently quite minimal, implementing only fundamental
core functionalities. Key features include: I/O queue management via
adminq, basic PCI-layer operations, and essential RX/TX data
communication capabilities. It also supports the creation,
initialization, and management of network devices (netdev). Furthermore,
the ring structures for both I/O queues and adminq have been abstracted
into a simple, unified, and reusable library implementation,
facilitating future extension and maintenance.

v26:
    Adopting suggestions from the previous AI review, another significant
    change is the introduction of an IRQ block to implement IRQ proxying. With
    this design, when an IRQ occurs, we no longer invoke the RX data structure
    directly -- instead, the IRQ block serves as an intermediary proxy. This
    approach offers several advantages: IRQ resources no longer need to be
    reallocated during reset operations, and IRQs are decoupled from RX
    structures. Consequently, when certain errors occur, we can fall back and
    safely reuse the original memory resources.

v25:
    I have adopted most of the suggestions from the AI's feedback. However, I
    believe some of the feedback is incorrect. I have already replied in the
    previous thread. http://lore.kernel.org/all/1770002612.3297296-2-xuanzhuo@linux.alibaba.com

v24:
    1. Add null checks for enet->rx and enet->tx in eea_get_ethtool_stat to
       prevent errors when reading rx = enet->rx[i] in case enet->rx is null.
       tx is similar. With rtnl protection in place, this check is sufficient.
    2. Use 'received' as the return value in eea_poll.

v23:
    I have moved netif_set_real_num_queues() out of eea_start_rxtx(), so
    eea_start_rxtx() is now a void function. I believe enet_bind_new_q_and_cfg()
    is a more suitable place to include netif_set_real_num_queues(). In
    eea_active_ring_and_irq(), I first execute request_irq() before interacting
    with the hardware to create queues. Therefore, during the NIC setup process,
    all driver-internal operations (memory allocation, IRQ initialization, sysfs
    configuration, etc.) will be completed before the final notification to the
    hardware.

v22:
    1. Use the budget from the NAPI poll function as the parameter for
       napi_consume_skb.
    2. Stop the TX queue when the remaining ring slots cannot hold an SKB.

v21:
    Fix two issues from the previous version:
    1, a DMA unmap operation was missing.
    2, RCU APIs were not used in eea_stats. Although the standard practice when
        using RCU would require adding the __rcu annotation to both the rx and
        tx fields, in many cases these fields are read without needing RCU
        protection.  Therefore, I do not want to add the __rcu annotation.
        Instead, I use a spin lock to protect modifications to rx and tx.

v20:
    Fix the partially initialized structure passed to db. @Jakub
    http://lore.kernel.org/all/20260113172353.2ae6ef81@kernel.org

v19:
    fix the comments from @Simon Horman

v18:
    v17 with [PATCH] prefix.

v17:
    1. In `eea_adminq_dev_status`, uniformly use `enet->cfg.rx_ring_num`.
    2. Add a `struct eea_net_cfg *cfg` parameter to `eea_free_rx` and
        `eea_free_tx`. When called in the normal path, pass `enet->cfg` as
        the argument; when called during initialization, pass the temporary
        `cfg` instead.
    3. Move the `.ndo_get_stats64` callback into `eea_net.c`.
    4. In the `.ndo_get_stats64` callback, add a comment explaining how the TX
        and RX statistics are protected by RCU.

       /* This function is protected by RCU. Here uses enet->tx and enet->rx
        * to check whether the TX and RX structures are safe to access. In
        * eea_free_rxtx_q_mem, before freeing the TX and RX resources, enet->rx
        * and enet->tx are set to NULL, and synchronize_net is called.
        */


v16:
    1. follow the advices from @ALOK TIWARI
       http://lore.kernel.org/all/5ff95a71-69e5-4cb6-9b2a-5224c983bdc2@oracle.com

v15:
    1. remove 'default m' from eea kconfig
    2. free the resources when open failed.

v14:
    1. some tiny fixes

v13:
    1. fix some tiny fixes @Simon

v12:
    I encountered some issues with sending the v11 patches, as they were quite
    messy. Therefore, I'm resending them as v12.

v11:
    1. remove auto clean __free(kfree)
    2. some tiny fixes

v10:
    1. name the jump labels after the target @Jakub
    2. rm __GFP_ZERO from dma_alloc_coherent @Jakub
v9:
    1. some fixes for ethtool from http://lore.kernel.org/all/20251027183754.52fe2a2c@kernel.org

v8: 1. rename eea_net_tmp to eea_net_init_ctx
    2. rm code that allocs memory to destroy queues
    3. some other minor changes

v7: 1. remove the irrelative code from ethtool commit
    2. build every commits with W12

v6: Split the big one commit to five commits
v5: Thanks for the comments from Kalesh Anakkur Purayil, ALOK TIWARI
v4: Thanks for the comments from Troy Mitchell, Przemek Kitszel, Andrew Lunn, Kalesh Anakkur Purayil
v3: Thanks for the comments from Paolo Abenchi
v2: Thanks for the comments from Simon Horman and Andrew Lunn
v1: Thanks for the comments from Simon Horman and Andrew Lunn





















Xuan Zhuo (8):
  eea: introduce PCI framework
  eea: introduce ring and descriptor structures
  eea: probe the netdevice and create adminq
  eea: create/destroy rx,tx queues for netdevice open and stop
  eea: implement packet receive logic
  eea: implement packet transmit logic
  eea: introduce ethtool support
  eea: introduce callback for ndo_get_stats64

 MAINTAINERS                                   |   8 +
 drivers/net/ethernet/Kconfig                  |   1 +
 drivers/net/ethernet/Makefile                 |   1 +
 drivers/net/ethernet/alibaba/Kconfig          |  28 +
 drivers/net/ethernet/alibaba/Makefile         |   5 +
 drivers/net/ethernet/alibaba/eea/Makefile     |   9 +
 drivers/net/ethernet/alibaba/eea/eea_adminq.c | 442 ++++++++++
 drivers/net/ethernet/alibaba/eea/eea_adminq.h |  75 ++
 drivers/net/ethernet/alibaba/eea/eea_desc.h   | 134 +++
 .../net/ethernet/alibaba/eea/eea_ethtool.c    | 243 ++++++
 .../net/ethernet/alibaba/eea/eea_ethtool.h    |  49 ++
 drivers/net/ethernet/alibaba/eea/eea_net.c    | 785 ++++++++++++++++++
 drivers/net/ethernet/alibaba/eea/eea_net.h    | 195 +++++
 drivers/net/ethernet/alibaba/eea/eea_pci.c    | 590 +++++++++++++
 drivers/net/ethernet/alibaba/eea/eea_pci.h    |  66 ++
 drivers/net/ethernet/alibaba/eea/eea_ring.c   | 265 ++++++
 drivers/net/ethernet/alibaba/eea/eea_ring.h   |  89 ++
 drivers/net/ethernet/alibaba/eea/eea_rx.c     | 730 ++++++++++++++++
 drivers/net/ethernet/alibaba/eea/eea_tx.c     | 402 +++++++++
 19 files changed, 4117 insertions(+)
 create mode 100644 drivers/net/ethernet/alibaba/Kconfig
 create mode 100644 drivers/net/ethernet/alibaba/Makefile
 create mode 100644 drivers/net/ethernet/alibaba/eea/Makefile
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_adminq.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_adminq.h
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_desc.h
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_ethtool.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_ethtool.h
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_net.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_net.h
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_pci.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_pci.h
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_ring.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_ring.h
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_rx.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_tx.c

--
2.32.0.3.g01195cf9f


^ permalink raw reply

* [PATCH net] net: macb: Fix tx/rx malfunction after phy link down and up
From: Kevin Hao @ 2026-02-08  8:45 UTC (permalink / raw)
  To: netdev
  Cc: Kevin Hao, stable, Nicolas Ferre, Claudiu Beznea, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Xiaolei Wang

In commit 99537d5c476c ("net: macb: Relocate mog_init_rings() callback
from macb_mac_link_up() to macb_open()"), the mog_init_rings() callback
was moved from macb_mac_link_up() to macb_open() to resolve a deadlock
issue. However, this change introduced a tx/rx malfunction following
phy link down and up events. The issue arises from a mismatch between
the software queue->tx_head, queue->tx_tail, queue->rx_prepared_head,
and queue->rx_tail values and the hardware's internal tx/rx queue
pointers.

According to the Zynq UltraScale TRM [1], when tx/rx is disabled, the
internal tx queue pointer resets to the value in the tx queue base
address register, while the internal rx queue pointer remains unchanged.
The following is quoted from the Zynq UltraScale TRM:
  When transmit is disabled, with bit [3] of the network control register
  set low, the transmit-buffer queue pointer resets to point to the address
  indicated by the transmit-buffer queue base address register. Disabling
  receive does not have the same effect on the receive-buffer queue
  pointer.

Additionally, there is no need to reset the RBQP and TBQP registers in a
phy event callback. Therefore, move macb_init_buffers() to macb_open().
In a phy link up event, the only required action is to reset the tx
software head and tail pointers to align with the hardware's behavior.

[1] https://docs.amd.com/v/u/en-US/ug1085-zynq-ultrascale-trm

Fixes: 99537d5c476c ("net: macb: Relocate mog_init_rings() callback from macb_mac_link_up() to macb_open()")
Signed-off-by: Kevin Hao <haokexin@gmail.com>
Cc: stable@vger.kernel.org
---
Cc: Nicolas Ferre <nicolas.ferre@microchip.com>
Cc: Claudiu Beznea <claudiu.beznea@tuxon.dev>
Cc: Andrew Lunn <andrew+netdev@lunn.ch>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Xiaolei Wang <xiaolei.wang@windriver.com>
---
 drivers/net/ethernet/cadence/macb_main.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
index effef67d80731e5cc795fcef5adc280ad931eda9..43cd013bb70e6bd08a31a0826364e4f34c0e0b89 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -705,14 +705,12 @@ static void macb_mac_link_up(struct phylink_config *config,
 		if (rx_pause)
 			ctrl |= MACB_BIT(PAE);

-		/* Initialize rings & buffers as clearing MACB_BIT(TE) in link down
-		 * cleared the pipeline and control registers.
-		 */
-		macb_init_buffers(bp);
-
-		for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue)
+		for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
+			queue->tx_head = 0;
+			queue->tx_tail = 0;
 			queue_writel(queue, IER,
 				     bp->rx_intr_mask | MACB_TX_INT_FLAGS | MACB_BIT(HRESP));
+		}
 	}

 	macb_or_gem_writel(bp, NCFGR, ctrl);
@@ -2954,6 +2952,7 @@ static int macb_open(struct net_device *dev)
 	}

 	bp->macbgem_ops.mog_init_rings(bp);
+	macb_init_buffers(bp);

 	for (q = 0, queue = bp->queues; q < bp->num_queues; ++q, ++queue) {
 		napi_enable(&queue->napi_rx);

---
base-commit: 9845cf73f7db6094c0d8419d6adb848028f4a921
change-id: 20260207-macb-init-ring-b0e37b3a3755

Best regards,
-- 
Kevin Hao <haokexin@gmail.com>

^ permalink raw reply related

* [PATCH net-next v2 4/4] net/rds: rds_sendmsg should not discard payload_len
From: Allison Henderson @ 2026-02-08  5:37 UTC (permalink / raw)
  To: netdev
  Cc: linux-kselftest, pabeni, edumazet, rds-devel, kuba, horms,
	linux-rdma, allison.henderson
In-Reply-To: <20260208053716.1617809-1-achender@kernel.org>

From: Allison Henderson <allison.henderson@oracle.com>

Commit 3db6e0d172c9 ("rds: use RCU to synchronize work-enqueue with
connection teardown") modifies rds_sendmsg to avoid enqueueing work
while a tear down is in progress. However, it also changed the return
value of rds_sendmsg to that of rds_send_xmit instead of the
payload_len. This means the user may incorrectly receive errno values
when it should have simply received a payload of 0 while the peer
attempts a reconnections.  So this patch corrects the teardown handling
code to only use the out error path in that case, thus restoring the
original payload_len return value.

Signed-off-by: Allison Henderson <achender@kernel.org>
---
 net/rds/send.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/rds/send.c b/net/rds/send.c
index 6e96f108473e..a1039e422a38 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1431,9 +1431,11 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
 		else
 			queue_delayed_work(cpath->cp_wq, &cpath->cp_send_w, 1);
 		rcu_read_unlock();
+
+		if (ret)
+			goto out;
 	}
-	if (ret)
-		goto out;
+
 	rds_message_put(rm);

 	for (ind = 0; ind < vct.indx; ind++)
-- 
2.43.0

^ permalink raw reply related

* [PATCH net-next v2 3/4] net/rds: Use proper peer port number even when not connected
From: Allison Henderson @ 2026-02-08  5:37 UTC (permalink / raw)
  To: netdev
  Cc: linux-kselftest, pabeni, edumazet, rds-devel, kuba, horms,
	linux-rdma, allison.henderson
In-Reply-To: <20260208053716.1617809-1-achender@kernel.org>

From: Greg Jumper <greg.jumper@oracle.com>

The function rds_tcp_get_peer_sport() should return the peer port of a
socket, even when the socket is not currently connected, so that RDS
can reliably determine the MPRDS "lane" corresponding to the port.

rds_tcp_get_peer_sport() calls kernel_getpeername() to get the port
number; however, when paths between endpoints frequently drop and
reconnect, kernel_getpeername() can return -ENOTCONN, causing
rds_tcp_get_peer_sport() to return an error, and ultimately causing
RDS to use the wrong lane for a port when reconnecting to a peer.

This patch modifies rds_tcp_get_peer_sport() to directly call the
socket-specific get-name function (inet_getname() in this case) that
kernel_getpeername() also calls.  The socket-specific function offers
an additional argument which, when set to a value greater than 1,
causes the function to return the socket's peer name even when the
socket is not connected, which in turn allows rds_tcp_get_peer_sport()
to return the correct port number.

Signed-off-by: Greg Jumper <greg.jumper@oracle.com>
Signed-off-by: Allison Henderson <achender@kernel.org>
---
 net/rds/tcp_listen.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 8fb8f7d26683..db4938fd1672 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -67,7 +67,14 @@ rds_tcp_get_peer_sport(struct socket *sock)
 	} saddr;
 	int sport;

-	if (kernel_getpeername(sock, &saddr.addr) >= 0) {
+	/* Call the socket's getname() function (inet_getname() in this case)
+	 * with a final argument greater than 1 to get the peer's port
+	 * regardless of whether the socket is currently connected.
+	 * Using peer=2 will get the peer port even during reconnection states
+	 * (TCPF_CLOSE, TCPF_SYN_SENT). This avoids -ENOTCONN while
+	 * inet_dport still contains the correct peer port.
+	 */
+	if (sock->ops->getname(sock, &saddr.addr, 2) >= 0) {
 		switch (saddr.addr.sa_family) {
 		case AF_INET:
 			sport = ntohs(saddr.sin.sin_port);
-- 
2.43.0

^ permalink raw reply related

* [PATCH net-next v2 2/4] net/rds: Delegate fan-out to a background worker
From: Allison Henderson @ 2026-02-08  5:37 UTC (permalink / raw)
  To: netdev
  Cc: linux-kselftest, pabeni, edumazet, rds-devel, kuba, horms,
	linux-rdma, allison.henderson
In-Reply-To: <20260208053716.1617809-1-achender@kernel.org>

From: Gerd Rausch <gerd.rausch@oracle.com>

Delegate fan-out to a background worker in order to allow
kernel_getpeername() to acquire a lock on the socket.

This has become necessary since the introduction of
commit "9dfc685e0262d ("inet: remove races in inet{6}_getname()")

The socket is already locked in the context that
"kernel_getpeername" used to get called by either
rds_tcp_recv_path" or "tcp_v{4,6}_rcv",
and therefore causing a deadlock.

Luckily, the fan-out need not happen in-context nor fast,
so we can easily just do the same in a background worker.

Also, while we're doing this, we get rid of the unused
struct members "t_conn_w", "t_send_w", "t_down_w" & "t_recv_w".

Reported-by: syzbot+ci858e84e8400d24b3@syzkaller.appspotmail.com
Link: https://ci.syzbot.org/series/1a5ef180-c02c-401d-9df7-670b18570a55
Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com>
Signed-off-by: Allison Henderson <achender@kernel.org>
---
 net/rds/tcp.c         |  3 +++
 net/rds/tcp.h         |  7 ++----
 net/rds/tcp_connect.c |  2 ++
 net/rds/tcp_listen.c  | 54 +++++++++++++++++++++++++++++++------------
 4 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 45484a93d75f..02f8f928c20b 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -358,6 +358,8 @@ static void rds_tcp_conn_free(void *arg)
 
 	rdsdebug("freeing tc %p\n", tc);
 
+	cancel_work_sync(&tc->t_fan_out_w);
+
 	spin_lock_irqsave(&rds_tcp_conn_lock, flags);
 	if (!tc->t_tcp_node_detached)
 		list_del(&tc->t_tcp_node);
@@ -384,6 +386,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
 		tc->t_tinc = NULL;
 		tc->t_tinc_hdr_rem = sizeof(struct rds_header);
 		tc->t_tinc_data_rem = 0;
+		INIT_WORK(&tc->t_fan_out_w, rds_tcp_fan_out_w);
 		init_waitqueue_head(&tc->t_recv_done_waitq);
 
 		conn->c_path[i].cp_transport_data = tc;
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 39c86347188c..9ecb0b6b658a 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -44,11 +44,7 @@ struct rds_tcp_connection {
 	size_t			t_tinc_hdr_rem;
 	size_t			t_tinc_data_rem;
 
-	/* XXX error report? */
-	struct work_struct	t_conn_w;
-	struct work_struct	t_send_w;
-	struct work_struct	t_down_w;
-	struct work_struct	t_recv_w;
+	struct work_struct	t_fan_out_w;
 
 	/* for info exporting only */
 	struct list_head	t_list_item;
@@ -90,6 +86,7 @@ void rds_tcp_state_change(struct sock *sk);
 struct socket *rds_tcp_listen_init(struct net *net, bool isv6);
 void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
 void rds_tcp_listen_data_ready(struct sock *sk);
+void rds_tcp_fan_out_w(struct work_struct *work);
 void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out);
 int rds_tcp_accept_one(struct rds_tcp_net *rtn);
 void rds_tcp_keepalive(struct socket *sock);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index b77c88ffb199..6954b8c479f1 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -115,6 +115,8 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
 	if (cp->cp_index > 0 && cp->cp_conn->c_npaths < 2)
 		return -EAGAIN;
 
+	cancel_work_sync(&tc->t_fan_out_w);
+
 	mutex_lock(&tc->t_conn_path_lock);
 
 	if (rds_conn_path_up(cp)) {
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 6fb5c928b8fd..8fb8f7d26683 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -123,27 +123,20 @@ rds_tcp_accept_one_path(struct rds_connection *conn, struct socket *sock)
 	return NULL;
 }
 
-void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out)
+void rds_tcp_fan_out_w(struct work_struct *work)
 {
-	struct rds_tcp_connection *tc;
-	struct rds_tcp_net *rtn;
-	struct socket *sock;
+	struct rds_tcp_connection *tc = container_of(work,
+						     struct rds_tcp_connection,
+						     t_fan_out_w);
+	struct rds_connection *conn = tc->t_cpath->cp_conn;
+	struct rds_tcp_net *rtn = tc->t_rtn;
+	struct socket *sock = tc->t_sock;
 	int sport, npaths;
 
-	if (rds_destroy_pending(conn))
-		return;
-
-	tc = conn->c_path->cp_transport_data;
-	rtn = tc->t_rtn;
-	if (!rtn)
-		return;
-
-	sock = tc->t_sock;
-
 	/* During fan-out, check that the connection we already
 	 * accepted in slot#0 carried the proper source port modulo.
 	 */
-	if (fan_out && conn->c_with_sport_idx && sock &&
+	if (conn->c_with_sport_idx && sock &&
 	    rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) > 0) {
 		/* cp->cp_index is encoded in lowest bits of source-port */
 		sport = rds_tcp_get_peer_sport(sock);
@@ -167,6 +160,37 @@ void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out)
 	rds_tcp_accept_work(rtn);
 }
 
+void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out)
+{
+	struct rds_conn_path *cp0;
+	struct rds_tcp_connection *tc;
+	struct rds_tcp_net *rtn;
+
+	if (rds_destroy_pending(conn))
+		return;
+
+	cp0 = conn->c_path;
+	tc = cp0->cp_transport_data;
+	rtn = tc->t_rtn;
+	if (!rtn)
+		return;
+
+	if (fan_out)
+		/* Delegate fan-out to a background worker in order
+		 * to allow "kernel_getpeername" to acquire a lock
+		 * on the socket.
+		 * The socket is already locked in this context
+		 * by either "rds_tcp_recv_path" or "tcp_v{4,6}_rcv",
+		 * depending on the origin of the dequeue-request.
+		 */
+		queue_work(cp0->cp_wq, &tc->t_fan_out_w);
+	else
+		/* Fan-out either already happened or is unnecessary.
+		 * Just go ahead and attempt to accept more connections
+		 */
+		rds_tcp_accept_work(rtn);
+}
+
 int rds_tcp_accept_one(struct rds_tcp_net *rtn)
 {
 	struct socket *listen_sock = rtn->rds_tcp_listen_sock;
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 1/4] net/rds: Refactor __rds_conn_create for blocking transport cleanup
From: Allison Henderson @ 2026-02-08  5:37 UTC (permalink / raw)
  To: netdev
  Cc: linux-kselftest, pabeni, edumazet, rds-devel, kuba, horms,
	linux-rdma, allison.henderson
In-Reply-To: <20260208053716.1617809-1-achender@kernel.org>

The next patch will delegate fanout operations to a background worker,
which requires cancel_work_sync() during connection cleanup.  However,
the error path of __rds_conn_create() currently calls
trans->conn_free() while holding rds_conn_lock (spinlock) and
rcu_read_lock, which creates an atomic context where cancel_work_sync()
cannot sleep.

To avoid this, refactor the error/race paths to defer
trans->conn_free() calls until after locks are released. This allows
transport cleanup functions to perform blocking operations safely.

This patch moves the cp_transport_data cleanup to the 'out:' label
where it runs outside the critical section, after the connection has
been freed from the slab and cannot be accessed by racing threads.

Signed-off-by: Allison Henderson <achender@kernel.org>
---
 net/rds/connection.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 185f73b01694..695ab7446178 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -170,6 +170,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 	struct hlist_head *head = rds_conn_bucket(laddr, faddr);
 	struct rds_transport *loop_trans;
 	struct rds_conn_path *free_cp = NULL;
+	struct rds_transport *free_trans = NULL;
 	unsigned long flags;
 	int ret, i;
 	int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
@@ -305,7 +306,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 	if (parent) {
 		/* Creating passive conn */
 		if (parent->c_passive) {
-			trans->conn_free(conn->c_path[0].cp_transport_data);
+			free_trans = trans;
 			free_cp = conn->c_path;
 			kmem_cache_free(rds_conn_slab, conn);
 			conn = parent->c_passive;
@@ -321,18 +322,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 		found = rds_conn_lookup(net, head, laddr, faddr, trans,
 					tos, dev_if);
 		if (found) {
-			struct rds_conn_path *cp;
-			int i;
-
-			for (i = 0; i < npaths; i++) {
-				cp = &conn->c_path[i];
-				/* The ->conn_alloc invocation may have
-				 * allocated resource for all paths, so all
-				 * of them may have to be freed here.
-				 */
-				if (cp->cp_transport_data)
-					trans->conn_free(cp->cp_transport_data);
-			}
+			free_trans = trans;
 			free_cp = conn->c_path;
 			kmem_cache_free(rds_conn_slab, conn);
 			conn = found;
@@ -349,9 +339,23 @@ static struct rds_connection *__rds_conn_create(struct net *net,
 
 out:
 	if (free_cp) {
-		for (i = 0; i < npaths; i++)
+		for (i = 0; i < npaths; i++) {
+			/*
+			 * The trans->conn_alloc call may have allocated
+			 * resources for the cp paths, which will need to
+			 * be freed before freeing cp itself.  We do this here
+			 * so it runs outside the rds_conn_lock spinlock
+			 * and rcu_read_lock section, because conn_free()
+			 * may call cancel_work_sync() which
+			 * can sleep.  free_trans is only set in the
+			 * race-loss paths where conn_alloc() succeeded.
+			 */
+			if (free_trans && free_cp[i].cp_transport_data)
+				free_trans->conn_free
+					(free_cp[i].cp_transport_data);
 			if (free_cp[i].cp_wq != rds_wq)
 				destroy_workqueue(free_cp[i].cp_wq);
+		}
 		kfree(free_cp);
 	}
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v2 0/4] net/rds: RDS-TCP reconnect and fanout improvements
From: Allison Henderson @ 2026-02-08  5:37 UTC (permalink / raw)
  To: netdev
  Cc: linux-kselftest, pabeni, edumazet, rds-devel, kuba, horms,
	linux-rdma, allison.henderson

Hi all,

This is subset 4 of the larger RDS-TCP patch series I posted last
Oct.  The greater series aims to correct multiple rds-tcp issues that
can cause dropped or out of sequence messages.  I've broken it down into
smaller sets to make reviews more manageable.

In this set, we address some reconnect issues occurring during connection
teardowns, and also move connection fanout operations to a background
worker.

The entire set can be viewed in the rfc here:
https://lore.kernel.org/netdev/20251022191715.157755-1-achender@kernel.org/

Questions, comments, flames appreciated!

Thanks,
Allison

Change Log
v2:
   [PATCH net-next v2 1/4] net/rds: Refactor __rds_conn_create for
   blocking transport cleanup
      - NEW

   [PATCH net-next v2 2/4] net/rds: Delegate fan-out to a background
    worker
      - Added syzbot report link

Allison Henderson (2):
  net/rds: Refactor __rds_conn_create for blocking transport cleanup
  net/rds: rds_sendmsg should not discard payload_len

Gerd Rausch (1):
  net/rds: Delegate fan-out to a background worker

Greg Jumper (1):
  net/rds: Use proper peer port number even when not connected

 net/rds/connection.c  | 32 ++++++++++++----------
 net/rds/send.c        |  6 +++--
 net/rds/tcp.c         |  3 +++
 net/rds/tcp.h         |  7 ++---
 net/rds/tcp_connect.c |  2 ++
 net/rds/tcp_listen.c  | 63 ++++++++++++++++++++++++++++++++-----------
 6 files changed, 76 insertions(+), 37 deletions(-)

-- 
2.43.0


^ permalink raw reply

* Re: [PATCH net-next v2 03/12] dpll: Add helpers to find DPLL pin fwnode
From: Saravana Kannan @ 2026-02-08  1:46 UTC (permalink / raw)
  To: Ivan Vecera
  Cc: netdev, Alexander Lobakin, Andrew Lunn, Arkadiusz Kubalewski,
	Conor Dooley, David S. Miller, Eric Dumazet, Grzegorz Nitka,
	Jakub Kicinski, Jiri Pirko, Jonathan Lemon, Krzysztof Kozlowski,
	Leon Romanovsky, Mark Bloch, Michal Schmidt, Paolo Abeni,
	Petr Oros, Prathosh Satish, Przemek Kitszel, Richard Cochran,
	Rob Herring, Saeed Mahameed, Saravana Kannan, Tariq Toukan,
	Tony Nguyen, Vadim Fedorenko, devicetree, intel-wired-lan,
	linux-kernel, linux-rdma
In-Reply-To: <20260116184610.147591-4-ivecera@redhat.com>

On Fri, Jan 16, 2026 at 10:46 AM Ivan Vecera <ivecera@redhat.com> wrote:
>
> dpll: core: add helpers to find DPLL pin fwnode
>
> Add helper functions to the DPLL core to retrieve a DPLL pin's firmware
> node handle based on the 'dpll-pins' and 'dpll-pin-names' properties.
>
> Unlike simple phandle arrays, 'dpll-pins' entries typically contain
> a pin specifier (index and direction) as defined by '#dpll-pin-cells'.
> The new helper fwnode_dpll_pin_node_get() parses these specifiers
> using fwnode_property_get_reference_args(). It resolves the target
> pin by:
> 1. Identifying the DPLL device node from the phandle.
> 2. Selecting the correct sub-node ('input-pins' or 'output-pins') based
>    on the direction argument.
> 3. Matching the pin index argument against the 'reg' property of
>    the child nodes.
>
> Additionally, register 'dpll-pins' in drivers/of/property.c to enable
> proper parsing of the supplier bindings by the OF core.
>
> Signed-off-by: Ivan Vecera <ivecera@redhat.com>
> ---
> v2:
> * added check for fwnode_property_match_string() return value
> * reworked searching for the pin using dpll device phandle and
>   pin specifier
> * added dpll-pins into OF core supplier_bindings
> ---
>  drivers/dpll/dpll_core.c | 74 ++++++++++++++++++++++++++++++++++++++++
>  drivers/of/property.c    |  2 ++
>  include/linux/dpll.h     | 15 ++++++++
>  3 files changed, 91 insertions(+)
>
> diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c
> index fb68b5e19b480..b0083b5c10aa4 100644
> --- a/drivers/dpll/dpll_core.c
> +++ b/drivers/dpll/dpll_core.c
> @@ -13,6 +13,7 @@
>  #include <linux/property.h>
>  #include <linux/slab.h>
>  #include <linux/string.h>
> +#include <dt-bindings/dpll/dpll.h>
>
>  #include "dpll_core.h"
>  #include "dpll_netlink.h"
> @@ -654,6 +655,79 @@ struct dpll_pin *fwnode_dpll_pin_find(struct fwnode_handle *fwnode)
>  }
>  EXPORT_SYMBOL_GPL(fwnode_dpll_pin_find);
>
> +/**
> + * fwnode_dpll_pin_node_get - get dpll pin node from given fw node and pin name
> + * @fwnode: firmware node that uses the dpll pin
> + * @name: dpll pin name from dpll-pin-names property
> + *
> + * Return: ERR_PTR() on error or a valid firmware node handle on success.
> + */
> +struct fwnode_handle *fwnode_dpll_pin_node_get(struct fwnode_handle *fwnode,
> +                                              const char *name)
> +{
> +       struct fwnode_handle *parent_node, *pin_node;
> +       struct fwnode_reference_args args;
> +       const char *parent_name;
> +       int ret, index = 0;
> +
> +       if (name) {
> +               index = fwnode_property_match_string(fwnode, "dpll-pin-names",
> +                                                    name);
> +               if (index < 0)
> +                       return ERR_PTR(-ENOENT);
> +       }
> +
> +       ret = fwnode_property_get_reference_args(fwnode, "dpll-pins",
> +                                                "#dpll-pin-cells", 2, index,
> +                                                &args);
> +       if (ret)
> +               return ERR_PTR(ret);
> +
> +       /* We support only 2 cell DPLL bindings in the kernel currently. */
> +       if (args.nargs != 2) {
> +               fwnode_handle_put(args.fwnode);
> +               return ERR_PTR(-ENOENT);
> +       }
> +
> +       /* Resolve parent node name according pin direction type */
> +       switch (args.args[1]) {
> +       case DPLL_PIN_INPUT:
> +               parent_name = "input-pins";
> +               break;
> +       case DPLL_PIN_OUTPUT:
> +               parent_name = "output-pins";
> +               break;
> +       default:
> +               fwnode_handle_put(args.fwnode);
> +               return ERR_PTR(-EINVAL);
> +       }
> +
> +       /* Get pin's parent sub-node */
> +       parent_node = fwnode_get_named_child_node(args.fwnode, parent_name);
> +       if (!parent_node) {
> +               fwnode_handle_put(args.fwnode);
> +               return ERR_PTR(-ENOENT);
> +       }
> +
> +       /* Enumerate child pin nodes and find the requested one */
> +       fwnode_for_each_child_node(parent_node, pin_node) {
> +               u32 reg;
> +
> +               if (fwnode_property_read_u32(pin_node, "reg", &reg))
> +                       continue;
> +
> +               if (reg == args.args[0])
> +                       break;
> +       }
> +
> +       /* Release pin's parent and dpll device node */
> +       fwnode_handle_put(parent_node);
> +       fwnode_handle_put(args.fwnode);
> +
> +       return pin_node ? pin_node : ERR_PTR(-ENOENT);
> +}
> +EXPORT_SYMBOL_GPL(fwnode_dpll_pin_node_get);
> +
>  static int
>  __dpll_pin_register(struct dpll_device *dpll, struct dpll_pin *pin,
>                     const struct dpll_pin_ops *ops, void *priv, void *cookie)
> diff --git a/drivers/of/property.c b/drivers/of/property.c
> index 4e3524227720a..8571c8bb71ade 100644
> --- a/drivers/of/property.c
> +++ b/drivers/of/property.c
> @@ -1410,6 +1410,7 @@ DEFINE_SIMPLE_PROP(post_init_providers, "post-init-providers", NULL)
>  DEFINE_SIMPLE_PROP(access_controllers, "access-controllers", "#access-controller-cells")
>  DEFINE_SIMPLE_PROP(pses, "pses", "#pse-cells")
>  DEFINE_SIMPLE_PROP(power_supplies, "power-supplies", NULL)
> +DEFINE_SIMPLE_PROP(dpll_pins, "dpll-pins", "#dpll-pin-cells")
>  DEFINE_SUFFIX_PROP(regulators, "-supply", NULL)
>  DEFINE_SUFFIX_PROP(gpio, "-gpio", "#gpio-cells")
>
> @@ -1568,6 +1569,7 @@ static const struct supplier_bindings of_supplier_bindings[] = {
>                 .parse_prop = parse_post_init_providers,
>                 .fwlink_flags = FWLINK_FLAG_IGNORE,
>         },
> +       { .parse_prop = parse_dpll_pins, },

Keep the same order as the other table please.

-Saravana
>         {}
>  };
>
> diff --git a/include/linux/dpll.h b/include/linux/dpll.h
> index f0c31a111c304..755c36d1ef45a 100644
> --- a/include/linux/dpll.h
> +++ b/include/linux/dpll.h
> @@ -11,6 +11,7 @@
>  #include <linux/device.h>
>  #include <linux/netlink.h>
>  #include <linux/netdevice.h>
> +#include <linux/property.h>
>  #include <linux/rtnetlink.h>
>
>  struct dpll_device;
> @@ -176,6 +177,8 @@ int dpll_netdev_add_pin_handle(struct sk_buff *msg,
>                                const struct net_device *dev);
>
>  struct dpll_pin *fwnode_dpll_pin_find(struct fwnode_handle *fwnode);
> +struct fwnode_handle *fwnode_dpll_pin_node_get(struct fwnode_handle *fwnode,
> +                                              const char *name);
>  #else
>  static inline void
>  dpll_netdev_pin_set(struct net_device *dev, struct dpll_pin *dpll_pin) { }
> @@ -197,8 +200,20 @@ fwnode_dpll_pin_find(struct fwnode_handle *fwnode)
>  {
>         return NULL;
>  }
> +
> +static inline struct fwnode_handle *
> +fwnode_dpll_pin_node_get(struct fwnode_handle *fwnode, const char *name)
> +{
> +       return NULL;
> +}
>  #endif
>
> +static inline struct fwnode_handle *
> +device_dpll_pin_node_get(struct device *dev, const char *name)
> +{
> +       return fwnode_dpll_pin_node_get(dev_fwnode(dev), name);
> +}
> +
>  struct dpll_device *
>  dpll_device_get(u64 clock_id, u32 dev_driver_id, struct module *module);
>
> --
> 2.52.0
>

^ permalink raw reply

* Re: [PATCH net-next v2 1/9] eth: bnxt: gather and report HW-GRO stats
From: Michael Chan @ 2026-02-08  0:09 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: davem, netdev, edumazet, pabeni, andrew+netdev, horms, shuah,
	willemb, petrm, donald.hunter, pavan.chebbi, linux-kselftest
In-Reply-To: <20260207003509.3927744-2-kuba@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 1305 bytes --]

On Fri, Feb 6, 2026 at 4:35 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> Count and report HW-GRO stats as seen by the kernel.
> The device stats for GRO seem to not reflect the reality,
> perhaps they count sessions which did not actually result
> in any aggregation. Also they count wire packets, so we
> have to count super-frames, anyway.
>
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>

Reviewed-by: Michael Chan <michael.chan@broadcom.com>

I'm suggesting a minor naming change below if you need to do v3.

> @@ -13492,6 +13497,8 @@ static void bnxt_get_one_ring_err_stats(struct bnxt *bp,

With the new GRO counters, this function is no longer limited to error
stats.  So maybe rename it to something like
bnxt_get_one_ring_misc_stats()?

>         stats->rx_total_netpoll_discards += sw_stats->rx.rx_netpoll_discards;
>         stats->rx_total_ring_discards +=
>                 BNXT_GET_RING_STATS64(hw_stats, rx_discard_pkts);
> +       stats->rx_total_hw_gro_packets += sw_stats->rx.rx_hw_gro_packets;
> +       stats->rx_total_hw_gro_wire_packets += sw_stats->rx.rx_hw_gro_wire_packets;
>         stats->tx_total_resets += sw_stats->tx.tx_resets;
>         stats->tx_total_ring_discards +=
>                 BNXT_GET_RING_STATS64(hw_stats, tx_discard_pkts);

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 5469 bytes --]

^ permalink raw reply

* Re: [PATCH net-next v14 4/4] net: dsa: add basic initial driver for MxL862xx switches
From: Daniel Golle @ 2026-02-08  0:06 UTC (permalink / raw)
  To: Vladimir Oltean
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
	Heiner Kallweit, Russell King, Simon Horman, netdev, devicetree,
	linux-kernel, Frank Wunderlich, Chad Monroe, Cezary Wilmanski,
	Liang Xu, John Crispin
In-Reply-To: <20260207215902.mtsg43zeoadqqfz5@skbuf>

On Sat, Feb 07, 2026 at 11:59:02PM +0200, Vladimir Oltean wrote:
> On Sat, Feb 07, 2026 at 03:07:27AM +0000, Daniel Golle wrote:
> > +/* PHY access via firmware relay */
> > +static int mxl862xx_phy_read_mmd(struct mxl862xx_priv *priv, int port,
> > +				 int devadd, int reg)
> > +{
> > +	struct mdio_relay_data param = {
> > +		.phy = port,
> > +		.mmd = devadd,
> > +		.reg = cpu_to_le16(reg),
> > +	};
> > +	int ret;
> > +
> > +	ret = MXL862XX_API_READ(priv, INT_GPHY_READ, param);
> > +	if (ret)
> > +		return ret;
> > +
> > +	return le16_to_cpu(param.data);
> > +}
> > +
> > +static int mxl862xx_phy_write_mmd(struct mxl862xx_priv *priv, int port,
> > +				  int devadd, int reg, u16 data)
> > +{
> > +	struct mdio_relay_data param = {
> > +		.phy = port,
> > +		.mmd = devadd,
> > +		.reg = cpu_to_le16(reg),
> > +		.data = cpu_to_le16(data),
> > +	};
> > +
> > +	return MXL862XX_API_WRITE(priv, INT_GPHY_WRITE, param);
> > +}
> > +
> > +static int mxl862xx_phy_read_mii_bus(struct mii_bus *bus, int port, int regnum)
> > +{
> > +	return mxl862xx_phy_read_mmd(bus->priv, port, 0, regnum);
> > +}
> > +
> > +static int mxl862xx_phy_write_mii_bus(struct mii_bus *bus, int port,
> > +				      int regnum, u16 val)
> > +{
> > +	return mxl862xx_phy_write_mmd(bus->priv, port, 0, regnum, val);
> > +}
> > +
> > +static int mxl862xx_phy_read_c45_mii_bus(struct mii_bus *bus, int port,
> > +					 int devadd, int regnum)
> > +{
> > +	return mxl862xx_phy_read_mmd(bus->priv, port, devadd, regnum);
> > +}
> > +
> > +static int mxl862xx_phy_write_c45_mii_bus(struct mii_bus *bus, int port,
> > +					  int devadd, int regnum, u16 val)
> > +{
> > +	return mxl862xx_phy_write_mmd(bus->priv, port, devadd, regnum, val);
> > +}
> 
> You took inspiration from the wrong place with the mii_bus ops prototypes,
> specifically with the "int port" argument.
> 
> The second argument does not hold the port, it holds the PHY address.
> I.e. in this case:
>                 port@6 {
>                     reg = <6>;
>                     phy-handle = <&phy5>;
>                     phy-mode = "internal";
>                 };
>                 phy5: ethernet-phy@5 {
>                     reg = <5>;
>                 };
> 
> "int port" is 5, not 6.
> 
> Your source of inspiration are the prototypes of an mii_bus used as
> ds->user_mii_bus. We have a different set of requirements there, because
> ds->user_mii_bus exists for the case where the PHY is not described in
> the device tree, so the port index is given as argument and the
> user_mii_bus is responsible for internally translating the port index to
> a PHY address.
> 
> So while the use of "int port" as argument name for these operations is
> justifiable in some cases, it is not applicable to this driver, and will
> be a pitfall for anyone who has to modify or debug this code.

Ack. While not completely correct from the beginning I should have
addressed that and changed the parameter to 'int addr' when I started to
count physical ports from 0 and no longer hide the microcontroller --
from that moment on the PHY address and the port address are no longer
equal.

I will post a follow-up series to address this and also the removal of
the 'label' property from the DT binding example once this has been
merged. Both cases don't justify a "Fixes:"-tag though, but I'll just
try to be fast, so both can still be applied before net-next closes.

^ permalink raw reply

* [PATCH net-next 2/2] bnxt_en: Check RSS contexts in bnxt_need_reserve_rings()
From: Michael Chan @ 2026-02-07 23:51 UTC (permalink / raw)
  To: davem
  Cc: netdev, edumazet, kuba, pabeni, andrew+netdev, pavan.chebbi,
	andrew.gospodarek, Kalesh AP
In-Reply-To: <20260207235118.1987301-1-michael.chan@broadcom.com>

bnxt_need_reserve_rings() checks all resources except HW RSS contexts
to determine if a new reservation is required.  For completeness, add
the check for HW RSS contexts.  This makes the code more complete after
the recent commit to increase the number of RSS contexts for a larger
RSS indirection table:

51b9d3f948b8 ("bnxt_en: Use a larger RSS indirection table on P5_PLUS chips")

Reviewed-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 4745063d2f5c..59f0ae745446 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7956,6 +7956,7 @@ static void bnxt_get_total_resources(struct bnxt *bp, struct bnxt_hw_rings *hwr)
 	hwr->rx = bp->rx_nr_rings;
 	hwr->grp = hwr->rx;
 	hwr->vnic = bnxt_get_total_vnics(bp, hwr->rx);
+	hwr->rss_ctx = bnxt_get_total_rss_ctxs(bp, hwr);
 	if (bp->flags & BNXT_FLAG_AGG_RINGS)
 		hwr->rx <<= 1;
 	hwr->stat = bnxt_get_func_stat_ctxs(bp);
@@ -7985,6 +7986,7 @@ static bool bnxt_need_reserve_rings(struct bnxt *bp)
 	if (hw_resc->resv_rx_rings != hwr.rx ||
 	    hw_resc->resv_vnics != hwr.vnic ||
 	    hw_resc->resv_stat_ctxs != hwr.stat ||
+	    hw_resc->resv_rsscos_ctxs != hwr.rss_ctx ||
 	    (hw_resc->resv_hw_ring_grps != hwr.grp &&
 	     !(bp->flags & BNXT_FLAG_CHIP_P5_PLUS)))
 		return true;
-- 
2.51.0


^ permalink raw reply related

* [PATCH net-next 1/2] bnxt_en: Refactor bnxt_need_reserve_rings()
From: Michael Chan @ 2026-02-07 23:51 UTC (permalink / raw)
  To: davem
  Cc: netdev, edumazet, kuba, pabeni, andrew+netdev, pavan.chebbi,
	andrew.gospodarek
In-Reply-To: <20260207235118.1987301-1-michael.chan@broadcom.com>

bnxt_need_reserve_rings() checks 6 ring resources against the reserved
values to determine if a new reservation is needed.  Factor out the code
to collect the total resources into a new helper function
bnxt_get_total_resources() to make the code cleaner and easier to read.
Instead of individual scalar variables, use the struct bnxt_hw_rings to
hold all the ring resources.  Using the struct, hwr.cp replaces the nq
variable and the chip specific hwr.cp_p5 replaces cp on newer chips.

There is no change in behavior.  This will make it easier to check the
RSS context resource in the next patch.

Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 45 +++++++++++++++--------
 1 file changed, 30 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 7d63d6b0d2c2..4745063d2f5c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -7946,13 +7946,27 @@ static int bnxt_get_total_vnics(struct bnxt *bp, int rx_rings)
 	return 1;
 }
 
+static void bnxt_get_total_resources(struct bnxt *bp, struct bnxt_hw_rings *hwr)
+{
+	hwr->cp = bnxt_nq_rings_in_use(bp);
+	hwr->cp_p5 = 0;
+	if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS)
+		hwr->cp_p5 = bnxt_cp_rings_in_use(bp);
+	hwr->tx = bp->tx_nr_rings;
+	hwr->rx = bp->rx_nr_rings;
+	hwr->grp = hwr->rx;
+	hwr->vnic = bnxt_get_total_vnics(bp, hwr->rx);
+	if (bp->flags & BNXT_FLAG_AGG_RINGS)
+		hwr->rx <<= 1;
+	hwr->stat = bnxt_get_func_stat_ctxs(bp);
+}
+
 static bool bnxt_need_reserve_rings(struct bnxt *bp)
 {
 	struct bnxt_hw_resc *hw_resc = &bp->hw_resc;
-	int cp = bnxt_cp_rings_in_use(bp);
-	int nq = bnxt_nq_rings_in_use(bp);
-	int rx = bp->rx_nr_rings, stat;
-	int vnic, grp = rx;
+	struct bnxt_hw_rings hwr;
+
+	bnxt_get_total_resources(bp, &hwr);
 
 	/* Old firmware does not need RX ring reservations but we still
 	 * need to setup a default RSS map when needed.  With new firmware
@@ -7962,25 +7976,26 @@ static bool bnxt_need_reserve_rings(struct bnxt *bp)
 	if (!BNXT_NEW_RM(bp))
 		bnxt_check_rss_tbl_no_rmgr(bp);
 
-	if (hw_resc->resv_tx_rings != bp->tx_nr_rings &&
-	    bp->hwrm_spec_code >= 0x10601)
+	if (hw_resc->resv_tx_rings != hwr.tx && bp->hwrm_spec_code >= 0x10601)
 		return true;
 
 	if (!BNXT_NEW_RM(bp))
 		return false;
 
-	vnic = bnxt_get_total_vnics(bp, rx);
-
-	if (bp->flags & BNXT_FLAG_AGG_RINGS)
-		rx <<= 1;
-	stat = bnxt_get_func_stat_ctxs(bp);
-	if (hw_resc->resv_rx_rings != rx || hw_resc->resv_cp_rings != cp ||
-	    hw_resc->resv_vnics != vnic || hw_resc->resv_stat_ctxs != stat ||
-	    (hw_resc->resv_hw_ring_grps != grp &&
+	if (hw_resc->resv_rx_rings != hwr.rx ||
+	    hw_resc->resv_vnics != hwr.vnic ||
+	    hw_resc->resv_stat_ctxs != hwr.stat ||
+	    (hw_resc->resv_hw_ring_grps != hwr.grp &&
 	     !(bp->flags & BNXT_FLAG_CHIP_P5_PLUS)))
 		return true;
+	if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) {
+		if (hw_resc->resv_cp_rings != hwr.cp_p5)
+			return true;
+	} else if (hw_resc->resv_cp_rings != hwr.cp) {
+		return true;
+	}
 	if ((bp->flags & BNXT_FLAG_CHIP_P5_PLUS) && BNXT_PF(bp) &&
-	    hw_resc->resv_irqs != nq)
+	    hw_resc->resv_irqs != hwr.cp)
 		return true;
 	return false;
 }
-- 
2.51.0


^ permalink raw reply related

* [PATCH net-next 0/2] bnxt_en: Add RSS context resource check
From: Michael Chan @ 2026-02-07 23:51 UTC (permalink / raw)
  To: davem
  Cc: netdev, edumazet, kuba, pabeni, andrew+netdev, pavan.chebbi,
	andrew.gospodarek

Add missing logic to check that we have enough RSS contexts.  This
will make the recent change to increase the use of RSS contexts for
a larger RSS indirection table more complete.

Michael Chan (2):
  bnxt_en: Refactor bnxt_need_reserve_rings()
  bnxt_en: Check RSS contexts in bnxt_need_reserve_rings()

 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 47 +++++++++++++++--------
 1 file changed, 32 insertions(+), 15 deletions(-)

-- 
2.51.0


^ permalink raw reply

* [PATCH v1 net] af_unix: Fix memleak of newsk in unix_stream_connect().
From: Kuniyuki Iwashima @ 2026-02-07 23:22 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Christian Brauner, Kuniyuki Iwashima,
	Kuniyuki Iwashima, netdev

When prepare_peercred() fails in unix_stream_connect(),
unix_release_sock() is not called for newsk, and the memory
is leaked.

Let's move prepare_peercred() before unix_create1().

Fixes: fd0a109a0f6b ("net, pidfs: prepare for handing out pidfds for reaped sk->sk_peer_pid")
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/unix/af_unix.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index d0511225799b..f6d56e70c7a2 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1650,10 +1650,9 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr_unsized *uad
 
 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
 
-	/* First of all allocate resources.
-	 * If we will make it after state is locked,
-	 * we will have to recheck all again in any case.
-	 */
+	err = prepare_peercred(&peercred);
+	if (err)
+		goto out;
 
 	/* create new sock for complete connection */
 	newsk = unix_create1(net, NULL, 0, sock->type);
@@ -1662,10 +1661,6 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr_unsized *uad
 		goto out;
 	}
 
-	err = prepare_peercred(&peercred);
-	if (err)
-		goto out;
-
 	/* Allocate skb for sending to listening sock */
 	skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
 	if (!skb) {
-- 
2.53.0.rc2.204.g2597b5adb4-goog


^ permalink raw reply related

* [PATCH v1 bpf 2/2] bpf: Reject access to unix_sk(sk)->listener.
From: Kuniyuki Iwashima @ 2026-02-07 23:07 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau
  Cc: John Fastabend, Eduard Zingerman, Song Liu, Yonghong Song,
	KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa, Michal Luczaj,
	Kuniyuki Iwashima, Kuniyuki Iwashima, bpf, netdev
In-Reply-To: <20260207230720.2542943-1-kuniyu@google.com>

With the previous patch, bpf prog cannot access unix_sk(sk)->peer.

struct unix_sock has two pointers to struct sock, and another
pointer unix_sk(sk)->listener also has the same problem mentioned
in the previous patch.

unix_sk(sk)->listener is set by unix_stream_connect() and
cleared by unix_update_edges() during accept(), and both are
done under unix_state_lock().

There are some functions where unix_sk(sk)->peer is passed and
bpf prog can access unix_sk(unix_sk(sk)->peer)->listener locklessly,
which is unsafe.  (e.g. unix_maybe_add_creds())

Let's reject bpf access to unix_sk(sk)->listener too.

Fixes: aed6ecef55d7 ("af_unix: Save listener for embryo socket.")
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 kernel/bpf/verifier.c                         |  1 +
 .../selftests/bpf/progs/verifier_sock.c       | 24 +++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b328a1640c82..2ffc6eff5584 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7157,6 +7157,7 @@ BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct vm_area_struct) {
 
 BTF_TYPE_SAFE_UNTRUSTED(struct unix_sock) {
 	struct sock *peer;
+	struct sock *listener;
 };
 
 static bool type_is_rcu(struct bpf_verifier_env *env,
diff --git a/tools/testing/selftests/bpf/progs/verifier_sock.c b/tools/testing/selftests/bpf/progs/verifier_sock.c
index 8de4d3ed98d4..730850e93d6d 100644
--- a/tools/testing/selftests/bpf/progs/verifier_sock.c
+++ b/tools/testing/selftests/bpf/progs/verifier_sock.c
@@ -1191,4 +1191,28 @@ int BPF_PROG(trace_unix_dgram_sendmsg, struct socket *sock, struct msghdr *msg,
 	return 0;
 }
 
+SEC("fentry/unix_maybe_add_creds")
+__failure __msg("R1 type=untrusted_ptr_ expected=sock_common, sock, tcp_sock, xdp_sock, ptr_, trusted_ptr_")
+int BPF_PROG(trace_unix_maybe_add_creds, struct sk_buff *skb,
+	     const struct sock *sk, struct sock *other)
+{
+	struct unix_sock *u_other, *u_listener;
+
+	if (!other)
+		return 0;
+
+	u_other = bpf_skc_to_unix_sock(other);
+	if (!u_other)
+		return 0;
+
+	/* unix_accept() could clear u_other->listener
+	 * and the listener could be close()d.
+	 */
+	u_listener = bpf_skc_to_unix_sock(u_other->listener);
+	if (!u_listener)
+		return 0;
+
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
2.53.0.rc2.204.g2597b5adb4-goog


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox