Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v40 3/8] eea: probe the netdevice and create adminq
From: Xuan Zhuo @ 2026-04-09 12:21 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Xuan Zhuo, Wen Gu, Philo Lu, Vadim Fedorenko,
	Dong Yibo, Jes Sorensen, Heiner Kallweit, Dust Li
In-Reply-To: <20260409122130.129416-1-xuanzhuo@linux.alibaba.com>

Add basic driver framework for the Alibaba Elastic Ethernet Adapter(EEA).

This commit creates the netdevice after PCI probe,
and initializes the admin queue to send commands to the device.

Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Reviewed-by: Philo Lu <lulie@linux.alibaba.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/net/ethernet/alibaba/eea/Makefile     |   6 +-
 drivers/net/ethernet/alibaba/eea/eea_adminq.c | 482 ++++++++++++++++++
 drivers/net/ethernet/alibaba/eea/eea_adminq.h |  74 +++
 drivers/net/ethernet/alibaba/eea/eea_net.c    | 230 +++++++++
 drivers/net/ethernet/alibaba/eea/eea_net.h    | 136 +++++
 drivers/net/ethernet/alibaba/eea/eea_pci.c    |  29 +-
 drivers/net/ethernet/alibaba/eea/eea_pci.h    |   3 +
 7 files changed, 957 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_adminq.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_adminq.h
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_net.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_net.h

diff --git a/drivers/net/ethernet/alibaba/eea/Makefile b/drivers/net/ethernet/alibaba/eea/Makefile
index e5e4007810a6..91f318e8e046 100644
--- a/drivers/net/ethernet/alibaba/eea/Makefile
+++ b/drivers/net/ethernet/alibaba/eea/Makefile
@@ -1,4 +1,6 @@
 
 obj-$(CONFIG_EEA) += eea.o
-eea-y :=  eea_ring.o \
-	eea_pci.o
+eea-y := eea_ring.o \
+	eea_net.o \
+	eea_pci.o \
+	eea_adminq.o
diff --git a/drivers/net/ethernet/alibaba/eea/eea_adminq.c b/drivers/net/ethernet/alibaba/eea/eea_adminq.c
new file mode 100644
index 000000000000..c36714a932eb
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_adminq.c
@@ -0,0 +1,482 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/iopoll.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+
+#include "eea_adminq.h"
+#include "eea_net.h"
+#include "eea_pci.h"
+#include "eea_ring.h"
+
+#define EEA_AQ_CMD_CFG_QUERY         ((0 << 8) | 0)
+
+#define EEA_AQ_CMD_QUEUE_CREATE      ((1 << 8) | 0)
+#define EEA_AQ_CMD_QUEUE_DESTROY_ALL ((1 << 8) | 1)
+
+#define EEA_AQ_CMD_HOST_INFO         ((2 << 8) | 0)
+
+#define EEA_AQ_CMD_DEV_STATUS        ((3 << 8) | 0)
+
+#define EEA_RING_DESC_F_AQ_PHASE     (BIT(15) | BIT(7))
+
+#define EEA_QUEUE_FLAGS_HW_SPLIT_HDR BIT(0)
+#define EEA_QUEUE_FLAGS_SQCQ         BIT(1)
+#define EEA_QUEUE_FLAGS_HWTS         BIT(2)
+
+struct eea_aq_create {
+	__le32 flags;
+	/* queue index.
+	 * rx: 0 == qidx % 2
+	 * tx: 1 == qidx % 2
+	 */
+	__le16 qidx;
+	/* the depth of the queue */
+	__le16 depth;
+	/*  0: without SPLIT HDR
+	 *  1: 128B
+	 *  2: 256B
+	 *  3: 512B
+	 */
+	u8 hdr_buf_size;
+	u8 sq_desc_size;
+	u8 cq_desc_size;
+	u8 reserve0;
+	/* The vector for the irq. rx,tx share the same vector */
+	__le16 msix_vector;
+	__le16 reserve;
+	/* sq ring cfg. */
+	__le32 sq_addr_low;
+	__le32 sq_addr_high;
+	/* cq ring cfg. Just valid when flags include EEA_QUEUE_FLAGS_SQCQ. */
+	__le32 cq_addr_low;
+	__le32 cq_addr_high;
+};
+
+struct eea_aq_queue_drv_status {
+	__le16 qidx;
+
+	__le16 sq_head;
+	__le16 cq_head;
+	__le16 reserved;
+};
+
+#define EEA_OS_DISTRO		0
+#define EEA_DRV_TYPE		0
+#define EEA_OS_LINUX		1
+#define EEA_SPEC_VER_MAJOR	1
+#define EEA_SPEC_VER_MINOR	0
+
+struct eea_aq_host_info_cfg {
+	__le16	os_type;
+	__le16	os_dist;
+	__le16	drv_type;
+
+	__le16	kern_ver_major;
+	__le16	kern_ver_minor;
+	__le16	kern_ver_sub_minor;
+
+	__le16	drv_ver_major;
+	__le16	drv_ver_minor;
+	__le16	drv_ver_sub_minor;
+
+	__le16	spec_ver_major;
+	__le16	spec_ver_minor;
+	__le16	pci_bdf;
+	__le32	pci_domain;
+
+	u8      os_ver_str[64];
+	u8      isa_str[64];
+};
+
+#define EEA_HINFO_MAX_REP_LEN	1024
+#define EEA_HINFO_REP_REJECT	2
+
+struct eea_aq_host_info_rep {
+	u8	op_code;
+	u8	has_reply;
+	u8	reply_str[EEA_HINFO_MAX_REP_LEN];
+};
+
+static struct eea_ring *qid_to_ering(struct eea_net *enet, u32 qid)
+{
+	struct eea_ring *ering;
+
+	if (qid % 2 == 0)
+		ering = enet->rx[qid / 2]->ering;
+	else
+		ering = enet->tx[qid / 2].ering;
+
+	return ering;
+}
+
+#define EEA_AQ_TIMEOUT_US (60 * 1000 * 1000)
+
+/* Operations are fully serialized under the protection of a lock
+ * (e.g., rtlock)
+ *
+ * If the hardware fails to complete the command correctly, a device reset will
+ * be triggered.
+ */
+static int eea_adminq_submit(struct eea_net *enet, u16 cmd,
+			     dma_addr_t req_addr, dma_addr_t res_addr,
+			     u32 req_size, u32 res_size)
+{
+	struct eea_aq_cdesc *cdesc;
+	struct eea_aq_desc *desc;
+	int ret;
+
+	desc = ering_aq_alloc_desc(enet->adminq.ring);
+
+	desc->classid = cmd >> 8;
+	desc->command = cmd & 0xff;
+
+	desc->data_addr = cpu_to_le64(req_addr);
+	desc->data_len = cpu_to_le32(req_size);
+
+	desc->reply_addr = cpu_to_le64(res_addr);
+	desc->reply_len = cpu_to_le32(res_size);
+
+	/* for update flags */
+	dma_wmb();
+
+	desc->flags = cpu_to_le16(enet->adminq.phase);
+
+	ering_sq_commit_desc(enet->adminq.ring);
+
+	ering_kick(enet->adminq.ring);
+
+	++enet->adminq.num;
+
+	if ((enet->adminq.num % enet->adminq.ring->num) == 0)
+		enet->adminq.phase ^= EEA_RING_DESC_F_AQ_PHASE;
+
+	ret = read_poll_timeout(ering_cq_get_desc, cdesc, cdesc, 10,
+				EEA_AQ_TIMEOUT_US, false, enet->adminq.ring);
+	if (ret) {
+		netdev_err(enet->netdev,
+			   "adminq exec timeout. cmd: %d reset device.\n",
+			   cmd);
+		/* The device must be reset before unmapping buffers to avoid
+		 * potential DMA writes after the memory is freed.
+		 */
+		eea_device_reset(enet->edev);
+		enet->adminq.broken = true;
+		return ret;
+	}
+
+	/* Returns 0 on success, or a negative error code on failure. */
+	ret = le32_to_cpu(cdesc->status);
+
+	ering_cq_ack_desc(enet->adminq.ring, 1);
+
+	if (ret)
+		netdev_err(enet->netdev,
+			   "adminq exec failed. cmd: %d ret %d\n", cmd, ret);
+
+	return ret;
+}
+
+static int eea_adminq_exec(struct eea_net *enet, u16 cmd,
+			   void *req, u32 req_size, void *res, u32 res_size)
+{
+	dma_addr_t req_addr = 0, res_addr = 0;
+	struct device *dma;
+	int ret;
+
+	if (enet->adminq.broken)
+		return -EIO;
+
+	dma = enet->edev->dma_dev;
+
+	if (req) {
+		req_addr = dma_map_single(dma, req, req_size, DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(dma, req_addr)))
+			return -ENOMEM;
+	}
+
+	if (res) {
+		res_addr = dma_map_single(dma, res, res_size, DMA_FROM_DEVICE);
+		if (unlikely(dma_mapping_error(dma, res_addr))) {
+			ret = -ENOMEM;
+			goto err_unmap_req;
+		}
+	}
+
+	ret = eea_adminq_submit(enet, cmd, req_addr, res_addr,
+				req_size, res_size);
+	if (res)
+		dma_unmap_single(dma, res_addr, res_size, DMA_FROM_DEVICE);
+
+err_unmap_req:
+	if (req)
+		dma_unmap_single(dma, req_addr, req_size, DMA_TO_DEVICE);
+
+	return ret;
+}
+
+void eea_destroy_adminq(struct eea_net *enet)
+{
+	struct eea_aq *aq;
+
+	aq = &enet->adminq;
+
+	if (aq->ring) {
+		ering_free(aq->ring);
+		aq->ring = NULL;
+		aq->phase = 0;
+	}
+
+	kfree(aq->q_req_buf);
+	kfree(aq->q_res_buf);
+
+	aq->q_req_buf = NULL;
+	aq->q_res_buf = NULL;
+}
+
+int eea_create_adminq(struct eea_net *enet, u32 qid)
+{
+	u32 db_size, q_size, num;
+	struct eea_ring *ering;
+	struct eea_aq *aq;
+	int err = -ENOMEM;
+
+	num = enet->edev->rx_num + enet->edev->tx_num;
+	aq = &enet->adminq;
+
+	ering = ering_alloc(qid, 64, enet->edev, sizeof(struct eea_aq_desc),
+			    sizeof(struct eea_aq_cdesc), "adminq");
+	if (!ering)
+		return -ENOMEM;
+
+	aq->ring = ering;
+
+	err = eea_pci_active_aq(ering, qid / 2 + 1);
+	if (err)
+		goto err;
+
+	aq->phase = BIT(7);
+	aq->num = 0;
+
+	q_size = sizeof(*aq->q_req_buf) * num;
+	db_size = sizeof(*aq->q_res_buf) * num;
+
+	aq->q_req_size = q_size;
+	aq->q_res_size = db_size;
+
+	aq->q_req_buf = kzalloc(q_size, GFP_KERNEL);
+	if (!aq->q_req_buf)
+		goto err;
+
+	aq->q_res_buf = kzalloc(db_size, GFP_KERNEL);
+	if (!aq->q_res_buf)
+		goto err;
+
+	err = eea_pci_set_aq_up(enet->edev);
+	if (err)
+		goto err;
+
+	aq->broken = false;
+
+	return 0;
+
+err:
+	/* For the adminq, we can safely free the ring before setting it up. */
+	eea_destroy_adminq(enet);
+	return err;
+}
+
+int eea_adminq_query_cfg(struct eea_net *enet, struct eea_aq_cfg *cfg)
+{
+	return eea_adminq_exec(enet, EEA_AQ_CMD_CFG_QUERY, NULL, 0, cfg,
+			       sizeof(*cfg));
+}
+
+static void qcfg_fill(struct eea_aq_create *qcfg, struct eea_ring *ering,
+		      u32 flags)
+{
+	qcfg->flags = cpu_to_le32(flags);
+	qcfg->qidx = cpu_to_le16(ering->index);
+	qcfg->depth = cpu_to_le16(ering->num);
+
+	qcfg->hdr_buf_size = flags & EEA_QUEUE_FLAGS_HW_SPLIT_HDR ? 1 : 0;
+	qcfg->sq_desc_size = ering->sq.desc_size;
+	qcfg->cq_desc_size = ering->cq.desc_size;
+	qcfg->msix_vector = cpu_to_le16(ering->msix_vec);
+
+	qcfg->sq_addr_low = cpu_to_le32(lower_32_bits(ering->sq.dma_addr));
+	qcfg->sq_addr_high = cpu_to_le32(upper_32_bits(ering->sq.dma_addr));
+
+	qcfg->cq_addr_low = cpu_to_le32(lower_32_bits(ering->cq.dma_addr));
+	qcfg->cq_addr_high = cpu_to_le32(upper_32_bits(ering->cq.dma_addr));
+}
+
+int eea_adminq_create_q(struct eea_net *enet, u32 num, u32 flags)
+{
+	int i, db_size, q_size, err = -ENOMEM;
+	struct eea_net_cfg *cfg;
+	struct eea_ring *ering;
+	struct eea_aq *aq;
+
+	cfg = &enet->cfg;
+	aq = &enet->adminq;
+
+	if (cfg->split_hdr)
+		flags |= EEA_QUEUE_FLAGS_HW_SPLIT_HDR;
+
+	flags |= EEA_QUEUE_FLAGS_SQCQ;
+	flags |= EEA_QUEUE_FLAGS_HWTS;
+
+	q_size = sizeof(*aq->q_req_buf) * num;
+	db_size = sizeof(*aq->q_res_buf) * num;
+
+	for (i = 0; i < num; i++) {
+		ering = qid_to_ering(enet, i);
+		qcfg_fill(aq->q_req_buf + i, ering, flags);
+	}
+
+	err = eea_adminq_exec(enet, EEA_AQ_CMD_QUEUE_CREATE,
+			      aq->q_req_buf, q_size, aq->q_res_buf, db_size);
+	if (err)
+		return err;
+
+	for (i = 0; i < num; i++) {
+		ering = qid_to_ering(enet, i);
+		ering->db = eea_pci_db_addr(ering->edev,
+					    le32_to_cpu(aq->q_res_buf[i]));
+		if (!ering->db) {
+			netdev_err(enet->netdev, "invalid db off %u\n",
+				   le32_to_cpu(aq->q_res_buf[i]));
+			goto err;
+		}
+	}
+
+	return err;
+
+err:
+	eea_adminq_destroy_all_q(enet);
+	for (i = 0; i < num; i++) {
+		ering = qid_to_ering(enet, i);
+		ering->db = NULL;
+	}
+
+	return -EIO;
+}
+
+int eea_adminq_destroy_all_q(struct eea_net *enet)
+{
+	return eea_adminq_exec(enet, EEA_AQ_CMD_QUEUE_DESTROY_ALL, NULL, 0,
+			       NULL, 0);
+}
+
+/* The caller must ensure that both the 'rt' and 'tx' arrays are valid. */
+struct eea_aq_dev_status *eea_adminq_dev_status(struct eea_net *enet)
+{
+	struct eea_aq_queue_drv_status *drv_status;
+	struct eea_aq_dev_status *dev_status;
+	int err, i, io_num, size, q_num;
+	struct eea_ring *ering;
+	void *rep, *req;
+
+	q_num = enet->cfg.rx_ring_num + enet->cfg.tx_ring_num + 1;
+	io_num = enet->cfg.rx_ring_num + enet->cfg.tx_ring_num;
+
+	req = kcalloc(q_num, sizeof(struct eea_aq_queue_drv_status),
+		      GFP_KERNEL);
+	if (!req)
+		return NULL;
+
+	size = struct_size(dev_status, q_status, q_num);
+
+	rep = kzalloc(size, GFP_KERNEL);
+	if (!rep) {
+		kfree(req);
+		return NULL;
+	}
+
+	drv_status = req;
+	for (i = 0; i < io_num; ++i, ++drv_status) {
+		ering = qid_to_ering(enet, i);
+		drv_status->qidx = cpu_to_le16(i);
+		drv_status->cq_head = cpu_to_le16(ering->cq.head);
+		drv_status->sq_head = cpu_to_le16(ering->sq.head);
+	}
+
+	drv_status->qidx = cpu_to_le16(i);
+	drv_status->cq_head = cpu_to_le16(enet->adminq.ring->cq.head);
+	drv_status->sq_head = cpu_to_le16(enet->adminq.ring->sq.head);
+
+	err = eea_adminq_exec(enet, EEA_AQ_CMD_DEV_STATUS, req,
+			      q_num * sizeof(struct eea_aq_queue_drv_status),
+			      rep, size);
+	kfree(req);
+	if (err) {
+		kfree(rep);
+		return NULL;
+	}
+
+	return rep;
+}
+
+int eea_adminq_config_host_info(struct eea_net *enet)
+{
+	struct device *dev = enet->edev->dma_dev;
+	struct eea_aq_host_info_cfg *cfg;
+	struct eea_aq_host_info_rep *rep;
+	int rc = -ENOMEM;
+
+	cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
+	if (!cfg)
+		return rc;
+
+	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
+	if (!rep)
+		goto err_free_cfg;
+
+	cfg->os_type            = cpu_to_le16(EEA_OS_LINUX);
+	cfg->os_dist            = cpu_to_le16(EEA_OS_DISTRO);
+	cfg->drv_type           = cpu_to_le16(EEA_DRV_TYPE);
+
+	cfg->kern_ver_major     = cpu_to_le16(LINUX_VERSION_MAJOR);
+	cfg->kern_ver_minor     = cpu_to_le16(LINUX_VERSION_PATCHLEVEL);
+	cfg->kern_ver_sub_minor = cpu_to_le16(LINUX_VERSION_SUBLEVEL);
+
+	cfg->drv_ver_major      = cpu_to_le16(EEA_VER_MAJOR);
+	cfg->drv_ver_minor      = cpu_to_le16(EEA_VER_MINOR);
+	cfg->drv_ver_sub_minor  = cpu_to_le16(EEA_VER_SUB_MINOR);
+
+	cfg->spec_ver_major     = cpu_to_le16(EEA_SPEC_VER_MAJOR);
+	cfg->spec_ver_minor     = cpu_to_le16(EEA_SPEC_VER_MINOR);
+
+	cfg->pci_bdf            = cpu_to_le16(eea_pci_dev_id(enet->edev));
+	cfg->pci_domain         = cpu_to_le32(eea_pci_domain_nr(enet->edev));
+
+	strscpy(cfg->os_ver_str, utsname()->release, sizeof(cfg->os_ver_str));
+	strscpy(cfg->isa_str, utsname()->machine, sizeof(cfg->isa_str));
+
+	rc = eea_adminq_exec(enet, EEA_AQ_CMD_HOST_INFO,
+			     cfg, sizeof(*cfg), rep, sizeof(*rep));
+
+	if (!rc) {
+		if (rep->op_code == EEA_HINFO_REP_REJECT) {
+			dev_err(dev, "Device has refused the initialization due to provided host information\n");
+			rc = -ENODEV;
+		}
+		if (rep->has_reply) {
+			rep->reply_str[EEA_HINFO_MAX_REP_LEN - 1] = '\0';
+			dev_warn(dev, "Device replied: %s\n",
+				 rep->reply_str);
+		}
+	}
+
+	kfree(rep);
+err_free_cfg:
+	kfree(cfg);
+	return rc;
+}
diff --git a/drivers/net/ethernet/alibaba/eea/eea_adminq.h b/drivers/net/ethernet/alibaba/eea/eea_adminq.h
new file mode 100644
index 000000000000..816059193b3b
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_adminq.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#ifndef __EEA_ADMINQ_H__
+#define __EEA_ADMINQ_H__
+
+struct eea_aq_cfg {
+	__le32 rx_depth_max;
+	__le32 rx_depth_def;
+
+	__le32 tx_depth_max;
+	__le32 tx_depth_def;
+
+	__le32 max_tso_size;
+	__le32 max_tso_segs;
+
+	u8 mac[ETH_ALEN];
+	__le16 status;
+
+	__le16 mtu;
+	__le16 reserved0;
+	__le16 reserved1;
+	u8 reserved2;
+	u8 reserved3;
+
+	__le16 reserved4;
+	__le16 reserved5;
+	__le16 reserved6;
+};
+
+struct eea_aq_queue_status {
+	__le16 qidx;
+#define EEA_QUEUE_STATUS_OK 0
+#define EEA_QUEUE_STATUS_NEED_RESET 1
+	__le16 status;
+};
+
+struct eea_aq_dev_status {
+#define EEA_LINK_DOWN_STATUS  0
+#define EEA_LINK_UP_STATUS    1
+	__le16 link_status;
+	__le16 reserved;
+
+	struct eea_aq_queue_status q_status[];
+};
+
+struct eea_aq {
+	struct eea_ring *ring;
+	u32 num;
+	bool broken;
+	u16 phase;
+
+	u32 q_req_size;
+	u32 q_res_size;
+	struct eea_aq_create *q_req_buf;
+	__le32 *q_res_buf;
+};
+
+struct eea_net;
+
+int eea_create_adminq(struct eea_net *enet, u32 qid);
+void eea_destroy_adminq(struct eea_net *enet);
+
+int eea_adminq_query_cfg(struct eea_net *enet, struct eea_aq_cfg *cfg);
+
+int eea_adminq_create_q(struct eea_net *enet, u32 num, u32 flags);
+int eea_adminq_destroy_all_q(struct eea_net *enet);
+struct eea_aq_dev_status *eea_adminq_dev_status(struct eea_net *enet);
+int eea_adminq_config_host_info(struct eea_net *enet);
+#endif
diff --git a/drivers/net/ethernet/alibaba/eea/eea_net.c b/drivers/net/ethernet/alibaba/eea/eea_net.c
new file mode 100644
index 000000000000..6df65908a215
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_net.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/netdev_queues.h>
+
+#include "eea_adminq.h"
+#include "eea_net.h"
+#include "eea_pci.h"
+#include "eea_ring.h"
+
+#define EEA_SPLIT_HDR_SIZE ALIGN(128, L1_CACHE_BYTES)
+#define EEA_NET_IO_RING_DEPTH_MAX (32 * 1024)
+#define EEA_NET_IO_RING_DEPTH_MIN 128
+
+static int eea_update_cfg(struct eea_net *enet,
+			  struct eea_device *edev,
+			  struct eea_aq_cfg *hwcfg)
+{
+	u32 rx_max = le32_to_cpu(hwcfg->rx_depth_max);
+	u32 tx_max = le32_to_cpu(hwcfg->tx_depth_max);
+	u32 rx_def = le32_to_cpu(hwcfg->rx_depth_def);
+	u32 tx_def = le32_to_cpu(hwcfg->tx_depth_def);
+
+	/* Now, we assert that the rx ring num is equal to the tx ring num. */
+	if (edev->rx_num != edev->tx_num) {
+		dev_err(edev->dma_dev, "Inconsistent ring num: RX %u, TX %u\n",
+			edev->rx_num, edev->tx_num);
+		return -EINVAL;
+	}
+
+	if (rx_max > EEA_NET_IO_RING_DEPTH_MAX ||
+	    rx_max < EEA_NET_IO_RING_DEPTH_MIN ||
+	    tx_max > EEA_NET_IO_RING_DEPTH_MAX ||
+	    tx_max < EEA_NET_IO_RING_DEPTH_MIN) {
+		dev_err(edev->dma_dev, "Invalid HW max depth: RX %u, TX %u\n",
+			rx_max, tx_max);
+		return -EINVAL;
+	}
+
+	if (rx_def > rx_max ||
+	    tx_def > tx_max ||
+	    rx_def < EEA_NET_IO_RING_DEPTH_MIN ||
+	    tx_def < EEA_NET_IO_RING_DEPTH_MIN) {
+		dev_err(edev->dma_dev, "Invalid default depth: RX %u (max %u), TX %u (max %u)\n",
+			rx_def, rx_max, tx_def, tx_max);
+		return -EINVAL;
+	}
+
+	if (!is_power_of_2(rx_max) || !is_power_of_2(tx_max) ||
+	    !is_power_of_2(rx_def) || !is_power_of_2(tx_def)) {
+		dev_err(edev->dma_dev, "Ring depth must be power of 2\n");
+		return -EINVAL;
+	}
+
+	enet->cfg_hw.rx_ring_depth = rx_max;
+	enet->cfg_hw.tx_ring_depth = tx_max;
+	enet->cfg_hw.rx_ring_num = edev->rx_num;
+	enet->cfg_hw.tx_ring_num = edev->tx_num;
+	enet->cfg_hw.split_hdr = EEA_SPLIT_HDR_SIZE;
+
+	enet->cfg.rx_ring_depth = rx_def;
+	enet->cfg.tx_ring_depth = tx_def;
+	enet->cfg.rx_ring_num = edev->rx_num;
+	enet->cfg.tx_ring_num = edev->tx_num;
+
+	return 0;
+}
+
+static int eea_netdev_init_features(struct net_device *netdev,
+				    struct eea_net *enet,
+				    struct eea_device *edev)
+{
+	struct eea_aq_cfg *cfg;
+	int err;
+	u32 mtu;
+
+	cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
+	if (!cfg)
+		return -ENOMEM;
+
+	err = eea_adminq_query_cfg(enet, cfg);
+	if (err)
+		goto err_free;
+
+	mtu = le16_to_cpu(cfg->mtu);
+	if (mtu < ETH_MIN_MTU) {
+		dev_err(edev->dma_dev, "The device gave us an invalid MTU. Here we can only exit the initialization. %u < %u\n",
+			mtu, ETH_MIN_MTU);
+		err = -EINVAL;
+		goto err_free;
+	}
+
+	err = eea_update_cfg(enet, edev, cfg);
+	if (err)
+		goto err_free;
+
+	netdev->priv_flags |= IFF_UNICAST_FLT;
+	netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+
+	netdev->hw_features |= NETIF_F_HW_CSUM;
+	netdev->hw_features |= NETIF_F_GRO_HW;
+	netdev->hw_features |= NETIF_F_SG;
+	netdev->hw_features |= NETIF_F_TSO;
+	netdev->hw_features |= NETIF_F_TSO_ECN;
+	netdev->hw_features |= NETIF_F_TSO6;
+	netdev->hw_features |= NETIF_F_GSO_UDP_L4;
+
+	netdev->features |= NETIF_F_HIGHDMA;
+	netdev->features |= NETIF_F_HW_CSUM;
+	netdev->features |= NETIF_F_SG;
+	netdev->features |= NETIF_F_GSO_ROBUST;
+	netdev->features |= netdev->hw_features & NETIF_F_ALL_TSO;
+	netdev->features |= NETIF_F_RXCSUM;
+	netdev->features |= NETIF_F_GRO_HW;
+
+	netdev->vlan_features = netdev->features;
+
+	if (!is_valid_ether_addr(cfg->mac)) {
+		dev_err(edev->dma_dev, "The device gave invalid mac %pM\n",
+			cfg->mac);
+		err = -EINVAL;
+		goto err_free;
+	}
+
+	eth_hw_addr_set(netdev, cfg->mac);
+
+	enet->speed = SPEED_UNKNOWN;
+	enet->duplex = DUPLEX_UNKNOWN;
+
+	netdev->min_mtu = ETH_MIN_MTU;
+
+	netdev->mtu = mtu;
+
+	/* If jumbo frames are already enabled, then the returned MTU will be a
+	 * jumbo MTU, and the driver will automatically enable jumbo frame
+	 * support by default.
+	 */
+	netdev->max_mtu = mtu;
+
+err_free:
+	kfree(cfg);
+	return err;
+}
+
+static const struct net_device_ops eea_netdev = {
+	.ndo_validate_addr  = eth_validate_addr,
+	.ndo_features_check = passthru_features_check,
+};
+
+static struct eea_net *eea_netdev_alloc(struct eea_device *edev, u32 pairs)
+{
+	struct net_device *netdev;
+	struct eea_net *enet;
+
+	netdev = alloc_etherdev_mq(sizeof(struct eea_net), pairs);
+	if (!netdev) {
+		dev_err(edev->dma_dev,
+			"alloc_etherdev_mq failed with pairs %d\n", pairs);
+		return NULL;
+	}
+
+	netdev->netdev_ops = &eea_netdev;
+	SET_NETDEV_DEV(netdev, edev->dma_dev);
+
+	enet = netdev_priv(netdev);
+	enet->netdev = netdev;
+	enet->edev = edev;
+	edev->enet = enet;
+
+	return enet;
+}
+
+int eea_net_probe(struct eea_device *edev)
+{
+	struct eea_net *enet;
+	int err = -ENOMEM;
+
+	enet = eea_netdev_alloc(edev, edev->rx_num);
+	if (!enet)
+		return -ENOMEM;
+
+	err = eea_create_adminq(enet, edev->rx_num + edev->tx_num);
+	if (err)
+		goto err_free_netdev;
+
+	err = eea_adminq_config_host_info(enet);
+	if (err)
+		goto err_reset_dev;
+
+	err = eea_netdev_init_features(enet->netdev, enet, edev);
+	if (err)
+		goto err_reset_dev;
+
+	netdev_dbg(enet->netdev, "eea probe success.\n");
+
+	return 0;
+
+err_reset_dev:
+	eea_device_reset(edev);
+	eea_destroy_adminq(enet);
+
+err_free_netdev:
+	free_netdev(enet->netdev);
+	return err;
+}
+
+void eea_net_remove(struct eea_device *edev)
+{
+	struct net_device *netdev;
+	struct eea_net *enet;
+
+	enet = edev->enet;
+	netdev = enet->netdev;
+
+	netdev_dbg(enet->netdev, "eea removed.\n");
+
+	eea_device_reset(edev);
+
+	eea_destroy_adminq(enet);
+
+	free_netdev(netdev);
+}
diff --git a/drivers/net/ethernet/alibaba/eea/eea_net.h b/drivers/net/ethernet/alibaba/eea/eea_net.h
new file mode 100644
index 000000000000..239312456c5b
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_net.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#ifndef __EEA_NET_H__
+#define __EEA_NET_H__
+
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+
+#include "eea_adminq.h"
+#include "eea_ring.h"
+
+#define EEA_VER_MAJOR		1
+#define EEA_VER_MINOR		0
+#define EEA_VER_SUB_MINOR	0
+
+struct eea_net_tx {
+	struct eea_net *enet;
+
+	struct eea_ring *ering;
+
+	struct eea_tx_meta *meta;
+	struct eea_tx_meta *free;
+
+	struct device *dma_dev;
+
+	u32 index;
+
+	char name[16];
+};
+
+struct eea_rx_meta {
+	struct eea_rx_meta *next;
+
+	struct page *page;
+	dma_addr_t dma;
+	u32 offset;
+	u32 frags;
+
+	struct page *hdr_page;
+	void *hdr_addr;
+	dma_addr_t hdr_dma;
+
+	u32 id;
+
+	u32 truesize;
+	u32 headroom;
+	u32 tailroom;
+
+	u32 len;
+};
+
+struct eea_net_rx_pkt_ctx {
+	u16 idx;
+
+	bool data_valid;
+	bool do_drop;
+
+	struct sk_buff *head_skb;
+};
+
+struct eea_net_rx {
+	struct eea_net *enet;
+
+	struct eea_ring *ering;
+
+	struct eea_rx_meta *meta;
+	struct eea_rx_meta *free;
+
+	struct device *dma_dev;
+
+	u32 index;
+
+	u32 flags;
+
+	u32 headroom;
+
+	struct napi_struct *napi;
+
+	char name[16];
+
+	struct eea_net_rx_pkt_ctx pkt;
+
+	struct page_pool *pp;
+};
+
+struct eea_net_cfg {
+	u32 rx_ring_depth;
+	u32 tx_ring_depth;
+	u32 rx_ring_num;
+	u32 tx_ring_num;
+
+	u8 rx_sq_desc_size;
+	u8 rx_cq_desc_size;
+	u8 tx_sq_desc_size;
+	u8 tx_cq_desc_size;
+
+	u32 split_hdr;
+};
+
+enum {
+	EEA_LINK_ERR_NONE,
+	EEA_LINK_ERR_HA_RESET_DEV,
+	EEA_LINK_ERR_LINK_DOWN,
+};
+
+struct eea_net {
+	struct eea_device *edev;
+	struct net_device *netdev;
+
+	struct eea_aq adminq;
+
+	struct eea_net_tx *tx;
+	struct eea_net_rx **rx;
+
+	struct eea_net_cfg cfg;
+	struct eea_net_cfg cfg_hw;
+
+	u32 link_err;
+
+	bool started;
+
+	u8 duplex;
+	u32 speed;
+
+	u64 hw_ts_offset;
+};
+
+int eea_net_probe(struct eea_device *edev);
+void eea_net_remove(struct eea_device *edev);
+
+#endif
diff --git a/drivers/net/ethernet/alibaba/eea/eea_pci.c b/drivers/net/ethernet/alibaba/eea/eea_pci.c
index c7306a299ab2..7d8ba2785754 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_pci.c
+++ b/drivers/net/ethernet/alibaba/eea/eea_pci.c
@@ -8,6 +8,7 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/iopoll.h>
 
+#include "eea_net.h"
 #include "eea_pci.h"
 
 #define EEA_PCI_DB_OFFSET 4096
@@ -64,7 +65,9 @@ struct eea_pci_device {
 	((void __iomem *)((reg) + offsetof(struct eea_pci_cfg, item)))
 
 #define cfg_write8(reg, item, val) iowrite8(val, cfg_pointer(reg, item))
+#define cfg_write16(reg, item, val) iowrite16(val, cfg_pointer(reg, item))
 #define cfg_write32(reg, item, val) iowrite32(val, cfg_pointer(reg, item))
+#define cfg_write64(reg, item, val) iowrite64_lo_hi(val, cfg_pointer(reg, item))
 
 #define cfg_read8(reg, item) ioread8(cfg_pointer(reg, item))
 #define cfg_read32(reg, item) ioread32(cfg_pointer(reg, item))
@@ -313,6 +316,25 @@ void __iomem *eea_pci_db_addr(struct eea_device *edev, u32 off)
 	return edev->ep_dev->db_base + off;
 }
 
+int eea_pci_active_aq(struct eea_ring *ering, int msix_vec)
+{
+	struct eea_pci_device *ep_dev = ering->edev->ep_dev;
+
+	cfg_write16(ep_dev->reg, aq_size, ering->num);
+	cfg_write16(ep_dev->reg, aq_msix_vector, msix_vec);
+
+	cfg_write64(ep_dev->reg, aq_sq_addr, ering->sq.dma_addr);
+	cfg_write64(ep_dev->reg, aq_cq_addr, ering->cq.dma_addr);
+
+	ering->db = eea_pci_db_addr(ering->edev,
+				    cfg_read32(ep_dev->reg, aq_db_off));
+
+	if (!ering->db)
+		return -EIO;
+
+	return 0;
+}
+
 u64 eea_pci_device_ts(struct eea_device *edev)
 {
 	struct eea_pci_device *ep_dev = edev->ep_dev;
@@ -334,7 +356,9 @@ static int eea_init_device(struct eea_device *edev)
 	if (err)
 		goto err;
 
-	/* do net device probe ... */
+	err = eea_net_probe(edev);
+	if (err)
+		goto err;
 
 	return 0;
 err:
@@ -368,6 +392,9 @@ static void __eea_pci_remove(struct pci_dev *pci_dev)
 {
 	struct eea_pci_device *ep_dev = pci_get_drvdata(pci_dev);
 	struct device *dev = get_device(&ep_dev->pci_dev->dev);
+	struct eea_device *edev = &ep_dev->edev;
+
+	eea_net_remove(edev);
 
 	pci_disable_sriov(pci_dev);
 
diff --git a/drivers/net/ethernet/alibaba/eea/eea_pci.h b/drivers/net/ethernet/alibaba/eea/eea_pci.h
index be4e75b4ed2f..d0094c419f59 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_pci.h
+++ b/drivers/net/ethernet/alibaba/eea/eea_pci.h
@@ -10,6 +10,8 @@
 
 #include <linux/pci.h>
 
+#include "eea_ring.h"
+
 struct eea_pci_cap {
 	__u8 cap_vndr;
 	__u8 cap_next;
@@ -43,6 +45,7 @@ u16 eea_pci_dev_id(struct eea_device *edev);
 
 int eea_device_reset(struct eea_device *dev);
 int eea_pci_set_aq_up(struct eea_device *dev);
+int eea_pci_active_aq(struct eea_ring *ering, int msix_vec);
 
 u64 eea_pci_device_ts(struct eea_device *edev);
 
-- 
2.32.0.3.g01195cf9f


^ permalink raw reply related

* [PATCH net-next v40 0/8] eea: Add basic driver framework for Alibaba Elastic Ethernet Adaptor
From: Xuan Zhuo @ 2026-04-09 12:21 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Xuan Zhuo, Wen Gu, Philo Lu, Vadim Fedorenko,
	Dong Yibo, Jes Sorensen, Heiner Kallweit, Dust Li

Add a driver framework for EEA that will be available in the future.

This driver is currently quite minimal, implementing only fundamental
core functionalities. Key features include: I/O queue management via
adminq, basic PCI-layer operations, and essential RX/TX data
communication capabilities. It also supports the creation,
initialization, and management of network devices (netdev). Furthermore,
the ring structures for both I/O queues and adminq have been abstracted
into a simple, unified, and reusable library implementation,
facilitating future extension and maintenance.

v40:
    1. fix comments from https://sashiko.dev/

v39:
    1. fix comments from https://sashiko.dev/#/patchset/20260407122001.22265-1-xuanzhuo%40linux.alibaba.com

v38:
    1. fix comments from https://sashiko.dev/#/patchset/20260404135144.33166-1-xuanzhuo%40linux.alibaba.com

v36:
    1. fix comments from https://sashiko.dev/#/patchset/20260323074441.91691-1-xuanzhuo%40linux.alibaba.com

v35:
    1. fix comments from https://netdev-ai.bots.linux.dev/ai-review.html?id=24371ffc-a1ea-46e1-a6b3-b8cbcbb52efb

v34:
    1. fix the comments from https://sashiko.dev/#/patchset/20260317015257.79494-1-xuanzhuo%40linux.alibaba.com

v33:
    1. fix the comments. old version refers enet_bind_new_q_and_cfg, but that is
       changed to eea_bind_q_and_cfg.

v32:
    1. remove unused parameters from eea_net_ha_reset_remove and eea_create_pp
    2. call skb_mark_for_recycle for new skb for frag_list skbs

v31:
    1. remove unused parameter "enet" from eex_rx_post()

v30:
   1. fix some small problems

v29:
    Address AI-generated review suggestions from the previous version.

v28:
    Address AI-generated review suggestions from the previous version.

v27:
    Address AI-generated review suggestions from the previous version.

v26:
    Adopting suggestions from the previous AI review, another significant
    change is the introduction of an IRQ block to implement IRQ proxying. With
    this design, when an IRQ occurs, we no longer invoke the RX data structure
    directly -- instead, the IRQ block serves as an intermediary proxy. This
    approach offers several advantages: IRQ resources no longer need to be
    reallocated during reset operations, and IRQs are decoupled from RX
    structures. Consequently, when certain errors occur, we can fall back and
    safely reuse the original memory resources.

v25:
    I have adopted most of the suggestions from the AI's feedback. However, I
    believe some of the feedback is incorrect. I have already replied in the
    previous thread. http://lore.kernel.org/all/1770002612.3297296-2-xuanzhuo@linux.alibaba.com

v24:
    1. Add null checks for enet->rx and enet->tx in eea_get_ethtool_stat to
       prevent errors when reading rx = enet->rx[i] in case enet->rx is null.
       tx is similar. With rtnl protection in place, this check is sufficient.
    2. Use 'received' as the return value in eea_poll.

v23:
    I have moved netif_set_real_num_queues() out of eea_start_rxtx(), so
    eea_start_rxtx() is now a void function. I believe enet_bind_new_q_and_cfg()
    is a more suitable place to include netif_set_real_num_queues(). In
    eea_active_ring_and_irq(), I first execute request_irq() before interacting
    with the hardware to create queues. Therefore, during the NIC setup process,
    all driver-internal operations (memory allocation, IRQ initialization, sysfs
    configuration, etc.) will be completed before the final notification to the
    hardware.

v22:
    1. Use the budget from the NAPI poll function as the parameter for
       napi_consume_skb.
    2. Stop the TX queue when the remaining ring slots cannot hold an SKB.

v21:
    Fix two issues from the previous version:
    1, a DMA unmap operation was missing.
    2, RCU APIs were not used in eea_stats. Although the standard practice when
        using RCU would require adding the __rcu annotation to both the rx and
        tx fields, in many cases these fields are read without needing RCU
        protection.  Therefore, I do not want to add the __rcu annotation.
        Instead, I use a spin lock to protect modifications to rx and tx.

v20:
    Fix the partially initialized structure passed to db. @Jakub
    http://lore.kernel.org/all/20260113172353.2ae6ef81@kernel.org

v19:
    fix the comments from @Simon Horman

v18:
    v17 with [PATCH] prefix.

v17:
    1. In `eea_adminq_dev_status`, uniformly use `enet->cfg.rx_ring_num`.
    2. Add a `struct eea_net_cfg *cfg` parameter to `eea_free_rx` and
        `eea_free_tx`. When called in the normal path, pass `enet->cfg` as
        the argument; when called during initialization, pass the temporary
        `cfg` instead.
    3. Move the `.ndo_get_stats64` callback into `eea_net.c`.
    4. In the `.ndo_get_stats64` callback, add a comment explaining how the TX
        and RX statistics are protected by RCU.

       /* This function is protected by RCU. Here uses enet->tx and enet->rx
        * to check whether the TX and RX structures are safe to access. In
        * eea_free_rxtx_q_mem, before freeing the TX and RX resources, enet->rx
        * and enet->tx are set to NULL, and synchronize_net is called.
        */


v16:
    1. follow the advices from @ALOK TIWARI
       http://lore.kernel.org/all/5ff95a71-69e5-4cb6-9b2a-5224c983bdc2@oracle.com

v15:
    1. remove 'default m' from eea kconfig
    2. free the resources when open failed.

v14:
    1. some tiny fixes

v13:
    1. fix some tiny fixes @Simon

v12:
    I encountered some issues with sending the v11 patches, as they were quite
    messy. Therefore, I'm resending them as v12.

v11:
    1. remove auto clean __free(kfree)
    2. some tiny fixes

v10:
    1. name the jump labels after the target @Jakub
    2. rm __GFP_ZERO from dma_alloc_coherent @Jakub
v9:
    1. some fixes for ethtool from http://lore.kernel.org/all/20251027183754.52fe2a2c@kernel.org

v8: 1. rename eea_net_tmp to eea_net_init_ctx
    2. rm code that allocs memory to destroy queues
    3. some other minor changes

v7: 1. remove the irrelative code from ethtool commit
    2. build every commits with W12

v6: Split the big one commit to five commits
v5: Thanks for the comments from Kalesh Anakkur Purayil, ALOK TIWARI
v4: Thanks for the comments from Troy Mitchell, Przemek Kitszel, Andrew Lunn, Kalesh Anakkur Purayil
v3: Thanks for the comments from Paolo Abenchi
v2: Thanks for the comments from Simon Horman and Andrew Lunn
v1: Thanks for the comments from Simon Horman and Andrew Lunn



































Xuan Zhuo (8):
  eea: introduce PCI framework
  eea: introduce ring and descriptor structures
  eea: probe the netdevice and create adminq
  eea: create/destroy rx,tx queues for netdevice open and stop
  eea: implement packet receive logic
  eea: implement packet transmit logic
  eea: introduce ethtool support
  eea: introduce callback for ndo_get_stats64

 MAINTAINERS                                   |   8 +
 drivers/net/ethernet/Kconfig                  |   1 +
 drivers/net/ethernet/Makefile                 |   1 +
 drivers/net/ethernet/alibaba/Kconfig          |  28 +
 drivers/net/ethernet/alibaba/Makefile         |   5 +
 drivers/net/ethernet/alibaba/eea/Makefile     |   9 +
 drivers/net/ethernet/alibaba/eea/eea_adminq.c | 482 ++++++++++
 drivers/net/ethernet/alibaba/eea/eea_adminq.h |  74 ++
 drivers/net/ethernet/alibaba/eea/eea_desc.h   | 131 +++
 .../net/ethernet/alibaba/eea/eea_ethtool.c    | 266 ++++++
 .../net/ethernet/alibaba/eea/eea_ethtool.h    |  49 +
 drivers/net/ethernet/alibaba/eea/eea_net.c    | 870 ++++++++++++++++++
 drivers/net/ethernet/alibaba/eea/eea_net.h    | 197 ++++
 drivers/net/ethernet/alibaba/eea/eea_pci.c    | 710 ++++++++++++++
 drivers/net/ethernet/alibaba/eea/eea_pci.h    |  73 ++
 drivers/net/ethernet/alibaba/eea/eea_ring.c   | 243 +++++
 drivers/net/ethernet/alibaba/eea/eea_ring.h   |  86 ++
 drivers/net/ethernet/alibaba/eea/eea_rx.c     | 810 ++++++++++++++++
 drivers/net/ethernet/alibaba/eea/eea_tx.c     | 455 +++++++++
 19 files changed, 4498 insertions(+)
 create mode 100644 drivers/net/ethernet/alibaba/Kconfig
 create mode 100644 drivers/net/ethernet/alibaba/Makefile
 create mode 100644 drivers/net/ethernet/alibaba/eea/Makefile
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_adminq.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_adminq.h
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_desc.h
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_ethtool.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_ethtool.h
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_net.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_net.h
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_pci.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_pci.h
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_ring.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_ring.h
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_rx.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_tx.c

--
2.32.0.3.g01195cf9f


^ permalink raw reply

* [PATCH net-next v40 4/8] eea: create/destroy rx,tx queues for netdevice open and stop
From: Xuan Zhuo @ 2026-04-09 12:21 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Xuan Zhuo, Wen Gu, Philo Lu, Vadim Fedorenko,
	Dong Yibo, Jes Sorensen, Heiner Kallweit, Dust Li
In-Reply-To: <20260409122130.129416-1-xuanzhuo@linux.alibaba.com>

Add basic driver framework for the Alibaba Elastic Ethernet Adapter(EEA).

This commit introduces the implementation for the netdevice open and
stop.

This commit introduces HA to restore the device when error occurs,
but in HA scenarios the driver can't ensure to restore the status
correctly.

Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Reviewed-by: Philo Lu <lulie@linux.alibaba.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 drivers/net/ethernet/alibaba/eea/Makefile  |   4 +-
 drivers/net/ethernet/alibaba/eea/eea_net.c | 570 ++++++++++++++++++++-
 drivers/net/ethernet/alibaba/eea/eea_net.h |  48 +-
 drivers/net/ethernet/alibaba/eea/eea_pci.c | 228 ++++++++-
 drivers/net/ethernet/alibaba/eea/eea_pci.h |  20 +
 drivers/net/ethernet/alibaba/eea/eea_rx.c  | 264 ++++++++++
 drivers/net/ethernet/alibaba/eea/eea_tx.c  | 102 ++++
 7 files changed, 1225 insertions(+), 11 deletions(-)
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_rx.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_tx.c

diff --git a/drivers/net/ethernet/alibaba/eea/Makefile b/drivers/net/ethernet/alibaba/eea/Makefile
index 91f318e8e046..fa34a005fa01 100644
--- a/drivers/net/ethernet/alibaba/eea/Makefile
+++ b/drivers/net/ethernet/alibaba/eea/Makefile
@@ -3,4 +3,6 @@ obj-$(CONFIG_EEA) += eea.o
 eea-y := eea_ring.o \
 	eea_net.o \
 	eea_pci.o \
-	eea_adminq.o
+	eea_adminq.o \
+	eea_tx.o \
+	eea_rx.o
diff --git a/drivers/net/ethernet/alibaba/eea/eea_net.c b/drivers/net/ethernet/alibaba/eea/eea_net.c
index 6df65908a215..6bbf3f01a473 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_net.c
+++ b/drivers/net/ethernet/alibaba/eea/eea_net.c
@@ -20,6 +20,462 @@
 #define EEA_NET_IO_RING_DEPTH_MAX (32 * 1024)
 #define EEA_NET_IO_RING_DEPTH_MIN 128
 
+static irqreturn_t eea_irq_handler(int irq, void *data)
+{
+	struct eea_irq_blk *blk = data;
+
+	napi_schedule_irqoff(&blk->napi);
+
+	return IRQ_HANDLED;
+}
+
+static void eea_free_irq_blk(struct eea_net *enet)
+{
+	struct eea_irq_blk *blk;
+	u32 num;
+	int i;
+
+	if (!enet->irq_blks)
+		return;
+
+	num = enet->edev->rx_num;
+
+	for (i = 0; i < num; i++) {
+		blk = &enet->irq_blks[i];
+
+		if (blk->ready)
+			eea_pci_free_irq(blk);
+
+		blk->ready = false;
+	}
+
+	kvfree(enet->irq_blks);
+	enet->irq_blks = NULL;
+}
+
+/* The driver will always attempt to allocate IRQ blocks based on the maximum
+ * possible queue num.
+ */
+static int eea_alloc_irq_blks(struct eea_net *enet)
+{
+	struct eea_device *edev = enet->edev;
+	struct eea_irq_blk *blk, *irq_blks;
+	int i, err, num;
+
+	num = enet->edev->rx_num;
+
+	irq_blks = kvcalloc(num, sizeof(*blk), GFP_KERNEL);
+	if (!irq_blks)
+		return -ENOMEM;
+
+	enet->irq_blks = irq_blks;
+
+	for (i = 0; i < num; i++) {
+		blk = &irq_blks[i];
+		blk->idx = i;
+
+		/* vec 0 is for error notify. */
+		blk->msix_vec = i + 1;
+
+		err = eea_pci_request_irq(edev, blk, eea_irq_handler);
+		if (err)
+			goto err_free_irq_blk;
+
+		blk->ready = true;
+	}
+
+	return 0;
+
+err_free_irq_blk:
+	eea_free_irq_blk(enet);
+	return err;
+}
+
+static int eea_update_queues(struct eea_net *enet)
+{
+	return netif_set_real_num_queues(enet->netdev, enet->cfg.tx_ring_num,
+					 enet->cfg.rx_ring_num);
+}
+
+void eea_init_ctx(struct eea_net *enet, struct eea_net_init_ctx *ctx)
+{
+	memset(ctx, 0, sizeof(*ctx));
+
+	ctx->netdev = enet->netdev;
+	ctx->edev = enet->edev;
+	ctx->cfg = enet->cfg;
+}
+
+static void eea_bind_q_and_cfg(struct eea_net *enet,
+			       struct eea_net_init_ctx *ctx)
+{
+	struct eea_irq_blk *blk;
+	struct eea_net_rx *rx;
+	struct eea_net_tx *tx;
+	int i;
+
+	enet->cfg = ctx->cfg;
+	enet->rx = ctx->rx;
+	enet->tx = ctx->tx;
+
+	for (i = 0; i < ctx->cfg.rx_ring_num; i++) {
+		blk = &enet->irq_blks[i];
+
+		rx = ctx->rx[i];
+		tx = &ctx->tx[i];
+
+		rx->enet = enet;
+		rx->napi = &blk->napi;
+		rx->ering->msix_vec = blk->msix_vec;
+
+		tx->enet = enet;
+		tx->ering->msix_vec = blk->msix_vec;
+
+		blk->rx = rx;
+	}
+}
+
+static void eea_unbind_q_and_cfg(struct eea_net *enet,
+				 struct eea_net_init_ctx *ctx)
+{
+	struct eea_irq_blk *blk;
+	struct eea_net_rx *rx;
+	int i;
+
+	ctx->cfg = enet->cfg;
+	ctx->rx = enet->rx;
+	ctx->tx = enet->tx;
+
+	enet->rx = NULL;
+	enet->tx = NULL;
+
+	for (i = 0; i < ctx->cfg.rx_ring_num; i++) {
+		blk = &enet->irq_blks[i];
+
+		rx = ctx->rx[i];
+
+		rx->napi = NULL;
+
+		blk->rx = NULL;
+	}
+}
+
+static void eea_free_rxtx_q_mem(struct eea_net_init_ctx *ctx)
+{
+	struct eea_net_rx *rx;
+	struct eea_net_tx *tx;
+	int i;
+
+	for (i = 0; i < ctx->cfg.rx_ring_num; i++) {
+		rx = ctx->rx[i];
+		tx = &ctx->tx[i];
+
+		eea_free_rx(rx, &ctx->cfg);
+		eea_free_tx(tx, &ctx->cfg);
+	}
+
+	kvfree(ctx->rx);
+	kvfree(ctx->tx);
+}
+
+/* alloc tx/rx: struct, ring, meta, pp, napi */
+static int eea_alloc_rxtx_q_mem(struct eea_net_init_ctx *ctx)
+{
+	struct eea_net_rx *rx;
+	struct eea_net_tx *tx;
+	int err, i;
+
+	ctx->tx = kvcalloc(ctx->cfg.tx_ring_num, sizeof(*ctx->tx), GFP_KERNEL);
+	if (!ctx->tx)
+		return -ENOMEM;
+
+	ctx->rx = kvcalloc(ctx->cfg.rx_ring_num, sizeof(*ctx->rx), GFP_KERNEL);
+	if (!ctx->rx)
+		goto err_free_tx;
+
+	ctx->cfg.rx_sq_desc_size = sizeof(struct eea_rx_desc);
+	ctx->cfg.rx_cq_desc_size = sizeof(struct eea_rx_cdesc);
+	ctx->cfg.tx_sq_desc_size = sizeof(struct eea_tx_desc);
+	ctx->cfg.tx_cq_desc_size = sizeof(struct eea_tx_cdesc);
+
+	ctx->cfg.tx_cq_desc_size /= 2;
+
+	if (!ctx->cfg.split_hdr)
+		ctx->cfg.rx_sq_desc_size /= 2;
+
+	for (i = 0; i < ctx->cfg.rx_ring_num; i++) {
+		rx = eea_alloc_rx(ctx, i);
+		if (!rx)
+			goto err_free;
+
+		ctx->rx[i] = rx;
+
+		tx = ctx->tx + i;
+		err = eea_alloc_tx(ctx, tx, i);
+		if (err)
+			goto err_free;
+	}
+
+	return 0;
+
+err_free:
+	for (i = 0; i < ctx->cfg.rx_ring_num; i++) {
+		rx = ctx->rx[i];
+		tx = ctx->tx + i;
+
+		eea_free_rx(rx, &ctx->cfg);
+		eea_free_tx(tx, &ctx->cfg);
+	}
+
+	kvfree(ctx->rx);
+
+err_free_tx:
+	kvfree(ctx->tx);
+	return -ENOMEM;
+}
+
+static int eea_hw_active_ring(struct eea_net *enet)
+{
+	return eea_adminq_create_q(enet, enet->cfg.rx_ring_num
+				   + enet->cfg.tx_ring_num, 0);
+}
+
+static int eea_hw_unactive_ring(struct eea_net *enet)
+{
+	int err;
+
+	err = eea_adminq_destroy_all_q(enet);
+	if (err)
+		netdev_warn(enet->netdev, "unactive rxtx ring failed.\n");
+
+	return err;
+}
+
+/* stop rx napi, stop tx queue. */
+static void eea_stop_rxtx(struct net_device *netdev)
+{
+	struct eea_net *enet = netdev_priv(netdev);
+	int i;
+
+	netif_tx_disable(netdev);
+
+	for (i = 0; i < enet->cfg.rx_ring_num; i++)
+		enet_rx_stop(enet->rx[i]);
+
+	netif_carrier_off(netdev);
+}
+
+static void eea_start_rxtx(struct eea_net *enet)
+{
+	int i;
+
+	for (i = 0; i < enet->cfg.rx_ring_num; i++)
+		enet_rx_start(enet->rx[i]);
+
+	netif_tx_start_all_queues(enet->netdev);
+	netif_carrier_on(enet->netdev);
+
+	enet->started = true;
+}
+
+static int eea_netdev_stop(struct net_device *netdev)
+{
+	struct eea_net *enet = netdev_priv(netdev);
+	struct eea_net_init_ctx ctx;
+
+	/* This function can be called during device anomaly recovery. To
+	 * prevent duplicate stop operations, the `started` flag is introduced
+	 * for checking.
+	 */
+
+	if (!enet->started) {
+		netdev_warn(netdev, "eea netdev stop: but dev is not started.\n");
+		return 0;
+	}
+
+	eea_stop_rxtx(netdev);
+	eea_hw_unactive_ring(enet);
+	eea_unbind_q_and_cfg(enet, &ctx);
+	eea_free_rxtx_q_mem(&ctx);
+
+	enet->started = false;
+
+	return 0;
+}
+
+static int eea_netdev_open(struct net_device *netdev)
+{
+	struct eea_net *enet = netdev_priv(netdev);
+	struct eea_net_init_ctx ctx;
+	int err;
+
+	if (enet->link_err) {
+		netdev_err(netdev, "netdev open err, because link error: %d\n",
+			   enet->link_err);
+		return -EBUSY;
+	}
+
+	eea_init_ctx(enet, &ctx);
+
+	err = eea_alloc_rxtx_q_mem(&ctx);
+	if (err)
+		goto err_done;
+
+	eea_bind_q_and_cfg(enet, &ctx);
+
+	err = eea_update_queues(enet);
+	if (err)
+		goto err_free_q;
+
+	err = eea_hw_active_ring(enet);
+	if (err)
+		goto err_free_q;
+
+	eea_start_rxtx(enet);
+
+	return 0;
+
+err_free_q:
+	eea_unbind_q_and_cfg(enet, &ctx);
+	eea_free_rxtx_q_mem(&ctx);
+
+err_done:
+	return err;
+}
+
+/* resources: ring, buffers, irq */
+int eea_reset_hw_resources(struct eea_net *enet, struct eea_net_init_ctx *ctx)
+{
+	struct eea_net_init_ctx ctx_old = {0};
+	int err, error;
+
+	if (!netif_running(enet->netdev) || !enet->started) {
+		enet->cfg = ctx->cfg;
+		return 0;
+	}
+
+	err = eea_alloc_rxtx_q_mem(ctx);
+	if (err) {
+		netdev_warn(enet->netdev,
+			    "eea reset: alloc q failed. stop reset. err %d\n",
+			    err);
+		return err;
+	}
+
+	eea_stop_rxtx(enet->netdev);
+	eea_hw_unactive_ring(enet);
+
+	eea_unbind_q_and_cfg(enet, &ctx_old);
+	eea_bind_q_and_cfg(enet, ctx);
+
+	err = eea_update_queues(enet);
+	if (err) {
+		netdev_err(enet->netdev,
+			   "eea reset: set real num queues failed. err %d\n",
+			   err);
+		goto err_bind_old;
+	}
+
+	err = eea_hw_active_ring(enet);
+	if (err) {
+		netdev_err(enet->netdev, "eea reset: active new ring. err %d\n",
+			   err);
+		eea_unbind_q_and_cfg(enet, ctx);
+		goto err_free_q;
+	}
+
+	eea_start_rxtx(enet);
+	eea_free_rxtx_q_mem(&ctx_old);
+	return 0;
+
+err_bind_old:
+	eea_unbind_q_and_cfg(enet, ctx);
+	eea_bind_q_and_cfg(enet, &ctx_old);
+	error = eea_hw_active_ring(enet);
+	if (error) {
+		netdev_err(enet->netdev, "eea reset: active old ring. err %d\n",
+			   error);
+		eea_unbind_q_and_cfg(enet, &ctx_old);
+		err = error;
+		goto err_free_q;
+	}
+
+	eea_start_rxtx(enet);
+	eea_free_rxtx_q_mem(ctx);
+	return err;
+
+err_free_q:
+
+	/* An exception occurred at the hardware level, and there's not much we
+	 * can do about it -- we can only release the resources first.
+	 */
+	eea_free_rxtx_q_mem(ctx);
+	eea_free_rxtx_q_mem(&ctx_old);
+	enet->started = false;
+	return err;
+}
+
+int eea_queues_check_and_reset(struct eea_device *edev)
+{
+	struct eea_aq_queue_status *qstatus;
+	struct eea_aq_dev_status *dstatus;
+	struct eea_aq_queue_status *qs;
+	struct eea_net_init_ctx ctx;
+	bool need_reset = false;
+	int num, i, err = 0;
+
+	rtnl_lock();
+
+	if (!netif_running(edev->enet->netdev))
+		goto err_unlock;
+
+	/* Maybe stopped by ha. */
+	if (!edev->enet->started || edev->enet->link_err)
+		goto err_unlock;
+
+	num = edev->enet->cfg.rx_ring_num + edev->enet->cfg.tx_ring_num + 1;
+
+	dstatus = eea_adminq_dev_status(edev->enet);
+	if (!dstatus) {
+		netdev_warn(edev->enet->netdev, "query queue status failed.\n");
+		err = -ENOMEM;
+		goto err_unlock;
+	}
+
+	if (le16_to_cpu(dstatus->link_status) == EEA_LINK_DOWN_STATUS) {
+		/* The device is broken, can not be up. */
+		eea_netdev_stop(edev->enet->netdev);
+		edev->enet->link_err = EEA_LINK_ERR_LINK_DOWN;
+		netdev_warn(edev->enet->netdev, "device link is down. stop device.\n");
+		goto err_free;
+	}
+
+	qstatus = dstatus->q_status;
+
+	for (i = 0; i < num; ++i) {
+		qs = &qstatus[i];
+
+		if (le16_to_cpu(qs->status) == EEA_QUEUE_STATUS_NEED_RESET) {
+			netdev_warn(edev->enet->netdev,
+				    "queue status: queue %u needs to reset\n",
+				    le16_to_cpu(qs->qidx));
+			need_reset = true;
+		}
+	}
+
+	if (need_reset) {
+		eea_init_ctx(edev->enet, &ctx);
+		err = eea_reset_hw_resources(edev->enet, &ctx);
+	}
+
+err_free:
+	kfree(dstatus);
+
+err_unlock:
+	rtnl_unlock();
+	return err;
+}
+
 static int eea_update_cfg(struct eea_net *enet,
 			  struct eea_device *edev,
 			  struct eea_aq_cfg *hwcfg)
@@ -151,6 +607,9 @@ static int eea_netdev_init_features(struct net_device *netdev,
 }
 
 static const struct net_device_ops eea_netdev = {
+	.ndo_open           = eea_netdev_open,
+	.ndo_stop           = eea_netdev_stop,
+	.ndo_start_xmit     = eea_tx_xmit,
 	.ndo_validate_addr  = eth_validate_addr,
 	.ndo_features_check = passthru_features_check,
 };
@@ -159,6 +618,7 @@ static struct eea_net *eea_netdev_alloc(struct eea_device *edev, u32 pairs)
 {
 	struct net_device *netdev;
 	struct eea_net *enet;
+	int err;
 
 	netdev = alloc_etherdev_mq(sizeof(struct eea_net), pairs);
 	if (!netdev) {
@@ -175,14 +635,83 @@ static struct eea_net *eea_netdev_alloc(struct eea_device *edev, u32 pairs)
 	enet->edev = edev;
 	edev->enet = enet;
 
+	err = eea_alloc_irq_blks(enet);
+	if (err) {
+		dev_err(edev->dma_dev,
+			"eea_alloc_irq_blks failed with pairs %d\n", pairs);
+		free_netdev(netdev);
+		return NULL;
+	}
+
 	return enet;
 }
 
+static void eea_update_ts_off(struct eea_device *edev, struct eea_net *enet)
+{
+	u64 ts;
+
+	ts = eea_pci_device_ts(edev);
+
+	enet->hw_ts_offset = ktime_get_real() - ts;
+}
+
+static int eea_net_reprobe(struct eea_device *edev)
+{
+	struct eea_net *enet = edev->enet;
+	int err = 0;
+
+	enet->edev = edev;
+
+	if (!enet->adminq.ring) {
+		err = eea_create_adminq(enet, edev->rx_num + edev->tx_num);
+		if (err)
+			return err;
+	}
+
+	err = eea_alloc_irq_blks(enet);
+	if (err)
+		goto err_destroy_aq;
+
+	eea_update_ts_off(edev, enet);
+
+	rtnl_lock();
+
+	enet->link_err = 0;
+	if (edev->ha_reset_netdev_running &&
+	    netif_running(edev->enet->netdev)) {
+		err = eea_netdev_open(enet->netdev);
+		if (err) {
+			enet->link_err = EEA_LINK_ERR_HA_RESET_DEV;
+			rtnl_unlock();
+			goto err_free_irq_blks;
+		}
+	}
+
+	rtnl_unlock();
+
+	enet->wait_pci_ready = false;
+	return 0;
+
+err_free_irq_blks:
+	eea_free_irq_blk(enet);
+
+err_destroy_aq:
+	eea_destroy_adminq(enet);
+
+	return err;
+}
+
 int eea_net_probe(struct eea_device *edev)
 {
 	struct eea_net *enet;
 	int err = -ENOMEM;
 
+	/* If edev->enet is not null, then this is called from ha reset worker.
+	 * Call eea_net_reprobe() directly.
+	 */
+	if (edev->enet)
+		return eea_net_reprobe(edev);
+
 	enet = eea_netdev_alloc(edev, edev->rx_num);
 	if (!enet)
 		return -ENOMEM;
@@ -199,6 +728,8 @@ int eea_net_probe(struct eea_device *edev)
 	if (err)
 		goto err_reset_dev;
 
+	eea_update_ts_off(edev, enet);
+
 	netdev_dbg(enet->netdev, "eea probe success.\n");
 
 	return 0;
@@ -208,11 +739,34 @@ int eea_net_probe(struct eea_device *edev)
 	eea_destroy_adminq(enet);
 
 err_free_netdev:
+	eea_free_irq_blk(enet);
 	free_netdev(enet->netdev);
 	return err;
 }
 
-void eea_net_remove(struct eea_device *edev)
+static void eea_net_ha_reset_remove(struct eea_net *enet,
+				    struct eea_device *edev)
+{
+	rtnl_lock();
+	edev->ha_reset_netdev_running = false;
+	if (netif_running(enet->netdev)) {
+		eea_netdev_stop(enet->netdev);
+		edev->ha_reset_netdev_running = true;
+	}
+
+	/* Prevent that the user set up the net device. */
+	enet->link_err = EEA_LINK_ERR_HA_RESET_DEV;
+
+	rtnl_unlock();
+
+	eea_device_reset(edev);
+	eea_destroy_adminq(enet);
+	eea_free_irq_blk(enet);
+
+	enet->wait_pci_ready = true;
+}
+
+void eea_net_remove(struct eea_device *edev, bool ha)
 {
 	struct net_device *netdev;
 	struct eea_net *enet;
@@ -220,11 +774,19 @@ void eea_net_remove(struct eea_device *edev)
 	enet = edev->enet;
 	netdev = enet->netdev;
 
-	netdev_dbg(enet->netdev, "eea removed.\n");
+	if (ha) {
+		if (enet->wait_pci_ready)
+			return;
 
-	eea_device_reset(edev);
+		eea_net_ha_reset_remove(enet, edev);
+		return;
+	}
 
-	eea_destroy_adminq(enet);
+	if (!enet->wait_pci_ready) {
+		eea_device_reset(edev);
+		eea_destroy_adminq(enet);
+		eea_free_irq_blk(enet);
+	}
 
 	free_netdev(netdev);
 }
diff --git a/drivers/net/ethernet/alibaba/eea/eea_net.h b/drivers/net/ethernet/alibaba/eea/eea_net.h
index 239312456c5b..ca35d28211fc 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_net.h
+++ b/drivers/net/ethernet/alibaba/eea/eea_net.h
@@ -18,6 +18,8 @@
 #define EEA_VER_MINOR		0
 #define EEA_VER_SUB_MINOR	0
 
+struct eea_tx_meta;
+
 struct eea_net_tx {
 	struct eea_net *enet;
 
@@ -100,6 +102,18 @@ struct eea_net_cfg {
 	u8 tx_cq_desc_size;
 
 	u32 split_hdr;
+
+	struct hwtstamp_config ts_cfg;
+};
+
+struct eea_net_init_ctx {
+	struct eea_net_cfg cfg;
+
+	struct eea_net_tx *tx;
+	struct eea_net_rx **rx;
+
+	struct net_device *netdev;
+	struct eea_device *edev;
 };
 
 enum {
@@ -108,6 +122,17 @@ enum {
 	EEA_LINK_ERR_LINK_DOWN,
 };
 
+struct eea_irq_blk {
+	struct napi_struct napi;
+	u16 msix_vec;
+	bool ready;
+	struct eea_net_rx *rx;
+	char irq_name[32];
+	int irq;
+	int idx;
+
+};
+
 struct eea_net {
 	struct eea_device *edev;
 	struct net_device *netdev;
@@ -120,9 +145,12 @@ struct eea_net {
 	struct eea_net_cfg cfg;
 	struct eea_net_cfg cfg_hw;
 
+	struct eea_irq_blk *irq_blks;
+
 	u32 link_err;
 
 	bool started;
+	bool wait_pci_ready;
 
 	u8 duplex;
 	u32 speed;
@@ -131,6 +159,24 @@ struct eea_net {
 };
 
 int eea_net_probe(struct eea_device *edev);
-void eea_net_remove(struct eea_device *edev);
+void eea_net_remove(struct eea_device *edev, bool ha);
+
+int eea_reset_hw_resources(struct eea_net *enet, struct eea_net_init_ctx *ctx);
+void eea_init_ctx(struct eea_net *enet, struct eea_net_init_ctx *ctx);
+int eea_queues_check_and_reset(struct eea_device *edev);
+
+/* rx apis */
+void enet_rx_stop(struct eea_net_rx *rx);
+void enet_rx_start(struct eea_net_rx *rx);
+
+void eea_free_rx(struct eea_net_rx *rx, struct eea_net_cfg *cfg);
+struct eea_net_rx *eea_alloc_rx(struct eea_net_init_ctx *ctx, u32 idx);
+
+/* tx apis */
+bool eea_poll_tx(struct eea_net_tx *tx, int budget);
+netdev_tx_t eea_tx_xmit(struct sk_buff *skb, struct net_device *netdev);
+
+void eea_free_tx(struct eea_net_tx *tx, struct eea_net_cfg *cfg);
+int eea_alloc_tx(struct eea_net_init_ctx *ctx, struct eea_net_tx *tx, u32 idx);
 
 #endif
diff --git a/drivers/net/ethernet/alibaba/eea/eea_pci.c b/drivers/net/ethernet/alibaba/eea/eea_pci.c
index 7d8ba2785754..7d2a9dd575b8 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_pci.c
+++ b/drivers/net/ethernet/alibaba/eea/eea_pci.c
@@ -16,6 +16,9 @@
 #define EEA_PCI_DB_MAX_SIZE 512
 #define EEA_PCI_Q_MAX_NUM 1000
 
+#define EEA_PCI_CAP_RESET_DEVICE 0xFA
+#define EEA_PCI_CAP_RESET_FLAG BIT(1)
+
 struct eea_pci_cfg {
 	__le32 reserve0;
 	__le32 reserve1;
@@ -57,8 +60,10 @@ struct eea_pci_device {
 	void __iomem *db_base;
 	void __iomem *db_end;
 
+	struct work_struct ha_handle_work;
 	char ha_irq_name[32];
 	u8 reset_pos;
+	bool ha_ready;
 };
 
 #define cfg_pointer(reg, item) \
@@ -73,6 +78,11 @@ struct eea_pci_device {
 #define cfg_read32(reg, item) ioread32(cfg_pointer(reg, item))
 #define cfg_read64(reg, item) ioread64(cfg_pointer(reg, item))
 
+/* Due to circular references, we have to add function definitions here. */
+static int __eea_pci_probe(struct pci_dev *pci_dev,
+			   struct eea_pci_device *ep_dev, bool pci_probe);
+static void __eea_pci_remove(struct pci_dev *pci_dev, bool pci_remove);
+
 const char *eea_pci_name(struct eea_device *edev)
 {
 	return pci_name(edev->ep_dev->pci_dev);
@@ -179,6 +189,12 @@ static int eea_negotiate(struct eea_device *edev)
 static void eea_pci_release_resource(struct eea_pci_device *ep_dev)
 {
 	struct pci_dev *pci_dev = ep_dev->pci_dev;
+	struct eea_device *edev;
+
+	edev = &ep_dev->edev;
+
+	if (edev->status < EEA_PCI_STATUS_READY)
+		return;
 
 	if (ep_dev->reg) {
 		pci_iounmap(pci_dev, ep_dev->reg);
@@ -192,12 +208,16 @@ static void eea_pci_release_resource(struct eea_pci_device *ep_dev)
 
 	pci_release_regions(pci_dev);
 	pci_disable_device(pci_dev);
+
+	edev->status = EEA_PCI_STATUS_NONE;
 }
 
 static int eea_pci_setup(struct pci_dev *pci_dev, struct eea_pci_device *ep_dev)
 {
 	int err, n, ret, len;
 
+	ep_dev->edev.status = EEA_PCI_STATUS_ERR;
+
 	ep_dev->pci_dev = pci_dev;
 
 	err = pci_enable_device(pci_dev);
@@ -286,6 +306,8 @@ static int eea_pci_setup(struct pci_dev *pci_dev, struct eea_pci_device *ep_dev)
 
 	ep_dev->msix_vec_n = ret;
 
+	ep_dev->edev.status = EEA_PCI_STATUS_READY;
+
 	return 0;
 
 err_unmap_reg:
@@ -335,6 +357,174 @@ int eea_pci_active_aq(struct eea_ring *ering, int msix_vec)
 	return 0;
 }
 
+void eea_pci_free_irq(struct eea_irq_blk *blk)
+{
+	irq_update_affinity_hint(blk->irq, NULL);
+	free_irq(blk->irq, blk);
+}
+
+int eea_pci_request_irq(struct eea_device *edev, struct eea_irq_blk *blk,
+			irqreturn_t (*callback)(int irq, void *data))
+{
+	struct eea_pci_device *ep_dev = edev->ep_dev;
+	int irq;
+
+	snprintf(blk->irq_name, sizeof(blk->irq_name), "eea-q%d@%s", blk->idx,
+		 pci_name(ep_dev->pci_dev));
+
+	irq = pci_irq_vector(ep_dev->pci_dev, blk->msix_vec);
+
+	blk->irq = irq;
+
+	return request_irq(irq, callback, IRQF_NO_AUTOEN, blk->irq_name, blk);
+}
+
+static void eea_ha_handle_reset(struct eea_pci_device *ep_dev)
+{
+	struct eea_device *edev;
+	struct pci_dev *pci_dev;
+	u16 reset;
+	int err;
+
+	if (!ep_dev->reset_pos) {
+		eea_queues_check_and_reset(&ep_dev->edev);
+		return;
+	}
+
+	edev = &ep_dev->edev;
+
+	pci_read_config_word(ep_dev->pci_dev, ep_dev->reset_pos, &reset);
+
+	/* Clear bits using 0xFFFF and ignore all previous messages. */
+	pci_write_config_word(ep_dev->pci_dev, ep_dev->reset_pos, 0xFFFF);
+
+	if (reset & EEA_PCI_CAP_RESET_FLAG) {
+		dev_warn(&ep_dev->pci_dev->dev, "recv device reset request.\n");
+
+		pci_dev = ep_dev->pci_dev;
+
+		/* The pci remove callback may hold this lock. If the
+		 * pci remove callback is called, then we can ignore the
+		 * ha interrupt.
+		 */
+		if (mutex_trylock(&edev->ha_lock)) {
+			if (edev->status != EEA_PCI_STATUS_DONE) {
+				dev_err(&ep_dev->pci_dev->dev, "ha: reset device: pci status is %d. skip it.\n",
+					edev->status);
+
+				mutex_unlock(&edev->ha_lock);
+				return;
+			}
+
+			__eea_pci_remove(pci_dev, false);
+			err = __eea_pci_probe(pci_dev, ep_dev, false);
+			if (err)
+				/* Currently, for some reason, PCI
+				 * initialization or network device re-probing
+				 * has failed. Waiting for the PCI subsystem to
+				 * call the remove callback to release the
+				 * remaining resources.
+				 */
+				dev_err(&ep_dev->pci_dev->dev,
+					"ha: re-setup failed.\n");
+
+			mutex_unlock(&edev->ha_lock);
+		} else {
+			/* Device removal is in progress, so return directly. */
+			dev_warn(&ep_dev->pci_dev->dev,
+				 "ha device reset: trylock failed.\n");
+		}
+		return;
+	}
+
+	eea_queues_check_and_reset(&ep_dev->edev);
+}
+
+/* ha handle code */
+static void eea_ha_handle_work(struct work_struct *work)
+{
+	struct eea_pci_device *ep_dev;
+
+	ep_dev = container_of(work, struct eea_pci_device, ha_handle_work);
+
+	/* Ha interrupt is triggered, so there maybe some error, we may need to
+	 * reset the device or reset some queues.
+	 */
+	dev_warn(&ep_dev->pci_dev->dev, "recv ha interrupt.\n");
+
+	eea_ha_handle_reset(ep_dev);
+}
+
+static irqreturn_t eea_pci_ha_handle(int irq, void *data)
+{
+	struct eea_device *edev = data;
+
+	schedule_work(&edev->ep_dev->ha_handle_work);
+
+	return IRQ_HANDLED;
+}
+
+static void eea_pci_free_ha_irq(struct eea_device *edev)
+{
+	struct eea_pci_device *ep_dev = edev->ep_dev;
+	int irq;
+
+	if (ep_dev->ha_ready) {
+		irq = pci_irq_vector(ep_dev->pci_dev, 0);
+		free_irq(irq, edev);
+		ep_dev->ha_ready = false;
+	}
+}
+
+static int eea_pci_ha_init(struct eea_device *edev, struct pci_dev *pci_dev,
+			   bool pci_probe)
+{
+	u8 pos, cfg_type_off, type, cfg_drv_off, cfg_dev_off;
+	struct eea_pci_device *ep_dev = edev->ep_dev;
+	int irq, err;
+
+	snprintf(ep_dev->ha_irq_name, sizeof(ep_dev->ha_irq_name), "eea-ha@%s",
+		 pci_name(ep_dev->pci_dev));
+
+	irq = pci_irq_vector(ep_dev->pci_dev, 0);
+
+	if (pci_probe)
+		INIT_WORK(&ep_dev->ha_handle_work, eea_ha_handle_work);
+
+	/* This irq is not only work for ha, so request it always. */
+	err = request_irq(irq, eea_pci_ha_handle, 0, ep_dev->ha_irq_name, edev);
+	if (err)
+		return err;
+
+	ep_dev->ha_ready = true;
+	ep_dev->reset_pos = 0;
+
+	cfg_type_off = offsetof(struct eea_pci_cap, cfg_type);
+	cfg_drv_off = offsetof(struct eea_pci_reset_reg, driver);
+	cfg_dev_off = offsetof(struct eea_pci_reset_reg, device);
+
+	for (pos = pci_find_capability(pci_dev, PCI_CAP_ID_VNDR);
+	     pos > 0;
+	     pos = pci_find_next_capability(pci_dev, pos, PCI_CAP_ID_VNDR)) {
+		pci_read_config_byte(pci_dev, pos + cfg_type_off, &type);
+
+		if (type == EEA_PCI_CAP_RESET_DEVICE) {
+			/* notify device, driver support this feature. */
+			pci_write_config_word(pci_dev, pos + cfg_drv_off,
+					      EEA_PCI_CAP_RESET_FLAG);
+			pci_write_config_word(pci_dev, pos + cfg_dev_off,
+					      0xFFFF);
+
+			edev->ep_dev->reset_pos = pos + cfg_dev_off;
+			return 0;
+		}
+	}
+
+	/* irq just for event notify */
+	dev_warn(&edev->ep_dev->pci_dev->dev, "Not Found reset cap.\n");
+	return 0;
+}
+
 u64 eea_pci_device_ts(struct eea_device *edev)
 {
 	struct eea_pci_device *ep_dev = edev->ep_dev;
@@ -367,12 +557,16 @@ static int eea_init_device(struct eea_device *edev)
 }
 
 static int __eea_pci_probe(struct pci_dev *pci_dev,
-			   struct eea_pci_device *ep_dev)
+			   struct eea_pci_device *ep_dev,
+			   bool pci_probe)
 {
+	struct eea_device *edev;
 	int err;
 
 	pci_set_drvdata(pci_dev, ep_dev);
 
+	edev = &ep_dev->edev;
+
 	err = eea_pci_setup(pci_dev, ep_dev);
 	if (err)
 		return err;
@@ -381,20 +575,34 @@ static int __eea_pci_probe(struct pci_dev *pci_dev,
 	if (err)
 		goto err_pci_rel;
 
+	err = eea_pci_ha_init(edev, pci_dev, pci_probe);
+	if (err)
+		goto err_net_rm;
+
+	edev->status = EEA_PCI_STATUS_DONE;
+
 	return 0;
 
+err_net_rm:
+	eea_net_remove(edev, !pci_probe);
+
 err_pci_rel:
 	eea_pci_release_resource(ep_dev);
 	return err;
 }
 
-static void __eea_pci_remove(struct pci_dev *pci_dev)
+static void __eea_pci_remove(struct pci_dev *pci_dev, bool pci_remove)
 {
 	struct eea_pci_device *ep_dev = pci_get_drvdata(pci_dev);
 	struct device *dev = get_device(&ep_dev->pci_dev->dev);
 	struct eea_device *edev = &ep_dev->edev;
 
-	eea_net_remove(edev);
+	eea_pci_free_ha_irq(edev);
+
+	if (pci_remove)
+		flush_work(&ep_dev->ha_handle_work);
+
+	eea_net_remove(edev, !pci_remove);
 
 	pci_disable_sriov(pci_dev);
 
@@ -421,8 +629,11 @@ static int eea_pci_probe(struct pci_dev *pci_dev,
 
 	ep_dev->pci_dev = pci_dev;
 
-	err = __eea_pci_probe(pci_dev, ep_dev);
+	mutex_init(&edev->ha_lock);
+
+	err = __eea_pci_probe(pci_dev, ep_dev, true);
 	if (err) {
+		mutex_destroy(&edev->ha_lock);
 		pci_set_drvdata(pci_dev, NULL);
 		kfree(ep_dev);
 	}
@@ -433,10 +644,17 @@ static int eea_pci_probe(struct pci_dev *pci_dev,
 static void eea_pci_remove(struct pci_dev *pci_dev)
 {
 	struct eea_pci_device *ep_dev = pci_get_drvdata(pci_dev);
+	struct eea_device *edev;
 
-	__eea_pci_remove(pci_dev);
+	edev = &ep_dev->edev;
+
+	mutex_lock(&edev->ha_lock);
+	__eea_pci_remove(pci_dev, true);
+	mutex_unlock(&edev->ha_lock);
 
 	pci_set_drvdata(pci_dev, NULL);
+
+	mutex_destroy(&edev->ha_lock);
 	kfree(ep_dev);
 }
 
diff --git a/drivers/net/ethernet/alibaba/eea/eea_pci.h b/drivers/net/ethernet/alibaba/eea/eea_pci.h
index d0094c419f59..575d0f89169d 100644
--- a/drivers/net/ethernet/alibaba/eea/eea_pci.h
+++ b/drivers/net/ethernet/alibaba/eea/eea_pci.h
@@ -10,8 +10,18 @@
 
 #include <linux/pci.h>
 
+#include "eea_net.h"
 #include "eea_ring.h"
 
+enum eea_pci_status {
+	EEA_PCI_STATUS_NONE,
+	EEA_PCI_STATUS_ERR,
+	EEA_PCI_STATUS_READY,
+	EEA_PCI_STATUS_DONE,
+};
+
+struct eea_irq_blk;
+
 struct eea_pci_cap {
 	__u8 cap_vndr;
 	__u8 cap_next;
@@ -34,6 +44,12 @@ struct eea_device {
 
 	u64 features;
 
+	enum eea_pci_status status;
+	bool ha_reset_netdev_running;
+
+	/* ha lock for the race between ha work and pci remove */
+	struct mutex ha_lock;
+
 	u32 rx_num;
 	u32 tx_num;
 	u32 db_blk_size;
@@ -47,6 +63,10 @@ int eea_device_reset(struct eea_device *dev);
 int eea_pci_set_aq_up(struct eea_device *dev);
 int eea_pci_active_aq(struct eea_ring *ering, int msix_vec);
 
+int eea_pci_request_irq(struct eea_device *edev, struct eea_irq_blk *blk,
+			irqreturn_t (*callback)(int irq, void *data));
+void eea_pci_free_irq(struct eea_irq_blk *blk);
+
 u64 eea_pci_device_ts(struct eea_device *edev);
 
 void __iomem *eea_pci_db_addr(struct eea_device *edev, u32 off);
diff --git a/drivers/net/ethernet/alibaba/eea/eea_rx.c b/drivers/net/ethernet/alibaba/eea/eea_rx.c
new file mode 100644
index 000000000000..8019b01a4b24
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_rx.c
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#include <net/netdev_rx_queue.h>
+#include <net/page_pool/helpers.h>
+
+#include "eea_adminq.h"
+#include "eea_net.h"
+#include "eea_pci.h"
+#include "eea_ring.h"
+
+#define EEA_ENABLE_F_NAPI        BIT(0)
+
+#define EEA_PAGE_FRAGS_NUM 1024
+
+static void eea_free_rx_buffer(struct eea_net_rx *rx, struct eea_rx_meta *meta,
+			       bool allow_direct)
+{
+	u32 drain_count;
+
+	drain_count = EEA_PAGE_FRAGS_NUM - meta->frags;
+
+	if (page_pool_unref_page(meta->page, drain_count) == 0)
+		page_pool_put_unrefed_page(rx->pp, meta->page, -1,
+					   allow_direct);
+
+	meta->page = NULL;
+}
+
+static void eea_free_rx_hdr(struct eea_net_rx *rx, struct eea_net_cfg *cfg)
+{
+	struct eea_rx_meta *meta;
+	int i;
+
+	for (i = 0; i < cfg->rx_ring_depth; ++i) {
+		meta = &rx->meta[i];
+		meta->hdr_addr = NULL;
+
+		if (!meta->hdr_page)
+			continue;
+
+		dma_unmap_page(rx->dma_dev, meta->hdr_dma, PAGE_SIZE,
+			       DMA_FROM_DEVICE);
+		put_page(meta->hdr_page);
+
+		meta->hdr_page = NULL;
+	}
+}
+
+static int eea_alloc_rx_hdr(struct eea_net_init_ctx *ctx, struct eea_net_rx *rx)
+{
+	struct page *hdr_page = NULL;
+	struct eea_rx_meta *meta;
+	u32 offset = 0, hdrsize;
+	struct device *dmadev;
+	dma_addr_t dma;
+	int i;
+
+	dmadev = ctx->edev->dma_dev;
+	hdrsize = ctx->cfg.split_hdr;
+
+	for (i = 0; i < ctx->cfg.rx_ring_depth; ++i) {
+		meta = &rx->meta[i];
+		meta->hdr_page = NULL;
+
+		if (!hdr_page || offset + hdrsize > PAGE_SIZE) {
+			hdr_page = dev_alloc_page();
+			if (!hdr_page)
+				goto err;
+
+			dma = dma_map_page(dmadev, hdr_page, 0, PAGE_SIZE,
+					   DMA_FROM_DEVICE);
+
+			if (unlikely(dma_mapping_error(dmadev, dma))) {
+				put_page(hdr_page);
+				goto err;
+			}
+
+			offset = 0;
+			meta->hdr_page = hdr_page;
+		}
+
+		meta->hdr_dma = dma + offset;
+		meta->hdr_addr = page_address(hdr_page) + offset;
+		offset += hdrsize;
+	}
+
+	return 0;
+
+err:
+	eea_free_rx_hdr(rx, &ctx->cfg);
+	return -ENOMEM;
+}
+
+static int eea_poll(struct napi_struct *napi, int budget)
+{
+	/* Empty function; will be implemented in a subsequent commit. */
+	return 0;
+}
+
+static void eea_free_rx_buffers(struct eea_net_rx *rx, struct eea_net_cfg *cfg)
+{
+	struct eea_rx_meta *meta;
+	u32 i;
+
+	for (i = 0; i < cfg->rx_ring_depth; ++i) {
+		meta = &rx->meta[i];
+		if (!meta->page)
+			continue;
+
+		eea_free_rx_buffer(rx, meta, false);
+	}
+}
+
+static struct page_pool *eea_create_pp(struct eea_net_init_ctx *ctx, u32 idx)
+{
+	struct page_pool_params pp_params = {0};
+
+	pp_params.order     = 0;
+	pp_params.flags     = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
+	pp_params.pool_size = ctx->cfg.rx_ring_depth;
+	pp_params.nid       = dev_to_node(ctx->edev->dma_dev);
+	pp_params.dev       = ctx->edev->dma_dev;
+	pp_params.netdev    = ctx->netdev;
+	pp_params.dma_dir   = DMA_FROM_DEVICE;
+	pp_params.max_len   = PAGE_SIZE;
+	pp_params.queue_idx = idx;
+
+	return page_pool_create(&pp_params);
+}
+
+static void eea_destroy_page_pool(struct eea_net_rx *rx)
+{
+	if (rx->pp)
+		page_pool_destroy(rx->pp);
+}
+
+void enet_rx_stop(struct eea_net_rx *rx)
+{
+	if (rx->flags & EEA_ENABLE_F_NAPI) {
+		rx->flags &= ~EEA_ENABLE_F_NAPI;
+
+		disable_irq(rx->enet->irq_blks[rx->index].irq);
+		napi_disable(rx->napi);
+
+		page_pool_disable_direct_recycling(rx->pp);
+		netif_napi_del(rx->napi);
+	}
+}
+
+void enet_rx_start(struct eea_net_rx *rx)
+{
+	netif_napi_add(rx->enet->netdev, rx->napi, eea_poll);
+
+	page_pool_enable_direct_recycling(rx->pp, rx->napi);
+
+	napi_enable(rx->napi);
+
+	rx->flags |= EEA_ENABLE_F_NAPI;
+
+	local_bh_disable();
+	napi_schedule(rx->napi);
+	local_bh_enable();
+
+	enable_irq(rx->enet->irq_blks[rx->index].irq);
+}
+
+/* Maybe called before eea_bind_q_and_cfg. So the cfg must be passed. */
+void eea_free_rx(struct eea_net_rx *rx, struct eea_net_cfg *cfg)
+{
+	if (!rx)
+		return;
+
+	if (rx->ering) {
+		ering_free(rx->ering);
+		rx->ering = NULL;
+	}
+
+	if (rx->meta) {
+		eea_free_rx_buffers(rx, cfg);
+		eea_free_rx_hdr(rx, cfg);
+		kvfree(rx->meta);
+		rx->meta = NULL;
+	}
+
+	if (rx->pp) {
+		eea_destroy_page_pool(rx);
+		rx->pp = NULL;
+	}
+
+	kfree(rx);
+}
+
+static void eea_rx_meta_init(struct eea_net_rx *rx, u32 num)
+{
+	struct eea_rx_meta *meta;
+	int i;
+
+	rx->free = NULL;
+
+	for (i = 0; i < num; ++i) {
+		meta = &rx->meta[i];
+		meta->id = i;
+		meta->next = rx->free;
+		rx->free = meta;
+	}
+}
+
+struct eea_net_rx *eea_alloc_rx(struct eea_net_init_ctx *ctx, u32 idx)
+{
+	struct eea_ring *ering;
+	struct eea_net_rx *rx;
+	int err;
+
+	rx = kzalloc(sizeof(*rx), GFP_KERNEL);
+	if (!rx)
+		return rx;
+
+	rx->index = idx;
+	snprintf(rx->name, sizeof(rx->name), "rx.%u", idx);
+
+	/* ering */
+	ering = ering_alloc(idx * 2, ctx->cfg.rx_ring_depth, ctx->edev,
+			    ctx->cfg.rx_sq_desc_size,
+			    ctx->cfg.rx_cq_desc_size,
+			    rx->name);
+	if (!ering)
+		goto err_free_rx;
+
+	rx->ering = ering;
+
+	rx->dma_dev = ctx->edev->dma_dev;
+
+	/* meta */
+	rx->meta = kvcalloc(ctx->cfg.rx_ring_depth,
+			    sizeof(*rx->meta), GFP_KERNEL);
+	if (!rx->meta)
+		goto err_free_rx;
+
+	eea_rx_meta_init(rx, ctx->cfg.rx_ring_depth);
+
+	if (ctx->cfg.split_hdr) {
+		err = eea_alloc_rx_hdr(ctx, rx);
+		if (err)
+			goto err_free_rx;
+	}
+
+	rx->pp = eea_create_pp(ctx, idx);
+	if (IS_ERR(rx->pp)) {
+		err = PTR_ERR(rx->pp);
+		rx->pp = NULL;
+		goto err_free_rx;
+	}
+
+	return rx;
+
+err_free_rx:
+	eea_free_rx(rx, &ctx->cfg);
+	return NULL;
+}
diff --git a/drivers/net/ethernet/alibaba/eea/eea_tx.c b/drivers/net/ethernet/alibaba/eea/eea_tx.c
new file mode 100644
index 000000000000..e01168d4e437
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_tx.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#include <net/netdev_queues.h>
+
+#include "eea_net.h"
+#include "eea_pci.h"
+#include "eea_ring.h"
+
+struct eea_tx_meta {
+	struct eea_tx_meta *next;
+
+	u32 id;
+
+	union {
+		struct sk_buff *skb;
+		void *data;
+	};
+
+	u32 num;
+
+	dma_addr_t dma_addr;
+	struct eea_tx_desc *desc;
+	u32 dma_len;
+};
+
+bool eea_poll_tx(struct eea_net_tx *tx, int budget)
+{
+	/* Empty function; will be implemented in a subsequent commit. */
+	return true;
+}
+
+netdev_tx_t eea_tx_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	/* Empty function; will be implemented in a subsequent commit. */
+	dev_kfree_skb_any(skb);
+	return NETDEV_TX_OK;
+}
+
+static void eea_free_meta(struct eea_net_tx *tx, struct eea_net_cfg *cfg)
+{
+	kvfree(tx->meta);
+	tx->meta = NULL;
+}
+
+/* Maybe called before eea_bind_q_and_cfg. So the cfg must be passed. */
+void eea_free_tx(struct eea_net_tx *tx, struct eea_net_cfg *cfg)
+{
+	if (!tx)
+		return;
+
+	if (tx->ering) {
+		ering_free(tx->ering);
+		tx->ering = NULL;
+	}
+
+	if (tx->meta)
+		eea_free_meta(tx, cfg);
+}
+
+int eea_alloc_tx(struct eea_net_init_ctx *ctx, struct eea_net_tx *tx, u32 idx)
+{
+	struct eea_tx_meta *meta;
+	struct eea_ring *ering;
+	u32 i;
+
+	snprintf(tx->name, sizeof(tx->name), "tx.%u", idx);
+
+	ering = ering_alloc(idx * 2 + 1, ctx->cfg.tx_ring_depth, ctx->edev,
+			    ctx->cfg.tx_sq_desc_size,
+			    ctx->cfg.tx_cq_desc_size,
+			    tx->name);
+	if (!ering)
+		goto err_free_tx;
+
+	tx->ering = ering;
+	tx->index = idx;
+	tx->dma_dev = ctx->edev->dma_dev;
+
+	/* meta */
+	tx->meta = kvcalloc(ctx->cfg.tx_ring_depth,
+			    sizeof(*tx->meta), GFP_KERNEL);
+	if (!tx->meta)
+		goto err_free_tx;
+
+	for (i = 0; i < ctx->cfg.tx_ring_depth; ++i) {
+		meta = &tx->meta[i];
+		meta->id = i;
+		meta->next = tx->free;
+		tx->free = meta;
+	}
+
+	return 0;
+
+err_free_tx:
+	eea_free_tx(tx, &ctx->cfg);
+	return -ENOMEM;
+}
-- 
2.32.0.3.g01195cf9f


^ permalink raw reply related

* [PATCH net-next v40 1/8] eea: introduce PCI framework
From: Xuan Zhuo @ 2026-04-09 12:21 UTC (permalink / raw)
  To: netdev
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Xuan Zhuo, Wen Gu, Philo Lu, Vadim Fedorenko,
	Dong Yibo, Jes Sorensen, Heiner Kallweit, Dust Li
In-Reply-To: <20260409122130.129416-1-xuanzhuo@linux.alibaba.com>

Add basic driver framework for the Alibaba Elastic Ethernet Adapter(EEA).

This commit implements the EEA PCI probe functionality.

Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Reviewed-by: Philo Lu <lulie@linux.alibaba.com>
Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
 MAINTAINERS                                |   8 +
 drivers/net/ethernet/Kconfig               |   1 +
 drivers/net/ethernet/Makefile              |   1 +
 drivers/net/ethernet/alibaba/Kconfig       |  28 ++
 drivers/net/ethernet/alibaba/Makefile      |   5 +
 drivers/net/ethernet/alibaba/eea/Makefile  |   3 +
 drivers/net/ethernet/alibaba/eea/eea_pci.c | 465 +++++++++++++++++++++
 drivers/net/ethernet/alibaba/eea/eea_pci.h |  50 +++
 8 files changed, 561 insertions(+)
 create mode 100644 drivers/net/ethernet/alibaba/Kconfig
 create mode 100644 drivers/net/ethernet/alibaba/Makefile
 create mode 100644 drivers/net/ethernet/alibaba/eea/Makefile
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_pci.c
 create mode 100644 drivers/net/ethernet/alibaba/eea/eea_pci.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 9d1e6d3acbac..b78a31ecd0d4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -804,6 +804,14 @@ S:	Maintained
 F:	Documentation/i2c/busses/i2c-ali1563.rst
 F:	drivers/i2c/busses/i2c-ali1563.c
 
+ALIBABA ELASTIC ETHERNET ADAPTER DRIVER
+M:	Xuan Zhuo <xuanzhuo@linux.alibaba.com>
+M:	Wen Gu <guwen@linux.alibaba.com>
+R:	Philo Lu <lulie@linux.alibaba.com>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	drivers/net/ethernet/alibaba/eea
+
 ALIBABA ELASTIC RDMA DRIVER
 M:	Cheng Xu <chengyou@linux.alibaba.com>
 M:	Kai Shen <kaishen@linux.alibaba.com>
diff --git a/drivers/net/ethernet/Kconfig b/drivers/net/ethernet/Kconfig
index bdc29d143160..63e7ddfd4aaa 100644
--- a/drivers/net/ethernet/Kconfig
+++ b/drivers/net/ethernet/Kconfig
@@ -22,6 +22,7 @@ source "drivers/net/ethernet/aeroflex/Kconfig"
 source "drivers/net/ethernet/agere/Kconfig"
 source "drivers/net/ethernet/airoha/Kconfig"
 source "drivers/net/ethernet/alacritech/Kconfig"
+source "drivers/net/ethernet/alibaba/Kconfig"
 source "drivers/net/ethernet/allwinner/Kconfig"
 source "drivers/net/ethernet/altera/Kconfig"
 source "drivers/net/ethernet/amazon/Kconfig"
diff --git a/drivers/net/ethernet/Makefile b/drivers/net/ethernet/Makefile
index 6bffb60ba644..b1eb69814333 100644
--- a/drivers/net/ethernet/Makefile
+++ b/drivers/net/ethernet/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_NET_VENDOR_ADI) += adi/
 obj-$(CONFIG_NET_VENDOR_AGERE) += agere/
 obj-$(CONFIG_NET_VENDOR_AIROHA) += airoha/
 obj-$(CONFIG_NET_VENDOR_ALACRITECH) += alacritech/
+obj-$(CONFIG_NET_VENDOR_ALIBABA) += alibaba/
 obj-$(CONFIG_NET_VENDOR_ALLWINNER) += allwinner/
 obj-$(CONFIG_ALTERA_TSE) += altera/
 obj-$(CONFIG_NET_VENDOR_AMAZON) += amazon/
diff --git a/drivers/net/ethernet/alibaba/Kconfig b/drivers/net/ethernet/alibaba/Kconfig
new file mode 100644
index 000000000000..9bd8cc9fd203
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/Kconfig
@@ -0,0 +1,28 @@
+#
+# Alibaba network device configuration
+#
+
+config NET_VENDOR_ALIBABA
+	bool "Alibaba Devices"
+	default y
+	help
+	  If you have a network (Ethernet) device belonging to this class, say Y.
+
+	  Note that the answer to this question doesn't directly affect the
+	  kernel: saying N will just cause the configurator to skip all
+	  the questions about Alibaba devices. If you say Y, you will be asked
+	  for your specific device in the following questions.
+
+if NET_VENDOR_ALIBABA
+
+config EEA
+	tristate "Alibaba Elastic Ethernet Adapter support"
+	depends on PCI_MSI
+	depends on 64BIT
+	select PAGE_POOL
+	help
+	  This driver supports Alibaba Elastic Ethernet Adapter.
+
+	  To compile this driver as a module, choose M here.
+
+endif #NET_VENDOR_ALIBABA
diff --git a/drivers/net/ethernet/alibaba/Makefile b/drivers/net/ethernet/alibaba/Makefile
new file mode 100644
index 000000000000..7980525cb086
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the Alibaba network device drivers.
+#
+
+obj-$(CONFIG_EEA) += eea/
diff --git a/drivers/net/ethernet/alibaba/eea/Makefile b/drivers/net/ethernet/alibaba/eea/Makefile
new file mode 100644
index 000000000000..cf2acf1733fd
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/Makefile
@@ -0,0 +1,3 @@
+
+obj-$(CONFIG_EEA) += eea.o
+eea-y := eea_pci.o
diff --git a/drivers/net/ethernet/alibaba/eea/eea_pci.c b/drivers/net/ethernet/alibaba/eea/eea_pci.c
new file mode 100644
index 000000000000..c7306a299ab2
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_pci.c
@@ -0,0 +1,465 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/iopoll.h>
+
+#include "eea_pci.h"
+
+#define EEA_PCI_DB_OFFSET 4096
+#define EEA_PCI_DB_MIN_SIZE 8
+#define EEA_PCI_DB_MAX_SIZE 512
+#define EEA_PCI_Q_MAX_NUM 1000
+
+struct eea_pci_cfg {
+	__le32 reserve0;
+	__le32 reserve1;
+	__le32 drv_f_idx;
+	__le32 drv_f;
+
+#define EEA_S_OK           BIT(2)
+#define EEA_S_FEATURE_DONE BIT(3)
+#define EEA_S_FAILED       BIT(7)
+	u8   device_status;
+	u8   reserved[7];
+
+	__le32 rx_num_max;
+	__le32 tx_num_max;
+	__le32 db_blk_size;
+
+	/* admin queue cfg */
+	__le16 aq_size;
+	__le16 aq_msix_vector;
+	__le32 aq_db_off;
+
+	__le32 aq_sq_addr;
+	__le32 aq_sq_addr_hi;
+	__le32 aq_cq_addr;
+	__le32 aq_cq_addr_hi;
+
+	__le32 reserved1;
+	__le64 hw_ts;
+};
+
+struct eea_pci_device {
+	struct eea_device edev;
+	struct pci_dev *pci_dev;
+
+	u32 msix_vec_n;
+	u32 db_len;
+
+	void __iomem *reg;
+	void __iomem *db_base;
+	void __iomem *db_end;
+
+	char ha_irq_name[32];
+	u8 reset_pos;
+};
+
+#define cfg_pointer(reg, item) \
+	((void __iomem *)((reg) + offsetof(struct eea_pci_cfg, item)))
+
+#define cfg_write8(reg, item, val) iowrite8(val, cfg_pointer(reg, item))
+#define cfg_write32(reg, item, val) iowrite32(val, cfg_pointer(reg, item))
+
+#define cfg_read8(reg, item) ioread8(cfg_pointer(reg, item))
+#define cfg_read32(reg, item) ioread32(cfg_pointer(reg, item))
+#define cfg_read64(reg, item) ioread64(cfg_pointer(reg, item))
+
+const char *eea_pci_name(struct eea_device *edev)
+{
+	return pci_name(edev->ep_dev->pci_dev);
+}
+
+int eea_pci_domain_nr(struct eea_device *edev)
+{
+	return pci_domain_nr(edev->ep_dev->pci_dev->bus);
+}
+
+u16 eea_pci_dev_id(struct eea_device *edev)
+{
+	return pci_dev_id(edev->ep_dev->pci_dev);
+}
+
+static void eea_pci_io_set_status(struct eea_device *edev, u8 status)
+{
+	struct eea_pci_device *ep_dev = edev->ep_dev;
+
+	cfg_write8(ep_dev->reg, device_status, status);
+}
+
+static u8 eea_pci_io_get_status(struct eea_device *edev)
+{
+	struct eea_pci_device *ep_dev = edev->ep_dev;
+
+	return cfg_read8(ep_dev->reg, device_status);
+}
+
+static void eea_add_status(struct eea_device *dev, u32 status)
+{
+	eea_pci_io_set_status(dev, eea_pci_io_get_status(dev) | status);
+}
+
+#define EEA_RESET_TIMEOUT_US (60 * 1000 * 1000)
+
+int eea_device_reset(struct eea_device *edev)
+{
+	struct eea_pci_device *ep_dev = edev->ep_dev;
+	int err;
+	u8 val;
+
+	eea_pci_io_set_status(edev, 0);
+
+	/* A longer timeout is set here to handle edge cases, though it should
+	 * return promptly in most scenarios.
+	 */
+	err = read_poll_timeout(cfg_read8, val, (!val || val == 0xFF), 20,
+				EEA_RESET_TIMEOUT_US,
+				false, ep_dev->reg, device_status);
+
+	/* Surprise PCIe Removal */
+	if (val == 0xFF)
+		return -EINVAL;
+
+	return err;
+}
+
+int eea_pci_set_aq_up(struct eea_device *edev)
+{
+	struct eea_pci_device *ep_dev = edev->ep_dev;
+	u8 status = eea_pci_io_get_status(edev);
+	int err;
+	u8 val;
+
+	eea_pci_io_set_status(edev, status | EEA_S_OK);
+
+	/* A longer timeout is set here to handle edge cases, though it should
+	 * return promptly in most scenarios.
+	 */
+	err = read_poll_timeout(cfg_read8, val, val & EEA_S_OK, 20,
+				EEA_RESET_TIMEOUT_US,
+				false, ep_dev->reg, device_status);
+
+	/* Surprise PCIe Removal */
+	if (val == 0xFF)
+		return -EINVAL;
+
+	return err;
+}
+
+static int eea_negotiate(struct eea_device *edev)
+{
+	struct eea_pci_device *ep_dev;
+	u32 status;
+
+	ep_dev = edev->ep_dev;
+
+	edev->features = 0;
+
+	cfg_write32(ep_dev->reg, drv_f_idx, 0);
+	cfg_write32(ep_dev->reg, drv_f, lower_32_bits(edev->features));
+	cfg_write32(ep_dev->reg, drv_f_idx, 1);
+	cfg_write32(ep_dev->reg, drv_f, upper_32_bits(edev->features));
+
+	eea_add_status(edev, EEA_S_FEATURE_DONE);
+	status = eea_pci_io_get_status(edev);
+	if (!(status & EEA_S_FEATURE_DONE))
+		return -ENODEV;
+
+	return 0;
+}
+
+static void eea_pci_release_resource(struct eea_pci_device *ep_dev)
+{
+	struct pci_dev *pci_dev = ep_dev->pci_dev;
+
+	if (ep_dev->reg) {
+		pci_iounmap(pci_dev, ep_dev->reg);
+		ep_dev->reg = NULL;
+	}
+
+	if (ep_dev->msix_vec_n) {
+		ep_dev->msix_vec_n = 0;
+		pci_free_irq_vectors(ep_dev->pci_dev);
+	}
+
+	pci_release_regions(pci_dev);
+	pci_disable_device(pci_dev);
+}
+
+static int eea_pci_setup(struct pci_dev *pci_dev, struct eea_pci_device *ep_dev)
+{
+	int err, n, ret, len;
+
+	ep_dev->pci_dev = pci_dev;
+
+	err = pci_enable_device(pci_dev);
+	if (err)
+		return err;
+
+	err = pci_request_regions(pci_dev, "EEA");
+	if (err)
+		goto err_disable_dev;
+
+	/* Instruct DPU to halt DMA and trigger reset. */
+	pci_clear_master(pci_dev);
+
+	/* The hardware has been notified by the 'pci_clear_master'; the DPU
+	 * will no longer initiate any DMA operations. So it is safe for
+	 * kdump/kexec.
+	 */
+	pci_set_master(pci_dev);
+
+	err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pci_dev->dev, "Failed to enable 64-bit DMA.\n");
+		goto err_release_regions;
+	}
+
+	if (pci_resource_len(pci_dev, 0) < EEA_PCI_DB_OFFSET) {
+		dev_err(&pci_dev->dev, "Bar 0 is too small %llu\n",
+			pci_resource_len(pci_dev, 0));
+		err = -ENOMEM;
+		goto err_release_regions;
+	}
+
+	ep_dev->reg = pci_iomap(pci_dev, 0, 0);
+	if (!ep_dev->reg) {
+		dev_err(&pci_dev->dev, "Failed to map pci bar!\n");
+		err = -ENOMEM;
+		goto err_release_regions;
+	}
+
+	ep_dev->edev.rx_num = cfg_read32(ep_dev->reg, rx_num_max);
+	ep_dev->edev.tx_num = cfg_read32(ep_dev->reg, tx_num_max);
+
+	if (ep_dev->edev.rx_num > EEA_PCI_Q_MAX_NUM ||
+	    ep_dev->edev.tx_num > EEA_PCI_Q_MAX_NUM) {
+		dev_err(&pci_dev->dev, "Invalid queue num %u %u\n",
+			ep_dev->edev.rx_num,
+			ep_dev->edev.tx_num);
+		err = -EINVAL;
+		goto err_unmap_reg;
+	}
+
+	ep_dev->edev.db_blk_size = cfg_read32(ep_dev->reg, db_blk_size);
+	if (ep_dev->edev.db_blk_size > EEA_PCI_DB_MAX_SIZE ||
+	    ep_dev->edev.db_blk_size < EEA_PCI_DB_MIN_SIZE) {
+		dev_err(&pci_dev->dev, "Invalid db size %u\n",
+			ep_dev->edev.db_blk_size);
+		err = -EINVAL;
+		goto err_unmap_reg;
+	}
+
+	ep_dev->db_len = ep_dev->edev.db_blk_size * (ep_dev->edev.rx_num +
+						     ep_dev->edev.tx_num + 1);
+	ep_dev->db_base = ep_dev->reg + EEA_PCI_DB_OFFSET;
+	ep_dev->db_end = ep_dev->db_base + ep_dev->db_len;
+
+	len = ep_dev->db_end - ep_dev->reg;
+
+	if (pci_resource_len(pci_dev, 0) < len) {
+		dev_err(&pci_dev->dev, "Bar 0 is too small %llu\n",
+			pci_resource_len(pci_dev, 0));
+		err = -EINVAL;
+		goto err_unmap_reg;
+	}
+
+	/* In our design, the number of hardware interrupts matches the maximum
+	 * number of queues. If pci_alloc_irq_vectors failed, return directly.
+	 *
+	 * 2: adminq, error handle
+	 */
+	n = ep_dev->edev.rx_num + ep_dev->edev.tx_num + 2;
+	ret = pci_alloc_irq_vectors(ep_dev->pci_dev, n, n, PCI_IRQ_MSIX);
+	if (ret != n) {
+		err = ret;
+		goto err_unmap_reg;
+	}
+
+	ep_dev->msix_vec_n = ret;
+
+	return 0;
+
+err_unmap_reg:
+	pci_iounmap(pci_dev, ep_dev->reg);
+	ep_dev->reg = NULL;
+
+err_release_regions:
+	pci_release_regions(pci_dev);
+
+err_disable_dev:
+	pci_disable_device(pci_dev);
+
+	return err;
+}
+
+void __iomem *eea_pci_db_addr(struct eea_device *edev, u32 off)
+{
+	u32 max_off;
+
+	if (!IS_ALIGNED(off, 8))
+		return NULL;
+
+	max_off = edev->ep_dev->db_len - edev->db_blk_size;
+
+	if (off > max_off)
+		return NULL;
+
+	return edev->ep_dev->db_base + off;
+}
+
+u64 eea_pci_device_ts(struct eea_device *edev)
+{
+	struct eea_pci_device *ep_dev = edev->ep_dev;
+
+	return cfg_read64(ep_dev->reg, hw_ts);
+}
+
+static int eea_init_device(struct eea_device *edev)
+{
+	int err;
+
+	err = eea_device_reset(edev);
+	if (err)
+		return err;
+
+	eea_pci_io_set_status(edev, BIT(0) | BIT(1));
+
+	err = eea_negotiate(edev);
+	if (err)
+		goto err;
+
+	/* do net device probe ... */
+
+	return 0;
+err:
+	eea_add_status(edev, EEA_S_FAILED);
+	return err;
+}
+
+static int __eea_pci_probe(struct pci_dev *pci_dev,
+			   struct eea_pci_device *ep_dev)
+{
+	int err;
+
+	pci_set_drvdata(pci_dev, ep_dev);
+
+	err = eea_pci_setup(pci_dev, ep_dev);
+	if (err)
+		return err;
+
+	err = eea_init_device(&ep_dev->edev);
+	if (err)
+		goto err_pci_rel;
+
+	return 0;
+
+err_pci_rel:
+	eea_pci_release_resource(ep_dev);
+	return err;
+}
+
+static void __eea_pci_remove(struct pci_dev *pci_dev)
+{
+	struct eea_pci_device *ep_dev = pci_get_drvdata(pci_dev);
+	struct device *dev = get_device(&ep_dev->pci_dev->dev);
+
+	pci_disable_sriov(pci_dev);
+
+	eea_pci_release_resource(ep_dev);
+
+	put_device(dev);
+}
+
+static int eea_pci_probe(struct pci_dev *pci_dev,
+			 const struct pci_device_id *id)
+{
+	struct eea_pci_device *ep_dev;
+	struct eea_device *edev;
+	int err;
+
+	ep_dev = kzalloc(sizeof(*ep_dev), GFP_KERNEL);
+	if (!ep_dev)
+		return -ENOMEM;
+
+	edev = &ep_dev->edev;
+
+	edev->ep_dev = ep_dev;
+	edev->dma_dev = &pci_dev->dev;
+
+	ep_dev->pci_dev = pci_dev;
+
+	err = __eea_pci_probe(pci_dev, ep_dev);
+	if (err) {
+		pci_set_drvdata(pci_dev, NULL);
+		kfree(ep_dev);
+	}
+
+	return err;
+}
+
+static void eea_pci_remove(struct pci_dev *pci_dev)
+{
+	struct eea_pci_device *ep_dev = pci_get_drvdata(pci_dev);
+
+	__eea_pci_remove(pci_dev);
+
+	pci_set_drvdata(pci_dev, NULL);
+	kfree(ep_dev);
+}
+
+static int eea_pci_sriov_configure(struct pci_dev *pci_dev, int num_vfs)
+{
+	int ret;
+
+	if (pci_vfs_assigned(pci_dev))
+		return -EPERM;
+
+	if (num_vfs == 0) {
+		pci_disable_sriov(pci_dev);
+		return 0;
+	}
+
+	ret = pci_enable_sriov(pci_dev, num_vfs);
+	if (ret < 0)
+		return ret;
+
+	return num_vfs;
+}
+
+static const struct pci_device_id eea_pci_id_table[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_ALIBABA, 0x500B) },
+	{ 0 }
+};
+
+MODULE_DEVICE_TABLE(pci, eea_pci_id_table);
+
+static struct pci_driver eea_pci_driver = {
+	.name            = "eea",
+	.id_table        = eea_pci_id_table,
+	.probe           = eea_pci_probe,
+	.remove          = eea_pci_remove,
+	.sriov_configure = eea_pci_sriov_configure,
+};
+
+static __init int eea_pci_init(void)
+{
+	return pci_register_driver(&eea_pci_driver);
+}
+
+static __exit void eea_pci_exit(void)
+{
+	pci_unregister_driver(&eea_pci_driver);
+}
+
+module_init(eea_pci_init);
+module_exit(eea_pci_exit);
+
+MODULE_DESCRIPTION("Driver for Alibaba Elastic Ethernet Adapter");
+MODULE_AUTHOR("Xuan Zhuo <xuanzhuo@linux.alibaba.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/alibaba/eea/eea_pci.h b/drivers/net/ethernet/alibaba/eea/eea_pci.h
new file mode 100644
index 000000000000..be4e75b4ed2f
--- /dev/null
+++ b/drivers/net/ethernet/alibaba/eea/eea_pci.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Driver for Alibaba Elastic Ethernet Adapter.
+ *
+ * Copyright (C) 2025 Alibaba Inc.
+ */
+
+#ifndef __EEA_PCI_H__
+#define __EEA_PCI_H__
+
+#include <linux/pci.h>
+
+struct eea_pci_cap {
+	__u8 cap_vndr;
+	__u8 cap_next;
+	__u8 cap_len;
+	__u8 cfg_type;
+};
+
+struct eea_pci_reset_reg {
+	struct eea_pci_cap cap;
+	__le16 driver;
+	__le16 device;
+};
+
+struct eea_pci_device;
+
+struct eea_device {
+	struct eea_pci_device *ep_dev;
+	struct device         *dma_dev;
+	struct eea_net        *enet;
+
+	u64 features;
+
+	u32 rx_num;
+	u32 tx_num;
+	u32 db_blk_size;
+};
+
+const char *eea_pci_name(struct eea_device *edev);
+int eea_pci_domain_nr(struct eea_device *edev);
+u16 eea_pci_dev_id(struct eea_device *edev);
+
+int eea_device_reset(struct eea_device *dev);
+int eea_pci_set_aq_up(struct eea_device *dev);
+
+u64 eea_pci_device_ts(struct eea_device *edev);
+
+void __iomem *eea_pci_db_addr(struct eea_device *edev, u32 off);
+#endif
-- 
2.32.0.3.g01195cf9f


^ permalink raw reply related

* Re: [net-next v1 v1 1/5] dt-bindings: net: starfive,jh7110-dwmac: Remove JH8100
From: Andrew Lunn @ 2026-04-09 12:13 UTC (permalink / raw)
  To: Minda Chen
  Cc: Alexandre Torgue, Andrew Lunn, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Maxime Coquelin,
	Emil Renner Berthing, Rob Herring, Krzysztof Kozlowski,
	Conor Dooley, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	linux-stm32@st-md-mailman.stormreply.com,
	devicetree@vger.kernel.org
In-Reply-To: <SHXPR01MB086305E0C766E8BB17AF18DCE6582@SHXPR01MB0863.CHNPR01.prod.partner.outlook.cn>

> Yes. 
> We (StarFive) stop developing on JH8100 now, And do NOT release the SoC outside.

Please add that to the commit message as the justification of removing
support.

	Andrew

^ permalink raw reply

* Re: [RFC v2 1/2] vfio: add callback to get tph info for dmabuf
From: Leon Romanovsky @ 2026-04-09 12:04 UTC (permalink / raw)
  To: Keith Busch
  Cc: Zhiping Zhang, Jason Gunthorpe, Bjorn Helgaas, linux-rdma,
	linux-pci, netdev, dri-devel, Yochai Cohen, Yishai Hadas,
	Bjorn Helgaas
In-Reply-To: <acwkAo2k41xaxdTS@kbusch-mbp>

On Tue, Mar 31, 2026 at 01:44:02PM -0600, Keith Busch wrote:
> On Tue, Mar 31, 2026 at 10:02:20PM +0300, Leon Romanovsky wrote:
> > 
> > Right, what about adding TPH fields to struct vfio_region_dma_range
> > instead of struct vfio_device_feature_dma_buf?
> 
> You might have to show me with code what you're talking about because I
> can't see any way we can add fields to any struct here without breaking
> backward compatibility.
> 
> If we can't claim bits out of the unused "flags" field for this feature,
> then my initial reply is the only sane approach: we can introduce a new
> feature and struct for it that closely mirrors the existing one, but
> with the extra hint fields.

Something like that, on top of this proposal:

diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c
index 3961afa640391..70d5ee1e3ef7b 100644
--- a/drivers/vfio/pci/vfio_pci_dmabuf.c
+++ b/drivers/vfio/pci/vfio_pci_dmabuf.c
@@ -241,9 +241,7 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
 		return -EFAULT;
 
 	if (!get_dma_buf.nr_ranges ||
-	    (get_dma_buf.flags & ~(VFIO_DMABUF_FL_TPH |
-				   VFIO_DMABUF_TPH_PH_MASK |
-				   VFIO_DMABUF_TPH_ST_MASK)))
+	    (get_dma_buf.flags & ~VFIO_DMABUF_FLAG_TPH))
 		return -EINVAL;
 
 	/*
@@ -300,13 +298,10 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
 		ret = PTR_ERR(priv->dmabuf);
 		goto err_dev_put;
 	}
-	if (get_dma_buf.flags & VFIO_DMABUF_FL_TPH) {
-		priv->steering_tag = (get_dma_buf.flags &
-				      VFIO_DMABUF_TPH_ST_MASK) >>
-				     VFIO_DMABUF_TPH_ST_SHIFT;
-		priv->ph = (get_dma_buf.flags &
-			    VFIO_DMABUF_TPH_PH_MASK) >>
-			   VFIO_DMABUF_TPH_PH_SHIFT;
+	if (get_dma_buf.flags & VFIO_DMABUF_FLAG_TPH) {
+		priv->steering_tag =
+			dma_ranges[get_dma_buf.nr_ranges + 1].tph.tag;
+		priv->ph = dma_ranges[get_dma_buf.nr_ranges + 1].tph.ph;
 	}
 	/* dma_buf_put() now frees priv */
 	INIT_LIST_HEAD(&priv->dmabufs_elm);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index e2a8962641d2c..a8b8d8b1a3278 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1497,20 +1497,30 @@ struct vfio_device_feature_bus_master {
  */
 #define VFIO_DEVICE_FEATURE_DMA_BUF 11
 
+struct vfio_region_dma_tph {
+	u16 tag;
+	u8 ph;
+};
+
 struct vfio_region_dma_range {
-	__u64 offset;
-	__u64 length;
+	union {
+		__u64 offset;
+		struct vfio_region_dma_tph tph;
+	};
+	union {
+		__u64 length;
+		__u64 reserved;
+	};
+};
+
+enum {
+	VFIO_DMABUF_FLAG_TPH = 1 << 0,
 };
 
 struct vfio_device_feature_dma_buf {
 	__u32	region_index;
 	__u32	open_flags;
 	__u32	flags;
-#define VFIO_DMABUF_FL_TPH		(1U << 0) /* TPH info is present */
-#define VFIO_DMABUF_TPH_PH_SHIFT	1         /* bits 1-2: PH (2-bit) */
-#define VFIO_DMABUF_TPH_PH_MASK	0x6U
-#define VFIO_DMABUF_TPH_ST_SHIFT	16        /* bits 16-31: steering tag */
-#define VFIO_DMABUF_TPH_ST_MASK		0xffff0000U
 	__u32	nr_ranges;
 	struct vfio_region_dma_range dma_ranges[] __counted_by(nr_ranges);
 };

^ permalink raw reply related

* Re: [PATCH net v2] nfc: pn533: allocate rx skb before consuming bytes
From: patchwork-bot+netdevbpf @ 2026-04-09 12:00 UTC (permalink / raw)
  To: Pengpeng Hou
  Cc: netdev, poeschel, duoming, rikard.falkeborn, linux-kernel, stable
In-Reply-To: <20260405094003.3-pn533-v2-pengpeng@iscas.ac.cn>

Hello:

This patch was applied to netdev/net.git (main)
by Paolo Abeni <pabeni@redhat.com>:

On Sun, 5 Apr 2026 08:40:00 +0800 you wrote:
> pn532_receive_buf() reports the number of accepted bytes to the serdev
> core. The current code consumes bytes into recv_skb and may already hand
> a complete frame to pn533_recv_frame() before allocating a fresh receive
> buffer.
> 
> If that alloc_skb() fails, the callback returns 0 even though it has
> already consumed bytes, and it leaves recv_skb as NULL for the next
> receive callback. That breaks the receive_buf() accounting contract and
> can also lead to a NULL dereference on the next skb_put_u8().
> 
> [...]

Here is the summary with links:
  - [net,v2] nfc: pn533: allocate rx skb before consuming bytes
    https://git.kernel.org/netdev/net/c/c71ba669b570

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* RE: [PATCH net v2 2/2] net: phy: micrel: remove ksz9131_resume()
From: Biju Das @ 2026-04-09 11:58 UTC (permalink / raw)
  To: Russell King
  Cc: Ovidiu Panait, andrew@lunn.ch, hkallweit1@gmail.com,
	davem@davemloft.net, edumazet@google.com, kuba@kernel.org,
	pabeni@redhat.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-renesas-soc@vger.kernel.org
In-Reply-To: <adeNzh3eu9PSdEas@shell.armlinux.org.uk>

Hi Russell King,

> -----Original Message-----
> From: Russell King <linux@armlinux.org.uk>
> Sent: 09 April 2026 12:30
> Subject: Re: [PATCH net v2 2/2] net: phy: micrel: remove ksz9131_resume()
> 
> On Thu, Apr 09, 2026 at 11:19:43AM +0000, Biju Das wrote:
> > Hi Russell King,
> >
> > > -----Original Message-----
> > > From: Russell King <linux@armlinux.org.uk>
> > > Sent: 09 April 2026 12:05
> > > Subject: Re: [PATCH net v2 2/2] net: phy: micrel: remove
> > > ksz9131_resume()
> > >
> > > On Thu, Apr 09, 2026 at 10:52:35AM +0000, Biju Das wrote:
> > > > Hi Russell King,
> > > >
> > > > Thanks for the feedback.
> > > >
> > > > > -----Original Message-----
> > > > > From: Russell King <linux@armlinux.org.uk>
> > > > > Sent: 09 April 2026 11:30
> > > > > Subject: Re: [PATCH net v2 2/2] net: phy: micrel: remove
> > > > > ksz9131_resume()
> > > > >
> > > > > phy_init_hw() will also call drv->config_intr(), so that doesn't need to be done either.
> > > > >
> > > > > It will also call drv->config_init(), which will call kszphy_config_reset().
> > > > >
> > > > > So most of kszphy_resume() becomes unnecessary. I think the only
> > > > > thing that remains would be the call to kszphy_enable_clk() -
> > > > > and is it fine to call that after
> > > phy_init_hw() ?
> > > >
> > > > It just needs kszphy_enable_clk() and phydev->drv->config_intr()
> > > > to enable PHY interrupts for suspend-to-RAM to work on RZ/G3E SMARC EVK.
> > >
> > > I think you mean WoL rather than suspend-to-RAM, although I don't
> > > see anything in micrel.c that hints that WoL is supported, so please explain why and how the PHY
> interrupt impacts suspend-to-RAM.
> >
> > This is not WoL. During Suspend-to-RAM, the DDR goes into retention
> > mode while the CPU, SoC, and PHY power is cut off.
> >
> > During resume, TF-A detects WARM_RESET, brings DDR out of retention,
> > and jumps to the PSCI resume path.
> >
> > >
> > > Note that a particular interrupt should not wake the system unless
> > > enable_irq_wake() has been called for that specific interrupt.
> >
> > If PHY interrupts are not configured during resume, no link interrupt is received and the message:
> > "renesas-gbeth 11c30000.ethernet end0: Link is Up - 1Gbps/Full - flow control rx/tx"
> > is not seen, as shown in [1].
> 
> ... and why does that happen? Is it because the PHY has lost its interrupt configuration and that needs
> to be reprogrammed?

Yes, but phy_init_hw() reconfigures the PHY interrupt during resume.
This is due to phydev->interrupts = PHY_INTERRUPT_DISABLED; in the suspend path, as you mentioned below.

> 
> If you don't disable the PHY interrupt in the suspend path, then will the call to drv->config_intr()
> via phy_init_hw() before
> phy_resume() be sufficient?

Yes, I confirm that if the PHY interrupt is not disabled in the suspend path, the call to
drv->config_intr() via phy_init_hw() before phy_resume() would be sufficient.

Cheers,
Biju


^ permalink raw reply

* [PATCH net-next 7/7] net/mlx5: Add profile to auto-enable switchdev mode at device init
From: Tariq Toukan @ 2026-04-09 11:55 UTC (permalink / raw)
  To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
	David S. Miller
  Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
	Shay Drory, Or Har-Toov, Edward Srouji, Maher Sanalla,
	Simon Horman, Moshe Shemesh, Kees Cook, Patrisious Haddad,
	Gerd Bayer, Parav Pandit, Cosmin Ratiu, Carolina Jubran, netdev,
	linux-rdma, linux-kernel, Gal Pressman, Dragos Tatulea
In-Reply-To: <20260409115550.156419-1-tariqt@nvidia.com>

From: Mark Bloch <mbloch@nvidia.com>

Deployments that always operate in switchdev mode currently require
manual devlink configuration after driver probe, which complicates
automated provisioning.

Introduce MLX5_PROF_MASK_DEF_SWITCHDEV, a new profile mask bit, and
profile index 4. When a device is initialized or reloaded with this
profile, the driver automatically switches the e-switch to switchdev
mode by calling mlx5_devlink_eswitch_mode_set() immediately after
bringing the device online.

A no-op stub of mlx5_devlink_eswitch_mode_set() is added for builds
without CONFIG_MLX5_ESWITCH.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/eswitch.h |  6 +++++
 .../net/ethernet/mellanox/mlx5/core/main.c    | 26 ++++++++++++++++++-
 include/linux/mlx5/driver.h                   |  1 +
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 256ac3ad37bc..5dcca59c3125 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -1047,6 +1047,12 @@ mlx5_esw_lag_demux_rule_create(struct mlx5_eswitch *esw, u16 vport_num,
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
+static inline int
+mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
+			      struct netlink_ext_ack *extack)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_MLX5_ESWITCH */
 
 #endif /* __MLX5_ESWITCH_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index dc7f20a357d9..12f39b4b6c2a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -86,7 +86,7 @@ MODULE_PARM_DESC(debug_mask, "debug mask: 1 = dump cmd data, 2 = dump cmd exec t
 
 static unsigned int prof_sel = MLX5_DEFAULT_PROF;
 module_param_named(prof_sel, prof_sel, uint, 0444);
-MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2");
+MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 4");
 
 static u32 sw_owner_id[4];
 #define MAX_SW_VHCA_ID (BIT(__mlx5_bit_sz(cmd_hca_cap_2, sw_vhca_id)) - 1)
@@ -185,6 +185,11 @@ static struct mlx5_profile profile[] = {
 		.log_max_qp	= LOG_MAX_SUPPORTED_QPS,
 		.num_cmd_caches = 0,
 	},
+	[4] = {
+		.mask = MLX5_PROF_MASK_DEF_SWITCHDEV | MLX5_PROF_MASK_QP_SIZE,
+		.log_max_qp = LOG_MAX_SUPPORTED_QPS,
+		.num_cmd_caches = MLX5_NUM_COMMAND_CACHES,
+	},
 };
 
 static int wait_fw_init(struct mlx5_core_dev *dev, u32 max_wait_mili,
@@ -1451,6 +1456,17 @@ static void mlx5_unload(struct mlx5_core_dev *dev)
 	mlx5_free_bfreg(dev, &dev->priv.bfreg);
 }
 
+static void mlx5_set_default_switchdev(struct mlx5_core_dev *dev)
+{
+	int err;
+
+	err = mlx5_devlink_eswitch_mode_set(priv_to_devlink(dev),
+					    DEVLINK_ESWITCH_MODE_SWITCHDEV,
+					    NULL);
+	if (err && err != -EOPNOTSUPP)
+		mlx5_core_warn(dev, "failed setting switchdev as default\n");
+}
+
 int mlx5_init_one_devl_locked(struct mlx5_core_dev *dev)
 {
 	bool light_probe = mlx5_dev_is_lightweight(dev);
@@ -1497,6 +1513,10 @@ int mlx5_init_one_devl_locked(struct mlx5_core_dev *dev)
 		mlx5_core_err(dev, "mlx5_hwmon_dev_register failed with error code %d\n", err);
 
 	mutex_unlock(&dev->intf_state_mutex);
+
+	if (dev->profile.mask & MLX5_PROF_MASK_DEF_SWITCHDEV)
+		mlx5_set_default_switchdev(dev);
+
 	return 0;
 
 err_register:
@@ -1598,6 +1618,10 @@ int mlx5_load_one_devl_locked(struct mlx5_core_dev *dev, bool recovery)
 		goto err_attach;
 
 	mutex_unlock(&dev->intf_state_mutex);
+
+	if (dev->profile.mask & MLX5_PROF_MASK_DEF_SWITCHDEV)
+		mlx5_set_default_switchdev(dev);
+
 	return 0;
 
 err_attach:
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 1268fcf35ec7..cfbc0ff6292a 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -706,6 +706,7 @@ struct mlx5_st;
 enum {
 	MLX5_PROF_MASK_QP_SIZE		= (u64)1 << 0,
 	MLX5_PROF_MASK_MR_CACHE		= (u64)1 << 1,
+	MLX5_PROF_MASK_DEF_SWITCHDEV    = (u64)1 << 2,
 };
 
 enum {
-- 
2.44.0


^ permalink raw reply related

* [PATCH net-next 6/7] net/mlx5: E-switch, load reps via work queue after registration
From: Tariq Toukan @ 2026-04-09 11:55 UTC (permalink / raw)
  To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
	David S. Miller
  Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
	Shay Drory, Or Har-Toov, Edward Srouji, Maher Sanalla,
	Simon Horman, Moshe Shemesh, Kees Cook, Patrisious Haddad,
	Gerd Bayer, Parav Pandit, Cosmin Ratiu, Carolina Jubran, netdev,
	linux-rdma, linux-kernel, Gal Pressman, Dragos Tatulea
In-Reply-To: <20260409115550.156419-1-tariqt@nvidia.com>

From: Mark Bloch <mbloch@nvidia.com>

mlx5_eswitch_register_vport_reps() merely sets the callbacks. The actual
representor load/unload requires devlink locking and shouldn’t run from
the registration context. Queue a work that acquires the devlink lock,
loads all relevant reps. This lets load happen where the needed locks
can be taken.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../mellanox/mlx5/core/eswitch_offloads.c     | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 4b626ffcfa8e..279490c0074c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -4535,6 +4535,38 @@ mlx5_eswitch_register_vport_reps_blocked(struct mlx5_eswitch *esw,
 	}
 }
 
+static void mlx5_eswitch_reload_reps_blocked(struct mlx5_eswitch *esw)
+{
+	struct mlx5_vport *vport;
+	unsigned long i;
+
+	if (esw->mode != MLX5_ESWITCH_OFFLOADS)
+		return;
+
+	if (mlx5_esw_offloads_rep_load(esw, MLX5_VPORT_UPLINK))
+		return;
+
+	mlx5_esw_for_each_vport(esw, i, vport) {
+		if (!vport)
+			continue;
+		if (!vport->enabled)
+			continue;
+		if (vport->vport == MLX5_VPORT_UPLINK)
+			continue;
+		if (!mlx5_eswitch_vport_has_rep(esw, vport->vport))
+			continue;
+
+		mlx5_esw_offloads_rep_load(esw, vport->vport);
+	}
+}
+
+static void mlx5_eswitch_reload_reps(struct mlx5_eswitch *esw)
+{
+	mlx5_esw_reps_block(esw);
+	mlx5_eswitch_reload_reps_blocked(esw);
+	mlx5_esw_reps_unblock(esw);
+}
+
 void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
 				      const struct mlx5_eswitch_rep_ops *ops,
 				      u8 rep_type)
@@ -4542,6 +4574,8 @@ void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
 	mlx5_esw_reps_block(esw);
 	mlx5_eswitch_register_vport_reps_blocked(esw, ops, rep_type);
 	mlx5_esw_reps_unblock(esw);
+
+	mlx5_esw_add_work(esw, mlx5_eswitch_reload_reps);
 }
 EXPORT_SYMBOL(mlx5_eswitch_register_vport_reps);
 
-- 
2.44.0


^ permalink raw reply related

* [PATCH net-next 5/7] net/mlx5: E-Switch, block representors during reconfiguration
From: Tariq Toukan @ 2026-04-09 11:55 UTC (permalink / raw)
  To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
	David S. Miller
  Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
	Shay Drory, Or Har-Toov, Edward Srouji, Maher Sanalla,
	Simon Horman, Moshe Shemesh, Kees Cook, Patrisious Haddad,
	Gerd Bayer, Parav Pandit, Cosmin Ratiu, Carolina Jubran, netdev,
	linux-rdma, linux-kernel, Gal Pressman, Dragos Tatulea
In-Reply-To: <20260409115550.156419-1-tariqt@nvidia.com>

From: Mark Bloch <mbloch@nvidia.com>

Introduce a simple atomic block state via mlx5_esw_reps_block() and
mlx5_esw_reps_unblock(). Internally, mlx5_esw_mark_reps() spins a
cmpxchg between the UNBLOCKED and BLOCKED states. All E-Switch
reconfiguration paths (mode set, enable, disable, VF/SF add/del, LAG
reload) now bracket their work with this guard so representor changes
won't race with the ongoing E-Switch update, yet we remain
non-blocking and avoid new locks.

A spinlock is out because the protected work can sleep (RDMA ops,
devcom, netdev callbacks). A mutex won't work either: esw_mode_change()
has to drop the guard mid-flight so mlx5_rescan_drivers_locked() can
reload mlx5_ib, which calls back into mlx5_eswitch_register_vport_reps()
on the same thread. Beyond that, any real lock would create an ABBA
cycle: the LAG side holds the LAG lock when it calls reps_block(), and
the mlx5_ib side holds RDMA locks when it calls register_vport_reps(),
and those two subsystems talk to each other. The atomic CAS loop avoids
all of this - no lock ordering, no sleep restrictions, and the owner
can drop the guard and let a nested caller win the next transition
before reclaiming it.

With this infrastructure in place, downstream patches can safely tie
representor load/unload to the mlx5_ib module's lifecycle. Loading
mlx5_ib while the device is in switchdev mode has failed to bring up
the IB representors for years; those patches will finally fix that.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/eswitch.c | 13 ++++
 .../net/ethernet/mellanox/mlx5/core/eswitch.h |  6 ++
 .../mellanox/mlx5/core/eswitch_offloads.c     | 77 +++++++++++++++++--
 .../net/ethernet/mellanox/mlx5/core/lag/lag.c |  2 +
 .../ethernet/mellanox/mlx5/core/sf/devlink.c  |  5 ++
 include/linux/mlx5/eswitch.h                  |  5 ++
 6 files changed, 100 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index d315484390c8..a7701c9d776a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1700,6 +1700,7 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
 		mlx5_lag_disable_change(esw->dev);
 
 	atomic_inc(&esw->generation);
+	mlx5_esw_reps_block(esw);
 
 	if (!mlx5_esw_is_fdb_created(esw)) {
 		ret = mlx5_eswitch_enable_locked(esw, num_vfs);
@@ -1723,6 +1724,8 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
 		}
 	}
 
+	mlx5_esw_reps_unblock(esw);
+
 	if (toggle_lag)
 		mlx5_lag_enable_change(esw->dev);
 
@@ -1747,6 +1750,8 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf)
 		 esw->esw_funcs.num_vfs, esw->esw_funcs.num_ec_vfs, esw->enabled_vports);
 	atomic_inc(&esw->generation);
 
+	mlx5_esw_reps_block(esw);
+
 	if (!mlx5_core_is_ecpf(esw->dev)) {
 		mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs);
 		if (clear_vf)
@@ -1757,6 +1762,8 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf)
 			mlx5_eswitch_clear_ec_vf_vports_info(esw);
 	}
 
+	mlx5_esw_reps_unblock(esw);
+
 	if (esw->mode == MLX5_ESWITCH_OFFLOADS) {
 		struct devlink *devlink = priv_to_devlink(esw->dev);
 
@@ -1812,7 +1819,11 @@ void mlx5_eswitch_disable(struct mlx5_eswitch *esw)
 	devl_assert_locked(priv_to_devlink(esw->dev));
 	atomic_inc(&esw->generation);
 	mlx5_lag_disable_change(esw->dev);
+
+	mlx5_esw_reps_block(esw);
 	mlx5_eswitch_disable_locked(esw);
+	mlx5_esw_reps_unblock(esw);
+
 	esw->mode = MLX5_ESWITCH_LEGACY;
 	mlx5_lag_enable_change(esw->dev);
 }
@@ -2075,6 +2086,8 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 	init_rwsem(&esw->mode_lock);
 	refcount_set(&esw->qos.refcnt, 0);
 	atomic_set(&esw->generation, 0);
+	atomic_set(&esw->offloads.reps_conf_state,
+		   MLX5_ESW_OFFLOADS_REP_TYPE_UNBLOCKED);
 
 	esw->enabled_vports = 0;
 	esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index e3ab8a30c174..256ac3ad37bc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -315,6 +315,7 @@ struct mlx5_esw_offload {
 	DECLARE_HASHTABLE(termtbl_tbl, 8);
 	struct mutex termtbl_mutex; /* protects termtbl hash */
 	struct xarray vhca_map;
+	atomic_t reps_conf_state;
 	const struct mlx5_eswitch_rep_ops *rep_ops[NUM_REP_TYPES];
 	u8 inline_mode;
 	atomic64_t num_flows;
@@ -949,6 +950,8 @@ mlx5_esw_lag_demux_fg_create(struct mlx5_eswitch *esw,
 struct mlx5_flow_handle *
 mlx5_esw_lag_demux_rule_create(struct mlx5_eswitch *esw, u16 vport_num,
 			       struct mlx5_flow_table *lag_ft);
+void mlx5_esw_reps_block(struct mlx5_eswitch *esw);
+void mlx5_esw_reps_unblock(struct mlx5_eswitch *esw);
 #else  /* CONFIG_MLX5_ESWITCH */
 /* eswitch API stubs */
 static inline int  mlx5_eswitch_init(struct mlx5_core_dev *dev) { return 0; }
@@ -1026,6 +1029,9 @@ mlx5_esw_host_functions_enabled(const struct mlx5_core_dev *dev)
 	return true;
 }
 
+static inline void mlx5_esw_reps_block(struct mlx5_eswitch *esw) {}
+static inline void mlx5_esw_reps_unblock(struct mlx5_eswitch *esw) {}
+
 static inline bool
 mlx5_esw_vport_vhca_id(struct mlx5_eswitch *esw, u16 vportn, u16 *vhca_id)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 988595e1b425..4b626ffcfa8e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -2410,23 +2410,56 @@ static int esw_create_restore_table(struct mlx5_eswitch *esw)
 	return err;
 }
 
+static void mlx5_esw_assert_reps_blocked(struct mlx5_eswitch *esw)
+{
+	if (atomic_read(&esw->offloads.reps_conf_state) ==
+	    MLX5_ESW_OFFLOADS_REP_TYPE_BLOCKED)
+		return;
+
+	esw_warn(esw->dev, "reps state machine violated: expected BLOCKED\n");
+}
+
+static void mlx5_esw_mark_reps(struct mlx5_eswitch *esw,
+			       enum mlx5_esw_offloads_rep_type_state old,
+			       enum mlx5_esw_offloads_rep_type_state new)
+{
+	atomic_t *reps_conf_state = &esw->offloads.reps_conf_state;
+
+	do {
+		atomic_cond_read_relaxed(reps_conf_state, VAL == old);
+	} while (atomic_cmpxchg(reps_conf_state, old, new) != old);
+}
+
+void mlx5_esw_reps_block(struct mlx5_eswitch *esw)
+{
+	mlx5_esw_mark_reps(esw, MLX5_ESW_OFFLOADS_REP_TYPE_UNBLOCKED,
+			   MLX5_ESW_OFFLOADS_REP_TYPE_BLOCKED);
+}
+
+void mlx5_esw_reps_unblock(struct mlx5_eswitch *esw)
+{
+	mlx5_esw_mark_reps(esw, MLX5_ESW_OFFLOADS_REP_TYPE_BLOCKED,
+			   MLX5_ESW_OFFLOADS_REP_TYPE_UNBLOCKED);
+}
+
 static void esw_mode_change(struct mlx5_eswitch *esw, u16 mode)
 {
+	mlx5_esw_reps_unblock(esw);
 	mlx5_devcom_comp_lock(esw->dev->priv.hca_devcom_comp);
 	if (esw->dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_IB_ADEV ||
 	    mlx5_core_mp_enabled(esw->dev)) {
 		esw->mode = mode;
-		mlx5_rescan_drivers_locked(esw->dev);
-		mlx5_devcom_comp_unlock(esw->dev->priv.hca_devcom_comp);
-		return;
+		goto out;
 	}
 
 	esw->dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
 	mlx5_rescan_drivers_locked(esw->dev);
 	esw->mode = mode;
 	esw->dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
+out:
 	mlx5_rescan_drivers_locked(esw->dev);
 	mlx5_devcom_comp_unlock(esw->dev->priv.hca_devcom_comp);
+	mlx5_esw_reps_block(esw);
 }
 
 static void mlx5_esw_fdb_drop_destroy(struct mlx5_eswitch *esw)
@@ -2761,6 +2794,8 @@ void esw_offloads_cleanup(struct mlx5_eswitch *esw)
 static int __esw_offloads_load_rep(struct mlx5_eswitch *esw,
 				   struct mlx5_eswitch_rep *rep, u8 rep_type)
 {
+	mlx5_esw_assert_reps_blocked(esw);
+
 	if (atomic_cmpxchg(&rep->rep_data[rep_type].state,
 			   REP_REGISTERED, REP_LOADED) == REP_REGISTERED)
 		return esw->offloads.rep_ops[rep_type]->load(esw->dev, rep);
@@ -2771,6 +2806,8 @@ static int __esw_offloads_load_rep(struct mlx5_eswitch *esw,
 static void __esw_offloads_unload_rep(struct mlx5_eswitch *esw,
 				      struct mlx5_eswitch_rep *rep, u8 rep_type)
 {
+	mlx5_esw_assert_reps_blocked(esw);
+
 	if (atomic_cmpxchg(&rep->rep_data[rep_type].state,
 			   REP_LOADED, REP_REGISTERED) == REP_LOADED) {
 		if (rep_type == REP_ETH)
@@ -3673,6 +3710,7 @@ static void esw_vfs_changed_event_handler(struct mlx5_eswitch *esw)
 	if (new_num_vfs == esw->esw_funcs.num_vfs || host_pf_disabled)
 		goto free;
 
+	mlx5_esw_reps_block(esw);
 	/* Number of VFs can only change from "0 to x" or "x to 0". */
 	if (esw->esw_funcs.num_vfs > 0) {
 		mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs);
@@ -3682,9 +3720,11 @@ static void esw_vfs_changed_event_handler(struct mlx5_eswitch *esw)
 		err = mlx5_eswitch_load_vf_vports(esw, new_num_vfs,
 						  MLX5_VPORT_UC_ADDR_CHANGE);
 		if (err)
-			goto free;
+			goto unblock;
 	}
 	esw->esw_funcs.num_vfs = new_num_vfs;
+unblock:
+	mlx5_esw_reps_unblock(esw);
 free:
 	kvfree(out);
 }
@@ -4164,6 +4204,7 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
 		goto unlock;
 	}
 
+	mlx5_esw_reps_block(esw);
 	esw->eswitch_operation_in_progress = true;
 	up_write(&esw->mode_lock);
 
@@ -4203,6 +4244,7 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
 		mlx5_devlink_netdev_netns_immutable_set(devlink, false);
 	down_write(&esw->mode_lock);
 	esw->eswitch_operation_in_progress = false;
+	mlx5_esw_reps_unblock(esw);
 unlock:
 	mlx5_esw_unlock(esw);
 enable_lag:
@@ -4474,9 +4516,10 @@ mlx5_eswitch_vport_has_rep(const struct mlx5_eswitch *esw, u16 vport_num)
 	return true;
 }
 
-void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
-				      const struct mlx5_eswitch_rep_ops *ops,
-				      u8 rep_type)
+static void
+mlx5_eswitch_register_vport_reps_blocked(struct mlx5_eswitch *esw,
+					 const struct mlx5_eswitch_rep_ops *ops,
+					 u8 rep_type)
 {
 	struct mlx5_eswitch_rep_data *rep_data;
 	struct mlx5_eswitch_rep *rep;
@@ -4491,9 +4534,20 @@ void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
 		}
 	}
 }
+
+void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
+				      const struct mlx5_eswitch_rep_ops *ops,
+				      u8 rep_type)
+{
+	mlx5_esw_reps_block(esw);
+	mlx5_eswitch_register_vport_reps_blocked(esw, ops, rep_type);
+	mlx5_esw_reps_unblock(esw);
+}
 EXPORT_SYMBOL(mlx5_eswitch_register_vport_reps);
 
-void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
+static void
+mlx5_eswitch_unregister_vport_reps_blocked(struct mlx5_eswitch *esw,
+					   u8 rep_type)
 {
 	struct mlx5_eswitch_rep *rep;
 	unsigned long i;
@@ -4504,6 +4558,13 @@ void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
 	mlx5_esw_for_each_rep(esw, i, rep)
 		atomic_set(&rep->rep_data[rep_type].state, REP_UNREGISTERED);
 }
+
+void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
+{
+	mlx5_esw_reps_block(esw);
+	mlx5_eswitch_unregister_vport_reps_blocked(esw, rep_type);
+	mlx5_esw_reps_unblock(esw);
+}
 EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_reps);
 
 void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index c402a8463081..ff2e6f6caa0c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -1105,7 +1105,9 @@ int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags)
 			struct mlx5_eswitch *esw;
 
 			esw = pf->dev->priv.eswitch;
+			mlx5_esw_reps_block(esw);
 			ret = mlx5_eswitch_reload_ib_reps(esw);
+			mlx5_esw_reps_unblock(esw);
 			if (ret)
 				return ret;
 		}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
index 8503e532f423..2fc69897e35b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
@@ -245,8 +245,10 @@ static int mlx5_sf_add(struct mlx5_core_dev *dev, struct mlx5_sf_table *table,
 	if (IS_ERR(sf))
 		return PTR_ERR(sf);
 
+	mlx5_esw_reps_block(esw);
 	err = mlx5_eswitch_load_sf_vport(esw, sf->hw_fn_id, MLX5_VPORT_UC_ADDR_CHANGE,
 					 &sf->dl_port, new_attr->controller, new_attr->sfnum);
+	mlx5_esw_reps_unblock(esw);
 	if (err)
 		goto esw_err;
 	*dl_port = &sf->dl_port.dl_port;
@@ -367,7 +369,10 @@ int mlx5_devlink_sf_port_del(struct devlink *devlink,
 	struct mlx5_sf_table *table = dev->priv.sf_table;
 	struct mlx5_sf *sf = mlx5_sf_by_dl_port(dl_port);
 
+	mlx5_esw_reps_block(dev->priv.eswitch);
 	mlx5_sf_del(table, sf);
+	mlx5_esw_reps_unblock(dev->priv.eswitch);
+
 	return 0;
 }
 
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 67256e776566..786b1ea83843 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -29,6 +29,11 @@ enum {
 	REP_LOADED,
 };
 
+enum mlx5_esw_offloads_rep_type_state {
+	MLX5_ESW_OFFLOADS_REP_TYPE_UNBLOCKED,
+	MLX5_ESW_OFFLOADS_REP_TYPE_BLOCKED,
+};
+
 enum mlx5_switchdev_event {
 	MLX5_SWITCHDEV_EVENT_PAIR,
 	MLX5_SWITCHDEV_EVENT_UNPAIR,
-- 
2.44.0


^ permalink raw reply related

* [PATCH net-next 4/7] net/mlx5: E-Switch, fix deadlock between devlink lock and esw->wq
From: Tariq Toukan @ 2026-04-09 11:55 UTC (permalink / raw)
  To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
	David S. Miller
  Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
	Shay Drory, Or Har-Toov, Edward Srouji, Maher Sanalla,
	Simon Horman, Moshe Shemesh, Kees Cook, Patrisious Haddad,
	Gerd Bayer, Parav Pandit, Cosmin Ratiu, Carolina Jubran, netdev,
	linux-rdma, linux-kernel, Gal Pressman, Dragos Tatulea
In-Reply-To: <20260409115550.156419-1-tariqt@nvidia.com>

From: Mark Bloch <mbloch@nvidia.com>

mlx5_eswitch_cleanup() calls destroy_workqueue() while holding the
devlink lock (via mlx5_uninit_one()). Workers on the queue call
devl_lock() before checking whether their work is stale, which
deadlocks:

  mlx5_uninit_one (holds devlink lock)
    mlx5_eswitch_cleanup()
      destroy_workqueue()     <- waits for workers to finish
                                 worker: devl_lock() <- blocked on
                                         devlink lock held above

The same pattern affects mlx5_devlink_eswitch_mode_set(), which can
drain the queue while holding devlink lock.

Fix by making esw_wq_handler() check the generation counter BEFORE
acquiring the devlink lock, using devl_trylock() in a loop with
cond_resched(). If the work is stale the handler exits immediately
without ever contending for the lock.

To guarantee stale detection, increment the generation counter at
every E-Switch operation boundary:

- mlx5_eswitch_cleanup(): increment before destroy_workqueue() so
  any in-flight worker sees stale and drains without blocking. Also
  move mlx5_esw_qos_cleanup() to after destroy_workqueue() so it
  runs only once all workers have finished.
- mlx5_devlink_eswitch_mode_set(): increment before starting the
  mode change so workers from the previous mode are discarded.
- mlx5_eswitch_disable(): increment so workers queued before the
  disable see stale and exit.
- mlx5_eswitch_enable() and mlx5_eswitch_disable_sriov(): increment
  so in-flight work against an old VF count or mode is discarded
  when these operations begin.

Remove the conditional atomic_inc() in
mlx5_eswitch_event_handler_unregister(); the mlx5_eswitch_disable()
increment now covers it unconditionally and earlier in the call chain.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/eswitch.c  | 11 +++++++----
 .../mellanox/mlx5/core/eswitch_offloads.c      | 18 +++++++++++++++++-
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 1986d4d0e886..d315484390c8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1073,10 +1073,8 @@ static void mlx5_eswitch_event_handler_register(struct mlx5_eswitch *esw)
 static void mlx5_eswitch_event_handler_unregister(struct mlx5_eswitch *esw)
 {
 	if (esw->mode == MLX5_ESWITCH_OFFLOADS &&
-	    mlx5_eswitch_is_funcs_handler(esw->dev)) {
+	    mlx5_eswitch_is_funcs_handler(esw->dev))
 		mlx5_eq_notifier_unregister(esw->dev, &esw->esw_funcs.nb);
-		atomic_inc(&esw->generation);
-	}
 }
 
 static void mlx5_eswitch_clear_vf_vports_info(struct mlx5_eswitch *esw)
@@ -1701,6 +1699,8 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
 	if (toggle_lag)
 		mlx5_lag_disable_change(esw->dev);
 
+	atomic_inc(&esw->generation);
+
 	if (!mlx5_esw_is_fdb_created(esw)) {
 		ret = mlx5_eswitch_enable_locked(esw, num_vfs);
 	} else {
@@ -1745,6 +1745,7 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf)
 	esw_info(esw->dev, "Unload vfs: mode(%s), nvfs(%d), necvfs(%d), active vports(%d)\n",
 		 esw->mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS",
 		 esw->esw_funcs.num_vfs, esw->esw_funcs.num_ec_vfs, esw->enabled_vports);
+	atomic_inc(&esw->generation);
 
 	if (!mlx5_core_is_ecpf(esw->dev)) {
 		mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs);
@@ -1809,6 +1810,7 @@ void mlx5_eswitch_disable(struct mlx5_eswitch *esw)
 		return;
 
 	devl_assert_locked(priv_to_devlink(esw->dev));
+	atomic_inc(&esw->generation);
 	mlx5_lag_disable_change(esw->dev);
 	mlx5_eswitch_disable_locked(esw);
 	esw->mode = MLX5_ESWITCH_LEGACY;
@@ -2110,8 +2112,9 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw)
 
 	esw_info(esw->dev, "cleanup\n");
 
-	mlx5_esw_qos_cleanup(esw);
+	atomic_inc(&esw->generation);
 	destroy_workqueue(esw->work_queue);
+	mlx5_esw_qos_cleanup(esw);
 	WARN_ON(refcount_read(&esw->qos.refcnt));
 	mutex_destroy(&esw->state_lock);
 	WARN_ON(!xa_empty(&esw->offloads.vhca_map));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 23af5a12dc07..988595e1b425 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3699,7 +3699,20 @@ static void esw_wq_handler(struct work_struct *work)
 	esw = host_work->esw;
 	devlink = priv_to_devlink(esw->dev);
 
-	devl_lock(devlink);
+	/* Check for stale work BEFORE acquiring devlink lock.
+	 * mlx5_eswitch_cleanup() increments the generation counter
+	 * before destroy_workqueue() while holding devlink lock,
+	 * so acquiring devlink lock here would deadlock.
+	 */
+	for (;;) {
+		if (host_work->work_gen != atomic_read(&esw->generation))
+			goto free;
+
+		if (devl_trylock(devlink))
+			break;
+
+		cond_resched();
+	}
 
 	/* Stale work from one or more mode changes ago. Bail out. */
 	if (host_work->work_gen != atomic_read(&esw->generation))
@@ -3709,6 +3722,7 @@ static void esw_wq_handler(struct work_struct *work)
 
 unlock:
 	devl_unlock(devlink);
+free:
 	kfree(host_work);
 }
 
@@ -4161,6 +4175,8 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
 		goto skip;
 	}
 
+	atomic_inc(&esw->generation);
+
 	if (mlx5_mode == MLX5_ESWITCH_LEGACY)
 		esw->dev->priv.flags |= MLX5_PRIV_FLAGS_SWITCH_LEGACY;
 	if (mlx5_mode == MLX5_ESWITCH_OFFLOADS)
-- 
2.44.0


^ permalink raw reply related

* [PATCH net-next 3/7] net/mlx5: E-Switch, introduce generic work queue dispatch helper
From: Tariq Toukan @ 2026-04-09 11:55 UTC (permalink / raw)
  To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
	David S. Miller
  Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
	Shay Drory, Or Har-Toov, Edward Srouji, Maher Sanalla,
	Simon Horman, Moshe Shemesh, Kees Cook, Patrisious Haddad,
	Gerd Bayer, Parav Pandit, Cosmin Ratiu, Carolina Jubran, netdev,
	linux-rdma, linux-kernel, Gal Pressman, Dragos Tatulea
In-Reply-To: <20260409115550.156419-1-tariqt@nvidia.com>

From: Mark Bloch <mbloch@nvidia.com>

Each E-Switch work item requires the same boilerplate: acquire the
devlink lock, check whether the work is stale, dispatch to the
appropriate handler, and release the lock. Factor this out.

Add a func callback to mlx5_host_work so the generic handler
esw_wq_handler() can dispatch to the right function without
duplicating locking logic. Introduce mlx5_esw_add_work() as the
single enqueue point: it stamps the work item with the current
generation counter and queues it onto the E-Switch work queue.

Refactor esw_vfs_changed_event_handler() to match the new contract:
it no longer receives work_gen or out as parameters. It queries
mlx5_esw_query_functions() itself and owns the kvfree() of the
result. The devlink lock is acquired and released by esw_wq_handler()
before dispatching, so the handler runs with the lock already held.

Update mlx5_esw_funcs_changed_handler() to use mlx5_esw_add_work().

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/eswitch.h |  1 +
 .../mellanox/mlx5/core/eswitch_offloads.c     | 77 +++++++++++--------
 2 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 0c3d2bdebf8c..e3ab8a30c174 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -336,6 +336,7 @@ struct mlx5_host_work {
 	struct work_struct	work;
 	struct mlx5_eswitch	*esw;
 	int			work_gen;
+	void (*func)(struct mlx5_eswitch *esw);
 };
 
 struct mlx5_esw_functions {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index b2e7294d3a5c..23af5a12dc07 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3655,20 +3655,15 @@ static void esw_offloads_steering_cleanup(struct mlx5_eswitch *esw)
 	mutex_destroy(&esw->fdb_table.offloads.vports.lock);
 }
 
-static void
-esw_vfs_changed_event_handler(struct mlx5_eswitch *esw, int work_gen,
-			      const u32 *out)
+static void esw_vfs_changed_event_handler(struct mlx5_eswitch *esw)
 {
-	struct devlink *devlink;
 	bool host_pf_disabled;
 	u16 new_num_vfs;
+	const u32 *out;
 
-	devlink = priv_to_devlink(esw->dev);
-	devl_lock(devlink);
-
-	/* Stale work from one or more mode changes ago. Bail out. */
-	if (work_gen != atomic_read(&esw->generation))
-		goto unlock;
+	out = mlx5_esw_query_functions(esw->dev);
+	if (IS_ERR(out))
+		return;
 
 	new_num_vfs = MLX5_GET(query_esw_functions_out, out,
 			       host_params_context.host_num_of_vfs);
@@ -3676,7 +3671,7 @@ esw_vfs_changed_event_handler(struct mlx5_eswitch *esw, int work_gen,
 				    host_params_context.host_pf_disabled);
 
 	if (new_num_vfs == esw->esw_funcs.num_vfs || host_pf_disabled)
-		goto unlock;
+		goto free;
 
 	/* Number of VFs can only change from "0 to x" or "x to 0". */
 	if (esw->esw_funcs.num_vfs > 0) {
@@ -3686,54 +3681,70 @@ esw_vfs_changed_event_handler(struct mlx5_eswitch *esw, int work_gen,
 
 		err = mlx5_eswitch_load_vf_vports(esw, new_num_vfs,
 						  MLX5_VPORT_UC_ADDR_CHANGE);
-		if (err) {
-			devl_unlock(devlink);
-			return;
-		}
+		if (err)
+			goto free;
 	}
 	esw->esw_funcs.num_vfs = new_num_vfs;
-unlock:
-	devl_unlock(devlink);
+free:
+	kvfree(out);
 }
 
-static void esw_functions_changed_event_handler(struct work_struct *work)
+static void esw_wq_handler(struct work_struct *work)
 {
 	struct mlx5_host_work *host_work;
 	struct mlx5_eswitch *esw;
-	const u32 *out;
+	struct devlink *devlink;
 
 	host_work = container_of(work, struct mlx5_host_work, work);
 	esw = host_work->esw;
+	devlink = priv_to_devlink(esw->dev);
 
-	out = mlx5_esw_query_functions(esw->dev);
-	if (IS_ERR(out))
-		goto out;
+	devl_lock(devlink);
 
-	esw_vfs_changed_event_handler(esw, host_work->work_gen, out);
-	kvfree(out);
-out:
+	/* Stale work from one or more mode changes ago. Bail out. */
+	if (host_work->work_gen != atomic_read(&esw->generation))
+		goto unlock;
+
+	host_work->func(esw);
+
+unlock:
+	devl_unlock(devlink);
 	kfree(host_work);
 }
 
-int mlx5_esw_funcs_changed_handler(struct notifier_block *nb, unsigned long type, void *data)
+static int mlx5_esw_add_work(struct mlx5_eswitch *esw,
+			     void (*func)(struct mlx5_eswitch *esw))
 {
-	struct mlx5_esw_functions *esw_funcs;
 	struct mlx5_host_work *host_work;
-	struct mlx5_eswitch *esw;
 
 	host_work = kzalloc_obj(*host_work, GFP_ATOMIC);
 	if (!host_work)
-		return NOTIFY_DONE;
-
-	esw_funcs = mlx5_nb_cof(nb, struct mlx5_esw_functions, nb);
-	esw = container_of(esw_funcs, struct mlx5_eswitch, esw_funcs);
+		return -ENOMEM;
 
 	host_work->esw = esw;
 	host_work->work_gen = atomic_read(&esw->generation);
 
-	INIT_WORK(&host_work->work, esw_functions_changed_event_handler);
+	host_work->func = func;
+	INIT_WORK(&host_work->work, esw_wq_handler);
 	queue_work(esw->work_queue, &host_work->work);
 
+	return 0;
+}
+
+int mlx5_esw_funcs_changed_handler(struct notifier_block *nb,
+				   unsigned long type, void *data)
+{
+	struct mlx5_esw_functions *esw_funcs;
+	struct mlx5_eswitch *esw;
+	int ret;
+
+	esw_funcs = mlx5_nb_cof(nb, struct mlx5_esw_functions, nb);
+	esw = container_of(esw_funcs, struct mlx5_eswitch, esw_funcs);
+
+	ret = mlx5_esw_add_work(esw, esw_vfs_changed_event_handler);
+	if (ret)
+		return NOTIFY_DONE;
+
 	return NOTIFY_OK;
 }
 
-- 
2.44.0


^ permalink raw reply related

* [PATCH net-next 2/7] net/mlx5: E-Switch, move work queue generation counter
From: Tariq Toukan @ 2026-04-09 11:55 UTC (permalink / raw)
  To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
	David S. Miller
  Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
	Shay Drory, Or Har-Toov, Edward Srouji, Maher Sanalla,
	Simon Horman, Moshe Shemesh, Kees Cook, Patrisious Haddad,
	Gerd Bayer, Parav Pandit, Cosmin Ratiu, Carolina Jubran, netdev,
	linux-rdma, linux-kernel, Gal Pressman, Dragos Tatulea
In-Reply-To: <20260409115550.156419-1-tariqt@nvidia.com>

From: Mark Bloch <mbloch@nvidia.com>

The generation counter in mlx5_esw_functions is used to detect stale
work items on the E-Switch work queue. Move it from mlx5_esw_functions
to the top-level mlx5_eswitch struct so it can guard all work types,
not just function-change events.

This is a mechanical refactor: no behavioral change.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c          | 3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h          | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 123c96716a54..1986d4d0e886 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1075,7 +1075,7 @@ static void mlx5_eswitch_event_handler_unregister(struct mlx5_eswitch *esw)
 	if (esw->mode == MLX5_ESWITCH_OFFLOADS &&
 	    mlx5_eswitch_is_funcs_handler(esw->dev)) {
 		mlx5_eq_notifier_unregister(esw->dev, &esw->esw_funcs.nb);
-		atomic_inc(&esw->esw_funcs.generation);
+		atomic_inc(&esw->generation);
 	}
 }
 
@@ -2072,6 +2072,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 	mutex_init(&esw->state_lock);
 	init_rwsem(&esw->mode_lock);
 	refcount_set(&esw->qos.refcnt, 0);
+	atomic_set(&esw->generation, 0);
 
 	esw->enabled_vports = 0;
 	esw->offloads.inline_mode = MLX5_INLINE_MODE_NONE;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 5128f5020dae..0c3d2bdebf8c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -340,7 +340,6 @@ struct mlx5_host_work {
 
 struct mlx5_esw_functions {
 	struct mlx5_nb		nb;
-	atomic_t		generation;
 	bool			host_funcs_disabled;
 	u16			num_vfs;
 	u16			num_ec_vfs;
@@ -410,6 +409,7 @@ struct mlx5_eswitch {
 	struct mlx5_devcom_comp_dev *devcom;
 	u16 enabled_ipsec_vf_count;
 	bool eswitch_operation_in_progress;
+	atomic_t generation;
 };
 
 void esw_offloads_disable(struct mlx5_eswitch *esw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index a078d06f4567..b2e7294d3a5c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3667,7 +3667,7 @@ esw_vfs_changed_event_handler(struct mlx5_eswitch *esw, int work_gen,
 	devl_lock(devlink);
 
 	/* Stale work from one or more mode changes ago. Bail out. */
-	if (work_gen != atomic_read(&esw->esw_funcs.generation))
+	if (work_gen != atomic_read(&esw->generation))
 		goto unlock;
 
 	new_num_vfs = MLX5_GET(query_esw_functions_out, out,
@@ -3729,7 +3729,7 @@ int mlx5_esw_funcs_changed_handler(struct notifier_block *nb, unsigned long type
 	esw = container_of(esw_funcs, struct mlx5_eswitch, esw_funcs);
 
 	host_work->esw = esw;
-	host_work->work_gen = atomic_read(&esw_funcs->generation);
+	host_work->work_gen = atomic_read(&esw->generation);
 
 	INIT_WORK(&host_work->work, esw_functions_changed_event_handler);
 	queue_work(esw->work_queue, &host_work->work);
-- 
2.44.0


^ permalink raw reply related

* [PATCH net-next 1/7] net/mlx5: Lag: refactor representor reload handling
From: Tariq Toukan @ 2026-04-09 11:55 UTC (permalink / raw)
  To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
	David S. Miller
  Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
	Shay Drory, Or Har-Toov, Edward Srouji, Maher Sanalla,
	Simon Horman, Moshe Shemesh, Kees Cook, Patrisious Haddad,
	Gerd Bayer, Parav Pandit, Cosmin Ratiu, Carolina Jubran, netdev,
	linux-rdma, linux-kernel, Gal Pressman, Dragos Tatulea
In-Reply-To: <20260409115550.156419-1-tariqt@nvidia.com>

From: Mark Bloch <mbloch@nvidia.com>

Representor reload during LAG/MPESW transitions has to be repeated in
several flows, and each open‑coded loop was easy to get out of sync
when adding new flags or tweaking error handling. Move the sequencing
into a single helper so that all call sites share the same ordering
and checks

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Shay Drori <shayd@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
---
 .../net/ethernet/mellanox/mlx5/core/lag/lag.c | 44 +++++++++++--------
 .../net/ethernet/mellanox/mlx5/core/lag/lag.h |  1 +
 .../ethernet/mellanox/mlx5/core/lag/mpesw.c   | 12 ++---
 3 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index 449e4bd86c06..c402a8463081 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -1093,6 +1093,27 @@ void mlx5_lag_remove_devices(struct mlx5_lag *ldev)
 	}
 }
 
+int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags)
+{
+	struct lag_func *pf;
+	int ret;
+	int i;
+
+	mlx5_ldev_for_each(i, 0, ldev) {
+		pf = mlx5_lag_pf(ldev, i);
+		if (!(pf->dev->priv.flags & flags)) {
+			struct mlx5_eswitch *esw;
+
+			esw = pf->dev->priv.eswitch;
+			ret = mlx5_eswitch_reload_ib_reps(esw);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
 void mlx5_disable_lag(struct mlx5_lag *ldev)
 {
 	bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
@@ -1130,9 +1151,7 @@ void mlx5_disable_lag(struct mlx5_lag *ldev)
 		mlx5_lag_add_devices(ldev);
 
 	if (shared_fdb)
-		mlx5_ldev_for_each(i, 0, ldev)
-			if (!(mlx5_lag_pf(ldev, i)->dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
-				mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
+		mlx5_lag_reload_ib_reps(ldev, MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV);
 }
 
 bool mlx5_lag_shared_fdb_supported(struct mlx5_lag *ldev)
@@ -1388,10 +1407,8 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
 		if (err) {
 			if (shared_fdb || roce_lag)
 				mlx5_lag_add_devices(ldev);
-			if (shared_fdb) {
-				mlx5_ldev_for_each(i, 0, ldev)
-					mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
-			}
+			if (shared_fdb)
+				mlx5_lag_reload_ib_reps(ldev, 0);
 
 			return;
 		}
@@ -1409,24 +1426,15 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
 					mlx5_nic_vport_enable_roce(dev);
 			}
 		} else if (shared_fdb) {
-			int i;
-
 			dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
 			mlx5_rescan_drivers_locked(dev0);
-
-			mlx5_ldev_for_each(i, 0, ldev) {
-				err = mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
-				if (err)
-					break;
-			}
-
+			err = mlx5_lag_reload_ib_reps(ldev, 0);
 			if (err) {
 				dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
 				mlx5_rescan_drivers_locked(dev0);
 				mlx5_deactivate_lag(ldev);
 				mlx5_lag_add_devices(ldev);
-				mlx5_ldev_for_each(i, 0, ldev)
-					mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
+				mlx5_lag_reload_ib_reps(ldev, 0);
 				mlx5_core_err(dev0, "Failed to enable lag\n");
 				return;
 			}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
index 6c911374f409..db561e306fc7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
@@ -199,4 +199,5 @@ int mlx5_get_next_ldev_func(struct mlx5_lag *ldev, int start_idx);
 int mlx5_lag_get_dev_index_by_seq(struct mlx5_lag *ldev, int seq);
 int mlx5_lag_num_devs(struct mlx5_lag *ldev);
 int mlx5_lag_num_netdevs(struct mlx5_lag *ldev);
+int mlx5_lag_reload_ib_reps(struct mlx5_lag *ldev, u32 flags);
 #endif /* __MLX5_LAG_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
index 5eea12a6887a..4d68e3092a56 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
@@ -70,7 +70,6 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
 	int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
 	struct mlx5_core_dev *dev0;
 	int err;
-	int i;
 
 	if (ldev->mode == MLX5_LAG_MODE_MPESW)
 		return 0;
@@ -103,11 +102,9 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
 
 	dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
 	mlx5_rescan_drivers_locked(dev0);
-	mlx5_ldev_for_each(i, 0, ldev) {
-		err = mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
-		if (err)
-			goto err_rescan_drivers;
-	}
+	err = mlx5_lag_reload_ib_reps(ldev, 0);
+	if (err)
+		goto err_rescan_drivers;
 
 	mlx5_lag_set_vports_agg_speed(ldev);
 
@@ -119,8 +116,7 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
 	mlx5_deactivate_lag(ldev);
 err_add_devices:
 	mlx5_lag_add_devices(ldev);
-	mlx5_ldev_for_each(i, 0, ldev)
-		mlx5_eswitch_reload_ib_reps(mlx5_lag_pf(ldev, i)->dev->priv.eswitch);
+	mlx5_lag_reload_ib_reps(ldev, 0);
 	mlx5_mpesw_metadata_cleanup(ldev);
 	return err;
 }
-- 
2.44.0


^ permalink raw reply related

* [PATCH net-next 0/7] net/mlx5: Improve representor lifecycle and fix work queue deadlock
From: Tariq Toukan @ 2026-04-09 11:55 UTC (permalink / raw)
  To: Eric Dumazet, Jakub Kicinski, Paolo Abeni, Andrew Lunn,
	David S. Miller
  Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Mark Bloch,
	Shay Drory, Or Har-Toov, Edward Srouji, Maher Sanalla,
	Simon Horman, Moshe Shemesh, Kees Cook, Patrisious Haddad,
	Gerd Bayer, Parav Pandit, Cosmin Ratiu, Carolina Jubran, netdev,
	linux-rdma, linux-kernel, Gal Pressman, Dragos Tatulea

Hi,

See detailed description by Mark below [1].

Regards,
Tariq

[1]
This series addresses three problems that have been present for years.
First, there is no coordination between E-Switch reconfiguration and
representor registration. The E-Switch can be mid-way through a mode
change or VF count update while mlx5_ib walks in and registers or
unregisters representors. Nothing stops them. The race window is small
and there is no field report, but it is clearly wrong.

A mutex is not the answer. The representor callbacks reach into RDMA,
netdev, and LAG layers that already hold their own locks, making a
new mutex in the E-Switch layer a deadlock waiting to happen.

Second, the E-Switch work queue has a deadlock of its own.
mlx5_eswitch_cleanup() drains the work queue while holding the devlink
lock. Workers on that queue acquire devlink lock before checking whether
their work is still relevant. They block. The cleanup path waits for
them to finish. Deadlock.

Third, loading mlx5_ib while the device is already in switchdev mode
does not bring up the IB representors. This has been broken for years.
mlx5_eswitch_register_vport_reps() only stores callbacks; nobody
triggers the actual load after registration.

For the work queue deadlock: introduce a generation counter in the
top-level mlx5_eswitch struct (moved from mlx5_esw_functions,
which only covered function-change events) and a generic dispatch helper
mlx5_esw_add_work(). The worker esw_wq_handler() checks the counter
before touching the devlink lock using devl_trylock() in a loop. Stale
work exits immediately without ever contending. The counter is
incremented at every E-Switch operation boundary: cleanup, disable,
mode-set, enable, disable_sriov.

For the registration race: a simple atomic block state guards all
reconfiguration paths. mlx5_esw_reps_block()/mlx5_esw_reps_unblock()
spin a cmpxchg between UNBLOCKED and BLOCKED. Every reconfiguration
path (mode set, enable, disable, VF/SF add/del, LAG reload, and the
register/unregister calls themselves) brackets its work with this guard.
No new locks, no deadlock risk.

For the missing IB representors: now that the work queue infrastructure
is in place, mlx5_eswitch_register_vport_reps() queues a work item that
acquires the devlink lock and loads all relevant representors. This is
the change that actually fixes the long-standing bug.

One thing worth calling out: the block guard is non-reentrant. A caller
that tries to transition UNBLOCKED->BLOCKED while the E-Switch is already
BLOCKED will spin forever. All call sites were audited:

 - mlx5_eswitch_enable/disable/disable_sriov hold BLOCKED only around
   low-level vport helpers that do not call register/unregister.

 - Inside mlx5_eswitch_unregister_vport_reps the unload callbacks run
   while BLOCKED is held. The one callback that calls unregister
   (mlx5_ib_vport_rep_unload in LAG shared-FDB mode) only does so on
   peer E-Switch instances, each with its own independent atomic.

 - mlx5_devlink_eswitch_mode_set acquires BLOCKED, then calls
   esw_offloads_start/stop -> esw_mode_change. esw_mode_change releases
   BLOCKED before calling rescan_drivers so that the probe/remove
   callbacks that trigger register/unregister see UNBLOCKED.
   esw_mode_change re-acquires before returning, and mode_set releases
   at the end. This is an explicit hand-off of the guard across the
   rescan window.

 - mlx5_eswitch_register_vport_reps holds BLOCKED only while storing
   callbacks and queuing the load work. The actual rep loading runs from
   the work queue after the guard is released.

Patch 1 is cleanup. LAG and MPESW had the same representor reload
sequence duplicated in several places and the copies had started to
drift. This consolidates them into one helper.

Patches 2-4 fix the work queue deadlock in three steps: first move the
generation counter from mlx5_esw_functions to mlx5_eswitch;
then introduce the generic esw_wq_handler/mlx5_esw_add_work dispatch
infrastructure; then apply the actual fix by switching to devl_trylock
and adding generation increments at all operation boundaries.

Patch 5 adds the atomic block guard for representor registration,
protecting all reconfiguration paths.

Patch 6 moves the representor load triggered by
mlx5_eswitch_register_vport_reps() onto the work queue. This is the
patch that fixes IB representors not coming up when mlx5_ib is loaded
while the device is already in switchdev mode.

Patch 7 adds a driver profile that auto-enables switchdev at device
init, for deployments that always operate in switchdev mode and want
to avoid a manual devlink command after every probe.

Mark Bloch (7):
  net/mlx5: Lag: refactor representor reload handling
  net/mlx5: E-Switch, move work queue generation counter
  net/mlx5: E-Switch, introduce generic work queue dispatch helper
  net/mlx5: E-Switch, fix deadlock between devlink lock and esw->wq
  net/mlx5: E-Switch, block representors during reconfiguration
  net/mlx5: E-switch, load reps via work queue after registration
  net/mlx5: Add profile to auto-enable switchdev mode at device init

 .../net/ethernet/mellanox/mlx5/core/eswitch.c |  25 ++-
 .../net/ethernet/mellanox/mlx5/core/eswitch.h |  15 +-
 .../mellanox/mlx5/core/eswitch_offloads.c     | 204 ++++++++++++++----
 .../net/ethernet/mellanox/mlx5/core/lag/lag.c |  46 ++--
 .../net/ethernet/mellanox/mlx5/core/lag/lag.h |   1 +
 .../ethernet/mellanox/mlx5/core/lag/mpesw.c   |  12 +-
 .../net/ethernet/mellanox/mlx5/core/main.c    |  26 ++-
 .../ethernet/mellanox/mlx5/core/sf/devlink.c  |   5 +
 include/linux/mlx5/driver.h                   |   1 +
 include/linux/mlx5/eswitch.h                  |   5 +
 10 files changed, 267 insertions(+), 73 deletions(-)

base-commit: 9700282a7ec721e285771d995ccfe33845e776dc
-- 
2.44.0

^ permalink raw reply

* Re: [PATCH net-next] selftests: net: py: add test case filtering and listing
From: Gal Pressman @ 2026-04-09 11:50 UTC (permalink / raw)
  To: Jakub Kicinski, davem
  Cc: netdev, edumazet, pabeni, andrew+netdev, horms, shuah, petrm,
	willemb, linux-kselftest
In-Reply-To: <20260407151715.3800579-1-kuba@kernel.org>

On 07/04/2026 18:17, Jakub Kicinski wrote:
> When developing new test cases and reproducing failures in
> existing ones we currently have to run the entire test which
> can take minutes to finish.
> 
> Add command line options for test selection, modeled after
> kselftest_harness.h:
> 
>   -l       list tests (all or filtered)
>   -t name  include test
>   -T name  exclude test
> 
> Since we don't have as clean separation into fixture / variant /
> test as kselftest_harness this is not really a 1 to 1 match.
> We have to lean on glob patterns instead.
> 
> Like in kselftest_harness filters are evaluated in order, first
> match wins. If only exclusions are specified everything else is
> included and vice versa.
> 
> Glob patterns (*, ?, [) are supported in addition to exact
> matching.
> 
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>

This is very useful!

Tested-by: Gal Pressman <gal@nvidia.com>

^ permalink raw reply

* Re: [PATCH net v3 0/5] bonding: 3ad: fix carrier state with no valid slaves
From: Louis Scalbert @ 2026-04-09 11:49 UTC (permalink / raw)
  To: Jonas Gorski
  Cc: Jakub Kicinski, netdev, andrew+netdev, jv, edumazet, pabeni, fbl,
	andy, shemminger, maheshb
In-Reply-To: <6631e1e7-8728-46a4-9999-ea9910a1abfb@gmail.com>

Hello,

Le jeu. 9 avr. 2026 à 08:53, Jonas Gorski <jonas.gorski@gmail.com> a écrit :
>
> On 09/04/2026 05:13, Jakub Kicinski wrote:
> > On Wed,  8 Apr 2026 17:23:48 +0200 Louis Scalbert wrote:
> >> The current behavior is not compliant with the LACP standard. This
> >> patchset introduces a working behavior that is not strictly
> >> standard-compliant either, but is widely adopted across the industry.
> >> It consists of bringing the bonding master interface down to signal to
> >> upper-layer processes that it is not usable.
> >
> > Is the only problem the compliance? If so I don't think this qualifies
> > as a fix. Please drop the Fixes tags and repost for net-next. Please
> > keep in mind the 24h reposting period (also I need some time tomorrow
> > to queue your patch to the CI so that the selftest passes when v4 is
> > posted :()

The problem is not only about compliance.

In his review of v2, Jay argued that the current behavior is
standard-compliant and pointed out that it enables some use cases, such
as PXE facing an LACP bond.

I replied that the current implementation is not actually compliant
with the standard, and that the PXE case does not truly work in a
reliable way, since success depends on a random link choice.

My goal is not to make the PXE use case work. Rather, this series fixes
other problematic scenarios. To avoid regressing setups that may
benefit from the current behavior, I added a configuration knob that
allows preserving the legacy behavior. The legacy mode should be
deprecated in my opinion.

>
> Signalling link up too early can cause issues for some protocols that
> may change behavior in the absence of PDUs from a link partner.

I agree with your point. I have observed issues with
keepalived VRRP when it is configured on top of a bonding interface.

When the bond reports carrier as up while no slave is actually able to
receive traffic (due to the partner not being ready, as indicated by the
absence of LACP negotiation), the VRRP process interprets the interface
as operational. At the same time, the absence of received VRRP
advertisements is interpreted as if it were the only router on the
segment. As a result, it transitions to the MASTER state.

In reality, another VRRP router may already be MASTER and actively
sending advertisements, but those packets are not received due to the
bonding state. This leads to a split-brain condition with multiple
masters on the network.

Such a situation breaks the assumptions of
VRRP, where a single MASTER is expected to handle traffic,
and can result in traffic inconsistency or loss when upper-layer
processes rely on this behavior.

>
> E.g. AFAIU RSTP may decide the bond is an edge port because no RSTP
> BPDUs received and put the bond in forwarding, which then temporarily
> creates a loop once the bond actually starts forwarding packets (until
> it receives the next RSTP BPDU, which may take up to two seconds).
>

There are also situations where BGP incorrectly assumes that the link is
 usable while it is not.

Please confirm whether it should be a fix or not.

> Best regards,
> Jonas

Best regards,

Louis Scalbert

^ permalink raw reply

* Re: [PATCH net] l2tp: take a session reference in pppol2tp_ioctl()
From: Paolo Abeni @ 2026-04-09 11:41 UTC (permalink / raw)
  To: Yiqi Sun, jchapman; +Cc: davem, edumazet, kuba, horms, netdev
In-Reply-To: <20260404133245.2391409-1-sunyiqixm@gmail.com>

On 4/4/26 3:32 PM, Yiqi Sun wrote:
> pppol2tp_ioctl() reads sock->sk->sk_user_data and dereferences the
> returned l2tp_session without taking a reference on it.
> 
> Since the ppp socket/session lifetime rework, session teardown runs
> asynchronously and can clear sk_user_data and drop the last session
> reference in parallel with ioctl(). This leaves ioctl() with a stale
> session pointer and can trigger a use-after-free.

It's not immediately obvious to me which are the actual code paths
involved, please include a more accurate description of the relevant race.

> 
> Fix this by using pppol2tp_sock_to_session() in pppol2tp_ioctl() and
> dropping the session reference before returning. This matches the
> existing getsockopt/setsockopt paths.
> 
> Fixes: c5cbaef992d64 ("l2tp: refactor ppp socket/session relationship")
> Signed-off-by: Yiqi Sun <sunyiqixm@gmail.com>
> ---
>  net/l2tp/l2tp_ppp.c | 88 +++++++++++++++++++++++++++------------------
>  1 file changed, 54 insertions(+), 34 deletions(-)
> 
> diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
> index ae4543d5597b..e6d7d3537180 100644
> --- a/net/l2tp/l2tp_ppp.c
> +++ b/net/l2tp/l2tp_ppp.c
> @@ -1042,66 +1042,79 @@ static int pppol2tp_tunnel_copy_stats(struct pppol2tp_ioc_stats *stats,
>  static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
>  			  unsigned long arg)
>  {
> +	struct sock *sk = sock->sk;
>  	struct pppol2tp_ioc_stats stats;
>  	struct l2tp_session *session;
> +	int err;
> +
> +	err = -ENOTCONN;
> +	if (!sk->sk_user_data)
> +		goto end;
> +
> +	err = -EBADF;
> +	session = pppol2tp_sock_to_session(sk);
> +	if (!session)
> +		goto end;

Consolidating the checks here brings an user visible change, as
unsupported cmds previously delivered -ENOIOCTLCMD.

>  
>  	switch (cmd) {
>  	case PPPIOCGMRU:
>  	case PPPIOCGFLAGS:
> -		session = sock->sk->sk_user_data;
> -		if (!session)
> -			return -ENOTCONN;
> -
>  		if (WARN_ON(session->magic != L2TP_SESSION_MAGIC))

This check is already present in pppol2tp_sock_to_session()

Similar chunks below.

/P


^ permalink raw reply

* Re: [PATCH net v2 2/2] net: phy: micrel: remove ksz9131_resume()
From: Russell King (Oracle) @ 2026-04-09 11:30 UTC (permalink / raw)
  To: Biju Das
  Cc: Ovidiu Panait, andrew@lunn.ch, hkallweit1@gmail.com,
	davem@davemloft.net, edumazet@google.com, kuba@kernel.org,
	pabeni@redhat.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-renesas-soc@vger.kernel.org
In-Reply-To: <TY3PR01MB11346732465160FFE9DCAADD686582@TY3PR01MB11346.jpnprd01.prod.outlook.com>

On Thu, Apr 09, 2026 at 11:19:43AM +0000, Biju Das wrote:
> Hi Russell King,
> 
> > -----Original Message-----
> > From: Russell King <linux@armlinux.org.uk>
> > Sent: 09 April 2026 12:05
> > Subject: Re: [PATCH net v2 2/2] net: phy: micrel: remove ksz9131_resume()
> > 
> > On Thu, Apr 09, 2026 at 10:52:35AM +0000, Biju Das wrote:
> > > Hi Russell King,
> > >
> > > Thanks for the feedback.
> > >
> > > > -----Original Message-----
> > > > From: Russell King <linux@armlinux.org.uk>
> > > > Sent: 09 April 2026 11:30
> > > > Subject: Re: [PATCH net v2 2/2] net: phy: micrel: remove
> > > > ksz9131_resume()
> > > >
> > > > phy_init_hw() will also call drv->config_intr(), so that doesn't need to be done either.
> > > >
> > > > It will also call drv->config_init(), which will call kszphy_config_reset().
> > > >
> > > > So most of kszphy_resume() becomes unnecessary. I think the only
> > > > thing that remains would be the call to kszphy_enable_clk() - and is it fine to call that after
> > phy_init_hw() ?
> > >
> > > It just needs kszphy_enable_clk() and phydev->drv->config_intr() to
> > > enable PHY interrupts for suspend-to-RAM to work on RZ/G3E SMARC EVK.
> > 
> > I think you mean WoL rather than suspend-to-RAM, although I don't see anything in micrel.c that hints
> > that WoL is supported, so please explain why and how the PHY interrupt impacts suspend-to-RAM.
> 
> This is not WoL. During Suspend-to-RAM, the DDR goes into retention mode while
> the CPU, SoC, and PHY power is cut off.
> 
> During resume, TF-A detects WARM_RESET, brings DDR out of retention, and jumps to
> the PSCI resume path.
> 
> > 
> > Note that a particular interrupt should not wake the system unless
> > enable_irq_wake() has been called for that specific interrupt.
> 
> If PHY interrupts are not configured during resume, no link interrupt is received and the message:
> "renesas-gbeth 11c30000.ethernet end0: Link is Up - 1Gbps/Full - flow control rx/tx"
> is not seen, as shown in [1].

... and why does that happen? Is it because the PHY has lost its
interrupt configuration and that needs to be reprogrammed?

If you don't disable the PHY interrupt in the suspend path, then
will the call to drv->config_intr() via phy_init_hw() before
phy_resume() be sufficient?

-- 
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 80Mbps down 10Mbps up. Decent connectivity at last!

^ permalink raw reply

* RE: [PATCH v5 net-next 0/8] dpll/ice: Add TXC DPLL type and full TX reference clock control for E825
From: Nitka, Grzegorz @ 2026-04-09 11:21 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	intel-wired-lan@lists.osuosl.org, Oros, Petr,
	richardcochran@gmail.com, andrew+netdev@lunn.ch,
	Kitszel, Przemyslaw, Nguyen, Anthony L,
	Prathosh.Satish@microchip.com, Vecera, Ivan, jiri@resnulli.us,
	Kubalewski, Arkadiusz, vadim.fedorenko@linux.dev,
	donald.hunter@gmail.com, horms@kernel.org, pabeni@redhat.com,
	davem@davemloft.net, edumazet@google.com
In-Reply-To: <20260406192312.0f7a2760@kernel.org>



> -----Original Message-----
> From: Jakub Kicinski <kuba@kernel.org>
> Sent: Tuesday, April 7, 2026 4:23 AM
> To: Nitka, Grzegorz <grzegorz.nitka@intel.com>
> Cc: netdev@vger.kernel.org; linux-kernel@vger.kernel.org; intel-wired-
> lan@lists.osuosl.org; Oros, Petr <poros@redhat.com>;
> richardcochran@gmail.com; andrew+netdev@lunn.ch; Kitszel, Przemyslaw
> <przemyslaw.kitszel@intel.com>; Nguyen, Anthony L
> <anthony.l.nguyen@intel.com>; Prathosh.Satish@microchip.com; Vecera,
> Ivan <ivecera@redhat.com>; jiri@resnulli.us; Kubalewski, Arkadiusz
> <arkadiusz.kubalewski@intel.com>; vadim.fedorenko@linux.dev;
> donald.hunter@gmail.com; horms@kernel.org; pabeni@redhat.com;
> davem@davemloft.net; edumazet@google.com
> Subject: Re: [PATCH v5 net-next 0/8] dpll/ice: Add TXC DPLL type and full TX
> reference clock control for E825
> 
> On Fri,  3 Apr 2026 01:06:18 +0200 Grzegorz Nitka wrote:
> > This series adds TX reference clock support for E825 devices and exposes
> > TX clock selection and synchronization status via the Linux DPLL
> > subsystem.
> > E825 hardware contains a dedicated Tx clock (TXC) domain that is
> > distinct
> > from PPS and EEC. TX reference clock selection is device‑wide, shared
> > across ports, and mediated by firmware as part of the link bring‑up
> > process. As a result, TX clock selection intent may differ from the
> > effective hardware configuration, and software must verify the outcome
> > after link‑up.
> > To support this, the series introduces TXC support incrementally across
> > the DPLL core and the ice driver:
> >
> > - add a new DPLL type (TXC) to represent transmit clock generators;
> 
> I'm not grasping why this is needed, isn't it part of any EEC system
> that the DPLL can drive the TXC? Is your system going to expose multiple
> DPLLs now for one NIC?
> 

Hello Jakub,
For E825 device, the short answer is yes. We have platform EEC now and
we want to add:
- TXC DPLLs per port, and
- PPS DPLL for TSPLL config purposes (in the near future)

EEC (Ethernet Equipment Clock) type DPLL is designed to control multiple
source signals (internal-NIC or external), where one drives the dpll device,
where multiple outputs are possible, each could drive various components
as well as propagate signal to external devices.
TXC is specific dpll device that associated with single ETH port to control it's source,
there is no need to declare any outputs as the single output is already determined.
Basically, having TXC DPLL indicates per port control over SyncE (or some external)
clock source. 

> > - relax DPLL pin registration rules for firmware‑described shared pins
> >   and extend pin notifications with a source identifier;
> > - allow dynamic state control of SyncE reference pins where hardware
> >   supports it;
> > - add CPI infrastructure for PHY‑side TX clock control on E825C;
> > - introduce a TXC DPLL device and TX reference clock pins (EXT_EREF0 and
> >   SYNCE) in the ice driver;
> > - extend the Restart Auto‑Negotiation command to carry a TX reference
> >   clock index;
> > - implement hardware‑backed TX reference clock switching, post‑link
> > - verification, and TX synchronization reporting.
> >
> > TXCLK pins report TX reference topology only. Actual synchronization
> > success is reported via the TXC DPLL lock status, which is updated after
> > hardware verification: external Tx references report LOCKED, while the
> > internal ENET/TXCO source reports UNLOCKED.
> > This provides reliable TX reference selection and observability on E825
> > devices using standard DPLL interfaces, without conflating user intent
> > with effective hardware behavior.


^ permalink raw reply

* Re: [PATCH v2] netfilter: nft_fwd_netdev: use recursion counter in neigh egress path
From: Pablo Neira Ayuso @ 2026-04-09 11:21 UTC (permalink / raw)
  To: Weiming Shi
  Cc: Florian Westphal, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Phil Sutter, Simon Horman, netfilter-devel, coreteam,
	netdev, Xiang Mei
In-Reply-To: <adeIF7ZsJsZsgwQy@chamomile>

On Thu, Apr 09, 2026 at 01:06:03PM +0200, Pablo Neira Ayuso wrote:
> On Thu, Apr 09, 2026 at 06:49:12PM +0800, Weiming Shi wrote:
> > nft_fwd_neigh can be used in egress chains (NF_NETDEV_EGRESS). When the
> > forwarding rule targets the same device or two devices forward to each
> > other, neigh_xmit() triggers dev_queue_xmit() which re-enters
> > nf_hook_egress(), causing infinite recursion and stack overflow.
> > 
> > Move the nf_get_nf_dup_skb_recursion() accessor and NF_RECURSION_LIMIT
> > to the shared header nf_dup_netdev.h as a static inline, so that
> > nft_fwd_netdev can use the recursion counter directly without exported
> > function call overhead. Guard neigh_xmit() with the same recursion
> > limit already used in nf_do_netdev_egress().
> > 
> > Fixes: f87b9464d152 ("netfilter: nft_fwd_netdev: Support egress hook")
> 
> I would just restrict this "feature", I don't see a point in allowing
> this from egress?

Hm, actually this can be combined with if0 device, fixing it makes sense.

> > Reported-by: Xiang Mei <xmei5@asu.edu>
> > Signed-off-by: Weiming Shi <bestswngs@gmail.com>
> > ---
> >  include/net/netfilter/nf_dup_netdev.h | 13 +++++++++++++
> >  net/netfilter/nf_dup_netdev.c         | 16 ----------------
> >  net/netfilter/nft_fwd_netdev.c        |  7 +++++++
> >  3 files changed, 20 insertions(+), 16 deletions(-)
> > 
> > diff --git a/include/net/netfilter/nf_dup_netdev.h b/include/net/netfilter/nf_dup_netdev.h
> > index b175d271aec9..609bcf422a9b 100644
> > --- a/include/net/netfilter/nf_dup_netdev.h
> > +++ b/include/net/netfilter/nf_dup_netdev.h
> > @@ -3,10 +3,23 @@
> >  #define _NF_DUP_NETDEV_H_
> >  
> >  #include <net/netfilter/nf_tables.h>
> > +#include <linux/netdevice.h>
> > +#include <linux/sched.h>
> >  
> >  void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif);
> >  void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif);
> >  
> > +#define NF_RECURSION_LIMIT	2
> > +
> > +static inline u8 *nf_get_nf_dup_skb_recursion(void)
> > +{
> > +#ifndef CONFIG_PREEMPT_RT
> > +	return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion);
> > +#else
> > +	return &current->net_xmit.nf_dup_skb_recursion;
> > +#endif
> > +}
> > +
> >  struct nft_offload_ctx;
> >  struct nft_flow_rule;
> >  
> > diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
> > index fab8b9011098..a958a1b0c5be 100644
> > --- a/net/netfilter/nf_dup_netdev.c
> > +++ b/net/netfilter/nf_dup_netdev.c
> > @@ -13,22 +13,6 @@
> >  #include <net/netfilter/nf_tables_offload.h>
> >  #include <net/netfilter/nf_dup_netdev.h>
> >  
> > -#define NF_RECURSION_LIMIT	2
> > -
> > -#ifndef CONFIG_PREEMPT_RT
> > -static u8 *nf_get_nf_dup_skb_recursion(void)
> > -{
> > -	return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion);
> > -}
> > -#else
> > -
> > -static u8 *nf_get_nf_dup_skb_recursion(void)
> > -{
> > -	return &current->net_xmit.nf_dup_skb_recursion;
> > -}
> > -
> > -#endif
> > -
> >  static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev,
> >  				enum nf_dev_hooks hook)
> >  {
> > diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
> > index 152a9fb4d23a..492bb599a499 100644
> > --- a/net/netfilter/nft_fwd_netdev.c
> > +++ b/net/netfilter/nft_fwd_netdev.c
> > @@ -141,13 +141,20 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr,
> >  		goto out;
> >  	}
> >  
> > +	if (*nf_get_nf_dup_skb_recursion() > NF_RECURSION_LIMIT) {
> > +		verdict = NF_DROP;
> > +		goto out;
> > +	}
> > +
> >  	dev = dev_get_by_index_rcu(nft_net(pkt), oif);
> >  	if (dev == NULL)
> >  		return;
> >  
> >  	skb->dev = dev;
> >  	skb_clear_tstamp(skb);
> > +	(*nf_get_nf_dup_skb_recursion())++;
> >  	neigh_xmit(neigh_table, dev, addr, skb);
> > +	(*nf_get_nf_dup_skb_recursion())--;
> >  out:
> >  	regs->verdict.code = verdict;
> >  }
> > -- 
> > 2.43.0
> > 
> > 

^ permalink raw reply

* RE: [PATCH net v2 2/2] net: phy: micrel: remove ksz9131_resume()
From: Biju Das @ 2026-04-09 11:19 UTC (permalink / raw)
  To: Russell King
  Cc: Ovidiu Panait, andrew@lunn.ch, hkallweit1@gmail.com,
	davem@davemloft.net, edumazet@google.com, kuba@kernel.org,
	pabeni@redhat.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-renesas-soc@vger.kernel.org
In-Reply-To: <adeH5y5TiZdaK94d@shell.armlinux.org.uk>

Hi Russell King,

> -----Original Message-----
> From: Russell King <linux@armlinux.org.uk>
> Sent: 09 April 2026 12:05
> Subject: Re: [PATCH net v2 2/2] net: phy: micrel: remove ksz9131_resume()
> 
> On Thu, Apr 09, 2026 at 10:52:35AM +0000, Biju Das wrote:
> > Hi Russell King,
> >
> > Thanks for the feedback.
> >
> > > -----Original Message-----
> > > From: Russell King <linux@armlinux.org.uk>
> > > Sent: 09 April 2026 11:30
> > > Subject: Re: [PATCH net v2 2/2] net: phy: micrel: remove
> > > ksz9131_resume()
> > >
> > > phy_init_hw() will also call drv->config_intr(), so that doesn't need to be done either.
> > >
> > > It will also call drv->config_init(), which will call kszphy_config_reset().
> > >
> > > So most of kszphy_resume() becomes unnecessary. I think the only
> > > thing that remains would be the call to kszphy_enable_clk() - and is it fine to call that after
> phy_init_hw() ?
> >
> > It just needs kszphy_enable_clk() and phydev->drv->config_intr() to
> > enable PHY interrupts for suspend-to-RAM to work on RZ/G3E SMARC EVK.
> 
> I think you mean WoL rather than suspend-to-RAM, although I don't see anything in micrel.c that hints
> that WoL is supported, so please explain why and how the PHY interrupt impacts suspend-to-RAM.

This is not WoL. During Suspend-to-RAM, the DDR goes into retention mode while
the CPU, SoC, and PHY power is cut off.

During resume, TF-A detects WARM_RESET, brings DDR out of retention, and jumps to
the PSCI resume path.

> 
> Note that a particular interrupt should not wake the system unless
> enable_irq_wake() has been called for that specific interrupt.

If PHY interrupts are not configured during resume, no link interrupt is received and the message:
"renesas-gbeth 11c30000.ethernet end0: Link is Up - 1Gbps/Full - flow control rx/tx"
is not seen, as shown in [1].

Cheers,
Biju

[1]
root@smarc-rzg3l:~# echo mem > /sys/power/state
[  184.611719] PM: suspend entry (deep)
[  184.616854] Filesystems sync: 0.000 seconds
[  184.629390] Freezing user space processes
[  184.637539] Freezing user space processes completed (elapsed 0.003 seconds)
[  184.644541] OOM killer disabled.
[  184.647758] Freezing remaining freezable tasks
[  184.653520] Freezing remaining freezable tasks completed (elapsed 0.001 seconds)
[  184.660941] printk: Suspending console(s) (use no_console_suspend to debug)
NOTICE:  BL2: v2.10.5(release):2.10.5/rz_soc_dev-383-g15a06c881
NOTICE:  BL2: Built : 12:13:18, Apr  2 2026
INFO:    BL2: Doing platform setup
INFO:    Configuring TrustZone Controller
INFO:    Total 3 regions set.
INFO:    Configuring TrustZone Controller
INFO:    Total 1 regions set.
INFO:    Configuring TrustZone Controller
INFO:    Total 1 regions set.
INFO:    eMMC boot from partition 1
INFO:    Loading image id=39 at address 0x44428
INFO:    emmcdrv_block_len: len: 0x00001000
INFO:    Load dst=0x44428 src=(p:1)0x260000(4864) len=0x1000(8)
INFO:    Image id=39 loaded: 0x44428 - 0x45428
INFO:    DDR: Retention Exit (Rev. 02.05)
NOTICE:  BL2: SYS_LSI_MODE: 0x12051
NOTICE:  BL2: SYS_LSI_DEVID: 0x87d9447
INFO:    BL2: Skip loading image id 3
INFO:    BL2: Skip loading image id 5
NOTICE:  BL2: Booting BL31
INFO:    Entry point address = 0x44000000
INFO:    SPSR = 0x3cd
[  184.670380] renesas-gbeth 11c30000.ethernet end0: Link is Down
[  184.674006] Disabling non-boot CPUs ...
[  184.675870] psci: CPU3 killed (polled 4 ms)
[  184.679357] psci: CPU2 killed (polled 0 ms)
[  184.683525] psci: CPU1 killed (polled 0 ms)
[  184.685755] Enabling non-boot CPUs ...
[  184.686014] Detected VIPT I-cache on CPU1
[  184.686070] GICv3: CPU1: found redistributor 100 region 0:0x0000000012460000
[  184.686119] CPU1: Booted secondary processor 0x0000000100 [0x412fd050]
[  184.687190] CPU1 is up
[  184.687348] Detected VIPT I-cache on CPU2
[  184.687384] GICv3: CPU2: found redistributor 200 region 0:0x0000000012480000
[  184.687419] CPU2: Booted secondary processor 0x0000000200 [0x412fd050]
[  184.688357] CPU2 is up
[  184.688534] Detected VIPT I-cache on CPU3
[  184.688573] GICv3: CPU3: found redistributor 300 region 0:0x00000000124a0000
[  184.688615] CPU3: Booted secondary processor 0x0000000300 [0x412fd050]
[  184.689702] CPU3 is up
[  184.692965] da7213 3-001a: Unable to sync registers 0x23-0x23. -6
[  184.767008] dwmac4: Master AXI performs fixed burst length
[  184.767049] renesas-gbeth 11c30000.ethernet end0: No Safety Features support found
[  184.767090] renesas-gbeth 11c30000.ethernet end0: IEEE 1588-2008 Advanced Timestamp supported
[  184.769791] renesas-gbeth 11c30000.ethernet end0: configuring for phy/rgmii-id link mode
[  184.839754] dwmac4: Master AXI performs fixed burst length
[  184.839784] renesas-gbeth 11c40000.ethernet end1: No Safety Features support found
[  184.839814] renesas-gbeth 11c40000.ethernet end1: IEEE 1588-2008 Advanced Timestamp supported
[  184.840892] renesas-gbeth 11c40000.ethernet end1: configuring for phy/rgmii-id link mode
[  184.994774] OOM killer enabled.
[  184.997922] Restarting tasks: Starting
[  185.002227] Restarting tasks: Done
[  185.005781] random: crng reseeded on system resumption
[  185.011124] PM: suspend exit
root@smarc-rzg3l:~#
root@smarc-rzg3l:~# [  187.356951] renesas-gbeth 11c30000.ethernet end0: Link is Up - 1Gbps/Full - flow control rx/tx



^ permalink raw reply

* Re: [PATCH net-next v7 2/2] r8152: Add support for the RTL8157 hardware
From: Birger Koblitz @ 2026-04-09 11:18 UTC (permalink / raw)
  To: Paolo Abeni, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski
  Cc: linux-usb, netdev, linux-kernel, Chih Kai Hsu
In-Reply-To: <8b324f8c-f4f8-4e90-b5d6-9b87ec3daf2b@redhat.com>

On 09/04/2026 12:16, Paolo Abeni wrote:
> On 4/4/26 9:57 AM, Birger Koblitz wrote:
>> @@ -6534,8 +6842,11 @@ static void rtl8156_up(struct r8152 *tp)
>>   	ocp_word_clr_bits(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL3,
>>   			  PLA_MCU_SPDWN_EN);
>>   
>> -	ocp_word_clr_bits(tp, MCU_TYPE_USB, USB_SPEED_OPTION,
>> -			  RG_PWRDN_EN | ALL_SPEED_OFF);
>> +	ocp_word_clr_bits(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL3, PLA_MCU_SPDWN_EN);
> 
> AI review notes that the above leads to 2 consecutive:
> 
> 	ocp_word_clr_bits(tp, MCU_TYPE_PLA, PLA_MAC_PWR_CTRL3, PLA_MCU_SPDWN_EN);
> 
> with slightly different formatting, likely C&P error?!?
> 
> I think this is better handled with a follow-up, if needed, as I don't
> see any possible issue out of it.
> 
> Other AI comments look not relevant.
Thanks a lot, Paolo!
I will follow up on this for sure, there is also the RTL8159...
I also contacted the Realtek devs, but have not heard back so far, probably needs
internal escalation...

Birger

^ permalink raw reply

* Re: possible deadlock in virtio_transport_release (vsock_register_mutex vs sk_lock)
From: Stefano Garzarella @ 2026-04-09 11:07 UTC (permalink / raw)
  To: weibo Zhang; +Cc: davem, edumazet, kuba, pabeni, netdev, linux-kernel
In-Reply-To: <CADaZ31BntFddQpK8nnLtGccyKA-kw-usOHZwEscA70+dgB2WSw@mail.gmail.com>

On Thu, 9 Apr 2026 at 11:06, weibo Zhang <weibozhang2050@gmail.com> wrote:
>
> Hi,
>
> I am reporting a possible circular locking dependency (AB-BA deadlock) in the
>
> vsock subsystem between vsock_register_mutex and sk_lock-AF_VSOCK, found by
>
> syzkaller-based kernel fuzzing on Linux 6.12.47.
>
> The deadlock is detected by lockdep and involves two lock ordering paths:
>
> Path #1 (bind path): sk_lock → vsock_register_mutex
>
> vsock_bind() → lock_sock(sk) → __vsock_bind() → vsock_find_cid()
>
> → vsock_registered_transport_cid() → mutex_lock(&vsock_register_mutex)
>
> Path #2 (connect + transport reassignment): vsock_register_mutex → sk_lock
>
> vsock_connect() → vsock_assign_transport()
>
> → mutex_lock(&vsock_register_mutex) [line 469, af_vsock.c]
>
> → old transport->release(vsk) [line 502, af_vsock.c]
>
> → virtio_transport_release()
>
> → virtio_transport_close()
>
> → virtio_transport_wait_close()
>
> → lock_sock(sk) [line 1207, virtio_transport_common.c]
>
> The second path is triggered when a socket that already has a transport
>
> assigned (e.g., loopback from a previous CID_LOCAL connect) attempts to
>
> connect to a different CID (e.g., CID_HOST), causing vsock_assign_transport()
>
> to release the old transport while holding vsock_register_mutex.
>
>
> A possible fix would be to drop vsock_register_mutex before calling
>
> transport->release() in vsock_assign_transport(), or to avoid calling

Which is exactly what we did with
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f7c877e7535260cc7a21484c994e8ce7e8cb6780

This was released with v6.18 and backported on stable branches.

>
> lock_sock() from within the transport release path when called from
>
> vsock_assign_transport().
>
> This can be reproduced on:
>
> HEAD commit: Linux 6.12.47

v6.12.56 should contain the backport of that patch, can you test it
(or a newer version)?

Thanks,
Stefano


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox