Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v3 02/10] enic: add admin channel open and close for SR-IOV
From: Satish Kharat via B4 Relay @ 2026-04-08 16:36 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, linux-kernel,
	20260401-enic-sriov-v2-prep-v4-0-d5834b2ef1b9, Satish Kharat
In-Reply-To: <20260408-enic-sriov-v2-admin-channel-v2-v3-0-1d4999a03cec@cisco.com>

From: Satish Kharat <satishkh@cisco.com>

The V2 SR-IOV design uses a dedicated admin channel (WQ/RQ/CQ/INTR
on separate BAR resources) for PF-VF mailbox communication rather
than firmware-proxied devcmds.

Introduce enic_admin_channel_open() and enic_admin_channel_close().
Open allocates and initialises the admin WQ, RQ, two CQs (one per
direction) and one SR-IOV interrupt, then issues CMD_QP_TYPE_SET to
tell firmware the queues are admin-type. Close reverses the sequence.

Add CMD_QP_TYPE_SET (97) and QP_TYPE_ADMIN/DATA defines to
vnic_devcmd.h.

Signed-off-by: Satish Kharat <satishkh@cisco.com>
---
 drivers/net/ethernet/cisco/enic/Makefile      |   3 +-
 drivers/net/ethernet/cisco/enic/enic_admin.c  | 175 ++++++++++++++++++++++++++
 drivers/net/ethernet/cisco/enic/enic_admin.h  |  15 +++
 drivers/net/ethernet/cisco/enic/vnic_devcmd.h |   9 ++
 4 files changed, 201 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cisco/enic/Makefile b/drivers/net/ethernet/cisco/enic/Makefile
index a96b8332e6e2..7ae72fefc99a 100644
--- a/drivers/net/ethernet/cisco/enic/Makefile
+++ b/drivers/net/ethernet/cisco/enic/Makefile
@@ -3,5 +3,6 @@ obj-$(CONFIG_ENIC) := enic.o
 
 enic-y := enic_main.o vnic_cq.o vnic_intr.o vnic_wq.o \
 	enic_res.o enic_dev.o enic_pp.o vnic_dev.o vnic_rq.o vnic_vic.o \
-	enic_ethtool.o enic_api.o enic_clsf.o enic_rq.o enic_wq.o
+	enic_ethtool.o enic_api.o enic_clsf.o enic_rq.o enic_wq.o \
+	enic_admin.o
 
diff --git a/drivers/net/ethernet/cisco/enic/enic_admin.c b/drivers/net/ethernet/cisco/enic/enic_admin.c
new file mode 100644
index 000000000000..d1abe6a50095
--- /dev/null
+++ b/drivers/net/ethernet/cisco/enic/enic_admin.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2025 Cisco Systems, Inc.  All rights reserved.
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+
+#include "vnic_dev.h"
+#include "vnic_wq.h"
+#include "vnic_rq.h"
+#include "vnic_cq.h"
+#include "vnic_intr.h"
+#include "vnic_resource.h"
+#include "vnic_devcmd.h"
+#include "enic.h"
+#include "enic_admin.h"
+#include "cq_desc.h"
+#include "wq_enet_desc.h"
+#include "rq_enet_desc.h"
+
+/* No-op: admin WQ buffers are freed inline after completion polling */
+static void enic_admin_wq_buf_clean(struct vnic_wq *wq,
+				    struct vnic_wq_buf *buf)
+{
+}
+
+/* No-op: admin RQ buffer teardown is handled in enic_admin_channel_close */
+static void enic_admin_rq_buf_clean(struct vnic_rq *rq,
+				    struct vnic_rq_buf *buf)
+{
+}
+
+static int enic_admin_qp_type_set(struct enic *enic, u32 enable)
+{
+	u64 a0 = QP_TYPE_ADMIN, a1 = enable;
+	int wait = 1000;
+	int err;
+
+	spin_lock_bh(&enic->devcmd_lock);
+	err = vnic_dev_cmd(enic->vdev, CMD_QP_TYPE_SET, &a0, &a1, wait);
+	spin_unlock_bh(&enic->devcmd_lock);
+
+	return err;
+}
+
+static int enic_admin_alloc_resources(struct enic *enic)
+{
+	int err;
+
+	err = vnic_wq_alloc_with_type(enic->vdev, &enic->admin_wq, 0,
+				      ENIC_ADMIN_DESC_COUNT,
+				      sizeof(struct wq_enet_desc),
+				      RES_TYPE_ADMIN_WQ);
+	if (err)
+		return err;
+
+	err = vnic_rq_alloc_with_type(enic->vdev, &enic->admin_rq, 0,
+				      ENIC_ADMIN_DESC_COUNT,
+				      sizeof(struct rq_enet_desc),
+				      RES_TYPE_ADMIN_RQ);
+	if (err)
+		goto free_wq;
+
+	err = vnic_cq_alloc_with_type(enic->vdev, &enic->admin_cq[0], 0,
+				      ENIC_ADMIN_DESC_COUNT,
+				      sizeof(struct cq_desc),
+				      RES_TYPE_ADMIN_CQ);
+	if (err)
+		goto free_rq;
+
+	err = vnic_cq_alloc_with_type(enic->vdev, &enic->admin_cq[1], 1,
+				      ENIC_ADMIN_DESC_COUNT,
+				      16 << enic->ext_cq,
+				      RES_TYPE_ADMIN_CQ);
+	if (err)
+		goto free_cq0;
+
+	/* PFs have dedicated SRIOV_INTR resources for admin channel.
+	 * VFs lack SRIOV_INTR; use a regular INTR_CTRL slot instead.
+	 */
+	if (vnic_dev_get_res_count(enic->vdev, RES_TYPE_SRIOV_INTR) >= 1)
+		err = vnic_intr_alloc_with_type(enic->vdev,
+						&enic->admin_intr, 0,
+						RES_TYPE_SRIOV_INTR);
+	else
+		err = vnic_intr_alloc(enic->vdev, &enic->admin_intr,
+				      enic->intr_count);
+	if (err)
+		goto free_cq1;
+
+	return 0;
+
+free_cq1:
+	vnic_cq_free(&enic->admin_cq[1]);
+free_cq0:
+	vnic_cq_free(&enic->admin_cq[0]);
+free_rq:
+	vnic_rq_free(&enic->admin_rq);
+free_wq:
+	vnic_wq_free(&enic->admin_wq);
+	return err;
+}
+
+static void enic_admin_free_resources(struct enic *enic)
+{
+	vnic_intr_free(&enic->admin_intr);
+	vnic_cq_free(&enic->admin_cq[1]);
+	vnic_cq_free(&enic->admin_cq[0]);
+	vnic_rq_free(&enic->admin_rq);
+	vnic_wq_free(&enic->admin_wq);
+}
+
+static void enic_admin_init_resources(struct enic *enic)
+{
+	vnic_wq_init(&enic->admin_wq, 0, 0, 0);
+	vnic_rq_init(&enic->admin_rq, 1, 0, 0);
+	vnic_cq_init(&enic->admin_cq[0], 0, 1, 0, 0, 1, 0, 1, 0, 0, 0);
+	vnic_cq_init(&enic->admin_cq[1], 0, 1, 0, 0, 1, 0, 1, 0, 0, 0);
+	vnic_intr_init(&enic->admin_intr, 0, 0, 1);
+}
+
+int enic_admin_channel_open(struct enic *enic)
+{
+	int err;
+
+	if (!enic->has_admin_channel)
+		return -ENODEV;
+
+	err = enic_admin_alloc_resources(enic);
+	if (err) {
+		netdev_err(enic->netdev,
+			   "Failed to alloc admin channel resources: %d\n",
+			   err);
+		return err;
+	}
+
+	enic_admin_init_resources(enic);
+
+	vnic_wq_enable(&enic->admin_wq);
+	vnic_rq_enable(&enic->admin_rq);
+
+	err = enic_admin_qp_type_set(enic, 1);
+	if (err) {
+		netdev_err(enic->netdev,
+			   "Failed to set admin QP type: %d\n", err);
+		goto disable_queues;
+	}
+
+	return 0;
+
+disable_queues:
+	vnic_wq_disable(&enic->admin_wq);
+	vnic_rq_disable(&enic->admin_rq);
+	enic_admin_qp_type_set(enic, 0);
+	enic_admin_free_resources(enic);
+	return err;
+}
+
+void enic_admin_channel_close(struct enic *enic)
+{
+	if (!enic->has_admin_channel)
+		return;
+
+	vnic_wq_disable(&enic->admin_wq);
+	vnic_rq_disable(&enic->admin_rq);
+
+	enic_admin_qp_type_set(enic, 0);
+
+	vnic_wq_clean(&enic->admin_wq, enic_admin_wq_buf_clean);
+	vnic_rq_clean(&enic->admin_rq, enic_admin_rq_buf_clean);
+	vnic_cq_clean(&enic->admin_cq[0]);
+	vnic_cq_clean(&enic->admin_cq[1]);
+	vnic_intr_clean(&enic->admin_intr);
+
+	enic_admin_free_resources(enic);
+}
diff --git a/drivers/net/ethernet/cisco/enic/enic_admin.h b/drivers/net/ethernet/cisco/enic/enic_admin.h
new file mode 100644
index 000000000000..569aadeb9312
--- /dev/null
+++ b/drivers/net/ethernet/cisco/enic/enic_admin.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2025 Cisco Systems, Inc.  All rights reserved. */
+
+#ifndef _ENIC_ADMIN_H_
+#define _ENIC_ADMIN_H_
+
+#define ENIC_ADMIN_DESC_COUNT	64
+#define ENIC_ADMIN_BUF_SIZE	2048
+
+struct enic;
+
+int enic_admin_channel_open(struct enic *enic);
+void enic_admin_channel_close(struct enic *enic);
+
+#endif /* _ENIC_ADMIN_H_ */
diff --git a/drivers/net/ethernet/cisco/enic/vnic_devcmd.h b/drivers/net/ethernet/cisco/enic/vnic_devcmd.h
index 7a4bce736105..a1c8f522c7d7 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_devcmd.h
+++ b/drivers/net/ethernet/cisco/enic/vnic_devcmd.h
@@ -455,8 +455,17 @@ enum vnic_devcmd_cmd {
 	 */
 	CMD_CQ_ENTRY_SIZE_SET = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 90),
 
+	/*
+	 * Set queue pair type (admin or data)
+	 * in: (u32) a0 = queue pair type (0 = admin, 1 = data)
+	 * in: (u32) a1 = enable (1) / disable (0)
+	 */
+	CMD_QP_TYPE_SET = _CMDC(_CMD_DIR_WRITE, _CMD_VTYPE_ENET, 97),
 };
 
+#define QP_TYPE_ADMIN	0
+#define QP_TYPE_DATA	1
+
 /* CMD_ENABLE2 flags */
 #define CMD_ENABLE2_STANDBY 0x0
 #define CMD_ENABLE2_ACTIVE  0x1

-- 
2.43.0



^ permalink raw reply related

* [PATCH net-next v3 05/10] enic: define MBOX message types and header structures
From: Satish Kharat via B4 Relay @ 2026-04-08 16:36 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, linux-kernel,
	20260401-enic-sriov-v2-prep-v4-0-d5834b2ef1b9, Satish Kharat
In-Reply-To: <20260408-enic-sriov-v2-admin-channel-v2-v3-0-1d4999a03cec@cisco.com>

From: Satish Kharat <satishkh@cisco.com>

Define the mailbox protocol used for PF-VF communication over the
admin channel. The protocol uses request/reply pairs where even
message types are requests and odd are replies.

Initial message types cover the core SR-IOV handshake:
  - VF_CAPABILITY: version negotiation
  - VF_REGISTER/UNREGISTER: VF lifecycle management
  - PF_LINK_STATE_NOTIF: PF-initiated link state changes

Each message carries a common header (src/dst vnic ID, type,
length, sequence number) followed by a type-specific payload.

Signed-off-by: Satish Kharat <satishkh@cisco.com>
---
 drivers/net/ethernet/cisco/enic/enic_mbox.h | 75 +++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/drivers/net/ethernet/cisco/enic/enic_mbox.h b/drivers/net/ethernet/cisco/enic/enic_mbox.h
new file mode 100644
index 000000000000..84cb6bbc1ead
--- /dev/null
+++ b/drivers/net/ethernet/cisco/enic/enic_mbox.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright 2025 Cisco Systems, Inc.  All rights reserved. */
+
+#ifndef _ENIC_MBOX_H_
+#define _ENIC_MBOX_H_
+
+/*
+ * Mailbox protocol for PF-VF communication over the admin channel.
+ *
+ * Even numbers are requests, odd numbers are replies/acks.
+ * The prefix indicates the initiator: VF_ = VF-initiated, PF_ = PF-initiated.
+ */
+enum enic_mbox_msg_type {
+	ENIC_MBOX_VF_CAPABILITY_REQUEST		= 0,
+	ENIC_MBOX_VF_CAPABILITY_REPLY		= 1,
+	ENIC_MBOX_VF_REGISTER_REQUEST		= 2,
+	ENIC_MBOX_VF_REGISTER_REPLY		= 3,
+	ENIC_MBOX_VF_UNREGISTER_REQUEST		= 4,
+	ENIC_MBOX_VF_UNREGISTER_REPLY		= 5,
+	ENIC_MBOX_PF_LINK_STATE_NOTIF		= 6,
+	ENIC_MBOX_PF_LINK_STATE_ACK		= 7,
+	ENIC_MBOX_MAX
+};
+
+struct enic_mbox_hdr {
+	__le16 src_vnic_id;
+	__le16 dst_vnic_id;
+	u8 msg_type;
+	u8 flags;
+	__le16 msg_len;
+	__le64 msg_num;
+};
+
+struct enic_mbox_generic_reply {
+	__le16 ret_major;
+	__le16 ret_minor;
+};
+
+#define ENIC_MBOX_ERR_GENERIC		BIT(0)
+#define ENIC_MBOX_ERR_VF_NOT_REGISTERED	BIT(1)
+#define ENIC_MBOX_ERR_MSG_NOT_SUPPORTED	BIT(2)
+
+/* ENIC_MBOX_VF_CAPABILITY_REQUEST / _REPLY */
+#define ENIC_MBOX_CAP_VERSION_0		0
+#define ENIC_MBOX_CAP_VERSION_1		1
+
+struct enic_mbox_vf_capability_msg {
+	__le32 version;
+	__le32 reserved[32];
+};
+
+struct enic_mbox_vf_capability_reply_msg {
+	struct enic_mbox_generic_reply reply;
+	__le32 version;
+	__le32 reserved[32];
+};
+
+/* ENIC_MBOX_VF_REGISTER / _UNREGISTER */
+struct enic_mbox_vf_register_reply_msg {
+	struct enic_mbox_generic_reply reply;
+};
+
+/* ENIC_MBOX_PF_LINK_STATE_NOTIF / _ACK */
+#define ENIC_MBOX_LINK_STATE_DISABLE	0
+#define ENIC_MBOX_LINK_STATE_ENABLE	1
+
+struct enic_mbox_pf_link_state_notif_msg {
+	__le32 link_state;
+};
+
+struct enic_mbox_pf_link_state_ack_msg {
+	struct enic_mbox_generic_reply ack;
+};
+
+#endif /* _ENIC_MBOX_H_ */

-- 
2.43.0



^ permalink raw reply related

* [PATCH net-next v3 01/10] enic: verify firmware supports V2 SR-IOV at probe time
From: Satish Kharat via B4 Relay @ 2026-04-08 16:36 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, linux-kernel,
	20260401-enic-sriov-v2-prep-v4-0-d5834b2ef1b9, Satish Kharat,
	Breno Leitao
In-Reply-To: <20260408-enic-sriov-v2-admin-channel-v2-v3-0-1d4999a03cec@cisco.com>

From: Satish Kharat <satishkh@cisco.com>

During PF probe, query the firmware get-supported-feature interface
to verify that the running firmware supports V2 SR-IOV. Firmware
version 5.3(4.72) and later report VIC_FEATURE_SRIOV via
CMD_GET_SUPP_FEATURE_VER. If the firmware does not support the
feature, set vf_type to ENIC_VF_TYPE_NONE and log a warning so the
admin knows a firmware upgrade is needed.

The VIC_FEATURE_SRIOV enum value (4) matches the firmware ABI. A
placeholder entry (VIC_FEATURE_PTP at position 3) is added to keep
the enum in sync with firmware's feature numbering.

Suggested-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Satish Kharat <satishkh@cisco.com>
---
 drivers/net/ethernet/cisco/enic/enic_main.c   | 19 +++++++++++++++++++
 drivers/net/ethernet/cisco/enic/vnic_devcmd.h |  2 ++
 2 files changed, 21 insertions(+)

diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index e7125b818087..73bb59eef7a0 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -2641,8 +2641,10 @@ static void enic_iounmap(struct enic *enic)
 static void enic_sriov_detect_vf_type(struct enic *enic)
 {
 	struct pci_dev *pdev = enic->pdev;
+	u64 supported_versions, a1 = 0;
 	int pos;
 	u16 vf_dev_id;
+	int err;
 
 	if (enic_is_sriov_vf(enic) || enic_is_dynamic(enic))
 		return;
@@ -2669,6 +2671,23 @@ static void enic_sriov_detect_vf_type(struct enic *enic)
 		enic->vf_type = ENIC_VF_TYPE_NONE;
 		break;
 	}
+
+	if (enic->vf_type != ENIC_VF_TYPE_V2)
+		return;
+
+	/* A successful command means firmware recognizes
+	 * VIC_FEATURE_SRIOV; supported_versions is available
+	 * for sub-feature versioning in the future.
+	 */
+	err = vnic_dev_get_supported_feature_ver(enic->vdev,
+						 VIC_FEATURE_SRIOV,
+						 &supported_versions,
+						 &a1);
+	if (err) {
+		dev_warn(&pdev->dev,
+			 "SR-IOV V2 not supported by current firmware. Upgrade to VIC FW 5.3(4.72) or higher.\n");
+		enic->vf_type = ENIC_VF_TYPE_NONE;
+	}
 }
 #endif
 
diff --git a/drivers/net/ethernet/cisco/enic/vnic_devcmd.h b/drivers/net/ethernet/cisco/enic/vnic_devcmd.h
index 605ef17f967e..7a4bce736105 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_devcmd.h
+++ b/drivers/net/ethernet/cisco/enic/vnic_devcmd.h
@@ -734,6 +734,8 @@ enum vic_feature_t {
 	VIC_FEATURE_VXLAN,
 	VIC_FEATURE_RDMA,
 	VIC_FEATURE_VXLAN_PATCH,
+	VIC_FEATURE_PTP,
+	VIC_FEATURE_SRIOV,
 	VIC_FEATURE_MAX,
 };
 

-- 
2.43.0



^ permalink raw reply related

* [PATCH net-next v3 03/10] enic: add admin RQ buffer management
From: Satish Kharat via B4 Relay @ 2026-04-08 16:36 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, linux-kernel,
	20260401-enic-sriov-v2-prep-v4-0-d5834b2ef1b9, Satish Kharat
In-Reply-To: <20260408-enic-sriov-v2-admin-channel-v2-v3-0-1d4999a03cec@cisco.com>

From: Satish Kharat <satishkh@cisco.com>

The admin receive queue needs pre-posted DMA buffers for incoming
mailbox messages from VFs. Each buffer is a kmalloc'd region mapped
for DMA (2048 bytes, sufficient for any MBOX message).

Add enic_admin_rq_fill() to post buffers at open time, and
enic_admin_rq_drain() to unmap and free them at close time.
Wire both into the admin channel open/close paths.

Signed-off-by: Satish Kharat <satishkh@cisco.com>
---
 drivers/net/ethernet/cisco/enic/enic_admin.c | 66 +++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cisco/enic/enic_admin.c b/drivers/net/ethernet/cisco/enic/enic_admin.c
index d1abe6a50095..a8fcd5f116d1 100644
--- a/drivers/net/ethernet/cisco/enic/enic_admin.c
+++ b/drivers/net/ethernet/cisco/enic/enic_admin.c
@@ -3,6 +3,7 @@
 
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <linux/dma-mapping.h>
 
 #include "vnic_dev.h"
 #include "vnic_wq.h"
@@ -23,10 +24,63 @@ static void enic_admin_wq_buf_clean(struct vnic_wq *wq,
 {
 }
 
-/* No-op: admin RQ buffer teardown is handled in enic_admin_channel_close */
 static void enic_admin_rq_buf_clean(struct vnic_rq *rq,
 				    struct vnic_rq_buf *buf)
 {
+	struct enic *enic = vnic_dev_priv(rq->vdev);
+
+	if (!buf->os_buf)
+		return;
+
+	dma_unmap_single(&enic->pdev->dev, buf->dma_addr, buf->len,
+			 DMA_FROM_DEVICE);
+	kfree(buf->os_buf);
+	buf->os_buf = NULL;
+}
+
+static int enic_admin_rq_post_one(struct enic *enic)
+{
+	struct vnic_rq *rq = &enic->admin_rq;
+	struct rq_enet_desc *desc;
+	dma_addr_t dma_addr;
+	void *buf;
+
+	buf = kmalloc(ENIC_ADMIN_BUF_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	dma_addr = dma_map_single(&enic->pdev->dev, buf, ENIC_ADMIN_BUF_SIZE,
+				  DMA_FROM_DEVICE);
+	if (dma_mapping_error(&enic->pdev->dev, dma_addr)) {
+		kfree(buf);
+		return -ENOMEM;
+	}
+
+	desc = vnic_rq_next_desc(rq);
+	rq_enet_desc_enc(desc, (u64)dma_addr | VNIC_PADDR_TARGET,
+			 RQ_ENET_TYPE_ONLY_SOP, ENIC_ADMIN_BUF_SIZE);
+	vnic_rq_post(rq, buf, 0, dma_addr, ENIC_ADMIN_BUF_SIZE, 0);
+
+	return 0;
+}
+
+static int enic_admin_rq_fill(struct enic *enic)
+{
+	struct vnic_rq *rq = &enic->admin_rq;
+	int err;
+
+	while (vnic_rq_desc_avail(rq) > 0) {
+		err = enic_admin_rq_post_one(enic);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static void enic_admin_rq_drain(struct enic *enic)
+{
+	vnic_rq_clean(&enic->admin_rq, enic_admin_rq_buf_clean);
 }
 
 static int enic_admin_qp_type_set(struct enic *enic, u32 enable)
@@ -138,6 +192,13 @@ int enic_admin_channel_open(struct enic *enic)
 	vnic_wq_enable(&enic->admin_wq);
 	vnic_rq_enable(&enic->admin_rq);
 
+	err = enic_admin_rq_fill(enic);
+	if (err) {
+		netdev_err(enic->netdev,
+			   "Failed to fill admin RQ buffers: %d\n", err);
+		goto disable_queues;
+	}
+
 	err = enic_admin_qp_type_set(enic, 1);
 	if (err) {
 		netdev_err(enic->netdev,
@@ -151,6 +212,7 @@ int enic_admin_channel_open(struct enic *enic)
 	vnic_wq_disable(&enic->admin_wq);
 	vnic_rq_disable(&enic->admin_rq);
 	enic_admin_qp_type_set(enic, 0);
+	enic_admin_rq_drain(enic);
 	enic_admin_free_resources(enic);
 	return err;
 }
@@ -166,7 +228,7 @@ void enic_admin_channel_close(struct enic *enic)
 	enic_admin_qp_type_set(enic, 0);
 
 	vnic_wq_clean(&enic->admin_wq, enic_admin_wq_buf_clean);
-	vnic_rq_clean(&enic->admin_rq, enic_admin_rq_buf_clean);
+	enic_admin_rq_drain(enic);
 	vnic_cq_clean(&enic->admin_cq[0]);
 	vnic_cq_clean(&enic->admin_cq[1]);
 	vnic_intr_clean(&enic->admin_intr);

-- 
2.43.0



^ permalink raw reply related

* [PATCH net-next v3 06/10] enic: add MBOX core send and receive for admin channel
From: Satish Kharat via B4 Relay @ 2026-04-08 16:36 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, linux-kernel,
	20260401-enic-sriov-v2-prep-v4-0-d5834b2ef1b9, Satish Kharat
In-Reply-To: <20260408-enic-sriov-v2-admin-channel-v2-v3-0-1d4999a03cec@cisco.com>

From: Satish Kharat <satishkh@cisco.com>

Implement the mailbox protocol engine used for PF-VF communication
over the admin channel.

The send path (enic_mbox_send_msg) builds a message with a common
header, DMA-maps it, posts a single WQ descriptor with the
destination vnic ID encoded in the VLAN tag field, and polls
the WQ CQ for completion.

The receive path (enic_mbox_recv_handler) is installed as the admin
RQ callback and validates incoming message headers. PF/VF-specific
dispatch will be added in subsequent commits.

Signed-off-by: Satish Kharat <satishkh@cisco.com>
---
 drivers/net/ethernet/cisco/enic/Makefile     |   2 +-
 drivers/net/ethernet/cisco/enic/enic.h       |   6 ++
 drivers/net/ethernet/cisco/enic/enic_admin.c |  23 +++-
 drivers/net/ethernet/cisco/enic/enic_mbox.c  | 156 +++++++++++++++++++++++++++
 drivers/net/ethernet/cisco/enic/enic_mbox.h  |   8 ++
 5 files changed, 193 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cisco/enic/Makefile b/drivers/net/ethernet/cisco/enic/Makefile
index 7ae72fefc99a..e38aaf34c148 100644
--- a/drivers/net/ethernet/cisco/enic/Makefile
+++ b/drivers/net/ethernet/cisco/enic/Makefile
@@ -4,5 +4,5 @@ obj-$(CONFIG_ENIC) := enic.o
 enic-y := enic_main.o vnic_cq.o vnic_intr.o vnic_wq.o \
 	enic_res.o enic_dev.o enic_pp.o vnic_dev.o vnic_rq.o vnic_vic.o \
 	enic_ethtool.o enic_api.o enic_clsf.o enic_rq.o enic_wq.o \
-	enic_admin.o
+	enic_admin.o enic_mbox.o
 
diff --git a/drivers/net/ethernet/cisco/enic/enic.h b/drivers/net/ethernet/cisco/enic/enic.h
index 1c09da3c0b1a..42f345aceced 100644
--- a/drivers/net/ethernet/cisco/enic/enic.h
+++ b/drivers/net/ethernet/cisco/enic/enic.h
@@ -292,6 +292,8 @@ struct enic {
 
 	/* Admin channel resources for SR-IOV MBOX */
 	bool has_admin_channel;
+	/* set on send timeout; cleared on channel re-open */
+	bool mbox_send_disabled;
 	struct vnic_wq admin_wq;
 	struct vnic_rq admin_rq;
 	struct vnic_cq admin_cq[2];
@@ -304,6 +306,10 @@ struct enic {
 	u64 admin_msg_drop_cnt;
 	void (*admin_rq_handler)(struct enic *enic, void *buf,
 				 unsigned int len);
+
+	/* MBOX protocol state */
+	struct mutex mbox_lock;
+	u64 mbox_msg_num;
 };
 
 static inline struct net_device *vnic_get_netdev(struct vnic_dev *vdev)
diff --git a/drivers/net/ethernet/cisco/enic/enic_admin.c b/drivers/net/ethernet/cisco/enic/enic_admin.c
index 345d194c6eeb..c96268adc173 100644
--- a/drivers/net/ethernet/cisco/enic/enic_admin.c
+++ b/drivers/net/ethernet/cisco/enic/enic_admin.c
@@ -19,6 +19,7 @@
 #include "cq_enet_desc.h"
 #include "wq_enet_desc.h"
 #include "rq_enet_desc.h"
+#include "enic_mbox.h"
 
 /* No-op: admin WQ buffers are freed inline after completion polling */
 static void enic_admin_wq_buf_clean(struct vnic_wq *wq,
@@ -156,7 +157,26 @@ unsigned int enic_admin_rq_cq_service(struct enic *enic, unsigned int budget)
 					buf->dma_addr, buf->len,
 					DMA_FROM_DEVICE);
 
-		enic_admin_msg_enqueue(enic, buf->os_buf, buf->len);
+		if (enic->admin_rq_handler) {
+			struct cq_enet_rq_desc *rq_desc = desc;
+			u16 sender_vlan;
+
+			/* Firmware sets the CQ VLAN field to identify the
+			 * sender: 0 = PF, 1-based = VF index.  Overwrite
+			 * the untrusted src_vnic_id in the MBOX header with
+			 * the hardware-verified value.
+			 */
+			sender_vlan = le16_to_cpu(rq_desc->vlan);
+			if (buf->len >= sizeof(struct enic_mbox_hdr)) {
+				struct enic_mbox_hdr *hdr = buf->os_buf;
+
+				hdr->src_vnic_id = (sender_vlan == 0) ?
+					cpu_to_le16(ENIC_MBOX_DST_PF) :
+					cpu_to_le16(sender_vlan - 1);
+			}
+
+			enic_admin_msg_enqueue(enic, buf->os_buf, buf->len);
+		}
 
 		enic_admin_rq_buf_clean(rq, rq->to_clean);
 		rq->to_clean = rq->to_clean->next;
@@ -389,6 +409,7 @@ int enic_admin_channel_open(struct enic *enic)
 	if (!enic->has_admin_channel)
 		return -ENODEV;
 
+	enic->mbox_send_disabled = false;
 	err = enic_admin_alloc_resources(enic);
 	if (err) {
 		netdev_err(enic->netdev,
diff --git a/drivers/net/ethernet/cisco/enic/enic_mbox.c b/drivers/net/ethernet/cisco/enic/enic_mbox.c
new file mode 100644
index 000000000000..00ab76a47a35
--- /dev/null
+++ b/drivers/net/ethernet/cisco/enic/enic_mbox.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2025 Cisco Systems, Inc.  All rights reserved.
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/dma-mapping.h>
+#include <linux/delay.h>
+
+#include "vnic_dev.h"
+#include "vnic_wq.h"
+#include "vnic_cq.h"
+#include "enic.h"
+#include "enic_admin.h"
+#include "enic_mbox.h"
+#include "wq_enet_desc.h"
+
+#define ENIC_MBOX_POLL_TIMEOUT_US	5000000
+#define ENIC_MBOX_POLL_INTERVAL_US	100
+
+static void enic_mbox_fill_hdr(struct enic *enic, struct enic_mbox_hdr *hdr,
+			       u8 msg_type, u16 dst_vnic_id, u16 msg_len)
+{
+	memset(hdr, 0, sizeof(*hdr));
+	hdr->dst_vnic_id = cpu_to_le16(dst_vnic_id);
+	hdr->msg_type = msg_type;
+	hdr->msg_len = cpu_to_le16(msg_len);
+	hdr->msg_num = cpu_to_le64(++enic->mbox_msg_num);
+}
+
+int enic_mbox_send_msg(struct enic *enic, u8 msg_type, u16 dst_vnic_id,
+		       void *payload, u16 payload_len)
+{
+	u16 total_len = sizeof(struct enic_mbox_hdr) + payload_len;
+	struct vnic_wq *wq = &enic->admin_wq;
+	struct wq_enet_desc *desc;
+	dma_addr_t dma_addr;
+	unsigned long timeout;
+	u16 vlan_tag;
+	void *buf;
+	int err;
+
+	/* Serialize MBOX sends. The admin channel is a low-frequency
+	 * control path; holding the mutex across the poll is acceptable.
+	 */
+	mutex_lock(&enic->mbox_lock);
+
+	if (!enic->has_admin_channel || enic->mbox_send_disabled) {
+		err = -ENODEV;
+		goto unlock;
+	}
+
+	if (vnic_wq_desc_avail(wq) == 0) {
+		err = -ENOSPC;
+		goto unlock;
+	}
+
+	buf = kmalloc(total_len, GFP_KERNEL);
+	if (!buf) {
+		err = -ENOMEM;
+		goto unlock;
+	}
+
+	enic_mbox_fill_hdr(enic, buf, msg_type, dst_vnic_id, total_len);
+	if (payload_len) {
+		void *dst = buf + sizeof(struct enic_mbox_hdr);
+
+		memcpy(dst, payload, payload_len);
+	}
+
+	dma_addr = dma_map_single(&enic->pdev->dev, buf, total_len,
+				  DMA_TO_DEVICE);
+	if (dma_mapping_error(&enic->pdev->dev, dma_addr)) {
+		kfree(buf);
+		err = -ENOMEM;
+		goto unlock;
+	}
+
+	/* Firmware uses vlan field for routing: 0 = PF, 1-based = VF index */
+	if (dst_vnic_id == ENIC_MBOX_DST_PF)
+		vlan_tag = 0;
+	else
+		vlan_tag = dst_vnic_id + 1;
+
+	desc = vnic_wq_next_desc(wq);
+	wq_enet_desc_enc(desc, (u64)dma_addr | VNIC_PADDR_TARGET,
+			 total_len, 0, 0, 0, 1, 1, 0, 1, vlan_tag, 0);
+	vnic_wq_post(wq, buf, dma_addr, total_len, 1, 1, 1, 1, 0, 0);
+	vnic_wq_doorbell(wq);
+
+	timeout = jiffies + usecs_to_jiffies(ENIC_MBOX_POLL_TIMEOUT_US);
+	err = -ETIMEDOUT;
+	while (time_before(jiffies, timeout)) {
+		if (enic_admin_wq_cq_service(enic)) {
+			err = 0;
+			break;
+		}
+		usleep_range(ENIC_MBOX_POLL_INTERVAL_US,
+			     ENIC_MBOX_POLL_INTERVAL_US + 50);
+	}
+
+	if (!err) {
+		wq->to_clean = wq->to_clean->next;
+		wq->ring.desc_avail++;
+		dma_unmap_single(&enic->pdev->dev, dma_addr, total_len,
+				 DMA_TO_DEVICE);
+		kfree(buf);
+	} else {
+		netdev_err(enic->netdev,
+			   "MBOX send timed out (type %u dst %u), disabling channel\n",
+			   msg_type, dst_vnic_id);
+		/*
+		 * The WQ descriptor is still live in hardware. Do not unmap
+		 * or free the buffer: the device may still DMA from dma_addr.
+		 * Mark the channel unusable so no further sends are attempted.
+		 */
+		enic->mbox_send_disabled = true;
+	}
+
+	netdev_dbg(enic->netdev,
+		   "MBOX send msg_type %u dst %u vlan %u err %d\n",
+		   msg_type, dst_vnic_id, vlan_tag, err);
+unlock:
+	mutex_unlock(&enic->mbox_lock);
+	return err;
+}
+
+static void enic_mbox_recv_handler(struct enic *enic, void *buf,
+				   unsigned int len)
+{
+	struct enic_mbox_hdr *hdr = buf;
+
+	if (len < sizeof(*hdr)) {
+		netdev_warn(enic->netdev,
+			    "MBOX: truncated message (len %u < %zu)\n",
+			    len, sizeof(*hdr));
+		return;
+	}
+
+	if (hdr->msg_type >= ENIC_MBOX_MAX) {
+		netdev_warn(enic->netdev, "MBOX: unknown msg type %u\n",
+			    hdr->msg_type);
+		return;
+	}
+
+	netdev_dbg(enic->netdev,
+		   "MBOX recv: type %u from vnic %u len %u\n",
+		   hdr->msg_type, le16_to_cpu(hdr->src_vnic_id),
+		   le16_to_cpu(hdr->msg_len));
+}
+
+void enic_mbox_init(struct enic *enic)
+{
+	enic->mbox_msg_num = 0;
+	mutex_init(&enic->mbox_lock);
+	enic->admin_rq_handler = enic_mbox_recv_handler;
+}
diff --git a/drivers/net/ethernet/cisco/enic/enic_mbox.h b/drivers/net/ethernet/cisco/enic/enic_mbox.h
index 84cb6bbc1ead..554269b78780 100644
--- a/drivers/net/ethernet/cisco/enic/enic_mbox.h
+++ b/drivers/net/ethernet/cisco/enic/enic_mbox.h
@@ -72,4 +72,12 @@ struct enic_mbox_pf_link_state_ack_msg {
 	struct enic_mbox_generic_reply ack;
 };
 
+#define ENIC_MBOX_DST_PF	0xFFFF
+
+struct enic;
+
+void enic_mbox_init(struct enic *enic);
+int enic_mbox_send_msg(struct enic *enic, u8 msg_type, u16 dst_vnic_id,
+		       void *payload, u16 payload_len);
+
 #endif /* _ENIC_MBOX_H_ */

-- 
2.43.0



^ permalink raw reply related

* [PATCH net-next v3 04/10] enic: add admin CQ service with MSI-X interrupt and NAPI polling
From: Satish Kharat via B4 Relay @ 2026-04-08 16:36 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, linux-kernel,
	20260401-enic-sriov-v2-prep-v4-0-d5834b2ef1b9, Satish Kharat
In-Reply-To: <20260408-enic-sriov-v2-admin-channel-v2-v3-0-1d4999a03cec@cisco.com>

From: Satish Kharat <satishkh@cisco.com>

Add completion queue service for the admin channel WQ and RQ, driven
by an MSI-X interrupt and NAPI polling.

The receive pipeline is: MSI-X ISR -> NAPI poll -> RQ CQ service ->
message enqueue -> workqueue handler -> admin_rq_handler callback.
NAPI drains the RQ CQ in softirq context, copying each received
buffer into an enic_admin_msg and appending it to a spinlock-protected
list.  A system workqueue handler then processes each message in
process context where sleeping (mutex, GFP_KERNEL allocations) is
safe.

The WQ CQ service counts transmit completions and is called from the
synchronous MBOX send path.

RQ buffer allocation uses GFP_ATOMIC since enic_admin_rq_fill() is
called from NAPI context during CQ processing.

The admin channel open/close paths set up and tear down the MSI-X
interrupt, NAPI instance, and workqueue.  CQ init enables interrupt
delivery and sets the interrupt offset so completions trigger the
admin ISR.

Signed-off-by: Satish Kharat <satishkh@cisco.com>
---
 drivers/net/ethernet/cisco/enic/enic.h       |   8 +
 drivers/net/ethernet/cisco/enic/enic_admin.c | 297 +++++++++++++++++++++++++--
 drivers/net/ethernet/cisco/enic/enic_admin.h |  12 ++
 3 files changed, 295 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/cisco/enic/enic.h b/drivers/net/ethernet/cisco/enic/enic.h
index 08472420f3a1..1c09da3c0b1a 100644
--- a/drivers/net/ethernet/cisco/enic/enic.h
+++ b/drivers/net/ethernet/cisco/enic/enic.h
@@ -296,6 +296,14 @@ struct enic {
 	struct vnic_rq admin_rq;
 	struct vnic_cq admin_cq[2];
 	struct vnic_intr admin_intr;
+	struct napi_struct admin_napi;
+	unsigned int admin_intr_index;
+	struct work_struct admin_msg_work;
+	spinlock_t admin_msg_lock;	/* protects admin_msg_list */
+	struct list_head admin_msg_list;
+	u64 admin_msg_drop_cnt;
+	void (*admin_rq_handler)(struct enic *enic, void *buf,
+				 unsigned int len);
 };
 
 static inline struct net_device *vnic_get_netdev(struct vnic_dev *vdev)
diff --git a/drivers/net/ethernet/cisco/enic/enic_admin.c b/drivers/net/ethernet/cisco/enic/enic_admin.c
index a8fcd5f116d1..345d194c6eeb 100644
--- a/drivers/net/ethernet/cisco/enic/enic_admin.c
+++ b/drivers/net/ethernet/cisco/enic/enic_admin.c
@@ -4,6 +4,7 @@
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
 #include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
 
 #include "vnic_dev.h"
 #include "vnic_wq.h"
@@ -15,6 +16,7 @@
 #include "enic.h"
 #include "enic_admin.h"
 #include "cq_desc.h"
+#include "cq_enet_desc.h"
 #include "wq_enet_desc.h"
 #include "rq_enet_desc.h"
 
@@ -38,14 +40,14 @@ static void enic_admin_rq_buf_clean(struct vnic_rq *rq,
 	buf->os_buf = NULL;
 }
 
-static int enic_admin_rq_post_one(struct enic *enic)
+static int enic_admin_rq_post_one(struct enic *enic, gfp_t gfp)
 {
 	struct vnic_rq *rq = &enic->admin_rq;
 	struct rq_enet_desc *desc;
 	dma_addr_t dma_addr;
 	void *buf;
 
-	buf = kmalloc(ENIC_ADMIN_BUF_SIZE, GFP_KERNEL);
+	buf = kmalloc(ENIC_ADMIN_BUF_SIZE, gfp);
 	if (!buf)
 		return -ENOMEM;
 
@@ -64,13 +66,13 @@ static int enic_admin_rq_post_one(struct enic *enic)
 	return 0;
 }
 
-static int enic_admin_rq_fill(struct enic *enic)
+static int enic_admin_rq_fill(struct enic *enic, gfp_t gfp)
 {
 	struct vnic_rq *rq = &enic->admin_rq;
 	int err;
 
 	while (vnic_rq_desc_avail(rq) > 0) {
-		err = enic_admin_rq_post_one(enic);
+		err = enic_admin_rq_post_one(enic, gfp);
 		if (err)
 			return err;
 	}
@@ -83,6 +85,207 @@ static void enic_admin_rq_drain(struct enic *enic)
 	vnic_rq_clean(&enic->admin_rq, enic_admin_rq_buf_clean);
 }
 
+static unsigned int enic_admin_cq_color(void *cq_desc, unsigned int desc_size)
+{
+	u8 type_color = *((u8 *)cq_desc + desc_size - 1);
+
+	return (type_color >> CQ_DESC_COLOR_SHIFT) & CQ_DESC_COLOR_MASK;
+}
+
+unsigned int enic_admin_wq_cq_service(struct enic *enic)
+{
+	struct vnic_cq *cq = &enic->admin_cq[0];
+	unsigned int work = 0;
+	void *desc;
+
+	desc = vnic_cq_to_clean(cq);
+	while (enic_admin_cq_color(desc, cq->ring.desc_size) !=
+	       cq->last_color) {
+		/* Ensure color bit is read before descriptor fields */
+		rmb();
+		vnic_cq_inc_to_clean(cq);
+		work++;
+		desc = vnic_cq_to_clean(cq);
+	}
+
+	return work;
+}
+
+static void enic_admin_msg_enqueue(struct enic *enic, void *buf,
+				   unsigned int len)
+{
+	struct enic_admin_msg *msg;
+
+	msg = kmalloc(struct_size(msg, data, len), GFP_ATOMIC);
+	if (!msg) {
+		enic->admin_msg_drop_cnt++;
+		if (net_ratelimit())
+			netdev_warn(enic->netdev,
+				    "admin msg enqueue drop (len=%u drops=%llu)\n",
+				    len, enic->admin_msg_drop_cnt);
+		return;
+	}
+
+	msg->len = len;
+	memcpy(msg->data, buf, len);
+
+	spin_lock(&enic->admin_msg_lock);
+	list_add_tail(&msg->list, &enic->admin_msg_list);
+	spin_unlock(&enic->admin_msg_lock);
+}
+
+unsigned int enic_admin_rq_cq_service(struct enic *enic, unsigned int budget)
+{
+	struct vnic_cq *cq = &enic->admin_cq[1];
+	struct vnic_rq *rq = &enic->admin_rq;
+	struct vnic_rq_buf *buf;
+	unsigned int work = 0;
+	void *desc;
+
+	desc = vnic_cq_to_clean(cq);
+	while (work < budget &&
+	       enic_admin_cq_color(desc, cq->ring.desc_size) !=
+	       cq->last_color) {
+		/* Ensure CQ descriptor fields are read after
+		 * the color/valid check.
+		 */
+		rmb();
+		buf = rq->to_clean;
+
+		dma_sync_single_for_cpu(&enic->pdev->dev,
+					buf->dma_addr, buf->len,
+					DMA_FROM_DEVICE);
+
+		enic_admin_msg_enqueue(enic, buf->os_buf, buf->len);
+
+		enic_admin_rq_buf_clean(rq, rq->to_clean);
+		rq->to_clean = rq->to_clean->next;
+		rq->ring.desc_avail++;
+
+		vnic_cq_inc_to_clean(cq);
+		work++;
+		desc = vnic_cq_to_clean(cq);
+	}
+
+	enic_admin_rq_fill(enic, GFP_ATOMIC);
+
+	return work;
+}
+
+static irqreturn_t enic_admin_isr_msix(int irq, void *data)
+{
+	struct napi_struct *napi = data;
+
+	napi_schedule_irqoff(napi);
+
+	return IRQ_HANDLED;
+}
+
+static void enic_admin_msg_work_handler(struct work_struct *work)
+{
+	struct enic *enic = container_of(work, struct enic, admin_msg_work);
+	struct enic_admin_msg *msg, *tmp;
+	LIST_HEAD(local_list);
+
+	spin_lock_bh(&enic->admin_msg_lock);
+	list_splice_init(&enic->admin_msg_list, &local_list);
+	spin_unlock_bh(&enic->admin_msg_lock);
+
+	list_for_each_entry_safe(msg, tmp, &local_list, list) {
+		if (enic->admin_rq_handler)
+			enic->admin_rq_handler(enic, msg->data, msg->len);
+		list_del(&msg->list);
+		kfree(msg);
+	}
+}
+
+static int enic_admin_napi_poll(struct napi_struct *napi, int budget)
+{
+	struct enic *enic = container_of(napi, struct enic, admin_napi);
+	unsigned int credits;
+	unsigned int rq_work;
+
+	credits = vnic_intr_credits(&enic->admin_intr);
+
+	rq_work = enic_admin_rq_cq_service(enic, budget);
+
+	if (rq_work > 0)
+		schedule_work(&enic->admin_msg_work);
+
+	if (rq_work < budget && napi_complete_done(napi, rq_work)) {
+		if (credits)
+			vnic_intr_return_credits(&enic->admin_intr, credits,
+						 1 /* unmask */, 0);
+	} else {
+		if (credits)
+			vnic_intr_return_credits(&enic->admin_intr, credits,
+						 0 /* don't unmask */, 0);
+	}
+
+	return rq_work;
+}
+
+static int enic_admin_setup_intr(struct enic *enic)
+{
+	unsigned int intr_index = enic->intr_count;
+	int err;
+
+	if (vnic_dev_get_intr_mode(enic->vdev) != VNIC_DEV_INTR_MODE_MSIX ||
+	    intr_index >= enic->intr_avail)
+		return -ENODEV;
+
+	err = vnic_intr_alloc(enic->vdev, &enic->admin_intr, intr_index);
+	if (err) {
+		netdev_warn(enic->netdev,
+			    "Failed to alloc admin intr at index %u: %d\n",
+			    intr_index, err);
+		return err;
+	}
+
+	enic->admin_intr_index = intr_index;
+
+	snprintf(enic->msix[intr_index].devname,
+		 sizeof(enic->msix[intr_index].devname),
+		 "%s-admin", enic->netdev->name);
+	enic->msix[intr_index].isr = enic_admin_isr_msix;
+	enic->msix[intr_index].devid = &enic->admin_napi;
+
+	err = request_irq(enic->msix_entry[intr_index].vector,
+			  enic->msix[intr_index].isr, 0,
+			  enic->msix[intr_index].devname,
+			  enic->msix[intr_index].devid);
+	if (err) {
+		netdev_warn(enic->netdev,
+			    "Failed to request admin MSI-X irq: %d\n", err);
+		vnic_intr_free(&enic->admin_intr);
+		return err;
+	}
+
+	enic->msix[intr_index].requested = 1;
+
+	netif_napi_add(enic->netdev, &enic->admin_napi,
+		       enic_admin_napi_poll);
+	napi_enable(&enic->admin_napi);
+
+	netdev_dbg(enic->netdev,
+		   "admin channel using MSI-X interrupt (index %u)\n",
+		   intr_index);
+
+	return 0;
+}
+
+static void enic_admin_teardown_intr(struct enic *enic)
+{
+	unsigned int intr_index = enic->admin_intr_index;
+
+	napi_disable(&enic->admin_napi);
+	netif_napi_del(&enic->admin_napi);
+
+	free_irq(enic->msix_entry[intr_index].vector,
+		 enic->msix[intr_index].devid);
+	enic->msix[intr_index].requested = 0;
+}
+
 static int enic_admin_qp_type_set(struct enic *enic, u32 enable)
 {
 	u64 a0 = QP_TYPE_ADMIN, a1 = enable;
@@ -128,23 +331,8 @@ static int enic_admin_alloc_resources(struct enic *enic)
 	if (err)
 		goto free_cq0;
 
-	/* PFs have dedicated SRIOV_INTR resources for admin channel.
-	 * VFs lack SRIOV_INTR; use a regular INTR_CTRL slot instead.
-	 */
-	if (vnic_dev_get_res_count(enic->vdev, RES_TYPE_SRIOV_INTR) >= 1)
-		err = vnic_intr_alloc_with_type(enic->vdev,
-						&enic->admin_intr, 0,
-						RES_TYPE_SRIOV_INTR);
-	else
-		err = vnic_intr_alloc(enic->vdev, &enic->admin_intr,
-				      enic->intr_count);
-	if (err)
-		goto free_cq1;
-
 	return 0;
 
-free_cq1:
-	vnic_cq_free(&enic->admin_cq[1]);
 free_cq0:
 	vnic_cq_free(&enic->admin_cq[0]);
 free_rq:
@@ -165,10 +353,32 @@ static void enic_admin_free_resources(struct enic *enic)
 
 static void enic_admin_init_resources(struct enic *enic)
 {
+	unsigned int intr_offset = enic->admin_intr_index;
+
 	vnic_wq_init(&enic->admin_wq, 0, 0, 0);
 	vnic_rq_init(&enic->admin_rq, 1, 0, 0);
-	vnic_cq_init(&enic->admin_cq[0], 0, 1, 0, 0, 1, 0, 1, 0, 0, 0);
-	vnic_cq_init(&enic->admin_cq[1], 0, 1, 0, 0, 1, 0, 1, 0, 0, 0);
+	vnic_cq_init(&enic->admin_cq[0],
+		     0 /* flow_control_enable */,
+		     1 /* color_enable */,
+		     0 /* cq_head */,
+		     0 /* cq_tail */,
+		     1 /* cq_tail_color */,
+		     1 /* interrupt_enable */,
+		     1 /* cq_entry_enable */,
+		     0 /* cq_message_enable */,
+		     intr_offset,
+		     0 /* cq_message_addr */);
+	vnic_cq_init(&enic->admin_cq[1],
+		     0 /* flow_control_enable */,
+		     1 /* color_enable */,
+		     0 /* cq_head */,
+		     0 /* cq_tail */,
+		     1 /* cq_tail_color */,
+		     1 /* interrupt_enable */,
+		     1 /* cq_entry_enable */,
+		     0 /* cq_message_enable */,
+		     intr_offset,
+		     0 /* cq_message_addr */);
 	vnic_intr_init(&enic->admin_intr, 0, 0, 1);
 }
 
@@ -187,12 +397,24 @@ int enic_admin_channel_open(struct enic *enic)
 		return err;
 	}
 
+	err = enic_admin_setup_intr(enic);
+	if (err) {
+		netdev_err(enic->netdev,
+			   "Admin channel requires MSI-X, SR-IOV unavailable: %d\n",
+			   err);
+		goto free_resources;
+	}
+
+	spin_lock_init(&enic->admin_msg_lock);
+	INIT_LIST_HEAD(&enic->admin_msg_list);
+	INIT_WORK(&enic->admin_msg_work, enic_admin_msg_work_handler);
+
 	enic_admin_init_resources(enic);
 
 	vnic_wq_enable(&enic->admin_wq);
 	vnic_rq_enable(&enic->admin_rq);
 
-	err = enic_admin_rq_fill(enic);
+	err = enic_admin_rq_fill(enic, GFP_KERNEL);
 	if (err) {
 		netdev_err(enic->netdev,
 			   "Failed to fill admin RQ buffers: %d\n", err);
@@ -206,22 +428,53 @@ int enic_admin_channel_open(struct enic *enic)
 		goto disable_queues;
 	}
 
+	vnic_intr_unmask(&enic->admin_intr);
+
+	netdev_dbg(enic->netdev,
+		   "admin channel open: intr=%u wq_avail=%u rq_avail=%u cq0_color=%u cq1_color=%u\n",
+		   enic->admin_intr_index,
+		   vnic_wq_desc_avail(&enic->admin_wq),
+		   vnic_rq_desc_avail(&enic->admin_rq),
+		   enic->admin_cq[0].last_color,
+		   enic->admin_cq[1].last_color);
+
 	return 0;
 
 disable_queues:
+	enic_admin_teardown_intr(enic);
 	vnic_wq_disable(&enic->admin_wq);
 	vnic_rq_disable(&enic->admin_rq);
 	enic_admin_qp_type_set(enic, 0);
 	enic_admin_rq_drain(enic);
+free_resources:
 	enic_admin_free_resources(enic);
 	return err;
 }
 
+static void enic_admin_msg_drain(struct enic *enic)
+{
+	struct enic_admin_msg *msg, *tmp;
+
+	spin_lock_bh(&enic->admin_msg_lock);
+	list_for_each_entry_safe(msg, tmp, &enic->admin_msg_list, list) {
+		list_del(&msg->list);
+		kfree(msg);
+	}
+	spin_unlock_bh(&enic->admin_msg_lock);
+}
+
 void enic_admin_channel_close(struct enic *enic)
 {
 	if (!enic->has_admin_channel)
 		return;
 
+	netdev_dbg(enic->netdev, "admin channel close\n");
+
+	vnic_intr_mask(&enic->admin_intr);
+	enic_admin_teardown_intr(enic);
+	cancel_work_sync(&enic->admin_msg_work);
+	enic_admin_msg_drain(enic);
+
 	vnic_wq_disable(&enic->admin_wq);
 	vnic_rq_disable(&enic->admin_rq);
 
diff --git a/drivers/net/ethernet/cisco/enic/enic_admin.h b/drivers/net/ethernet/cisco/enic/enic_admin.h
index 569aadeb9312..73cdd3dac7ec 100644
--- a/drivers/net/ethernet/cisco/enic/enic_admin.h
+++ b/drivers/net/ethernet/cisco/enic/enic_admin.h
@@ -9,7 +9,19 @@
 
 struct enic;
 
+/* Wrapper for received admin messages queued for deferred processing.
+ * NAPI enqueues these; a workqueue handler processes them in process context
+ * where sleeping (mutex, GFP_KERNEL) is safe.
+ */
+struct enic_admin_msg {
+	struct list_head list;
+	unsigned int len;
+	u8 data[];
+};
+
 int enic_admin_channel_open(struct enic *enic);
 void enic_admin_channel_close(struct enic *enic);
+unsigned int enic_admin_wq_cq_service(struct enic *enic);
+unsigned int enic_admin_rq_cq_service(struct enic *enic, unsigned int budget);
 
 #endif /* _ENIC_ADMIN_H_ */

-- 
2.43.0



^ permalink raw reply related

* [PATCH net-next v3 08/10] enic: add MBOX VF handlers for capability, register and link state
From: Satish Kharat via B4 Relay @ 2026-04-08 16:36 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, linux-kernel,
	20260401-enic-sriov-v2-prep-v4-0-d5834b2ef1b9, Satish Kharat
In-Reply-To: <20260408-enic-sriov-v2-admin-channel-v2-v3-0-1d4999a03cec@cisco.com>

From: Satish Kharat <satishkh@cisco.com>

Implement VF-side mailbox message processing for SR-IOV V2
admin channel communication.

VF receive handlers:
  - VF_CAPABILITY_REPLY: store PF protocol version, signal
    completion
  - VF_REGISTER_REPLY: mark VF as registered, signal completion
  - VF_UNREGISTER_REPLY: mark VF as unregistered, signal
    completion
  - PF_LINK_STATE_NOTIF: update carrier state via
    netif_carrier_on/off, send ACK back to PF

VF initiation functions for the probe-time handshake:
  - enic_mbox_vf_capability_check: send capability request,
    wait for PF reply via completion
  - enic_mbox_vf_register: send register request, wait for
    PF confirmation via completion
  - enic_mbox_vf_unregister: send unregister request, wait
    for PF confirmation

The wait helper (enic_mbox_wait_reply) uses
wait_for_completion_timeout, signaled when the admin ISR/NAPI/
workqueue pipeline delivers the reply message.

Signed-off-by: Satish Kharat <satishkh@cisco.com>
---
 drivers/net/ethernet/cisco/enic/enic.h      |   9 +-
 drivers/net/ethernet/cisco/enic/enic_mbox.c | 220 ++++++++++++++++++++++++++++
 drivers/net/ethernet/cisco/enic/enic_mbox.h |   3 +
 3 files changed, 231 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cisco/enic/enic.h b/drivers/net/ethernet/cisco/enic/enic.h
index 9b1fa3857df5..29ce26284493 100644
--- a/drivers/net/ethernet/cisco/enic/enic.h
+++ b/drivers/net/ethernet/cisco/enic/enic.h
@@ -258,6 +258,8 @@ struct enic {
 	u32 tx_coalesce_usecs;
 	u16 num_vfs;
 	enum enic_vf_type vf_type;
+	bool vf_registered;
+	u32 pf_cap_version;
 	unsigned int enable_count;
 	spinlock_t enic_api_lock;
 	bool enic_api_busy;
@@ -305,9 +307,14 @@ struct enic {
 	void (*admin_rq_handler)(struct enic *enic, void *buf,
 				 unsigned int len);
 
-	/* MBOX protocol state */
+	/* MBOX protocol state -- single-flight: on the VF, all callers
+	 * that wait on mbox_comp run under RTNL or during probe/remove,
+	 * so only one completion is outstanding at a time. mbox_lock
+	 * protects the shared admin WQ from concurrent senders.
+	 */
 	struct mutex mbox_lock;
 	u64 mbox_msg_num;
+	struct completion mbox_comp;
 
 	/* PF: per-VF MBOX state, allocated when SRIOV V2 is enabled */
 	struct enic_vf_state {
diff --git a/drivers/net/ethernet/cisco/enic/enic_mbox.c b/drivers/net/ethernet/cisco/enic/enic_mbox.c
index eb5049b538b1..0db05b124557 100644
--- a/drivers/net/ethernet/cisco/enic/enic_mbox.c
+++ b/drivers/net/ethernet/cisco/enic/enic_mbox.c
@@ -5,6 +5,7 @@
 #include <linux/netdevice.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
+#include <linux/completion.h>
 
 #include "vnic_dev.h"
 #include "vnic_wq.h"
@@ -124,6 +125,16 @@ int enic_mbox_send_msg(struct enic *enic, u8 msg_type, u16 dst_vnic_id,
 	return err;
 }
 
+static int enic_mbox_wait_reply(struct enic *enic, unsigned long timeout_ms)
+{
+	unsigned long left;
+
+	left = wait_for_completion_timeout(&enic->mbox_comp,
+					   msecs_to_jiffies(timeout_ms));
+
+	return left ? 0 : -ETIMEDOUT;
+}
+
 int enic_mbox_send_link_state(struct enic *enic, u16 vf_id, u32 link_state)
 {
 	struct enic_mbox_pf_link_state_notif_msg notif = {};
@@ -280,6 +291,136 @@ static void enic_mbox_pf_process_msg(struct enic *enic,
 			    hdr->msg_type, vf_id, err);
 }
 
+static void enic_mbox_vf_handle_capability_reply(struct enic *enic,
+						 void *payload)
+{
+	struct enic_mbox_vf_capability_reply_msg *reply = payload;
+
+	if (le16_to_cpu(reply->reply.ret_major) == 0)
+		enic->pf_cap_version = le32_to_cpu(reply->version);
+	complete(&enic->mbox_comp);
+}
+
+static void enic_mbox_vf_handle_register_reply(struct enic *enic,
+					       void *payload)
+{
+	struct enic_mbox_vf_register_reply_msg *reply = payload;
+
+	if (le16_to_cpu(reply->reply.ret_major)) {
+		netdev_warn(enic->netdev,
+			    "MBOX: VF register rejected by PF: %u/%u\n",
+			    le16_to_cpu(reply->reply.ret_major),
+			    le16_to_cpu(reply->reply.ret_minor));
+	} else {
+		enic->vf_registered = true;
+	}
+	complete(&enic->mbox_comp);
+}
+
+static void enic_mbox_vf_handle_unregister_reply(struct enic *enic,
+						 void *payload)
+{
+	struct enic_mbox_vf_register_reply_msg *reply = payload;
+
+	if (le16_to_cpu(reply->reply.ret_major)) {
+		netdev_warn(enic->netdev,
+			    "MBOX: VF unregister rejected by PF: %u/%u\n",
+			    le16_to_cpu(reply->reply.ret_major),
+			    le16_to_cpu(reply->reply.ret_minor));
+	} else {
+		enic->vf_registered = false;
+	}
+	complete(&enic->mbox_comp);
+}
+
+static void enic_mbox_vf_handle_link_state(struct enic *enic, void *payload)
+{
+	struct enic_mbox_pf_link_state_notif_msg *notif = payload;
+	struct enic_mbox_pf_link_state_ack_msg ack = {};
+
+	switch (le32_to_cpu(notif->link_state)) {
+	case ENIC_MBOX_LINK_STATE_ENABLE:
+		if (!netif_carrier_ok(enic->netdev))
+			netif_carrier_on(enic->netdev);
+		netdev_dbg(enic->netdev, "MBOX: link state -> UP\n");
+		break;
+	case ENIC_MBOX_LINK_STATE_DISABLE:
+		if (netif_carrier_ok(enic->netdev))
+			netif_carrier_off(enic->netdev);
+		netdev_dbg(enic->netdev, "MBOX: link state -> DOWN\n");
+		break;
+	default:
+		netdev_warn(enic->netdev, "MBOX: unknown link state %u\n",
+			    le32_to_cpu(notif->link_state));
+		ack.ack.ret_major = cpu_to_le16(ENIC_MBOX_ERR_GENERIC);
+		break;
+	}
+
+	enic_mbox_send_msg(enic, ENIC_MBOX_PF_LINK_STATE_ACK, ENIC_MBOX_DST_PF,
+			   &ack, sizeof(ack));
+}
+
+static bool enic_mbox_vf_payload_ok(struct enic *enic, u8 msg_type,
+				    u16 payload_len, size_t min_len)
+{
+	if (payload_len < min_len) {
+		netdev_warn(enic->netdev,
+			    "MBOX: short payload for type %u (%u < %zu)\n",
+			    msg_type, payload_len, min_len);
+		return false;
+	}
+	return true;
+}
+
+static void enic_mbox_vf_process_msg(struct enic *enic,
+				     struct enic_mbox_hdr *hdr, void *payload,
+				     u16 payload_len)
+{
+	switch (hdr->msg_type) {
+	case ENIC_MBOX_VF_CAPABILITY_REPLY: {
+		size_t exp = sizeof(struct enic_mbox_vf_capability_reply_msg);
+
+		if (!enic_mbox_vf_payload_ok(enic, hdr->msg_type,
+					     payload_len, exp))
+			return;
+		enic_mbox_vf_handle_capability_reply(enic, payload);
+		break;
+	}
+	case ENIC_MBOX_VF_REGISTER_REPLY: {
+		size_t exp = sizeof(struct enic_mbox_vf_register_reply_msg);
+
+		if (!enic_mbox_vf_payload_ok(enic, hdr->msg_type,
+					     payload_len, exp))
+			return;
+		enic_mbox_vf_handle_register_reply(enic, payload);
+		break;
+	}
+	case ENIC_MBOX_VF_UNREGISTER_REPLY: {
+		size_t exp = sizeof(struct enic_mbox_vf_register_reply_msg);
+
+		if (!enic_mbox_vf_payload_ok(enic, hdr->msg_type,
+					     payload_len, exp))
+			return;
+		enic_mbox_vf_handle_unregister_reply(enic, payload);
+		break;
+	}
+	case ENIC_MBOX_PF_LINK_STATE_NOTIF: {
+		size_t exp = sizeof(struct enic_mbox_pf_link_state_notif_msg);
+
+		if (!enic_mbox_vf_payload_ok(enic, hdr->msg_type,
+					     payload_len, exp))
+			return;
+		enic_mbox_vf_handle_link_state(enic, payload);
+		break;
+	}
+	default:
+		netdev_dbg(enic->netdev,
+			   "MBOX: VF unhandled msg type %u\n",
+			   hdr->msg_type);
+		break;
+	}
+}
+
 static void enic_mbox_recv_handler(struct enic *enic, void *buf,
 				   unsigned int len)
 {
@@ -316,11 +457,90 @@ static void enic_mbox_recv_handler(struct enic *enic, void *buf,
 
 	if (enic->vf_state)
 		enic_mbox_pf_process_msg(enic, hdr, payload);
+	else
+		enic_mbox_vf_process_msg(enic, hdr, payload,
+					 msg_len - (u16)sizeof(*hdr));
+}
+
+int enic_mbox_vf_capability_check(struct enic *enic)
+{
+	struct enic_mbox_vf_capability_msg req = {};
+	int err;
+
+	enic->pf_cap_version = 0;
+	reinit_completion(&enic->mbox_comp);
+	req.version = cpu_to_le32(ENIC_MBOX_CAP_VERSION_1);
+
+	err = enic_mbox_send_msg(enic, ENIC_MBOX_VF_CAPABILITY_REQUEST,
+				 ENIC_MBOX_DST_PF, &req, sizeof(req));
+	if (err)
+		return err;
+
+	err = enic_mbox_wait_reply(enic, 3000);
+	if (err) {
+		netdev_warn(enic->netdev,
+			    "MBOX: no capability reply from PF\n");
+		return err;
+	}
+
+	if (enic->pf_cap_version < ENIC_MBOX_CAP_VERSION_1) {
+		netdev_warn(enic->netdev,
+			    "MBOX: PF version %u too old\n",
+			    enic->pf_cap_version);
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+int enic_mbox_vf_register(struct enic *enic)
+{
+	int err;
+
+	enic->vf_registered = false;
+	reinit_completion(&enic->mbox_comp);
+
+	err = enic_mbox_send_msg(enic, ENIC_MBOX_VF_REGISTER_REQUEST,
+				 ENIC_MBOX_DST_PF, NULL, 0);
+	if (err)
+		return err;
+
+	err = enic_mbox_wait_reply(enic, 3000);
+	if (err) {
+		netdev_warn(enic->netdev,
+			    "MBOX: VF registration with PF timed out\n");
+		return err;
+	}
+
+	if (!enic->vf_registered)
+		return -ENODEV;
+
+	return 0;
+}
+
+int enic_mbox_vf_unregister(struct enic *enic)
+{
+	int err;
+
+	if (!enic->vf_registered)
+		return 0;
+
+	reinit_completion(&enic->mbox_comp);
+
+	err = enic_mbox_send_msg(enic, ENIC_MBOX_VF_UNREGISTER_REQUEST,
+				 ENIC_MBOX_DST_PF, NULL, 0);
+	if (err)
+		return err;
+
+	err = enic_mbox_wait_reply(enic, 3000);
+
+	return enic->vf_registered ? -ETIMEDOUT : 0;
 }
 
 void enic_mbox_init(struct enic *enic)
 {
 	enic->mbox_msg_num = 0;
 	mutex_init(&enic->mbox_lock);
+	init_completion(&enic->mbox_comp);
 	enic->admin_rq_handler = enic_mbox_recv_handler;
 }
diff --git a/drivers/net/ethernet/cisco/enic/enic_mbox.h b/drivers/net/ethernet/cisco/enic/enic_mbox.h
index a6f6798d14f4..fa2fb08bf7d0 100644
--- a/drivers/net/ethernet/cisco/enic/enic_mbox.h
+++ b/drivers/net/ethernet/cisco/enic/enic_mbox.h
@@ -80,5 +80,8 @@ void enic_mbox_init(struct enic *enic);
 int enic_mbox_send_msg(struct enic *enic, u8 msg_type, u16 dst_vnic_id,
 		       void *payload, u16 payload_len);
 int enic_mbox_send_link_state(struct enic *enic, u16 vf_id, u32 link_state);
+int enic_mbox_vf_capability_check(struct enic *enic);
+int enic_mbox_vf_register(struct enic *enic);
+int enic_mbox_vf_unregister(struct enic *enic);
 
 #endif /* _ENIC_MBOX_H_ */

-- 
2.43.0



^ permalink raw reply related

* [PATCH net-next v3 07/10] enic: add MBOX PF handlers for VF register and capability
From: Satish Kharat via B4 Relay @ 2026-04-08 16:36 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, linux-kernel,
	20260401-enic-sriov-v2-prep-v4-0-d5834b2ef1b9, Satish Kharat
In-Reply-To: <20260408-enic-sriov-v2-admin-channel-v2-v3-0-1d4999a03cec@cisco.com>

From: Satish Kharat <satishkh@cisco.com>

Implement PF-side mailbox message processing for SR-IOV V2
admin channel communication.

When the PF receives messages from VFs, the dispatch routes
them to type-specific handlers:
  - VF_CAPABILITY_REQUEST: reply with protocol version 1
  - VF_REGISTER_REQUEST: mark VF registered, reply, then
    send PF_LINK_STATE_NOTIF with link enabled
  - VF_UNREGISTER_REQUEST: mark VF unregistered, send reply
  - PF_LINK_STATE_ACK: log errors from VF acknowledgment

Per-VF state (struct enic_vf_state) is tracked via enic->vf_state
which will be allocated when SRIOV V2 is enabled.

Remove the CONFIG_PCI_IOV guard from num_vfs in struct enic. The
PF handlers reference enic->num_vfs for VF ID bounds checking in
enic_mbox.c, which is compiled unconditionally. The field must be
visible regardless of CONFIG_PCI_IOV to avoid build failures.

Add enic_mbox_send_link_state() helper for PF-initiated link
state notifications, also used later by ndo_set_vf_link_state.

Signed-off-by: Satish Kharat <satishkh@cisco.com>
---
 drivers/net/ethernet/cisco/enic/enic.h      |   7 +-
 drivers/net/ethernet/cisco/enic/enic_mbox.c | 174 +++++++++++++++++++++++++++-
 drivers/net/ethernet/cisco/enic/enic_mbox.h |   1 +
 3 files changed, 178 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/cisco/enic/enic.h b/drivers/net/ethernet/cisco/enic/enic.h
index 42f345aceced..9b1fa3857df5 100644
--- a/drivers/net/ethernet/cisco/enic/enic.h
+++ b/drivers/net/ethernet/cisco/enic/enic.h
@@ -256,9 +256,7 @@ struct enic {
 	struct enic_rx_coal rx_coalesce_setting;
 	u32 rx_coalesce_usecs;
 	u32 tx_coalesce_usecs;
-#ifdef CONFIG_PCI_IOV
 	u16 num_vfs;
-#endif
 	enum enic_vf_type vf_type;
 	unsigned int enable_count;
 	spinlock_t enic_api_lock;
@@ -310,6 +308,11 @@ struct enic {
 	/* MBOX protocol state */
 	struct mutex mbox_lock;
 	u64 mbox_msg_num;
+
+	/* PF: per-VF MBOX state, allocated when SRIOV V2 is enabled */
+	struct enic_vf_state {
+		bool registered;
+	} *vf_state;
 };
 
 static inline struct net_device *vnic_get_netdev(struct vnic_dev *vdev)
diff --git a/drivers/net/ethernet/cisco/enic/enic_mbox.c b/drivers/net/ethernet/cisco/enic/enic_mbox.c
index 00ab76a47a35..eb5049b538b1 100644
--- a/drivers/net/ethernet/cisco/enic/enic_mbox.c
+++ b/drivers/net/ethernet/cisco/enic/enic_mbox.c
@@ -124,10 +124,168 @@ int enic_mbox_send_msg(struct enic *enic, u8 msg_type, u16 dst_vnic_id,
 	return err;
 }
 
+int enic_mbox_send_link_state(struct enic *enic, u16 vf_id, u32 link_state)
+{
+	struct enic_mbox_pf_link_state_notif_msg notif = {};
+
+	if (!enic->vf_state || vf_id >= enic->num_vfs ||
+	    !enic->vf_state[vf_id].registered) {
+		netdev_dbg(enic->netdev,
+			   "MBOX: skip link state to unregistered VF %u\n",
+			   vf_id);
+		return 0;
+	}
+
+	notif.link_state = cpu_to_le32(link_state);
+	return enic_mbox_send_msg(enic, ENIC_MBOX_PF_LINK_STATE_NOTIF, vf_id,
+				  &notif, sizeof(notif));
+}
+
+static int enic_mbox_pf_handle_capability(struct enic *enic, void *msg,
+					  u16 vf_id, u64 msg_num)
+{
+	struct enic_mbox_vf_capability_reply_msg reply = {};
+
+	reply.reply.ret_major = cpu_to_le16(0);
+	reply.version = cpu_to_le32(ENIC_MBOX_CAP_VERSION_1);
+
+	return enic_mbox_send_msg(enic, ENIC_MBOX_VF_CAPABILITY_REPLY, vf_id,
+				  &reply, sizeof(reply));
+}
+
+static int enic_mbox_pf_handle_register(struct enic *enic, void *msg,
+					u16 vf_id, u64 msg_num)
+{
+	struct enic_mbox_vf_register_reply_msg reply = {};
+	int err;
+
+	if (!enic->vf_state || vf_id >= enic->num_vfs) {
+		netdev_warn(enic->netdev,
+			    "MBOX: register from invalid VF %u\n", vf_id);
+		return -EINVAL;
+	}
+
+	/* VF re-registering (e.g. guest reboot without clean unregister):
+	 * mark the previous registration inactive before accepting the new one.
+	 */
+	if (enic->vf_state[vf_id].registered) {
+		netdev_dbg(enic->netdev,
+			   "MBOX: VF %u re-register, cleaning previous state\n",
+			   vf_id);
+		enic->vf_state[vf_id].registered = false;
+	}
+
+	reply.reply.ret_major = cpu_to_le16(0);
+	err = enic_mbox_send_msg(enic, ENIC_MBOX_VF_REGISTER_REPLY, vf_id,
+				 &reply, sizeof(reply));
+	if (err)
+		return err;
+
+	enic->vf_state[vf_id].registered = true;
+	netdev_info(enic->netdev, "VF %u registered via MBOX\n", vf_id);
+
+	err = enic_mbox_send_link_state(enic, vf_id,
+					ENIC_MBOX_LINK_STATE_ENABLE);
+	if (err)
+		netdev_warn(enic->netdev,
+			    "VF %u: failed to send initial link state: %d\n",
+			    vf_id, err);
+	/* Registration succeeded; link state will be (re-)sent on next
+	 * enic_link_check() event.
+	 */
+	return 0;
+}
+
+static int enic_mbox_pf_handle_unregister(struct enic *enic, void *msg,
+					  u16 vf_id, u64 msg_num)
+{
+	struct enic_mbox_vf_register_reply_msg reply = {};
+	int err;
+
+	if (!enic->vf_state || vf_id >= enic->num_vfs) {
+		netdev_warn(enic->netdev,
+			    "MBOX: unregister from invalid VF %u\n", vf_id);
+		return -EINVAL;
+	}
+
+	reply.reply.ret_major = cpu_to_le16(0);
+	err = enic_mbox_send_msg(enic, ENIC_MBOX_VF_UNREGISTER_REPLY, vf_id,
+				 &reply, sizeof(reply));
+	if (err)
+		return err;
+
+	enic->vf_state[vf_id].registered = false;
+
+	netdev_info(enic->netdev, "VF %u unregistered via MBOX\n", vf_id);
+
+	return 0;
+}
+
+static void enic_mbox_pf_process_msg(struct enic *enic,
+				     struct enic_mbox_hdr *hdr, void *payload)
+{
+	u16 vf_id = le16_to_cpu(hdr->src_vnic_id);
+	u16 msg_len = le16_to_cpu(hdr->msg_len);
+	int err = 0;
+
+	if (!enic->vf_state) {
+		netdev_dbg(enic->netdev,
+			   "MBOX: PF received msg but SRIOV not active\n");
+		return;
+	}
+
+	if (vf_id >= enic->num_vfs) {
+		netdev_warn(enic->netdev,
+			    "MBOX: PF received msg from invalid VF %u\n",
+			    vf_id);
+		return;
+	}
+
+	switch (hdr->msg_type) {
+	case ENIC_MBOX_VF_CAPABILITY_REQUEST:
+		err = enic_mbox_pf_handle_capability(enic, payload, vf_id,
+						     le64_to_cpu(hdr->msg_num));
+		break;
+	case ENIC_MBOX_VF_REGISTER_REQUEST:
+		err = enic_mbox_pf_handle_register(enic, payload, vf_id,
+						   le64_to_cpu(hdr->msg_num));
+		break;
+	case ENIC_MBOX_VF_UNREGISTER_REQUEST:
+		err = enic_mbox_pf_handle_unregister(enic, payload, vf_id,
+						     le64_to_cpu(hdr->msg_num));
+		break;
+	case ENIC_MBOX_PF_LINK_STATE_ACK: {
+		struct enic_mbox_pf_link_state_ack_msg *ack = payload;
+
+		if (msg_len < sizeof(*hdr) + sizeof(*ack))
+			break;
+		if (le16_to_cpu(ack->ack.ret_major))
+			netdev_warn(enic->netdev,
+				    "MBOX: VF %u link state ACK error %u/%u\n",
+				    vf_id, le16_to_cpu(ack->ack.ret_major),
+				    le16_to_cpu(ack->ack.ret_minor));
+		break;
+	}
+	default:
+		netdev_dbg(enic->netdev,
+			   "MBOX: PF unhandled msg type %u from VF %u\n",
+			   hdr->msg_type, vf_id);
+		err = -EOPNOTSUPP;
+		break;
+	}
+
+	if (err)
+		netdev_warn(enic->netdev,
+			    "MBOX: PF handler for msg type %u from VF %u failed: %d\n",
+			    hdr->msg_type, vf_id, err);
+}
+
 static void enic_mbox_recv_handler(struct enic *enic, void *buf,
 				   unsigned int len)
 {
 	struct enic_mbox_hdr *hdr = buf;
+	void *payload;
+	u16 msg_len;
 
 	if (len < sizeof(*hdr)) {
 		netdev_warn(enic->netdev,
@@ -142,10 +300,22 @@ static void enic_mbox_recv_handler(struct enic *enic, void *buf,
 		return;
 	}
 
+	msg_len = le16_to_cpu(hdr->msg_len);
+	if (msg_len < sizeof(*hdr) || msg_len > len) {
+		netdev_warn(enic->netdev,
+			    "MBOX: invalid msg_len %u (buf len %u)\n",
+			    msg_len, len);
+		return;
+	}
+
 	netdev_dbg(enic->netdev,
 		   "MBOX recv: type %u from vnic %u len %u\n",
-		   hdr->msg_type, le16_to_cpu(hdr->src_vnic_id),
-		   le16_to_cpu(hdr->msg_len));
+		   hdr->msg_type, le16_to_cpu(hdr->src_vnic_id), msg_len);
+
+	payload = buf + sizeof(*hdr);
+
+	if (enic->vf_state)
+		enic_mbox_pf_process_msg(enic, hdr, payload);
 }
 
 void enic_mbox_init(struct enic *enic)
diff --git a/drivers/net/ethernet/cisco/enic/enic_mbox.h b/drivers/net/ethernet/cisco/enic/enic_mbox.h
index 554269b78780..a6f6798d14f4 100644
--- a/drivers/net/ethernet/cisco/enic/enic_mbox.h
+++ b/drivers/net/ethernet/cisco/enic/enic_mbox.h
@@ -79,5 +79,6 @@ struct enic;
 void enic_mbox_init(struct enic *enic);
 int enic_mbox_send_msg(struct enic *enic, u8 msg_type, u16 dst_vnic_id,
 		       void *payload, u16 payload_len);
+int enic_mbox_send_link_state(struct enic *enic, u16 vf_id, u32 link_state);
 
 #endif /* _ENIC_MBOX_H_ */

-- 
2.43.0



^ permalink raw reply related

* [PATCH net-next v3 10/10] enic: add V2 VF probe with admin channel and PF registration
From: Satish Kharat via B4 Relay @ 2026-04-08 16:36 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, linux-kernel,
	20260401-enic-sriov-v2-prep-v4-0-d5834b2ef1b9, Satish Kharat
In-Reply-To: <20260408-enic-sriov-v2-admin-channel-v2-v3-0-1d4999a03cec@cisco.com>

From: Satish Kharat <satishkh@cisco.com>

When a V2 SR-IOV VF probes, open the admin channel, initialize the
MBOX protocol, perform the capability check with the PF, and register
with the PF. This establishes the PF-VF communication path that the PF
uses to send link state notifications.

The admin channel and MBOX registration happen after enic_dev_init()
(which discovers admin channel resources) and before register_netdev()
so the VF is fully initialized before the interface is visible to
userspace.

On remove, the VF unregisters from the PF and closes its admin channel
before tearing down data path resources.

V2 VFs are not provisioned with an RES_TYPE_SRIOV_INTR resource by
firmware, so bypass that check in the admin channel capability
detection for V2 VFs. The PF still requires this resource.

Signed-off-by: Satish Kharat <satishkh@cisco.com>
---
 drivers/net/ethernet/cisco/enic/enic.h      |  1 +
 drivers/net/ethernet/cisco/enic/enic_main.c | 58 ++++++++++++++++++++++++++++-
 drivers/net/ethernet/cisco/enic/enic_res.c  |  3 +-
 3 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/cisco/enic/enic.h b/drivers/net/ethernet/cisco/enic/enic.h
index 29ce26284493..6301930903ee 100644
--- a/drivers/net/ethernet/cisco/enic/enic.h
+++ b/drivers/net/ethernet/cisco/enic/enic.h
@@ -441,6 +441,7 @@ void enic_reset_addr_lists(struct enic *enic);
 int enic_sriov_enabled(struct enic *enic);
 int enic_is_valid_vf(struct enic *enic, int vf);
 int enic_is_dynamic(struct enic *enic);
+int enic_is_sriov_vf_v2(struct enic *enic);
 void enic_set_ethtool_ops(struct net_device *netdev);
 int __enic_set_rsskey(struct enic *enic);
 void enic_ext_cq(struct enic *enic);
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 39b0d635f1fc..cd21259ebe43 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -316,6 +316,11 @@ static int enic_is_sriov_vf(struct enic *enic)
 	       enic->pdev->device == PCI_DEVICE_ID_CISCO_VIC_ENET_VF_V2;
 }
 
+int enic_is_sriov_vf_v2(struct enic *enic)
+{
+	return enic->pdev->device == PCI_DEVICE_ID_CISCO_VIC_ENET_VF_V2;
+}
+
 int enic_is_valid_vf(struct enic *enic, int vf)
 {
 #ifdef CONFIG_PCI_IOV
@@ -2990,6 +2995,32 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_out_dev_close;
 	}
 
+	/* V2 VF: open admin channel and register with PF.
+	 * Must happen before register_netdev so the VF is fully
+	 * initialized before the interface is visible to userspace.
+	 */
+	if (enic_is_sriov_vf_v2(enic)) {
+		err = enic_admin_channel_open(enic);
+		if (err) {
+			dev_err(dev,
+				"Failed to open admin channel: %d\n", err);
+			goto err_out_dev_deinit;
+		}
+		enic_mbox_init(enic);
+		err = enic_mbox_vf_capability_check(enic);
+		if (err) {
+			dev_err(dev,
+				"MBOX capability check failed: %d\n", err);
+			goto err_out_admin_close;
+		}
+		err = enic_mbox_vf_register(enic);
+		if (err) {
+			dev_err(dev,
+				"MBOX VF registration failed: %d\n", err);
+			goto err_out_admin_close;
+		}
+	}
+
 	netif_set_real_num_tx_queues(netdev, enic->wq_count);
 	netif_set_real_num_rx_queues(netdev, enic->rq_count);
 
@@ -3014,7 +3045,7 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	err = enic_set_mac_addr(netdev, enic->mac_addr);
 	if (err) {
 		dev_err(dev, "Invalid MAC address, aborting\n");
-		goto err_out_dev_deinit;
+		goto err_out_admin_close;
 	}
 
 	enic->tx_coalesce_usecs = enic->config.intr_timer_usec;
@@ -3112,11 +3143,23 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	err = register_netdev(netdev);
 	if (err) {
 		dev_err(dev, "Cannot register net device, aborting\n");
-		goto err_out_dev_deinit;
+		goto err_out_admin_close;
 	}
 
 	return 0;
 
+err_out_admin_close:
+	if (enic_is_sriov_vf_v2(enic)) {
+		if (enic->vf_registered) {
+			int unreg_err = enic_mbox_vf_unregister(enic);
+
+			if (unreg_err)
+				netdev_warn(netdev,
+					    "Failed to unregister from PF: %d\n",
+					    unreg_err);
+		}
+		enic_admin_channel_close(enic);
+	}
 err_out_dev_deinit:
 	enic_dev_deinit(enic);
 err_out_dev_close:
@@ -3154,6 +3197,17 @@ static void enic_remove(struct pci_dev *pdev)
 		cancel_work_sync(&enic->reset);
 		cancel_work_sync(&enic->change_mtu_work);
 		unregister_netdev(netdev);
+		if (enic_is_sriov_vf_v2(enic)) {
+			if (enic->vf_registered) {
+				int unreg_err = enic_mbox_vf_unregister(enic);
+
+				if (unreg_err)
+					netdev_warn(netdev,
+						    "Failed to unregister from PF: %d\n",
+						    unreg_err);
+			}
+			enic_admin_channel_close(enic);
+		}
 #ifdef CONFIG_PCI_IOV
 		if (enic_sriov_enabled(enic)) {
 			if (enic->vf_type == ENIC_VF_TYPE_V2)
diff --git a/drivers/net/ethernet/cisco/enic/enic_res.c b/drivers/net/ethernet/cisco/enic/enic_res.c
index 436326ace049..74cd2ee3af5c 100644
--- a/drivers/net/ethernet/cisco/enic/enic_res.c
+++ b/drivers/net/ethernet/cisco/enic/enic_res.c
@@ -211,7 +211,8 @@ void enic_get_res_counts(struct enic *enic)
 		vnic_dev_get_res_count(enic->vdev, RES_TYPE_ADMIN_RQ) >= 1 &&
 		vnic_dev_get_res_count(enic->vdev, RES_TYPE_ADMIN_CQ) >=
 			ARRAY_SIZE(enic->admin_cq) &&
-		vnic_dev_get_res_count(enic->vdev, RES_TYPE_SRIOV_INTR) >= 1;
+		(enic_is_sriov_vf_v2(enic) ||
+		 vnic_dev_get_res_count(enic->vdev, RES_TYPE_SRIOV_INTR) >= 1);
 
 	dev_info(enic_get_dev(enic),
 		"vNIC resources avail: wq %d rq %d cq %d intr %d admin %s\n",

-- 
2.43.0



^ permalink raw reply related

* [PATCH net-next v3 09/10] enic: wire V2 SR-IOV enable with admin channel and MBOX
From: Satish Kharat via B4 Relay @ 2026-04-08 16:36 UTC (permalink / raw)
  To: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: netdev, linux-kernel,
	20260401-enic-sriov-v2-prep-v4-0-d5834b2ef1b9, Satish Kharat
In-Reply-To: <20260408-enic-sriov-v2-admin-channel-v2-v3-0-1d4999a03cec@cisco.com>

From: Satish Kharat <satishkh@cisco.com>

Extend enic_sriov_configure() to handle V2 SR-IOV VFs. When the PF
detects V2 VF device IDs, the enable path allocates per-VF MBOX state,
opens the admin channel, initializes the MBOX protocol, and then calls
pci_enable_sriov(). The admin channel must be ready before VFs are
created so that VF drivers can immediately begin the MBOX capability
and registration handshake during their probe.

The disable path reverses this order: pci_disable_sriov() first (so VF
drivers unregister via MBOX), then the admin channel is closed and
per-VF state is freed.

The existing V1/USNIC SR-IOV paths are unchanged.

Signed-off-by: Satish Kharat <satishkh@cisco.com>
---
 drivers/net/ethernet/cisco/enic/enic_main.c | 137 ++++++++++++++++++++++++++--
 drivers/net/ethernet/cisco/enic/enic_res.c  |   1 +
 drivers/net/ethernet/cisco/enic/vnic_enet.h |   4 +-
 3 files changed, 134 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 73bb59eef7a0..39b0d635f1fc 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -60,6 +60,8 @@
 #include "enic_clsf.h"
 #include "enic_rq.h"
 #include "enic_wq.h"
+#include "enic_admin.h"
+#include "enic_mbox.h"
 
 #define ENIC_NOTIFY_TIMER_PERIOD	(2 * HZ)
 
@@ -2689,6 +2691,120 @@ static void enic_sriov_detect_vf_type(struct enic *enic)
 		enic->vf_type = ENIC_VF_TYPE_NONE;
 	}
 }
+
+static int __maybe_unused
+enic_sriov_v2_enable(struct enic *enic, int num_vfs)
+{
+	int err;
+
+	if (!enic->has_admin_channel) {
+		netdev_err(enic->netdev,
+			   "V2 SR-IOV requires admin channel resources\n");
+		return -EOPNOTSUPP;
+	}
+
+	enic->vf_state = kcalloc(num_vfs, sizeof(*enic->vf_state), GFP_KERNEL);
+	if (!enic->vf_state)
+		return -ENOMEM;
+
+	err = enic_admin_channel_open(enic);
+	if (err) {
+		netdev_err(enic->netdev,
+			   "Failed to open admin channel: %d\n", err);
+		goto free_vf_state;
+	}
+
+	enic_mbox_init(enic);
+
+	enic->num_vfs = num_vfs;
+
+	err = pci_enable_sriov(enic->pdev, num_vfs);
+	if (err) {
+		netdev_err(enic->netdev,
+			   "pci_enable_sriov failed: %d\n", err);
+		goto close_admin;
+	}
+
+	enic->priv_flags |= ENIC_SRIOV_ENABLED;
+	return num_vfs;
+
+close_admin:
+	enic->num_vfs = 0;
+	enic_admin_channel_close(enic);
+free_vf_state:
+	kfree(enic->vf_state);
+	enic->vf_state = NULL;
+	return err;
+}
+
+static void enic_sriov_v2_disable(struct enic *enic)
+{
+	pci_disable_sriov(enic->pdev);
+	enic_admin_channel_close(enic);
+	kfree(enic->vf_state);
+	enic->vf_state = NULL;
+	enic->num_vfs = 0;
+	enic->priv_flags &= ~ENIC_SRIOV_ENABLED;
+}
+
+static int __maybe_unused
+enic_sriov_configure(struct pci_dev *pdev, int num_vfs)
+{
+	struct net_device *netdev = pci_get_drvdata(pdev);
+	struct enic *enic = netdev_priv(netdev);
+	struct enic_port_profile *pp;
+	int err;
+
+	if (num_vfs > 0) {
+		if (enic->config.mq_subvnic_count) {
+			netdev_err(netdev,
+				   "SR-IOV not supported with multi-queue sub-vnics\n");
+			return -EOPNOTSUPP;
+		}
+
+		if (enic->vf_type == ENIC_VF_TYPE_NONE) {
+			netdev_err(netdev,
+				   "SR-IOV not supported on this firmware version\n");
+			return -EOPNOTSUPP;
+		}
+
+		if (enic->vf_type == ENIC_VF_TYPE_V2)
+			return enic_sriov_v2_enable(enic, num_vfs);
+
+		pp = kcalloc(num_vfs, sizeof(*pp), GFP_KERNEL);
+		if (!pp)
+			return -ENOMEM;
+
+		err = pci_enable_sriov(pdev, num_vfs);
+		if (err) {
+			kfree(pp);
+			return err;
+		}
+
+		kfree(enic->pp);
+		enic->pp = pp;
+		enic->num_vfs = num_vfs;
+		enic->priv_flags |= ENIC_SRIOV_ENABLED;
+		return num_vfs;
+	}
+
+	if (!enic_sriov_enabled(enic))
+		return 0;
+
+	if (enic->vf_type == ENIC_VF_TYPE_V2) {
+		enic_sriov_v2_disable(enic);
+		return 0;
+	}
+
+	pci_disable_sriov(pdev);
+	enic->num_vfs = 0;
+	enic->priv_flags &= ~ENIC_SRIOV_ENABLED;
+
+	kfree(enic->pp);
+	enic->pp = kzalloc(sizeof(*enic->pp), GFP_KERNEL);
+
+	return 0;
+}
 #endif
 
 static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
@@ -2787,12 +2903,18 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_out_vnic_unregister;
 
 #ifdef CONFIG_PCI_IOV
-	/* Get number of subvnics */
+	enic_sriov_detect_vf_type(enic);
+
+	/* Auto-enable SR-IOV if VFs were pre-configured (e.g. at boot).
+	 * V2 VFs require the admin channel, which is not yet set up at probe
+	 * time; use sysfs (enic_sriov_configure) to enable V2 SR-IOV instead.
+	 */
 	pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
 	if (pos) {
 		pci_read_config_word(pdev, pos + PCI_SRIOV_TOTAL_VF,
 			&enic->num_vfs);
-		if (enic->num_vfs) {
+		if (enic->num_vfs &&
+		    enic->vf_type != ENIC_VF_TYPE_V2) {
 			err = pci_enable_sriov(pdev, enic->num_vfs);
 			if (err) {
 				dev_err(dev, "SRIOV enable failed, aborting."
@@ -2804,7 +2926,6 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			num_pps = enic->num_vfs;
 		}
 	}
-	enic_sriov_detect_vf_type(enic);
 #endif
 
 	/* Allocate structure for port profiles */
@@ -3033,14 +3154,16 @@ static void enic_remove(struct pci_dev *pdev)
 		cancel_work_sync(&enic->reset);
 		cancel_work_sync(&enic->change_mtu_work);
 		unregister_netdev(netdev);
-		enic_dev_deinit(enic);
-		vnic_dev_close(enic->vdev);
 #ifdef CONFIG_PCI_IOV
 		if (enic_sriov_enabled(enic)) {
-			pci_disable_sriov(pdev);
-			enic->priv_flags &= ~ENIC_SRIOV_ENABLED;
+			if (enic->vf_type == ENIC_VF_TYPE_V2)
+				enic_sriov_v2_disable(enic);
+			else
+				pci_disable_sriov(pdev);
 		}
 #endif
+		enic_dev_deinit(enic);
+		vnic_dev_close(enic->vdev);
 		kfree(enic->pp);
 		vnic_dev_unregister(enic->vdev);
 		enic_iounmap(enic);
diff --git a/drivers/net/ethernet/cisco/enic/enic_res.c b/drivers/net/ethernet/cisco/enic/enic_res.c
index 2b7545d6a67f..436326ace049 100644
--- a/drivers/net/ethernet/cisco/enic/enic_res.c
+++ b/drivers/net/ethernet/cisco/enic/enic_res.c
@@ -59,6 +59,7 @@ int enic_get_vnic_config(struct enic *enic)
 	GET_CONFIG(intr_timer_usec);
 	GET_CONFIG(loop_tag);
 	GET_CONFIG(num_arfs);
+	GET_CONFIG(mq_subvnic_count);
 	GET_CONFIG(max_rq_ring);
 	GET_CONFIG(max_wq_ring);
 	GET_CONFIG(max_cq_ring);
diff --git a/drivers/net/ethernet/cisco/enic/vnic_enet.h b/drivers/net/ethernet/cisco/enic/vnic_enet.h
index 9e8e86262a3f..519d2969990b 100644
--- a/drivers/net/ethernet/cisco/enic/vnic_enet.h
+++ b/drivers/net/ethernet/cisco/enic/vnic_enet.h
@@ -21,7 +21,9 @@ struct vnic_enet_config {
 	u16 loop_tag;
 	u16 vf_rq_count;
 	u16 num_arfs;
-	u8 reserved[66];
+	u8 reserved1[32];
+	u16 mq_subvnic_count;
+	u8 reserved2[32];
 	u32 max_rq_ring;	// MAX RQ ring size
 	u32 max_wq_ring;	// MAX WQ ring size
 	u32 max_cq_ring;	// MAX CQ ring size

-- 
2.43.0



^ permalink raw reply related

* Re: [PATCH 4/6] net: ipa: add IPA v5.2 configuration data
From: Simon Horman @ 2026-04-08 16:36 UTC (permalink / raw)
  To: Luca Weiss
  Cc: Alex Elder, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Rob Herring, Krzysztof Kozlowski,
	Conor Dooley, Bjorn Andersson, Konrad Dybcio, Alexander Koskovich,
	~postmarketos/upstreaming, phone-devel, netdev, linux-kernel,
	linux-arm-msm, devicetree
In-Reply-To: <20260403-milos-ipa-v1-4-01e9e4e03d3e@fairphone.com>

On Fri, Apr 03, 2026 at 06:43:50PM +0200, Luca Weiss wrote:
> Add the configuration data required for IPA v5.2, which is used in
> the Qualcomm Milos SoC.
> 
> Signed-off-by: Luca Weiss <luca.weiss@fairphone.com>
> ---
>  drivers/net/ipa/Makefile             |   2 +-
>  drivers/net/ipa/data/ipa_data-v5.2.c | 452 +++++++++++++++++++++++++++++++++++
>  drivers/net/ipa/gsi_reg.c            |   1 +
>  drivers/net/ipa/ipa_data.h           |   1 +
>  drivers/net/ipa/ipa_main.c           |   4 +
>  drivers/net/ipa/ipa_reg.c            |   1 +
>  drivers/net/ipa/ipa_sysfs.c          |   2 +
>  drivers/net/ipa/ipa_version.h        |   2 +
>  8 files changed, 464 insertions(+), 1 deletion(-)

Reviewed-by: Simon Horman <horms@kernel.org>

I'm not suggesting a change to this patch.

But it does seem to me that there is a lot of commonality
between drivers/net/ipa/data/ipa_data-v*.c.
And it would be nice if that could be consolidated somehow.

...

^ permalink raw reply

* Re: [PATCH net-next v2 2/2] selftests: net: add ECMP rehash test
From: Jakub Kicinski @ 2026-04-08 16:38 UTC (permalink / raw)
  To: Neil Spring; +Cc: netdev, edumazet, davem
In-Reply-To: <20260408070514.1840227-3-ntspring@meta.com>

On Wed,  8 Apr 2026 00:05:14 -0700 Neil Spring wrote:
> Add ecmp_rehash.sh to exercise TCP ECMP path re-selection on
> retransmission timeout.  Three tests cover client SYN rehash, server
> SYN/ACK rehash, and midstream RTO rehash of an established connection
> over a two-path ECMP topology with one leg blocked by tc.
> 
> The SYN test retries 26 times, so has a false negative probability
> of ~(1/2)^25 ≈ 3e-8.

I think this is failing in netdev CI:

TAP version 13
1..1
# timeout set to 3600
# selftests: net: ecmp_rehash.sh
# 30.31 [+30.31] TEST: ECMP RTO rehash: establish with blocked paths                 [FAIL]
# 30.31 [+0.00] SYNs did not appear on both paths (rehash not working)
# 30.32 [+0.01] 2026/04/07 23:36:18 socat[18706] W exiting on signal 15
# 33.44 [+3.12] 2026/04/07 23:36:21 socat[21325] E read(7, 0x560db4041000, 8192): Connection reset by peer
# 33.45 [+0.01] TEST: ECMP SYN/ACK rehash: blocked return path                      [ OK ]
# 33.45 [+0.01] 2026/04/07 23:36:21 socat[21137] W exiting on signal 15
# 34.76 [+1.31] 51TEST: ECMP midstream rehash: block active path                      [FAIL]
# 99.81 [+65.04] data transfer failed after blocking veth0a
# 99.82 [+0.01] 2026/04/07 23:37:27 socat[21345] W exiting on signal 15
not ok 1 selftests: net: ecmp_rehash.sh # exit=1


Also please start a new thread when you post v3.

^ permalink raw reply

* [PATCH 0/5] selftests: net: add multithread and multiqueue support to iou-zcrx
From: Juanlu Herrero @ 2026-04-08 16:38 UTC (permalink / raw)
  To: netdev; +Cc: Juanlu Herrero

Add multithreaded support to the iou-zcrx selftest and a new
rss_multiqueue test variant that exercises multi-queue zero-copy
receive with per-port flow rule steering.

Juanlu Herrero (5):
  selftests: net: fix get_refill_ring_size() to use its local variable
  selftests: net: add multithread client support to iou-zcrx
  selftests: net: remove unused variable in process_recvzc()
  selftests: net: add multithread server support to iou-zcrx
  selftests: net: add rss_multiqueue test variant to iou-zcrx

 .../testing/selftests/drivers/net/hw/Makefile |   2 +-
 .../selftests/drivers/net/hw/iou-zcrx.c       | 361 ++++++++++++------
 .../selftests/drivers/net/hw/iou-zcrx.py      |  45 ++-
 3 files changed, 281 insertions(+), 127 deletions(-)

-- 
2.53.0


^ permalink raw reply

* [PATCH 1/5] selftests: net: fix get_refill_ring_size() to use its local variable
From: Juanlu Herrero @ 2026-04-08 16:38 UTC (permalink / raw)
  To: netdev; +Cc: Juanlu Herrero
In-Reply-To: <20260408163816.2760-1-juanlu@fastmail.com>

In preparation for multi-threaded rss selftests, fix
get_refill_ring_size to use the local `size` variable,
instead of the `global_size`.

Signed-off-by: Juanlu Herrero <juanlu@fastmail.com>
---
 tools/testing/selftests/drivers/net/hw/iou-zcrx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
index 240d13dbc54e..334985083f61 100644
--- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
+++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
@@ -132,10 +132,10 @@ static inline size_t get_refill_ring_size(unsigned int rq_entries)
 {
 	size_t size;
 
-	ring_size = rq_entries * sizeof(struct io_uring_zcrx_rqe);
+	size = rq_entries * sizeof(struct io_uring_zcrx_rqe);
 	/* add space for the header (head/tail/etc.) */
-	ring_size += page_size;
-	return ALIGN_UP(ring_size, page_size);
+	size += page_size;
+	return ALIGN_UP(size, page_size);
 }
 
 static void setup_zcrx(struct io_uring *ring)
-- 
2.53.0


^ permalink raw reply related

* [PATCH 2/5] selftests: net: add multithread client support to iou-zcrx
From: Juanlu Herrero @ 2026-04-08 16:38 UTC (permalink / raw)
  To: netdev; +Cc: Juanlu Herrero
In-Reply-To: <20260408163816.2760-1-juanlu@fastmail.com>

Add pthreads to the iou-zcrx client so that multiple connections can be
established simultaneously. Each client thread connects to the server
and sends its payload independently.

Introduce struct thread_ctx and the -t option to control the number of
threads (default 1), preserving backwards compatibility with existing
tests.

Signed-off-by: Juanlu Herrero <juanlu@fastmail.com>
---
 .../testing/selftests/drivers/net/hw/Makefile |  2 +-
 .../selftests/drivers/net/hw/iou-zcrx.c       | 46 +++++++++++++++++--
 2 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile
index deeca3f8d080..227adfec706c 100644
--- a/tools/testing/selftests/drivers/net/hw/Makefile
+++ b/tools/testing/selftests/drivers/net/hw/Makefile
@@ -80,5 +80,5 @@ include ../../../net/ynl.mk
 include ../../../net/bpf.mk
 
 ifeq ($(HAS_IOURING_ZCRX),y)
-$(OUTPUT)/iou-zcrx: LDLIBS += -luring
+$(OUTPUT)/iou-zcrx: LDLIBS += -luring -lpthread
 endif
diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
index 334985083f61..de2eea78a5b6 100644
--- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
+++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
@@ -4,6 +4,7 @@
 #include <error.h>
 #include <fcntl.h>
 #include <limits.h>
+#include <pthread.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -85,8 +86,14 @@ static int cfg_send_size = SEND_SIZE;
 static struct sockaddr_in6 cfg_addr;
 static unsigned int cfg_rx_buf_len;
 static bool cfg_dry_run;
+static int cfg_num_threads = 1;
 
 static char *payload;
+
+struct thread_ctx {
+	int			thread_id;
+};
+
 static void *area_ptr;
 static void *ring_ptr;
 static size_t ring_size;
@@ -376,7 +383,7 @@ static void run_server(void)
 		error(1, 0, "test failed\n");
 }
 
-static void run_client(void)
+static void *client_worker(void *arg)
 {
 	ssize_t to_send = cfg_send_size;
 	ssize_t sent = 0;
@@ -402,12 +409,42 @@ static void run_client(void)
 	}
 
 	close(fd);
+	return NULL;
+}
+
+static void run_client(void)
+{
+	struct thread_ctx *ctxs;
+	pthread_t *threads;
+	int i, ret;
+
+	ctxs = calloc(cfg_num_threads, sizeof(*ctxs));
+	threads = calloc(cfg_num_threads, sizeof(*threads));
+	if (!ctxs || !threads)
+		error(1, 0, "calloc()");
+
+	for (i = 0; i < cfg_num_threads; i++)
+		ctxs[i].thread_id = i;
+
+	for (i = 0; i < cfg_num_threads; i++) {
+		ret = pthread_create(&threads[i], NULL, client_worker,
+				     &ctxs[i]);
+		if (ret)
+			error(1, ret, "pthread_create()");
+	}
+
+	for (i = 0; i < cfg_num_threads; i++)
+		pthread_join(threads[i], NULL);
+
+	free(threads);
+	free(ctxs);
 }
 
 static void usage(const char *filepath)
 {
 	error(1, 0, "Usage: %s (-4|-6) (-s|-c) -h<server_ip> -p<port> "
-		    "-l<payload_size> -i<ifname> -q<rxq_id>", filepath);
+		    "-l<payload_size> -i<ifname> -q<rxq_id> -t<num_threads>",
+		    filepath);
 }
 
 static void parse_opts(int argc, char **argv)
@@ -425,7 +462,7 @@ static void parse_opts(int argc, char **argv)
 		usage(argv[0]);
 	cfg_payload_len = max_payload_len;
 
-	while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:x:d")) != -1) {
+	while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:x:dt:")) != -1) {
 		switch (c) {
 		case 's':
 			if (cfg_client)
@@ -466,6 +503,9 @@ static void parse_opts(int argc, char **argv)
 		case 'd':
 			cfg_dry_run = true;
 			break;
+		case 't':
+			cfg_num_threads = strtoul(optarg, NULL, 0);
+			break;
 		}
 	}
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH 3/5] selftests: net: remove unused variable in process_recvzc()
From: Juanlu Herrero @ 2026-04-08 16:38 UTC (permalink / raw)
  To: netdev; +Cc: Juanlu Herrero
In-Reply-To: <20260408163816.2760-1-juanlu@fastmail.com>

Remove unused `sqe` variable in preparation for multiqueue
rss selftest changes to process_recvzc() in the following
commit.

Signed-off-by: Juanlu Herrero <juanlu@fastmail.com>
---
 tools/testing/selftests/drivers/net/hw/iou-zcrx.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
index de2eea78a5b6..6185c855b85c 100644
--- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
+++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
@@ -276,7 +276,6 @@ static void process_recvzc(struct io_uring *ring, struct io_uring_cqe *cqe)
 	unsigned rq_mask = rq_ring.ring_entries - 1;
 	struct io_uring_zcrx_cqe *rcqe;
 	struct io_uring_zcrx_rqe *rqe;
-	struct io_uring_sqe *sqe;
 	uint64_t mask;
 	char *data;
 	ssize_t n;
-- 
2.53.0


^ permalink raw reply related

* [PATCH 4/5] selftests: net: add multithread server support to iou-zcrx
From: Juanlu Herrero @ 2026-04-08 16:38 UTC (permalink / raw)
  To: netdev; +Cc: Juanlu Herrero
In-Reply-To: <20260408163816.2760-1-juanlu@fastmail.com>

Move server state (io_uring ring, zcrx area, receive tracking) from
global variables into struct thread_ctx and thread the server side.

The main thread creates a single listening socket, spawns N worker
threads (each setting up its own io_uring and zcrx instance), then
accepts N connections and distributes them to the workers via
pthread barriers for synchronization.

Signed-off-by: Juanlu Herrero <juanlu@fastmail.com>
---
 .../selftests/drivers/net/hw/iou-zcrx.c       | 247 ++++++++++--------
 1 file changed, 140 insertions(+), 107 deletions(-)

diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
index 6185c855b85c..646682167bb0 100644
--- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
+++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
@@ -89,20 +89,22 @@ static bool cfg_dry_run;
 static int cfg_num_threads = 1;
 
 static char *payload;
+static pthread_barrier_t barrier;
 
 struct thread_ctx {
+	struct io_uring		ring;
+	void			*area_ptr;
+	void			*ring_ptr;
+	size_t			ring_size;
+	struct io_uring_zcrx_rq	rq_ring;
+	unsigned long		area_token;
+	int			connfd;
+	bool			stop;
+	size_t			received;
+	int			queue_id;
 	int			thread_id;
 };
 
-static void *area_ptr;
-static void *ring_ptr;
-static size_t ring_size;
-static struct io_uring_zcrx_rq rq_ring;
-static unsigned long area_token;
-static int connfd;
-static bool stop;
-static size_t received;
-
 static unsigned long gettimeofday_ms(void)
 {
 	struct timeval tv;
@@ -145,7 +147,7 @@ static inline size_t get_refill_ring_size(unsigned int rq_entries)
 	return ALIGN_UP(size, page_size);
 }
 
-static void setup_zcrx(struct io_uring *ring)
+static void setup_zcrx(struct thread_ctx *ctx)
 {
 	unsigned int ifindex;
 	unsigned int rq_entries = 4096;
@@ -156,58 +158,58 @@ static void setup_zcrx(struct io_uring *ring)
 		error(1, 0, "bad interface name: %s", cfg_ifname);
 
 	if (cfg_rx_buf_len && cfg_rx_buf_len != page_size) {
-		area_ptr = mmap(NULL,
-				AREA_SIZE,
-				PROT_READ | PROT_WRITE,
-				MAP_ANONYMOUS | MAP_PRIVATE |
-				MAP_HUGETLB | MAP_HUGE_2MB,
-				-1,
-				0);
-		if (area_ptr == MAP_FAILED) {
+		ctx->area_ptr = mmap(NULL,
+				     AREA_SIZE,
+				     PROT_READ | PROT_WRITE,
+				     MAP_ANONYMOUS | MAP_PRIVATE |
+				     MAP_HUGETLB | MAP_HUGE_2MB,
+				     -1,
+				     0);
+		if (ctx->area_ptr == MAP_FAILED) {
 			printf("Can't allocate huge pages\n");
 			exit(SKIP_CODE);
 		}
 	} else {
-		area_ptr = mmap(NULL,
-				AREA_SIZE,
-				PROT_READ | PROT_WRITE,
-				MAP_ANONYMOUS | MAP_PRIVATE,
-				0,
-				0);
-		if (area_ptr == MAP_FAILED)
+		ctx->area_ptr = mmap(NULL,
+				     AREA_SIZE,
+				     PROT_READ | PROT_WRITE,
+				     MAP_ANONYMOUS | MAP_PRIVATE,
+				     0,
+				     0);
+		if (ctx->area_ptr == MAP_FAILED)
 			error(1, 0, "mmap(): zero copy area");
 	}
 
-	ring_size = get_refill_ring_size(rq_entries);
-	ring_ptr = mmap(NULL,
-			ring_size,
-			PROT_READ | PROT_WRITE,
-			MAP_ANONYMOUS | MAP_PRIVATE,
-			0,
-			0);
+	ctx->ring_size = get_refill_ring_size(rq_entries);
+	ctx->ring_ptr = mmap(NULL,
+			     ctx->ring_size,
+			     PROT_READ | PROT_WRITE,
+			     MAP_ANONYMOUS | MAP_PRIVATE,
+			     0,
+			     0);
 
 	struct io_uring_region_desc region_reg = {
-		.size = ring_size,
-		.user_addr = (__u64)(unsigned long)ring_ptr,
+		.size = ctx->ring_size,
+		.user_addr = (__u64)(unsigned long)ctx->ring_ptr,
 		.flags = IORING_MEM_REGION_TYPE_USER,
 	};
 
 	struct io_uring_zcrx_area_reg area_reg = {
-		.addr = (__u64)(unsigned long)area_ptr,
+		.addr = (__u64)(unsigned long)ctx->area_ptr,
 		.len = AREA_SIZE,
 		.flags = 0,
 	};
 
 	struct t_io_uring_zcrx_ifq_reg reg = {
 		.if_idx = ifindex,
-		.if_rxq = cfg_queue_id,
+		.if_rxq = ctx->queue_id,
 		.rq_entries = rq_entries,
 		.area_ptr = (__u64)(unsigned long)&area_reg,
 		.region_ptr = (__u64)(unsigned long)&region_reg,
 		.rx_buf_len = cfg_rx_buf_len,
 	};
 
-	ret = io_uring_register_ifq(ring, (void *)&reg);
+	ret = io_uring_register_ifq(&ctx->ring, (void *)&reg);
 	if (cfg_rx_buf_len && (ret == -EINVAL || ret == -EOPNOTSUPP ||
 			       ret == -ERANGE)) {
 		printf("Large chunks are not supported %i\n", ret);
@@ -216,64 +218,40 @@ static void setup_zcrx(struct io_uring *ring)
 		error(1, 0, "io_uring_register_ifq(): %d", ret);
 	}
 
-	rq_ring.khead = (unsigned int *)((char *)ring_ptr + reg.offsets.head);
-	rq_ring.ktail = (unsigned int *)((char *)ring_ptr + reg.offsets.tail);
-	rq_ring.rqes = (struct io_uring_zcrx_rqe *)((char *)ring_ptr + reg.offsets.rqes);
-	rq_ring.rq_tail = 0;
-	rq_ring.ring_entries = reg.rq_entries;
-
-	area_token = area_reg.rq_area_token;
-}
-
-static void add_accept(struct io_uring *ring, int sockfd)
-{
-	struct io_uring_sqe *sqe;
-
-	sqe = io_uring_get_sqe(ring);
+	ctx->rq_ring.khead = (unsigned int *)((char *)ctx->ring_ptr + reg.offsets.head);
+	ctx->rq_ring.ktail = (unsigned int *)((char *)ctx->ring_ptr + reg.offsets.tail);
+	ctx->rq_ring.rqes = (struct io_uring_zcrx_rqe *)((char *)ctx->ring_ptr + reg.offsets.rqes);
+	ctx->rq_ring.rq_tail = 0;
+	ctx->rq_ring.ring_entries = reg.rq_entries;
 
-	io_uring_prep_accept(sqe, sockfd, NULL, NULL, 0);
-	sqe->user_data = 1;
+	ctx->area_token = area_reg.rq_area_token;
 }
 
-static void add_recvzc(struct io_uring *ring, int sockfd)
+static void add_recvzc(struct thread_ctx *ctx, int sockfd)
 {
 	struct io_uring_sqe *sqe;
 
-	sqe = io_uring_get_sqe(ring);
+	sqe = io_uring_get_sqe(&ctx->ring);
 
 	io_uring_prep_rw(IORING_OP_RECV_ZC, sqe, sockfd, NULL, 0, 0);
 	sqe->ioprio |= IORING_RECV_MULTISHOT;
 	sqe->user_data = 2;
 }
 
-static void add_recvzc_oneshot(struct io_uring *ring, int sockfd, size_t len)
+static void add_recvzc_oneshot(struct thread_ctx *ctx, int sockfd, size_t len)
 {
 	struct io_uring_sqe *sqe;
 
-	sqe = io_uring_get_sqe(ring);
+	sqe = io_uring_get_sqe(&ctx->ring);
 
 	io_uring_prep_rw(IORING_OP_RECV_ZC, sqe, sockfd, NULL, len, 0);
 	sqe->ioprio |= IORING_RECV_MULTISHOT;
 	sqe->user_data = 2;
 }
 
-static void process_accept(struct io_uring *ring, struct io_uring_cqe *cqe)
+static void process_recvzc(struct thread_ctx *ctx, struct io_uring_cqe *cqe)
 {
-	if (cqe->res < 0)
-		error(1, 0, "accept()");
-	if (connfd)
-		error(1, 0, "Unexpected second connection");
-
-	connfd = cqe->res;
-	if (cfg_oneshot)
-		add_recvzc_oneshot(ring, connfd, page_size);
-	else
-		add_recvzc(ring, connfd);
-}
-
-static void process_recvzc(struct io_uring *ring, struct io_uring_cqe *cqe)
-{
-	unsigned rq_mask = rq_ring.ring_entries - 1;
+	unsigned rq_mask = ctx->rq_ring.ring_entries - 1;
 	struct io_uring_zcrx_cqe *rcqe;
 	struct io_uring_zcrx_rqe *rqe;
 	uint64_t mask;
@@ -282,7 +260,7 @@ static void process_recvzc(struct io_uring *ring, struct io_uring_cqe *cqe)
 	int i;
 
 	if (cqe->res == 0 && cqe->flags == 0 && cfg_oneshot_recvs == 0) {
-		stop = true;
+		ctx->stop = true;
 		return;
 	}
 
@@ -291,59 +269,99 @@ static void process_recvzc(struct io_uring *ring, struct io_uring_cqe *cqe)
 
 	if (cfg_oneshot) {
 		if (cqe->res == 0 && cqe->flags == 0 && cfg_oneshot_recvs) {
-			add_recvzc_oneshot(ring, connfd, page_size);
+			add_recvzc_oneshot(ctx, ctx->connfd, page_size);
 			cfg_oneshot_recvs--;
 		}
 	} else if (!(cqe->flags & IORING_CQE_F_MORE)) {
-		add_recvzc(ring, connfd);
+		add_recvzc(ctx, ctx->connfd);
 	}
 
 	rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1);
 
 	n = cqe->res;
 	mask = (1ULL << IORING_ZCRX_AREA_SHIFT) - 1;
-	data = (char *)area_ptr + (rcqe->off & mask);
+	data = (char *)ctx->area_ptr + (rcqe->off & mask);
 
 	for (i = 0; i < n; i++) {
-		if (*(data + i) != payload[(received + i)])
+		if (*(data + i) != payload[(ctx->received + i)])
 			error(1, 0, "payload mismatch at %d", i);
 	}
-	received += n;
+	ctx->received += n;
 
-	rqe = &rq_ring.rqes[(rq_ring.rq_tail & rq_mask)];
-	rqe->off = (rcqe->off & ~IORING_ZCRX_AREA_MASK) | area_token;
+	rqe = &ctx->rq_ring.rqes[(ctx->rq_ring.rq_tail & rq_mask)];
+	rqe->off = (rcqe->off & ~IORING_ZCRX_AREA_MASK) | ctx->area_token;
 	rqe->len = cqe->res;
-	io_uring_smp_store_release(rq_ring.ktail, ++rq_ring.rq_tail);
+	io_uring_smp_store_release(ctx->rq_ring.ktail, ++ctx->rq_ring.rq_tail);
 }
 
-static void server_loop(struct io_uring *ring)
+static void server_loop(struct thread_ctx *ctx)
 {
 	struct io_uring_cqe *cqe;
 	unsigned int count = 0;
 	unsigned int head;
 	int i, ret;
 
-	io_uring_submit_and_wait(ring, 1);
+	io_uring_submit_and_wait(&ctx->ring, 1);
 
-	io_uring_for_each_cqe(ring, head, cqe) {
-		if (cqe->user_data == 1)
-			process_accept(ring, cqe);
-		else if (cqe->user_data == 2)
-			process_recvzc(ring, cqe);
+	io_uring_for_each_cqe(&ctx->ring, head, cqe) {
+		if (cqe->user_data == 2)
+			process_recvzc(ctx, cqe);
 		else
 			error(1, 0, "unknown cqe");
 		count++;
 	}
-	io_uring_cq_advance(ring, count);
+	io_uring_cq_advance(&ctx->ring, count);
 }
 
-static void run_server(void)
+static void *server_worker(void *arg)
 {
+	struct thread_ctx *ctx = arg;
 	unsigned int flags = 0;
-	struct io_uring ring;
-	int fd, enable, ret;
 	uint64_t tstop;
 
+	flags |= IORING_SETUP_COOP_TASKRUN;
+	flags |= IORING_SETUP_SINGLE_ISSUER;
+	flags |= IORING_SETUP_DEFER_TASKRUN;
+	flags |= IORING_SETUP_SUBMIT_ALL;
+	flags |= IORING_SETUP_CQE32;
+
+	io_uring_queue_init(512, &ctx->ring, flags);
+
+	setup_zcrx(ctx);
+
+	pthread_barrier_wait(&barrier);
+
+	if (cfg_dry_run)
+		return NULL;
+
+	pthread_barrier_wait(&barrier);
+
+	if (cfg_oneshot)
+		add_recvzc_oneshot(ctx, ctx->connfd, page_size);
+	else
+		add_recvzc(ctx, ctx->connfd);
+
+	tstop = gettimeofday_ms() + 5000;
+	while (!ctx->stop && gettimeofday_ms() < tstop)
+		server_loop(ctx);
+
+	if (!ctx->stop)
+		error(1, 0, "test failed\n");
+
+	return NULL;
+}
+
+static void run_server(void)
+{
+	struct thread_ctx *ctxs;
+	pthread_t *threads;
+	int fd, ret, i, enable;
+
+	ctxs = calloc(cfg_num_threads, sizeof(*ctxs));
+	threads = calloc(cfg_num_threads, sizeof(*threads));
+	if (!ctxs || !threads)
+		error(1, 0, "calloc()");
+
 	fd = socket(AF_INET6, SOCK_STREAM, 0);
 	if (fd == -1)
 		error(1, 0, "socket()");
@@ -360,26 +378,41 @@ static void run_server(void)
 	if (listen(fd, 1024) < 0)
 		error(1, 0, "listen()");
 
-	flags |= IORING_SETUP_COOP_TASKRUN;
-	flags |= IORING_SETUP_SINGLE_ISSUER;
-	flags |= IORING_SETUP_DEFER_TASKRUN;
-	flags |= IORING_SETUP_SUBMIT_ALL;
-	flags |= IORING_SETUP_CQE32;
+	pthread_barrier_init(&barrier, NULL, cfg_num_threads + 1);
+
+	for (i = 0; i < cfg_num_threads; i++) {
+		ctxs[i].queue_id = cfg_queue_id + i;
+		ctxs[i].thread_id = i;
+	}
 
-	io_uring_queue_init(512, &ring, flags);
+	for (i = 0; i < cfg_num_threads; i++) {
+		ret = pthread_create(&threads[i], NULL, server_worker,
+				     &ctxs[i]);
+		if (ret)
+			error(1, ret, "pthread_create()");
+	}
+
+	pthread_barrier_wait(&barrier);
 
-	setup_zcrx(&ring);
 	if (cfg_dry_run)
-		return;
+		goto join;
+
+	for (i = 0; i < cfg_num_threads; i++) {
+		ctxs[i].connfd = accept(fd, NULL, NULL);
+		if (ctxs[i].connfd < 0)
+			error(1, 0, "accept()");
+	}
 
-	add_accept(&ring, fd);
+	pthread_barrier_wait(&barrier);
 
-	tstop = gettimeofday_ms() + 5000;
-	while (!stop && gettimeofday_ms() < tstop)
-		server_loop(&ring);
+join:
+	for (i = 0; i < cfg_num_threads; i++)
+		pthread_join(threads[i], NULL);
 
-	if (!stop)
-		error(1, 0, "test failed\n");
+	pthread_barrier_destroy(&barrier);
+	close(fd);
+	free(threads);
+	free(ctxs);
 }
 
 static void *client_worker(void *arg)
-- 
2.53.0


^ permalink raw reply related

* [PATCH 5/5] selftests: net: add rss_multiqueue test variant to iou-zcrx
From: Juanlu Herrero @ 2026-04-08 16:38 UTC (permalink / raw)
  To: netdev; +Cc: Juanlu Herrero
In-Reply-To: <20260408163816.2760-1-juanlu@fastmail.com>

Add multi-port support to the iou-zcrx test binary and a new
rss_multiqueue Python test variant that exercises multi-queue zero-copy
receive with per-port flow rule steering.

In multi-port mode, the server creates N listening sockets on
consecutive ports (cfg_port, cfg_port+1, ...) and uses epoll to accept
one connection per socket. Each client thread connects to its
corresponding port. Per-port ntuple flow rules steer traffic to
different NIC hardware queues, each with its own zcrx instance.

For single-thread mode (the default), behavior is unchanged: one socket
on cfg_port, one thread, one queue.

Signed-off-by: Juanlu Herrero <juanlu@fastmail.com>
---
 .../selftests/drivers/net/hw/iou-zcrx.c       | 81 ++++++++++++++-----
 .../selftests/drivers/net/hw/iou-zcrx.py      | 45 ++++++++++-
 2 files changed, 104 insertions(+), 22 deletions(-)

diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
index 646682167bb0..1f33d7127185 100644
--- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
+++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c
@@ -102,6 +102,7 @@ struct thread_ctx {
 	bool			stop;
 	size_t			received;
 	int			queue_id;
+	int			port;
 	int			thread_id;
 };
 
@@ -353,35 +354,47 @@ static void *server_worker(void *arg)
 
 static void run_server(void)
 {
+	struct epoll_event ev, events[64];
 	struct thread_ctx *ctxs;
+	struct sockaddr_in6 addr;
 	pthread_t *threads;
-	int fd, ret, i, enable;
+	int *fds;
+	int epfd, nfds, accepted;
+	int ret, i, enable;
 
 	ctxs = calloc(cfg_num_threads, sizeof(*ctxs));
 	threads = calloc(cfg_num_threads, sizeof(*threads));
-	if (!ctxs || !threads)
+	fds = calloc(cfg_num_threads, sizeof(*fds));
+	if (!ctxs || !threads || !fds)
 		error(1, 0, "calloc()");
 
-	fd = socket(AF_INET6, SOCK_STREAM, 0);
-	if (fd == -1)
-		error(1, 0, "socket()");
+	for (i = 0; i < cfg_num_threads; i++) {
+		fds[i] = socket(AF_INET6, SOCK_STREAM, 0);
+		if (fds[i] == -1)
+			error(1, 0, "socket()");
+
+		enable = 1;
+		ret = setsockopt(fds[i], SOL_SOCKET, SO_REUSEADDR,
+				 &enable, sizeof(int));
+		if (ret < 0)
+			error(1, 0, "setsockopt(SO_REUSEADDR)");
 
-	enable = 1;
-	ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
-	if (ret < 0)
-		error(1, 0, "setsockopt(SO_REUSEADDR)");
+		addr = cfg_addr;
+		addr.sin6_port = htons(cfg_port + i);
 
-	ret = bind(fd, (struct sockaddr *)&cfg_addr, sizeof(cfg_addr));
-	if (ret < 0)
-		error(1, 0, "bind()");
+		ret = bind(fds[i], (struct sockaddr *)&addr, sizeof(addr));
+		if (ret < 0)
+			error(1, 0, "bind()");
 
-	if (listen(fd, 1024) < 0)
-		error(1, 0, "listen()");
+		if (listen(fds[i], 1024) < 0)
+			error(1, 0, "listen()");
+	}
 
 	pthread_barrier_init(&barrier, NULL, cfg_num_threads + 1);
 
 	for (i = 0; i < cfg_num_threads; i++) {
 		ctxs[i].queue_id = cfg_queue_id + i;
+		ctxs[i].port = cfg_port + i;
 		ctxs[i].thread_id = i;
 	}
 
@@ -397,12 +410,36 @@ static void run_server(void)
 	if (cfg_dry_run)
 		goto join;
 
+	epfd = epoll_create1(0);
+	if (epfd < 0)
+		error(1, 0, "epoll_create1()");
+
 	for (i = 0; i < cfg_num_threads; i++) {
-		ctxs[i].connfd = accept(fd, NULL, NULL);
-		if (ctxs[i].connfd < 0)
-			error(1, 0, "accept()");
+		ev.events = EPOLLIN;
+		ev.data.u32 = i;
+		if (epoll_ctl(epfd, EPOLL_CTL_ADD, fds[i], &ev) < 0)
+			error(1, 0, "epoll_ctl()");
 	}
 
+	accepted = 0;
+	while (accepted < cfg_num_threads) {
+		nfds = epoll_wait(epfd, events, 64, 5000);
+		if (nfds < 0)
+			error(1, 0, "epoll_wait()");
+		if (nfds == 0)
+			error(1, 0, "epoll_wait() timeout");
+
+		for (i = 0; i < nfds; i++) {
+			int idx = events[i].data.u32;
+
+			ctxs[idx].connfd = accept(fds[idx], NULL, NULL);
+			if (ctxs[idx].connfd < 0)
+				error(1, 0, "accept()");
+			accepted++;
+		}
+	}
+
+	close(epfd);
 	pthread_barrier_wait(&barrier);
 
 join:
@@ -410,23 +447,29 @@ static void run_server(void)
 		pthread_join(threads[i], NULL);
 
 	pthread_barrier_destroy(&barrier);
-	close(fd);
+	for (i = 0; i < cfg_num_threads; i++)
+		close(fds[i]);
+	free(fds);
 	free(threads);
 	free(ctxs);
 }
 
 static void *client_worker(void *arg)
 {
+	struct thread_ctx *ctx = arg;
+	struct sockaddr_in6 addr = cfg_addr;
 	ssize_t to_send = cfg_send_size;
 	ssize_t sent = 0;
 	ssize_t chunk, res;
 	int fd;
 
+	addr.sin6_port = htons(cfg_port + ctx->thread_id);
+
 	fd = socket(AF_INET6, SOCK_STREAM, 0);
 	if (fd == -1)
 		error(1, 0, "socket()");
 
-	if (connect(fd, (struct sockaddr *)&cfg_addr, sizeof(cfg_addr)))
+	if (connect(fd, (struct sockaddr *)&addr, sizeof(addr)))
 		error(1, 0, "connect()");
 
 	while (to_send) {
diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py
index e81724cb5542..c918cdaf6b1b 100755
--- a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py
+++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py
@@ -35,6 +35,12 @@ def set_flow_rule(cfg):
     return int(values)
 
 
+def set_flow_rule_port(cfg, port, queue):
+    output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {port} action {queue}").stdout
+    values = re.search(r'ID (\d+)', output).group(1)
+    return int(values)
+
+
 def set_flow_rule_rss(cfg, rss_ctx_id):
     output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} context {rss_ctx_id}").stdout
     values = re.search(r'ID (\d+)', output).group(1)
@@ -100,18 +106,51 @@ def rss(cfg):
     defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}")
 
 
+def rss_multiqueue(cfg):
+    channels = cfg.ethnl.channels_get({'header': {'dev-index': cfg.ifindex}})
+    channels = channels['combined-count']
+    if channels < 3:
+        raise KsftSkipEx('Test requires NETIF with at least 3 combined channels')
+
+    rings = cfg.ethnl.rings_get({'header': {'dev-index': cfg.ifindex}})
+    rx_rings = rings['rx']
+    hds_thresh = rings.get('hds-thresh', 0)
+
+    cfg.ethnl.rings_set({'header': {'dev-index': cfg.ifindex},
+                         'tcp-data-split': 'enabled',
+                         'hds-thresh': 0,
+                         'rx': 64})
+    defer(cfg.ethnl.rings_set, {'header': {'dev-index': cfg.ifindex},
+                                'tcp-data-split': 'unknown',
+                                'hds-thresh': hds_thresh,
+                                'rx': rx_rings})
+    defer(mp_clear_wait, cfg)
+
+    cfg.num_threads = 2
+    cfg.target = channels - cfg.num_threads
+    ethtool(f"-X {cfg.ifname} equal {cfg.target}")
+    defer(ethtool, f"-X {cfg.ifname} default")
+
+    for i in range(cfg.num_threads):
+        flow_rule_id = set_flow_rule_port(cfg, cfg.port + i, cfg.target + i)
+        defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}")
+
+
 @ksft_variants([
     KsftNamedVariant("single", single),
     KsftNamedVariant("rss", rss),
+    KsftNamedVariant("rss_multiqueue", rss_multiqueue),
 ])
 def test_zcrx(cfg, setup) -> None:
     cfg.require_ipver('6')
 
+    cfg.num_threads = getattr(cfg, 'num_threads', 1)
     setup(cfg)
-    rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target}"
-    tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 12840"
+    rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.target} -t {cfg.num_threads}"
+    tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 12840 -t {cfg.num_threads}"
     with bkg(rx_cmd, exit_wait=True):
-        wait_port_listen(cfg.port, proto="tcp")
+        for i in range(cfg.num_threads):
+            wait_port_listen(cfg.port + i, proto="tcp")
         cmd(tx_cmd, host=cfg.remote)
 
 
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH net-next v6 00/10] Decouple receive and transmit enablement in team driver
From: Jakub Kicinski @ 2026-04-08 16:40 UTC (permalink / raw)
  To: Marc Harvey
  Cc: Jiri Pirko, Andrew Lunn, David S. Miller, Eric Dumazet,
	Paolo Abeni, Shuah Khan, Simon Horman, netdev, linux-kernel,
	linux-kselftest
In-Reply-To: <20260408-teaming-driver-internal-v6-0-e5bcdcf72504@google.com>

On Wed, 08 Apr 2026 02:52:19 +0000 Marc Harvey wrote:
> Allow independent control over receive and transmit enablement states
> for aggregated ports in the team driver.
> 
> The motivation is that IEE 802.3ad LACP "independent control" can't
> be implemented for the team driver currently. This was added to the
> bonding driver in commit 240fd405528b ("bonding: Add independent
> control state machine").
> 
> This series also has a few patches that add tests to show that the old
> coupled enablement still works and that the new decoupled enablement
> works as intended (4, 5, and 10).
> 
> There are three patches with small fixes as well, with the goal of
> making the final decoupling patch clearer (1, 2, and 3).

It pains me to report on non-debug kernels:

make: Entering directory '/srv/vmksft/testing/wt-9/tools/testing/selftests'
make[1]: Nothing to be done for 'all'.
TAP version 13
1..1
# timeout set to 45
# selftests: drivers/net/team: teamd_activebackup.sh
# Setting up two-link aggregation for runner activebackup
# Teamd version is: teamd 1.32
# Conf files are /tmp/tmp.ZeEAwlX4kB and /tmp/tmp.Q8XVmtXmXY
# This program is not intended to be run as root.
# This program is not intended to be run as root.
# Created team devices
# Teamd PIDs are 30274 and 30278
# PING fd00::2 (fd00::2) 56 data bytes
# 64 bytes from fd00::2: icmp_seq=1 ttl=64 time=0.037 ms
# 
# --- fd00::2 ping statistics ---
# 1 packets transmitted, 1 received, 0% packet loss, time 0ms
# rtt min/avg/max/mdev = 0.037/0.037/0.037/0.000 msPacket count for test_team2 was 121
# Waiting for eth0 in ns2-yYZzD5 to stop receiving
# Packet count for eth0 was 0Packet count for eth0 was 0
# Packet count for eth1 was 243
# Waiting for eth1 in ns2-yYZzD5 to stop receiving
# Packet count for eth1 was 0Packet count for eth0 was 365
# Packet count for eth1 was 0
# TEST: teamd active backup runner test                               [ OK ]
# Tearing down two-link aggregation
# Failed to kill daemon: Timer expired
#
not ok 1 selftests: drivers/net/team: teamd_activebackup.sh # TIMEOUT 45 seconds

Retry:

make: Entering directory '/srv/vmksft/testing/wt-9/tools/testing/selftests'
make[1]: Nothing to be done for 'all'.
TAP version 13
1..1
# timeout set to 45
# selftests: drivers/net/team: teamd_activebackup.sh
# Setting up two-link aggregation for runner activebackup
# Teamd version is: teamd 1.32
# Conf files are /tmp/tmp.0pmbsXgdH5 and /tmp/tmp.ehbGB6jJTZ
# This program is not intended to be run as root.
# This program is not intended to be run as root.
# Created team devices
# Teamd PIDs are 1314 and 1318
# PING fd00::2 (fd00::2) 56 data bytes
# 64 bytes from fd00::2: icmp_seq=1 ttl=64 time=0.032 ms
# 
# --- fd00::2 ping statistics ---
# 1 packets transmitted, 1 received, 0% packet loss, time 0ms
# rtt min/avg/max/mdev = 0.032/0.032/0.032/0.000 msPacket count for test_team2 was 121
# Waiting for eth0 in ns2-H0Yrq8 to stop receiving
# Packet count for eth0 was 0Packet count for eth0 was 0
# Packet count for eth1 was 243
# Waiting for eth1 in ns2-H0Yrq8 to stop receiving
# Packet count for eth1 was 0Packet count for eth0 was 366
# Packet count for eth1 was 0
# TEST: teamd active backup runner test                               [ OK ]
# Tearing down two-link aggregation
# Failed to kill daemon: Timer expired
#
not ok 1 selftests: drivers/net/team: teamd_activebackup.sh # TIMEOUT 45 seconds

^ permalink raw reply

* Re: BUG: net-next (7.0-rc6 based and later) fails to boot on Jetson Xavier NX
From: Robin Murphy @ 2026-04-08 16:40 UTC (permalink / raw)
  To: Russell King (Oracle), netdev, linux-arm-kernel, linux-kernel,
	iommu, linux-ext4, Linus Torvalds, dmaengine
  Cc: Marek Szyprowski, Theodore Ts'o, Andreas Dilger, Vinod Koul,
	Frank Li
In-Reply-To: <adZ_ZmjcE8S22vR1@shell.armlinux.org.uk>

On 2026-04-08 5:16 pm, Russell King (Oracle) wrote:
> On Wed, Apr 08, 2026 at 05:08:34PM +0100, Russell King (Oracle) wrote:
>> The rebase is still progressing, but it's landed on:
>>
>> c7d812e33f3e dmaengine: xilinx: xilinx_dma: Fix unmasked residue subtraction

FWIW I don't see a Tegra having the Xilinx IP in it anyway - judging by 
the DT it has their own tegra-gpcdma engine...

There's a fair chance this could be 90c5def10bea ("iommu: Do not call 
drivers for empty gathers"), which JonH also reported causing boot 
issues on Tegras - in short, SMMU TLB maintenance may not be completed 
properly which could lead to recycled DMA addresses causing exactly this 
kind of random memory corruption. I CC'd you on a patch:

https://lore.kernel.org/linux-iommu/20260408162846.GE3357077@nvidia.com/T/#t

Thanks,
Robin.

>>
>> and while this boots to a login prompt, it spat out a BUG():
>>
>> BUG: sleeping function called from invalid context at kernel/locking/mutex.c:591
>> in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 56, name: kworker/u24:3
>> preempt_count: 0, expected: 0
>> RCU nest depth: 0, expected: 0
>> 3 locks held by kworker/u24:3/56:
>>   #0: ffff000080042148 ((wq_completion)events_unbound#2){+.+.}-{0:0}, at: process_one_work+0x184/0x780
>>   #1: ffff80008299bdf8 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work+0x1ac/0x780
>>   #2: ffff0000808b48f8 (&dev->mutex){....}-{4:4}, at: __device_attach+0x2c/0x188
>> irq event stamp: 10872
>> hardirqs last  enabled at (10871): [<ffff80008013a410>] ktime_get+0x130/0x180
>> hardirqs last disabled at (10872): [<ffff800080d61ac8>] _raw_spin_lock_irqsave+0x84/0x88
>> softirqs last  enabled at (9216): [<ffff80008002807c>] fpsimd_save_and_flush_current_state+0x3c/0x80
>> softirqs last disabled at (9214): [<ffff800080028098>] fpsimd_save_and_flush_current_state+0x58/0x80
>> CPU: 5 UID: 0 PID: 56 Comm: kworker/u24:3 Not tainted 7.0.0-rc1-bisect+ #654 PREEMPT
>> Hardware name: NVIDIA NVIDIA Jetson Xavier NX Developer Kit/Jetson, BIOS 6.0-37391689 08/28/2024
>> Workqueue: events_unbound deferred_probe_work_func
>> Call trace:
>>   show_stack+0x18/0x30 (C)
>>   dump_stack_lvl+0x6c/0x94
>>   dump_stack+0x18/0x24
>>   __might_resched+0x154/0x220
>>   __might_sleep+0x48/0x80
>>   __mutex_lock+0x48/0x800
>>   mutex_lock_nested+0x24/0x30
>>   pinmux_disable_setting+0x9c/0x180
>>   pinctrl_commit_state+0x5c/0x260
>>   pinctrl_pm_select_idle_state+0x4c/0xa0
>>   tegra_i2c_runtime_suspend+0x2c/0x3c
>>   pm_generic_runtime_suspend+0x2c/0x44
>>   __rpm_callback+0x48/0x1ec
>>   rpm_callback+0x74/0x80
>>   rpm_suspend+0xec/0x630
>>   rpm_idle+0x2c0/0x420
>>   __pm_runtime_idle+0x44/0x160
>>   tegra_i2c_probe+0x2e4/0x640
>>   platform_probe+0x5c/0xa4
>>   really_probe+0xbc/0x2c0
>>   __driver_probe_device+0x78/0x120
>>   driver_probe_device+0x3c/0x160
>>   __device_attach_driver+0xbc/0x160
>>   bus_for_each_drv+0x70/0xb8
>>   __device_attach+0xa4/0x188
>>   device_initial_probe+0x50/0x54
>>   bus_probe_device+0x38/0xa4
>>   deferred_probe_work_func+0x90/0xcc
>>   process_one_work+0x204/0x780
>>   worker_thread+0x1c8/0x36c
>>   kthread+0x138/0x144
>>   ret_from_fork+0x10/0x20
>>
>> This is reproducible.
> 
> I've just realised that it's the Tegra I2C bug that is already known
> about, but took ages to be fixed in mainline - it's unrelated to the
> memory corruption, so can be ignored. Sorry for the noise.
> 


^ permalink raw reply

* Re: [PATCH net-next v2 1/5] tools: ynl: move ethtool.py to selftest
From: Stanislav Fomichev @ 2026-04-08 16:42 UTC (permalink / raw)
  To: Hangbin Liu
  Cc: Donald Hunter, Jakub Kicinski, David S. Miller, Eric Dumazet,
	Paolo Abeni, Simon Horman, Andrew Lunn, netdev, linux-kernel
In-Reply-To: <20260408-b4-ynl_ethtool-v2-1-7623a5e8f70b@gmail.com>

On 04/08, Hangbin Liu wrote:
> We have converted all the samples to selftests. This script is
> the last piece of random "PoC" code we still have lying around.
> Let's move it to tests.
> 
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
> ---
>  tools/net/ynl/tests/Makefile              | 5 ++++-
>  tools/net/ynl/{pyynl => tests}/ethtool.py | 2 +-
>  tools/net/ynl/tests/test_ynl_ethtool.sh   | 2 +-
>  3 files changed, 6 insertions(+), 3 deletions(-)
> 
> diff --git a/tools/net/ynl/tests/Makefile b/tools/net/ynl/tests/Makefile
> index 2a02958c7039..94bf0346b54d 100644
> --- a/tools/net/ynl/tests/Makefile
> +++ b/tools/net/ynl/tests/Makefile
> @@ -36,7 +36,10 @@ TEST_GEN_FILES := \
>  	rt-route \
>  # end of TEST_GEN_FILES
>  
> -TEST_FILES := ynl_nsim_lib.sh
> +TEST_FILES := \
> +	ethtool.py \
> +	ynl_nsim_lib.sh \
> +# end of TEST_FILES
>  
>  CFLAGS_netdev:=$(CFLAGS_netdev) $(CFLAGS_rt-link)
>  CFLAGS_ovs:=$(CFLAGS_ovs_datapath)
> diff --git a/tools/net/ynl/pyynl/ethtool.py b/tools/net/ynl/tests/ethtool.py
> similarity index 99%
> rename from tools/net/ynl/pyynl/ethtool.py
> rename to tools/net/ynl/tests/ethtool.py
> index f1a2a2a89985..6eeeb867edcf 100755
> --- a/tools/net/ynl/pyynl/ethtool.py
> +++ b/tools/net/ynl/tests/ethtool.py
> @@ -14,7 +14,7 @@ import re
>  import os
>  
>  # pylint: disable=no-name-in-module,wrong-import-position
> -sys.path.append(pathlib.Path(__file__).resolve().parent.as_posix())
> +sys.path.append(pathlib.Path(__file__).resolve().parent.parent.joinpath('pyynl').as_posix())
>  # pylint: disable=import-error
>  from cli import schema_dir, spec_dir
>  from lib import YnlFamily
> diff --git a/tools/net/ynl/tests/test_ynl_ethtool.sh b/tools/net/ynl/tests/test_ynl_ethtool.sh
> index b826269017f4..b4480e9be7b7 100755
> --- a/tools/net/ynl/tests/test_ynl_ethtool.sh
> +++ b/tools/net/ynl/tests/test_ynl_ethtool.sh
> @@ -8,7 +8,7 @@ KSELFTEST_KTAP_HELPERS="$(dirname "$(realpath "$0")")/../../../testing/selftests
>  source "$KSELFTEST_KTAP_HELPERS"
>  
>  # Default ynl-ethtool path for direct execution, can be overridden by make install
> -ynl_ethtool="../pyynl/ethtool.py"
> +ynl_ethtool="./ethtool.py"
>  
>  readonly NSIM_ID="1337"
>  readonly NSIM_DEV_NAME="nsim${NSIM_ID}"

Do we need to add some expects/asserts to the script to really make it into
a test? Right now it just prints things, so it's not really a test.

^ permalink raw reply

* Re: [PATCH net-next v2 5/5] ethtool: strset: check nla_len overflow
From: Stanislav Fomichev @ 2026-04-08 16:43 UTC (permalink / raw)
  To: Hangbin Liu
  Cc: Donald Hunter, Jakub Kicinski, David S. Miller, Eric Dumazet,
	Paolo Abeni, Simon Horman, Andrew Lunn, netdev, linux-kernel
In-Reply-To: <20260408-b4-ynl_ethtool-v2-5-7623a5e8f70b@gmail.com>

On 04/08, Hangbin Liu wrote:
> The netlink attribute length field nla_len is a __u16, which can only
> represent values up to 65535 bytes. NICs with a large number of
> statistics strings (e.g. mlx5_core with thousands of ETH_SS_STATS
> entries) can produce a ETHTOOL_A_STRINGSET_STRINGS nest that exceeds
> this limit.
> 
> When nla_nest_end() writes the actual nest size back to nla_len, the
> value is silently truncated. This results in a corrupted netlink message
> being sent to userspace: the parser reads a wrong (truncated) attribute
> length and misaligns all subsequent attribute boundaries, causing decode
> errors.
> 
> Fix this by using the new helper nla_nest_end_safe and error out if
> the size exceeds U16_MAX.

Not sure that's the user supposed to do? Does it mean there is no way
to retrieve ETHTOOL_A_STRINGSET_STRINGS for those devices with too
many strings?

^ permalink raw reply

* Re: linux-next: manual merge of the net-next tree with the netfilter tree
From: Matthieu Baerts @ 2026-04-08 16:43 UTC (permalink / raw)
  To: Mark Brown, David Miller, Jakub Kicinski, Paolo Abeni, Networking
  Cc: Andrea Mayer, Justin Iurman, Linux Kernel Mailing List,
	Linux Next Mailing List
In-Reply-To: <729b748b-e00b-475e-81e0-a666eab24fc6@sirena.org.uk>

Hello,

On 08/04/2026 17:08, Mark Brown wrote:
> On Wed, Apr 08, 2026 at 03:10:10PM +0100, Mark Brown wrote:
>> Hi all,
>>
>> Today's linux-next merge of the net-next tree got a conflict in:
>>
>>   net/ipv6/seg6_iptunnel.c
>>
>> between commit:
>>
>>   c3812651b522f ("seg6: separate dst_cache for input and output paths in seg6 lwtunnel")
>>
>> from the netfilter tree and commit:
>>
>>   78723a62b969a ("seg6: add per-route tunnel source address")
>>
>> from the net-next tree.
>>
>> I fixed it up (see below) and can carry the fix as necessary. This
>> is now fixed as far as linux-next is concerned, but any non trivial
>> conflicts should be mentioned to your upstream maintainer when your tree
>> is submitted for merging.  You may also want to consider cooperating
>> with the maintainer of the conflicting tree to minimise any particularly
>> complex conflicts.

Thank you for having fixed the conflict on linux-next!

>> diff --cc net/ipv6/seg6_iptunnel.c
>> index d6a0f7df90807,e76cc0cc481ec..0000000000000
>> --- a/net/ipv6/seg6_iptunnel.c
>> +++ b/net/ipv6/seg6_iptunnel.c
>> @@@ -48,8 -48,8 +48,9 @@@ static size_t seg6_lwt_headroom(struct 
>>   }
>>   
>>   struct seg6_lwt {
>>  -	struct dst_cache cache;
>>  +	struct dst_cache cache_input;
>>  +	struct dst_cache cache_output;
>> + 	struct in6_addr tunsrc;
>>   	struct seg6_iptunnel_encap tuninfo[];
>>   };
>>   
> 
> This also needs a fixup for a new jump to the error handling paths that
> was added in seg6_build_state().

I also had this other conflict there, and I did this when resolving it
in MPTCP tree:

--------------------- 8< ---------------------
diff --cc net/ipv6/seg6_iptunnel.c
index e76cc0cc481e,d6a0f7df9080..97b50d9b1365
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@@ -48,8 -48,8 +48,9 @@@ static size_t seg6_lwt_headroom(struct 
  }
  
  struct seg6_lwt {
- 	struct dst_cache cache;
+ 	struct dst_cache cache_input;
+ 	struct dst_cache cache_output;
 +	struct in6_addr tunsrc;
  	struct seg6_iptunnel_encap tuninfo[];
  };
  
@@@ -726,18 -712,6 +731,18 @@@ static int seg6_build_state(struct net 
  
  	memcpy(&slwt->tuninfo, tuninfo, tuninfo_len);
  
 +	if (tb[SEG6_IPTUNNEL_SRC]) {
 +		slwt->tunsrc = nla_get_in6_addr(tb[SEG6_IPTUNNEL_SRC]);
 +
 +		if (ipv6_addr_any(&slwt->tunsrc) ||
 +		    ipv6_addr_is_multicast(&slwt->tunsrc) ||
 +		    ipv6_addr_loopback(&slwt->tunsrc)) {
 +			NL_SET_ERR_MSG(extack, "invalid tunsrc address");
 +			err = -EINVAL;
- 			goto free_dst_cache;
++			goto err_destroy_output;
 +		}
 +	}
 +
  	newts->type = LWTUNNEL_ENCAP_SEG6;
  	newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
  
@@@ -750,9 -724,9 +755,11 @@@
  
  	return 0;
  
- free_dst_cache:
- 	dst_cache_destroy(&slwt->cache);
- free_lwt_state:
++err_destroy_output:
++	dst_cache_destroy(&slwt->cache_output);
+ err_destroy_input:
+ 	dst_cache_destroy(&slwt->cache_input);
+ err_free_newts:
  	kfree(newts);
  	return err;
  }
--------------------- 8< ---------------------

I took the liberty to add the err_destroy_output label, similar to the
new err_destroy_input one.

Just in case, rerere cache file is available there:

 https://github.com/multipath-tcp/mptcp-upstream-rr-cache/commit/dbb6675

Cheers,
Matt
-- 
Sponsored by the NGI0 Core fund.


^ permalink raw reply

* Re: [PATCH v12 6/6] selftests: net: add TLS hardware offload test
From: Sabrina Dubroca @ 2026-04-08 16:45 UTC (permalink / raw)
  To: kuba, Rishikesh Jethwani
  Cc: netdev, saeedm, tariqt, mbloch, borisp, john.fastabend, davem,
	pabeni, edumazet, leon
In-Reply-To: <20260402235511.664801-7-rjethwani@purestorage.com>

@Jakub [top-posting so you don't have to scroll through the rest of my
comments to find some global questions about this patch]

tools/testing/selftests/drivers/net/README.rst mentions "Local
host is the DUT", but this test does rekeys on both sides and sends a
bit of traffic back and forth. Is that acceptable?

Another thought: is there a "standard" for stdout vs stderr, as well as
verbosity of "test progress"/"debug" type messages ("sent
keyupdate"/"received keyupdate"/"server listening"/"setup complete"
etc) for those test programs? Any expectation for a --{debug,verbose}
option to only display all this stuff on request?

Output for 1 test:
-------- 8< --------
TLS Version: TLS 1.3
Cipher: AES-GCM-128
Buffer size: 16384
Connecting to 192.168.13.1:4433...
Connected!
Installing TLS_TX AES-GCM-128 gen 0...
TLS_TX AES-GCM-128 gen 0 installed
Installing TLS_RX AES-GCM-128 gen 0...
TLS_RX AES-GCM-128 gen 0 installed
TLS setup complete.
Sending 100 messages of 16384 bytes...
Sent 16384 bytes (iteration 1)
Received echo 16384 bytes (ok)
[...repeated]
Sent 16384 bytes (iteration 100)
Received echo 16384 bytes (ok)
-------- 8< --------

With some rekeys I get 300L of output on each side.

If not, I guess we can fall back to what makes the most sense for
NIPA?

2026-04-02, 17:55:11 -0600, Rishikesh Jethwani wrote:
> diff --git a/tools/testing/selftests/drivers/net/hw/tls_hw_offload.c b/tools/testing/selftests/drivers/net/hw/tls_hw_offload.c
> new file mode 100644
> index 000000000000..788891890ec8
> --- /dev/null
> +++ b/tools/testing/selftests/drivers/net/hw/tls_hw_offload.c

I think it would make sense to add the new files to the
"NETWORKING [TLS]" entry in MAINTAINERS. We already have the existing
selftest listed.

[...]
> +#define TEST_ITERATIONS	100
> +#define MAX_REKEYS	99

Making TEST_ITERATIONS a test argument would allow a test that lasts
more than half a second. Then MAX_REKEYS can be removed and the
argument validation switched to num_rekeys < num_iterations. (doing
rekeys without a send() in between [other than the send_keyupdate]
should be fine, but it makes the logic for "do we send a rekey and/or
application_data this round?" more complex, so let's keep the check on
num_rekeys)

In general, this test is a good start, but it doesn't cover enough
(see my reply to 5/6). How did you test the "unacked records" part of
the rekey patch? It's the most complex part of the series.

[...]
> +static void derive_key_fields(unsigned char *key, int key_size,
> +			      unsigned char *iv, int iv_size,
> +			      unsigned char *salt, int salt_size,
> +			      unsigned char *rec_seq, int rec_seq_size,
> +			      int generation)
> +{
> +	unsigned char pattern;
> +	int i;
> +
> +	if (generation == 0)
> +		return;
> +
> +	pattern = (unsigned char)((generation * 0x1B) ^ 0x63);
> +	for (i = 0; i < key_size; i++) {
> +		key[i] ^= pattern;
> +		pattern = (pattern << 1) | (pattern >> 7);
> +	}
> +
> +	pattern = (unsigned char)((generation * 0x2D) ^ 0x7C);
> +	for (i = 0; i < iv_size; i++) {
> +		iv[i] ^= pattern;
> +		pattern = (pattern << 1) | (pattern >> 7);
> +	}
> +
> +	for (i = 0; i < salt_size; i++)
> +		salt[i] ^= (unsigned char)(generation & 0xFF);
> +
> +	memset(rec_seq, 0, rec_seq_size);
> +}

This isn't wrong (so I'm not requesting to change it), but it seems
way overkill for "we want to fill tls12_crypto_info_aes_gcm_* with
some predictable data based only on key_generation".

> +/* Send TLS 1.3 KeyUpdate handshake message */
> +static int send_tls_key_update(int fd, int request_update)
> +{
> +	char cmsg_buf[CMSG_SPACE(sizeof(unsigned char))];
> +	unsigned char key_update_msg[5];
> +	struct msghdr msg = {0};
> +	struct cmsghdr *cmsg;
> +	struct iovec iov;
> +
> +	key_update_msg[0] = TLS_HANDSHAKE_KEY_UPDATE;
> +	key_update_msg[1] = 0;
> +	key_update_msg[2] = 0;
> +	key_update_msg[3] = 1;
> +	key_update_msg[4] = request_update ? KEY_UPDATE_REQUESTED
> +					   : KEY_UPDATE_NOT_REQUESTED;

I'm still really not convinced we need to bother with things that
would only affect a "proper" userspace.

[...]
> +static int validate_keyupdate(const char *buf, int len)
> +{
> +	if (len != 5) {
> +		printf("KeyUpdate: expected 5 bytes, got %d\n", len);
> +		return -1;
> +	}
> +
> +	if ((unsigned char)buf[0] != TLS_HANDSHAKE_KEY_UPDATE) {
> +		printf("Expected KeyUpdate (0x%02x), got 0x%02x\n",
> +		       TLS_HANDSHAKE_KEY_UPDATE, (unsigned char)buf[0]);
> +		return -1;
> +	}
> +
> +	if (buf[1] != 0 || buf[2] != 0 || buf[3] != 1) {
> +		printf("KeyUpdate: bad length field %02x%02x%02x\n",
> +		       (unsigned char)buf[1], (unsigned char)buf[2],
> +		       (unsigned char)buf[3]);
> +		return -1;
> +	}

And same here.

> +	if ((unsigned char)buf[4] != KEY_UPDATE_NOT_REQUESTED &&
> +	    (unsigned char)buf[4] != KEY_UPDATE_REQUESTED) {
> +		printf("KeyUpdate: invalid request_update value %u\n",
> +		       (unsigned char)buf[4]);
> +		return -1;
> +	}

And here.

What type of issue do you expect to identify by validating the
contents of the key update message that your test program is sending
to itself?

> +	printf("Received TLS KeyUpdate (request_update=%u)\n",
> +	       (unsigned char)buf[4]);
> +	return 0;
> +}
> +

...
> +static int do_client(void)
> +{
> +	char *buf = NULL, *echo_buf = NULL;
> +	int max_size, rekey_interval;
> +	ssize_t echo_total, echo_n;
> +	int csk = -1, ret, i, j;
> +	struct sockaddr_in sa;
> +	int test_result = -1;
> +	int current_gen = 0;
> +	int next_rekey_at;
> +	ssize_t n;
> +
> +	if (!server_ip) {
> +		printf("ERROR: Client requires -s <ip> option\n");
> +		return -1;
> +	}
> +
> +	max_size = random_size_max > 0 ? random_size_max : send_size;
> +	if (max_size < MIN_BUF_SIZE)
> +		max_size = MIN_BUF_SIZE;
> +	buf = malloc(max_size);
> +	echo_buf = malloc(max_size);
> +	if (!buf || !echo_buf) {
> +		printf("failed to allocate buffers\n");
> +		goto out;
> +	}
> +
> +	csk = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
> +	if (csk < 0) {
> +		printf("failed to create socket: %s\n", strerror(errno));

Those failures (malloc, socket, connect, maybe also initial key setup)
could use a common prefix (maybe "SETUP ERROR") to differentate them
from "invalid arguments" and "actual test failure".

[...]
> +	rekey_interval = TEST_ITERATIONS / (num_rekeys + 1);
> +	if (rekey_interval < 1)
> +		rekey_interval = 1;

nit: should never happen since num_rekeys < TEST_ITERATIONS

> +	next_rekey_at = rekey_interval;
> +
> +	for (i = 0; i < TEST_ITERATIONS; i++) {

nit: i itself is never used, only i+1, so iterating from 1 up to
i <= TEST_ITERATIONS would be a tiny bit more readable.

> +		int this_size;
> +
> +		if (random_size_max > 0)
> +			this_size = (rand() % random_size_max) + 1;
> +		else
> +			this_size = send_size;
> +
> +		for (j = 0; j < this_size; j++)
> +			buf[j] = rand() & 0xFF;
> +
> +		n = send(csk, buf, this_size, 0);
> +		if (n != this_size) {
> +			printf("FAIL: send failed: %s\n", strerror(errno));

The test failures messages in the client consistently use "FAIL:" as a
prefix...

[...]
> +static int do_server(void)
> +{
[...]
> +	/* Main receive loop */
> +	while (1) {
[...]
> +			printf("recv failed: %s\n", strerror(errno));
[...]
> +				printf("Failed to send KeyUpdate\n");
[...]
> +				printf("Echo send failed: %s\n",
> +				       strerror(errno));

...but not in the server.

[...]
> +int main(int argc, char *argv[])
> +{
[...]
> +	if (tls_version == TLS_1_2_VERSION && num_rekeys) {
> +		printf("ERROR: TLS 1.2 does not support rekey\n");
> +		return -1;
> +	}

Maybe also a check to make setting random_size_max incompatible with
setting send_size. And probably at least a warning when client-only
options (server_ip, random_size, num_rekeys) are passed to the server
and will be ignored.

The "Client requires -s <ip> option" check would also fit better here,
with the rest of the options checking.

> diff --git a/tools/testing/selftests/drivers/net/hw/tls_hw_offload.py b/tools/testing/selftests/drivers/net/hw/tls_hw_offload.py
> new file mode 100755
> index 000000000000..66c5ddfd8125
> --- /dev/null
> +++ b/tools/testing/selftests/drivers/net/hw/tls_hw_offload.py
[...]
> +def verify_tls_counters(stats_before, stats_after, expected_rekeys, is_server):

This is much cleaner now, thanks.

> +def run_tls_test(cfg, cipher="128", tls_version="1.3", rekey=0, buffer_size=None, random_max=None):
> +    port = rand_port()
> +    send_size = random_max or buffer_size
> +
> +    server_args = f"{cfg.bin_remote} server -p {port} -c {cipher} -v {tls_version}"

These variables were called {server,client}_cmd in the previous
version, I wonder why you renamed them to something less accurate
(since it's actually the full command, not just the arguments).

> +    if send_size:
> +        server_args += f" -b {send_size}"
> +
> +    client_args = (f"{cfg.bin_local} client -s {cfg.remote_addr_v['4']} "
> +                   f"-p {port} -c {cipher} -v {tls_version}")
> +    if rekey:
> +        client_args += f" -k {rekey}"
> +    if random_max:
> +        client_args += f" -r {random_max}"
> +    elif send_size:
> +        client_args += f" -b {send_size}"

nit: I find the use of send_size here (instead of directly
buffer_size) slightly confusing, since we just took care of
"random_max was set".

-- 
Sabrina

^ permalink raw reply

* Re: [PATCH net-next] l2tp: Drop large packets with UDP encap
From: Simon Horman @ 2026-04-08 16:48 UTC (permalink / raw)
  To: Alice Mikityanska
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	James Chapman, netdev, Alice Mikityanska,
	syzbot+ci3edea60a44225dec
In-Reply-To: <20260403174949.843941-1-alice.kernel@fastmail.im>

On Fri, Apr 03, 2026 at 08:49:49PM +0300, Alice Mikityanska wrote:
> From: Alice Mikityanska <alice@isovalent.com>
> 
> syzbot reported a WARN on my patch series [1]. The actual issue is an
> overflow of 16-bit UDP length field, and it exists in the upstream code.
> My series added a debug WARN with an overflow check that exposed the
> issue, that's why syzbot tripped on my patches, rather than on upstream
> code.
> 
> syzbot's repro:
> 
> # {"procs":1,"slowdown":1,"sandbox":"","sandbox_arg":0,"close_fds":false,"callcomments":true}
> r0 = socket$pppl2tp(0x18, 0x1, 0x1)
> r1 = socket$inet6_udp(0xa, 0x2, 0x0)
> connect$inet6(r1, &(0x7f00000000c0)={0xa, 0x0, 0x0, @loopback, 0xfffffffc}, 0x1c)
> connect$pppl2tp(r0, &(0x7f0000000240)=@pppol2tpin6={0x18, 0x1, {0x0, r1, 0x4, 0x0, 0x0, 0x0, {0xa, 0x4e22, 0xffff, @ipv4={'\x00', '\xff\xff', @empty}}}}, 0x32)
> writev(r0, &(0x7f0000000080)=[{&(0x7f0000000000)="ee", 0x34000}], 0x1)
> 
> It basically sends an oversized (0x34000 bytes) PPPoL2TP packet with UDP
> encapsulation, and l2tp_xmit_core doesn't check for overflows when it
> assigns the UDP length field. The value gets trimmed to 16 bites.
> 
> Add an overflow check that drops oversized packets and avoids sending
> packets with trimmed UDP length to the wire.
> 
> syzbot's stack trace (with my patch applied):
> 
> len >= 65536u
> WARNING: ./include/linux/udp.h:38 at udp_set_len_short include/linux/udp.h:38 [inline], CPU#1: syz.0.17/5957
> WARNING: ./include/linux/udp.h:38 at l2tp_xmit_core net/l2tp/l2tp_core.c:1293 [inline], CPU#1: syz.0.17/5957
> WARNING: ./include/linux/udp.h:38 at l2tp_xmit_skb+0x1204/0x18d0 net/l2tp/l2tp_core.c:1327, CPU#1: syz.0.17/5957
> Modules linked in:
> CPU: 1 UID: 0 PID: 5957 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full)
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
> RIP: 0010:udp_set_len_short include/linux/udp.h:38 [inline]
> RIP: 0010:l2tp_xmit_core net/l2tp/l2tp_core.c:1293 [inline]
> RIP: 0010:l2tp_xmit_skb+0x1204/0x18d0 net/l2tp/l2tp_core.c:1327
> Code: 0f 0b 90 e9 21 f9 ff ff e8 e9 05 ec f6 90 0f 0b 90 e9 8d f9 ff ff e8 db 05 ec f6 90 0f 0b 90 e9 cc f9 ff ff e8 cd 05 ec f6 90 <0f> 0b 90 e9 de fa ff ff 44 89 f1 80 e1 07 80 c1 03 38 c1 0f 8c 4f
> RSP: 0018:ffffc90003d67878 EFLAGS: 00010293
> RAX: ffffffff8ad985e3 RBX: ffff8881a6400090 RCX: ffff8881697f0000
> RDX: 0000000000000000 RSI: 0000000000034010 RDI: 000000000000ffff
> RBP: dffffc0000000000 R08: 0000000000000003 R09: 0000000000000004
> R10: dffffc0000000000 R11: fffff520007acf00 R12: ffff8881baf20900
> R13: 0000000000034010 R14: ffff8881a640008e R15: ffff8881760f7000
> FS:  000055557e81f500(0000) GS:ffff8882a9467000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000200000033000 CR3: 00000001612f4000 CR4: 00000000000006f0
> Call Trace:
>  <TASK>
>  pppol2tp_sendmsg+0x40a/0x5f0 net/l2tp/l2tp_ppp.c:302
>  sock_sendmsg_nosec net/socket.c:727 [inline]
>  __sock_sendmsg net/socket.c:742 [inline]
>  sock_write_iter+0x503/0x550 net/socket.c:1195
>  do_iter_readv_writev+0x619/0x8c0 fs/read_write.c:-1
>  vfs_writev+0x33c/0x990 fs/read_write.c:1059
>  do_writev+0x154/0x2e0 fs/read_write.c:1105
>  do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
>  do_syscall_64+0x14d/0xf80 arch/x86/entry/syscall_64.c:94
>  entry_SYSCALL_64_after_hwframe+0x77/0x7f
> RIP: 0033:0x7f636479c629
> Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48
> RSP: 002b:00007ffffd4241c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000014
> RAX: ffffffffffffffda RBX: 00007f6364a15fa0 RCX: 00007f636479c629
> RDX: 0000000000000001 RSI: 0000200000000080 RDI: 0000000000000003
> RBP: 00007f6364832b39 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
> R13: 00007f6364a15fac R14: 00007f6364a15fa0 R15: 00007f6364a15fa0
>  </TASK>
> 
> [1]: https://lore.kernel.org/all/20260226201600.222044-1-alice.kernel@fastmail.im/
> 
> Reported-by: syzbot+ci3edea60a44225dec@syzkaller.appspotmail.com
> Closes: https://lore.kernel.org/netdev/69a1dfba.050a0220.3a55be.0026.GAE@google.com/

Hi Alice,

A Fixes tag needs to go here.
And if it's fixing code present in net - that is, the bug can manifest
there - then it should be targeted at net rather than net-next.

> Signed-off-by: Alice Mikityanska <alice@isovalent.com>
> ---
>  net/l2tp/l2tp_core.c | 5 +++++
>  1 file changed, 5 insertions(+)
> 
> diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
> index c89ae52764b8..157fc23ce4e1 100644
> --- a/net/l2tp/l2tp_core.c
> +++ b/net/l2tp/l2tp_core.c
> @@ -1290,6 +1290,11 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, uns
>  		uh->source = inet->inet_sport;
>  		uh->dest = inet->inet_dport;
>  		udp_len = uhlen + session->hdr_len + data_len;
> +		if (udp_len > U16_MAX) {
> +			kfree_skb(skb);
> +			ret = NET_XMIT_DROP;
> +			goto out_unlock;
> +		}

As a fix, this looks like the right approach.
But I do think this code could benefit from some goto labels
to handle unwinding error cases.

>  		uh->len = htons(udp_len);
>  
>  		/* Calculate UDP checksum if configured to do so */
> -- 
> 2.53.0
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox