Linux-HyperV List
 help / color / mirror / Atom feed
* [PATCH net-next v8 6/6] RDMA/mana_ib: Allocate interrupt contexts on EQs
From: Long Li @ 2026-05-08 22:12 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260508221202.15725-1-longli@microsoft.com>

Use the GIC functions to allocate interrupt contexts for RDMA EQs. These
interrupt contexts may be shared with Ethernet EQs when MSI-X vectors
are limited.

The driver now supports allocating dedicated MSI-X for each EQ. Indicate
this capability through driver capability bits. The RDMA EQs pass
use_msi_bitmap=false to share MSI-X vectors with Ethernet, while the
capability flag advertises that the driver supports per-vPort EQ
separation when hardware has sufficient vectors.

Populate eq.irq on all RDMA EQs for consistency with the Ethernet path.

Also relocate the GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE define to its
numeric BIT(6) position among the other capability flags.

Signed-off-by: Long Li <longli@microsoft.com>
---
 drivers/infiniband/hw/mana/main.c | 43 +++++++++++++++++++++++++------
 include/net/mana/gdma.h           |  7 +++--
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
index 8000ab6e8beb..7adab0457a66 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -749,7 +749,8 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev)
 {
 	struct gdma_context *gc = mdev_to_gc(mdev);
 	struct gdma_queue_spec spec = {};
-	int err, i;
+	struct gdma_irq_context *gic;
+	int err, i, msi;
 
 	spec.type = GDMA_EQ;
 	spec.monitor_avl_buf = false;
@@ -757,11 +758,19 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev)
 	spec.eq.callback = mana_ib_event_handler;
 	spec.eq.context = mdev;
 	spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE;
-	spec.eq.msix_index = 0;
+
+	msi = 0;
+	gic = mana_gd_get_gic(gc, false, &msi);
+	if (!gic)
+		return -ENOMEM;
+	spec.eq.msix_index = msi;
 
 	err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->fatal_err_eq);
-	if (err)
+	if (err) {
+		mana_gd_put_gic(gc, false, 0);
 		return err;
+	}
+	mdev->fatal_err_eq->eq.irq = gic->irq;
 
 	mdev->eqs = kzalloc_objs(struct gdma_queue *,
 				 mdev->ib_dev.num_comp_vectors);
@@ -771,32 +780,50 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev)
 	}
 	spec.eq.callback = NULL;
 	for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) {
-		spec.eq.msix_index = (i + 1) % gc->num_msix_usable;
+		msi = (i + 1) % gc->num_msix_usable;
+
+		gic = mana_gd_get_gic(gc, false, &msi);
+		if (!gic) {
+			err = -ENOMEM;
+			goto destroy_eqs;
+		}
+		spec.eq.msix_index = msi;
+
 		err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->eqs[i]);
-		if (err)
+		if (err) {
+			mana_gd_put_gic(gc, false, msi);
 			goto destroy_eqs;
+		}
+		mdev->eqs[i]->eq.irq = gic->irq;
 	}
 
 	return 0;
 
 destroy_eqs:
-	while (i-- > 0)
+	while (i-- > 0) {
 		mana_gd_destroy_queue(gc, mdev->eqs[i]);
+		mana_gd_put_gic(gc, false, (i + 1) % gc->num_msix_usable);
+	}
 	kfree(mdev->eqs);
 destroy_fatal_eq:
 	mana_gd_destroy_queue(gc, mdev->fatal_err_eq);
+	mana_gd_put_gic(gc, false, 0);
 	return err;
 }
 
 void mana_ib_destroy_eqs(struct mana_ib_dev *mdev)
 {
 	struct gdma_context *gc = mdev_to_gc(mdev);
-	int i;
+	int i, msi;
 
 	mana_gd_destroy_queue(gc, mdev->fatal_err_eq);
+	mana_gd_put_gic(gc, false, 0);
 
-	for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++)
+	for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) {
 		mana_gd_destroy_queue(gc, mdev->eqs[i]);
+		msi = (i + 1) % gc->num_msix_usable;
+		mana_gd_put_gic(gc, false, msi);
+	}
 
 	kfree(mdev->eqs);
 }
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 6c138cc77407..d84e474309a3 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -615,6 +615,7 @@ enum {
 #define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3)
 #define GDMA_DRV_CAP_FLAG_1_GDMA_PAGES_4MB_1GB_2GB BIT(4)
 #define GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT BIT(5)
+#define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6)
 
 /* Driver can handle holes (zeros) in the device list */
 #define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11)
@@ -631,7 +632,8 @@ enum {
 /* Driver detects stalled send queues and recovers them */
 #define GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY BIT(18)
 
-#define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6)
+/* Driver supports separate EQ/MSIs for each vPort */
+#define GDMA_DRV_CAP_FLAG_1_EQ_MSI_UNSHARE_MULTI_VPORT BIT(19)
 
 /* Driver supports linearizing the skb when num_sge exceeds hardware limit */
 #define GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE BIT(20)
@@ -659,7 +661,8 @@ enum {
 	 GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE | \
 	 GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY | \
 	 GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY | \
-	 GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECOVERY)
+	 GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECOVERY | \
+	 GDMA_DRV_CAP_FLAG_1_EQ_MSI_UNSHARE_MULTI_VPORT)
 
 #define GDMA_DRV_CAP_FLAGS2 0
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v8 5/6] net: mana: Allocate interrupt context for each EQ when creating vPort
From: Long Li @ 2026-05-08 22:12 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260508221202.15725-1-longli@microsoft.com>

Use GIC functions to create a dedicated interrupt context or acquire a
shared interrupt context for each EQ when setting up a vPort.

The caller now owns the GIC reference across the EQ create/destroy
lifecycle: mana_create_eq() calls mana_gd_get_gic() before creating
each EQ and mana_destroy_eq() calls mana_gd_put_gic() after destroying
it. The msix_index invalidation is moved from mana_gd_deregister_irq()
to the mana_gd_create_eq() error path so that mana_destroy_eq() can
read the index before teardown.

Signed-off-by: Long Li <longli@microsoft.com>
---
 .../net/ethernet/microsoft/mana/gdma_main.c    |  2 +-
 drivers/net/ethernet/microsoft/mana/mana_en.c  | 18 +++++++++++++++++-
 include/net/mana/gdma.h                        |  1 +
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 3408bc1fd6ab..b70271a0624f 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -857,7 +857,6 @@ static void mana_gd_deregister_irq(struct gdma_queue *queue)
 	}
 	spin_unlock_irqrestore(&gic->lock, flags);
 
-	queue->eq.msix_index = INVALID_PCI_MSIX_INDEX;
 	synchronize_rcu();
 }
 
@@ -972,6 +971,7 @@ static int mana_gd_create_eq(struct gdma_dev *gd,
 out:
 	dev_err(dev, "Failed to create EQ: %d\n", err);
 	mana_gd_destroy_eq(gc, false, queue);
+	queue->eq.msix_index = INVALID_PCI_MSIX_INDEX;
 	return err;
 }
 
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 3f6cdc2cd82d..42fd517e56d2 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1626,6 +1626,7 @@ void mana_destroy_eq(struct mana_port_context *apc)
 	struct gdma_context *gc = ac->gdma_dev->gdma_context;
 	struct gdma_queue *eq;
 	int i;
+	unsigned int msi;
 
 	if (!apc->eqs)
 		return;
@@ -1638,7 +1639,9 @@ void mana_destroy_eq(struct mana_port_context *apc)
 		if (!eq)
 			continue;
 
+		msi = eq->eq.msix_index;
 		mana_gd_destroy_queue(gc, eq);
+		mana_gd_put_gic(gc, !gc->msi_sharing, msi);
 	}
 
 	kfree(apc->eqs);
@@ -1655,6 +1658,7 @@ static void mana_create_eq_debugfs(struct mana_port_context *apc, int i)
 	eq.mana_eq_debugfs = debugfs_create_dir(eqnum, apc->mana_eqs_debugfs);
 	debugfs_create_u32("head", 0400, eq.mana_eq_debugfs, &eq.eq->head);
 	debugfs_create_u32("tail", 0400, eq.mana_eq_debugfs, &eq.eq->tail);
+	debugfs_create_u32("irq", 0400, eq.mana_eq_debugfs, &eq.eq->eq.irq);
 	debugfs_create_file("eq_dump", 0400, eq.mana_eq_debugfs, eq.eq, &mana_dbg_q_fops);
 }
 
@@ -1665,6 +1669,8 @@ int mana_create_eq(struct mana_port_context *apc)
 	struct gdma_queue_spec spec = {};
 	int err;
 	int i;
+	int msi;
+	struct gdma_irq_context *gic;
 
 	WARN_ON(apc->eqs);
 	apc->eqs = kzalloc_objs(struct mana_eq, apc->num_queues);
@@ -1682,12 +1688,22 @@ int mana_create_eq(struct mana_port_context *apc)
 		debugfs_create_dir("EQs", apc->mana_port_debugfs);
 
 	for (i = 0; i < apc->num_queues; i++) {
-		spec.eq.msix_index = (i + 1) % gc->num_msix_usable;
+		msi = (i + 1) % gc->num_msix_usable;
+
+		gic = mana_gd_get_gic(gc, !gc->msi_sharing, &msi);
+		if (!gic) {
+			err = -ENOMEM;
+			goto out;
+		}
+		spec.eq.msix_index = msi;
+
 		err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq);
 		if (err) {
 			dev_err(gc->dev, "Failed to create EQ %d : %d\n", i, err);
+			mana_gd_put_gic(gc, !gc->msi_sharing, msi);
 			goto out;
 		}
+		apc->eqs[i].eq->eq.irq = gic->irq;
 		mana_create_eq_debugfs(apc, i);
 	}
 
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index fbe3c1427b45..6c138cc77407 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -342,6 +342,7 @@ struct gdma_queue {
 			void *context;
 
 			unsigned int msix_index;
+			unsigned int irq;
 
 			u32 log2_throttle_limit;
 		} eq;
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v8 4/6] net: mana: Use GIC functions to allocate global EQs
From: Long Li @ 2026-05-08 22:12 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260508221202.15725-1-longli@microsoft.com>

Replace the GDMA global interrupt setup code with the new GIC allocation
and release functions for managing interrupt contexts.

This changes the per-queue interrupt names in /proc/interrupts from
mana_q0, mana_q1, ... to mana_msi1, mana_msi2, ... to reflect the
MSI-X index rather than a zero-based queue number. The HWC interrupt
name (mana_hwc) is unchanged.

Signed-off-by: Long Li <longli@microsoft.com>
---
 .../net/ethernet/microsoft/mana/gdma_main.c   | 96 +++----------------
 1 file changed, 13 insertions(+), 83 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 78cb89c46ff3..3408bc1fd6ab 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -1878,7 +1878,7 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
 	struct gdma_context *gc = pci_get_drvdata(pdev);
 	struct gdma_irq_context *gic;
 	bool skip_first_cpu = false;
-	int *irqs, irq, err, i;
+	int *irqs, err, i;
 
 	irqs = kmalloc_objs(int, nvec);
 	if (!irqs)
@@ -1891,30 +1891,13 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
 	 * further used in irq_setup()
 	 */
 	for (i = 1; i <= nvec; i++) {
-		gic = kzalloc_obj(*gic);
+		gic = mana_gd_get_gic(gc, false, &i);
 		if (!gic) {
 			err = -ENOMEM;
 			goto free_irq;
 		}
-		gic->handler = mana_gd_process_eq_events;
-		INIT_LIST_HEAD(&gic->eq_list);
-		spin_lock_init(&gic->lock);
-
-		snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s",
-			 i - 1, pci_name(pdev));
-
-		/* one pci vector is already allocated for HWC */
-		irqs[i - 1] = pci_irq_vector(pdev, i);
-		if (irqs[i - 1] < 0) {
-			err = irqs[i - 1];
-			goto free_current_gic;
-		}
-
-		err = request_irq(irqs[i - 1], mana_gd_intr, 0, gic->name, gic);
-		if (err)
-			goto free_current_gic;
 
-		xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL);
+		irqs[i - 1] = gic->irq;
 	}
 
 	/*
@@ -1936,20 +1919,9 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
 	kfree(irqs);
 	return 0;
 
-free_current_gic:
-	kfree(gic);
 free_irq:
-	for (i -= 1; i > 0; i--) {
-		irq = pci_irq_vector(pdev, i);
-		gic = xa_load(&gc->irq_contexts, i);
-		if (WARN_ON(!gic))
-			continue;
-
-		irq_update_affinity_hint(irq, NULL);
-		free_irq(irq, gic);
-		xa_erase(&gc->irq_contexts, i);
-		kfree(gic);
-	}
+	for (i -= 1; i > 0; i--)
+		mana_gd_put_gic(gc, false, i);
 	kfree(irqs);
 	return err;
 }
@@ -1958,7 +1930,7 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec)
 {
 	struct gdma_context *gc = pci_get_drvdata(pdev);
 	struct gdma_irq_context *gic;
-	int *irqs, *start_irqs, irq;
+	int *irqs, *start_irqs;
 	unsigned int cpu;
 	int err, i;
 
@@ -1969,34 +1941,13 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec)
 	start_irqs = irqs;
 
 	for (i = 0; i < nvec; i++) {
-		gic = kzalloc_obj(*gic);
+		gic = mana_gd_get_gic(gc, false, &i);
 		if (!gic) {
 			err = -ENOMEM;
 			goto free_irq;
 		}
 
-		gic->handler = mana_gd_process_eq_events;
-		INIT_LIST_HEAD(&gic->eq_list);
-		spin_lock_init(&gic->lock);
-
-		if (!i)
-			snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_hwc@pci:%s",
-				 pci_name(pdev));
-		else
-			snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s",
-				 i - 1, pci_name(pdev));
-
-		irqs[i] = pci_irq_vector(pdev, i);
-		if (irqs[i] < 0) {
-			err = irqs[i];
-			goto free_current_gic;
-		}
-
-		err = request_irq(irqs[i], mana_gd_intr, 0, gic->name, gic);
-		if (err)
-			goto free_current_gic;
-
-		xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL);
+		irqs[i] = gic->irq;
 	}
 
 	/* If number of IRQ is one extra than number of online CPUs,
@@ -2025,20 +1976,9 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec)
 	kfree(start_irqs);
 	return 0;
 
-free_current_gic:
-	kfree(gic);
 free_irq:
-	for (i -= 1; i >= 0; i--) {
-		irq = pci_irq_vector(pdev, i);
-		gic = xa_load(&gc->irq_contexts, i);
-		if (WARN_ON(!gic))
-			continue;
-
-		irq_update_affinity_hint(irq, NULL);
-		free_irq(irq, gic);
-		xa_erase(&gc->irq_contexts, i);
-		kfree(gic);
-	}
+	for (i -= 1; i >= 0; i--)
+		mana_gd_put_gic(gc, false, i);
 
 	kfree(start_irqs);
 	return err;
@@ -2112,26 +2052,16 @@ static int mana_gd_setup_remaining_irqs(struct pci_dev *pdev)
 static void mana_gd_remove_irqs(struct pci_dev *pdev)
 {
 	struct gdma_context *gc = pci_get_drvdata(pdev);
-	struct gdma_irq_context *gic;
-	int irq, i;
+	int i;
 
 	if (gc->max_num_msix < 1)
 		return;
 
 	for (i = 0; i < gc->max_num_msix; i++) {
-		irq = pci_irq_vector(pdev, i);
-		if (irq < 0)
-			continue;
-
-		gic = xa_load(&gc->irq_contexts, i);
-		if (WARN_ON(!gic))
+		if (!xa_load(&gc->irq_contexts, i))
 			continue;
 
-		/* Need to clear the hint before free_irq */
-		irq_update_affinity_hint(irq, NULL);
-		free_irq(irq, gic);
-		xa_erase(&gc->irq_contexts, i);
-		kfree(gic);
+		mana_gd_put_gic(gc, false, i);
 	}
 
 	pci_free_irq_vectors(pdev);
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v8 3/6] net: mana: Introduce GIC context with refcounting for interrupt management
From: Long Li @ 2026-05-08 22:11 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260508221202.15725-1-longli@microsoft.com>

To allow Ethernet EQs to use dedicated or shared MSI-X vectors and RDMA
EQs to share the same MSI-X, introduce a GIC (GDMA IRQ Context) with
reference counting. This allows the driver to create an interrupt context
on an assigned or unassigned MSI-X vector and share it across multiple
EQ consumers.

Signed-off-by: Long Li <longli@microsoft.com>
---
 .../net/ethernet/microsoft/mana/gdma_main.c   | 159 ++++++++++++++++++
 include/net/mana/gdma.h                       |  12 ++
 2 files changed, 171 insertions(+)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 4673ff62e6d9..78cb89c46ff3 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -1618,6 +1618,164 @@ static irqreturn_t mana_gd_intr(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
+void mana_gd_put_gic(struct gdma_context *gc, bool use_msi_bitmap, int msi)
+{
+	struct pci_dev *dev = to_pci_dev(gc->dev);
+	struct msi_map irq_map;
+	struct gdma_irq_context *gic;
+	int irq;
+
+	mutex_lock(&gc->gic_mutex);
+
+	gic = xa_load(&gc->irq_contexts, msi);
+	if (WARN_ON(!gic)) {
+		mutex_unlock(&gc->gic_mutex);
+		return;
+	}
+
+	if (use_msi_bitmap)
+		gic->bitmap_refs--;
+
+	if (use_msi_bitmap && gic->bitmap_refs == 0)
+		clear_bit(msi, gc->msi_bitmap);
+
+	if (!refcount_dec_and_test(&gic->refcount))
+		goto out;
+
+	irq = gic->irq;
+
+	irq_update_affinity_hint(irq, NULL);
+	free_irq(irq, gic);
+
+	if (gic->dyn_msix) {
+		irq_map.virq = irq;
+		irq_map.index = msi;
+		pci_msix_free_irq(dev, irq_map);
+	}
+
+	xa_erase(&gc->irq_contexts, msi);
+	kfree(gic);
+
+out:
+	mutex_unlock(&gc->gic_mutex);
+}
+EXPORT_SYMBOL_NS(mana_gd_put_gic, "NET_MANA");
+
+/*
+ * Get a GIC (GDMA IRQ Context) on a MSI vector
+ * a MSI can be shared between different EQs, this function supports setting
+ * up separate MSIs using a bitmap, or directly using the MSI index
+ *
+ * @use_msi_bitmap:
+ * True if MSI is assigned by this function on available slots from bitmap.
+ * False if MSI is passed from *msi_requested
+ */
+struct gdma_irq_context *mana_gd_get_gic(struct gdma_context *gc,
+					 bool use_msi_bitmap,
+					 int *msi_requested)
+{
+	struct gdma_irq_context *gic;
+	struct pci_dev *dev = to_pci_dev(gc->dev);
+	struct msi_map irq_map = { };
+	int irq;
+	int msi;
+	int err;
+
+	mutex_lock(&gc->gic_mutex);
+
+	if (use_msi_bitmap) {
+		msi = find_first_zero_bit(gc->msi_bitmap, gc->num_msix_usable);
+		if (msi >= gc->num_msix_usable) {
+			dev_err(gc->dev, "No free MSI vectors available\n");
+			gic = NULL;
+			goto out;
+		}
+		*msi_requested = msi;
+	} else {
+		msi = *msi_requested;
+	}
+
+	gic = xa_load(&gc->irq_contexts, msi);
+	if (gic) {
+		refcount_inc(&gic->refcount);
+		if (use_msi_bitmap) {
+			gic->bitmap_refs++;
+			set_bit(msi, gc->msi_bitmap);
+		}
+		goto out;
+	}
+
+	irq = pci_irq_vector(dev, msi);
+	if (irq == -EINVAL) {
+		irq_map = pci_msix_alloc_irq_at(dev, msi, NULL);
+		if (!irq_map.virq) {
+			err = irq_map.index;
+			dev_err(gc->dev,
+				"Failed to alloc irq_map msi %d err %d\n",
+				msi, err);
+			gic = NULL;
+			goto out;
+		}
+		irq = irq_map.virq;
+		msi = irq_map.index;
+	}
+
+	gic = kzalloc(sizeof(*gic), GFP_KERNEL);
+	if (!gic) {
+		if (irq_map.virq)
+			pci_msix_free_irq(dev, irq_map);
+		goto out;
+	}
+
+	gic->handler = mana_gd_process_eq_events;
+	gic->msi = msi;
+	gic->irq = irq;
+	INIT_LIST_HEAD(&gic->eq_list);
+	spin_lock_init(&gic->lock);
+
+	if (!gic->msi)
+		snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_hwc@pci:%s",
+			 pci_name(dev));
+	else
+		snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_msi%d@pci:%s",
+			 gic->msi, pci_name(dev));
+
+	err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
+	if (err) {
+		dev_err(gc->dev, "Failed to request irq %d %s\n",
+			irq, gic->name);
+		kfree(gic);
+		gic = NULL;
+		if (irq_map.virq)
+			pci_msix_free_irq(dev, irq_map);
+		goto out;
+	}
+
+	gic->dyn_msix = !!irq_map.virq;
+	refcount_set(&gic->refcount, 1);
+	gic->bitmap_refs = use_msi_bitmap ? 1 : 0;
+
+	err = xa_err(xa_store(&gc->irq_contexts, msi, gic, GFP_KERNEL));
+	if (err) {
+		dev_err(gc->dev, "Failed to store irq context for msi %d: %d\n",
+			msi, err);
+		free_irq(irq, gic);
+		kfree(gic);
+		gic = NULL;
+		if (irq_map.virq)
+			pci_msix_free_irq(dev, irq_map);
+		goto out;
+	}
+
+	if (use_msi_bitmap)
+		set_bit(msi, gc->msi_bitmap);
+
+out:
+	mutex_unlock(&gc->gic_mutex);
+	return gic;
+}
+EXPORT_SYMBOL_NS(mana_gd_get_gic, "NET_MANA");
+
 int mana_gd_alloc_res_map(u32 res_avail, struct gdma_resource *r)
 {
 	r->map = bitmap_zalloc(res_avail, GFP_KERNEL);
@@ -2107,6 +2265,7 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto release_region;
 
 	mutex_init(&gc->eq_test_event_mutex);
+	mutex_init(&gc->gic_mutex);
 	pci_set_drvdata(pdev, gc);
 	gc->bar0_pa = pci_resource_start(pdev, 0);
 	gc->bar0_size = pci_resource_len(pdev, 0);
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 9c05b1e15c3e..fbe3c1427b45 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -388,6 +388,11 @@ struct gdma_irq_context {
 	spinlock_t lock;
 	struct list_head eq_list;
 	char name[MANA_IRQ_NAME_SZ];
+	unsigned int msi;
+	unsigned int irq;
+	refcount_t refcount;
+	unsigned int bitmap_refs;
+	bool dyn_msix;
 };
 
 enum gdma_context_flags {
@@ -449,6 +454,9 @@ struct gdma_context {
 
 	unsigned long		flags;
 
+	/* Protect access to GIC context */
+	struct mutex		gic_mutex;
+
 	/* Indicate if this device is sharing MSI for EQs on MANA */
 	bool msi_sharing;
 
@@ -1026,6 +1034,10 @@ int mana_gd_resume(struct pci_dev *pdev);
 
 bool mana_need_log(struct gdma_context *gc, int err);
 
+struct gdma_irq_context *mana_gd_get_gic(struct gdma_context *gc,
+					 bool use_msi_bitmap,
+					 int *msi_requested);
+void mana_gd_put_gic(struct gdma_context *gc, bool use_msi_bitmap, int msi);
 int mana_gd_query_device_cfg(struct gdma_context *gc, u32 proto_major_ver,
 			     u32 proto_minor_ver, u32 proto_micro_ver,
 			     u16 *max_num_vports, u8 *bm_hostmode);
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v8 2/6] net: mana: Query device capabilities and configure MSI-X sharing for EQs
From: Long Li @ 2026-05-08 22:11 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260508221202.15725-1-longli@microsoft.com>

When querying the device, adjust the max number of queues to allow
dedicated MSI-X vectors for each vPort. The number of queues per vPort
is clamped to no less than MANA_DEF_NUM_QUEUES. MSI-X sharing among
vPorts is disabled by default and is only enabled when there are not
enough MSI-X vectors for dedicated allocation.

Rename mana_query_device_cfg() to mana_gd_query_device_cfg() as it is
used at GDMA device probe time for querying device capabilities.

Signed-off-by: Long Li <longli@microsoft.com>
---
 .../net/ethernet/microsoft/mana/gdma_main.c   | 59 ++++++++++++++++++-
 drivers/net/ethernet/microsoft/mana/mana_en.c | 40 ++++++++-----
 include/net/mana/gdma.h                       | 13 +++-
 3 files changed, 93 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index f3316e929175..4673ff62e6d9 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -149,6 +149,9 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
 	struct gdma_context *gc = pci_get_drvdata(pdev);
 	struct gdma_query_max_resources_resp resp = {};
 	struct gdma_general_req req = {};
+	unsigned int max_num_queues;
+	u8 bm_hostmode;
+	u16 num_ports;
 	int err;
 
 	mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_MAX_RESOURCES,
@@ -197,6 +200,43 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
 	if (gc->max_num_queues == 0)
 		return -ENOSPC;
 
+	err = mana_gd_query_device_cfg(gc, MANA_MAJOR_VERSION,
+				       MANA_MINOR_VERSION,
+				       MANA_MICRO_VERSION,
+				       &num_ports, &bm_hostmode);
+	if (err)
+		return err;
+
+	if (!num_ports)
+		return -EINVAL;
+
+	/*
+	 * Adjust the per-vPort max queue count to allow dedicated
+	 * MSIx for each vPort. Clamp to no less than MANA_DEF_NUM_QUEUES.
+	 */
+	max_num_queues = (gc->num_msix_usable - 1) / num_ports;
+	max_num_queues = rounddown_pow_of_two(max(max_num_queues, 1U));
+	if (max_num_queues < MANA_DEF_NUM_QUEUES)
+		max_num_queues = MANA_DEF_NUM_QUEUES;
+
+	/*
+	 * Use dedicated MSIx for EQs whenever possible, use MSIx sharing for
+	 * Ethernet EQs when (max_num_queues * num_ports > num_msix_usable - 1)
+	 */
+	max_num_queues = min(gc->max_num_queues, max_num_queues);
+	if (max_num_queues * num_ports > gc->num_msix_usable - 1)
+		gc->msi_sharing = true;
+
+	/* If MSI is shared, use max allowed value */
+	if (gc->msi_sharing)
+		gc->max_num_queues_vport = min(gc->num_msix_usable - 1,
+					       gc->max_num_queues);
+	else
+		gc->max_num_queues_vport = max_num_queues;
+
+	dev_info(gc->dev, "MSI sharing mode %d max queues %d\n",
+		 gc->msi_sharing, gc->max_num_queues);
+
 	return 0;
 }
 
@@ -1859,6 +1899,7 @@ static int mana_gd_setup_hwc_irqs(struct pci_dev *pdev)
 		/* Need 1 interrupt for HWC */
 		max_irqs = min(num_online_cpus(), MANA_MAX_NUM_QUEUES) + 1;
 		min_irqs = 2;
+		gc->msi_sharing = true;
 	}
 
 	nvec = pci_alloc_irq_vectors(pdev, min_irqs, max_irqs, PCI_IRQ_MSIX);
@@ -1937,6 +1978,8 @@ static void mana_gd_remove_irqs(struct pci_dev *pdev)
 
 	pci_free_irq_vectors(pdev);
 
+	bitmap_free(gc->msi_bitmap);
+	gc->msi_bitmap = NULL;
 	gc->max_num_msix = 0;
 	gc->num_msix_usable = 0;
 }
@@ -1971,6 +2014,10 @@ static int mana_gd_setup(struct pci_dev *pdev)
 	if (err)
 		goto destroy_hwc;
 
+	err = mana_gd_detect_devices(pdev);
+	if (err)
+		goto destroy_hwc;
+
 	err = mana_gd_query_max_resources(pdev);
 	if (err)
 		goto destroy_hwc;
@@ -1981,9 +2028,15 @@ static int mana_gd_setup(struct pci_dev *pdev)
 		goto destroy_hwc;
 	}
 
-	err = mana_gd_detect_devices(pdev);
-	if (err)
-		goto destroy_hwc;
+	if (!gc->msi_sharing) {
+		gc->msi_bitmap = bitmap_zalloc(gc->num_msix_usable, GFP_KERNEL);
+		if (!gc->msi_bitmap) {
+			err = -ENOMEM;
+			goto destroy_hwc;
+		}
+		/* Set bit for HWC */
+		set_bit(0, gc->msi_bitmap);
+	}
 
 	dev_dbg(&pdev->dev, "mana gdma setup successful\n");
 	return 0;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 2f3d619e0f2e..3f6cdc2cd82d 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1007,10 +1007,9 @@ static int mana_init_port_context(struct mana_port_context *apc)
 	return !apc->rxqs ? -ENOMEM : 0;
 }
 
-static int mana_send_request(struct mana_context *ac, void *in_buf,
-			     u32 in_len, void *out_buf, u32 out_len)
+static int gdma_mana_send_request(struct gdma_context *gc, void *in_buf,
+				  u32 in_len, void *out_buf, u32 out_len)
 {
-	struct gdma_context *gc = ac->gdma_dev->gdma_context;
 	struct gdma_resp_hdr *resp = out_buf;
 	struct gdma_req_hdr *req = in_buf;
 	struct device *dev = gc->dev;
@@ -1044,6 +1043,14 @@ static int mana_send_request(struct mana_context *ac, void *in_buf,
 	return 0;
 }
 
+static int mana_send_request(struct mana_context *ac, void *in_buf,
+			     u32 in_len, void *out_buf, u32 out_len)
+{
+	struct gdma_context *gc = ac->gdma_dev->gdma_context;
+
+	return gdma_mana_send_request(gc, in_buf, in_len, out_buf, out_len);
+}
+
 static int mana_verify_resp_hdr(const struct gdma_resp_hdr *resp_hdr,
 				const enum mana_command_code expected_code,
 				const u32 min_size)
@@ -1177,11 +1184,10 @@ static void mana_pf_deregister_filter(struct mana_port_context *apc)
 			   err, resp.hdr.status);
 }
 
-static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver,
-				 u32 proto_minor_ver, u32 proto_micro_ver,
-				 u16 *max_num_vports, u8 *bm_hostmode)
+int mana_gd_query_device_cfg(struct gdma_context *gc, u32 proto_major_ver,
+			     u32 proto_minor_ver, u32 proto_micro_ver,
+			     u16 *max_num_vports, u8 *bm_hostmode)
 {
-	struct gdma_context *gc = ac->gdma_dev->gdma_context;
 	struct mana_query_device_cfg_resp resp = {};
 	struct mana_query_device_cfg_req req = {};
 	struct device *dev = gc->dev;
@@ -1196,7 +1202,8 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver,
 	req.proto_minor_ver = proto_minor_ver;
 	req.proto_micro_ver = proto_micro_ver;
 
-	err = mana_send_request(ac, &req, sizeof(req), &resp, sizeof(resp));
+	err = gdma_mana_send_request(gc, &req, sizeof(req),
+				     &resp, sizeof(resp));
 	if (err) {
 		dev_err(dev, "Failed to query config: %d", err);
 		return err;
@@ -1230,8 +1237,6 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver,
 	else
 		*bm_hostmode = 0;
 
-	debugfs_create_u16("adapter-MTU", 0400, gc->mana_pci_debugfs, &gc->adapter_mtu);
-
 	return 0;
 }
 
@@ -3415,7 +3420,7 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
 	int err;
 
 	ndev = alloc_etherdev_mq(sizeof(struct mana_port_context),
-				 gc->max_num_queues);
+				 gc->max_num_queues_vport);
 	if (!ndev)
 		return -ENOMEM;
 
@@ -3424,9 +3429,9 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
 	apc = netdev_priv(ndev);
 	apc->ac = ac;
 	apc->ndev = ndev;
-	apc->max_queues = gc->max_num_queues;
+	apc->max_queues = gc->max_num_queues_vport;
 	/* Use MANA_DEF_NUM_QUEUES as default, still honoring the HW limit */
-	apc->num_queues = min(gc->max_num_queues, MANA_DEF_NUM_QUEUES);
+	apc->num_queues = min(gc->max_num_queues_vport, MANA_DEF_NUM_QUEUES);
 	apc->tx_queue_size = DEF_TX_BUFFERS_PER_QUEUE;
 	apc->rx_queue_size = DEF_RX_BUFFERS_PER_QUEUE;
 	apc->port_handle = INVALID_MANA_HANDLE;
@@ -3690,13 +3695,18 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 
 	INIT_DELAYED_WORK(&ac->gf_stats_work, mana_gf_stats_work_handler);
 
-	err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION,
-				    MANA_MICRO_VERSION, &num_ports, &bm_hostmode);
+	err = mana_gd_query_device_cfg(gc, MANA_MAJOR_VERSION,
+				       MANA_MINOR_VERSION,
+				       MANA_MICRO_VERSION,
+				       &num_ports, &bm_hostmode);
 	if (err)
 		goto out;
 
 	ac->bm_hostmode = bm_hostmode;
 
+	debugfs_create_u16("adapter-MTU", 0400,
+			   gc->mana_pci_debugfs, &gc->adapter_mtu);
+
 	if (!resuming) {
 		ac->num_ports = num_ports;
 	} else {
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 6d836060976a..9c05b1e15c3e 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -399,8 +399,10 @@ struct gdma_context {
 	struct device		*dev;
 	struct dentry		*mana_pci_debugfs;
 
-	/* Per-vPort max number of queues */
+	/* Hardware max number of queues */
 	unsigned int		max_num_queues;
+	/* Per-vPort max number of queues */
+	unsigned int		max_num_queues_vport;
 	unsigned int		max_num_msix;
 	unsigned int		num_msix_usable;
 	struct xarray		irq_contexts;
@@ -446,6 +448,12 @@ struct gdma_context {
 	struct workqueue_struct *service_wq;
 
 	unsigned long		flags;
+
+	/* Indicate if this device is sharing MSI for EQs on MANA */
+	bool msi_sharing;
+
+	/* Bitmap tracks where MSI is allocated when it is not shared for EQs */
+	unsigned long *msi_bitmap;
 };
 
 static inline bool mana_gd_is_mana(struct gdma_dev *gd)
@@ -1018,4 +1026,7 @@ int mana_gd_resume(struct pci_dev *pdev);
 
 bool mana_need_log(struct gdma_context *gc, int err);
 
+int mana_gd_query_device_cfg(struct gdma_context *gc, u32 proto_major_ver,
+			     u32 proto_minor_ver, u32 proto_micro_ver,
+			     u16 *max_num_vports, u8 *bm_hostmode);
 #endif /* _GDMA_H */
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v8 1/6] net: mana: Create separate EQs for each vPort
From: Long Li @ 2026-05-08 22:11 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260508221202.15725-1-longli@microsoft.com>

To prepare for assigning vPorts to dedicated MSI-X vectors, remove EQ
sharing among the vPorts and create dedicated EQs for each vPort.

Move the EQ definition from struct mana_context to struct mana_port_context
and update related support functions. Export mana_create_eq() and
mana_destroy_eq() for use by the MANA RDMA driver.

Signed-off-by: Long Li <longli@microsoft.com>
---
 drivers/infiniband/hw/mana/main.c             |  19 ++-
 drivers/infiniband/hw/mana/qp.c               |  16 ++-
 drivers/net/ethernet/microsoft/mana/mana_en.c | 111 ++++++++++--------
 include/net/mana/mana.h                       |   7 +-
 4 files changed, 98 insertions(+), 55 deletions(-)

diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
index ac5e75dd3494..8000ab6e8beb 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -20,8 +20,10 @@ void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd,
 	pd->vport_use_count--;
 	WARN_ON(pd->vport_use_count < 0);
 
-	if (!pd->vport_use_count)
+	if (!pd->vport_use_count) {
+		mana_destroy_eq(mpc);
 		mana_uncfg_vport(mpc);
+	}
 
 	mutex_unlock(&pd->vport_mutex);
 }
@@ -55,15 +57,22 @@ int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd,
 		return err;
 	}
 
-	mutex_unlock(&pd->vport_mutex);
 
 	pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
 	pd->tx_vp_offset = mpc->tx_vp_offset;
+	err = mana_create_eq(mpc);
+	if (err) {
+		mana_uncfg_vport(mpc);
+		pd->vport_use_count--;
+	}
 
-	ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
-		  mpc->port_handle, pd->pdn, doorbell_id);
+	mutex_unlock(&pd->vport_mutex);
 
-	return 0;
+	if (!err)
+		ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
+			  mpc->port_handle, pd->pdn, doorbell_id);
+
+	return err;
 }
 
 int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
index 645581359cee..6f1043383e8c 100644
--- a/drivers/infiniband/hw/mana/qp.c
+++ b/drivers/infiniband/hw/mana/qp.c
@@ -168,7 +168,15 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 		cq_spec.gdma_region = cq->queue.gdma_region;
 		cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE;
 		cq_spec.modr_ctx_id = 0;
-		eq = &mpc->ac->eqs[cq->comp_vector];
+		/* EQs are created when a raw QP configures the vport.
+		 * A raw QP must be created before creating rwq_ind_tbl.
+		 */
+		if (!mpc->eqs) {
+			ret = -EINVAL;
+			i--;
+			goto fail;
+		}
+		eq = &mpc->eqs[cq->comp_vector % mpc->num_queues];
 		cq_spec.attached_eq = eq->eq->id;
 
 		ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ,
@@ -317,7 +325,11 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
 	cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE;
 	cq_spec.modr_ctx_id = 0;
 	eq_vec = send_cq->comp_vector;
-	eq = &mpc->ac->eqs[eq_vec];
+	if (!mpc->eqs) {
+		err = -EINVAL;
+		goto err_destroy_queue;
+	}
+	eq = &mpc->eqs[eq_vec % mpc->num_queues];
 	cq_spec.attached_eq = eq->eq->id;
 
 	err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec,
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 462a457e7d53..2f3d619e0f2e 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1615,78 +1615,83 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type,
 }
 EXPORT_SYMBOL_NS(mana_destroy_wq_obj, "NET_MANA");
 
-static void mana_destroy_eq(struct mana_context *ac)
+void mana_destroy_eq(struct mana_port_context *apc)
 {
+	struct mana_context *ac = apc->ac;
 	struct gdma_context *gc = ac->gdma_dev->gdma_context;
 	struct gdma_queue *eq;
 	int i;
 
-	if (!ac->eqs)
+	if (!apc->eqs)
 		return;
 
-	debugfs_remove_recursive(ac->mana_eqs_debugfs);
-	ac->mana_eqs_debugfs = NULL;
+	debugfs_remove_recursive(apc->mana_eqs_debugfs);
+	apc->mana_eqs_debugfs = NULL;
 
-	for (i = 0; i < gc->max_num_queues; i++) {
-		eq = ac->eqs[i].eq;
+	for (i = 0; i < apc->num_queues; i++) {
+		eq = apc->eqs[i].eq;
 		if (!eq)
 			continue;
 
 		mana_gd_destroy_queue(gc, eq);
 	}
 
-	kfree(ac->eqs);
-	ac->eqs = NULL;
+	kfree(apc->eqs);
+	apc->eqs = NULL;
 }
+EXPORT_SYMBOL_NS(mana_destroy_eq, "NET_MANA");
 
-static void mana_create_eq_debugfs(struct mana_context *ac, int i)
+static void mana_create_eq_debugfs(struct mana_port_context *apc, int i)
 {
-	struct mana_eq eq = ac->eqs[i];
+	struct mana_eq eq = apc->eqs[i];
 	char eqnum[32];
 
 	sprintf(eqnum, "eq%d", i);
-	eq.mana_eq_debugfs = debugfs_create_dir(eqnum, ac->mana_eqs_debugfs);
+	eq.mana_eq_debugfs = debugfs_create_dir(eqnum, apc->mana_eqs_debugfs);
 	debugfs_create_u32("head", 0400, eq.mana_eq_debugfs, &eq.eq->head);
 	debugfs_create_u32("tail", 0400, eq.mana_eq_debugfs, &eq.eq->tail);
 	debugfs_create_file("eq_dump", 0400, eq.mana_eq_debugfs, eq.eq, &mana_dbg_q_fops);
 }
 
-static int mana_create_eq(struct mana_context *ac)
+int mana_create_eq(struct mana_port_context *apc)
 {
-	struct gdma_dev *gd = ac->gdma_dev;
+	struct gdma_dev *gd = apc->ac->gdma_dev;
 	struct gdma_context *gc = gd->gdma_context;
 	struct gdma_queue_spec spec = {};
 	int err;
 	int i;
 
-	ac->eqs = kzalloc_objs(struct mana_eq, gc->max_num_queues);
-	if (!ac->eqs)
+	WARN_ON(apc->eqs);
+	apc->eqs = kzalloc_objs(struct mana_eq, apc->num_queues);
+	if (!apc->eqs)
 		return -ENOMEM;
 
 	spec.type = GDMA_EQ;
 	spec.monitor_avl_buf = false;
 	spec.queue_size = EQ_SIZE;
 	spec.eq.callback = NULL;
-	spec.eq.context = ac->eqs;
+	spec.eq.context = apc->eqs;
 	spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE;
 
-	ac->mana_eqs_debugfs = debugfs_create_dir("EQs", gc->mana_pci_debugfs);
+	apc->mana_eqs_debugfs =
+		debugfs_create_dir("EQs", apc->mana_port_debugfs);
 
-	for (i = 0; i < gc->max_num_queues; i++) {
+	for (i = 0; i < apc->num_queues; i++) {
 		spec.eq.msix_index = (i + 1) % gc->num_msix_usable;
-		err = mana_gd_create_mana_eq(gd, &spec, &ac->eqs[i].eq);
+		err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq);
 		if (err) {
 			dev_err(gc->dev, "Failed to create EQ %d : %d\n", i, err);
 			goto out;
 		}
-		mana_create_eq_debugfs(ac, i);
+		mana_create_eq_debugfs(apc, i);
 	}
 
 	return 0;
 out:
-	mana_destroy_eq(ac);
+	mana_destroy_eq(apc);
 	return err;
 }
+EXPORT_SYMBOL_NS(mana_create_eq, "NET_MANA");
 
 static int mana_fence_rq(struct mana_port_context *apc, struct mana_rxq *rxq)
 {
@@ -2451,7 +2456,7 @@ static int mana_create_txq(struct mana_port_context *apc,
 		spec.monitor_avl_buf = false;
 		spec.queue_size = cq_size;
 		spec.cq.callback = mana_schedule_napi;
-		spec.cq.parent_eq = ac->eqs[i].eq;
+		spec.cq.parent_eq = apc->eqs[i].eq;
 		spec.cq.context = cq;
 		err = mana_gd_create_mana_wq_cq(gd, &spec, &cq->gdma_cq);
 		if (err)
@@ -2844,13 +2849,12 @@ static void mana_create_rxq_debugfs(struct mana_port_context *apc, int idx)
 static int mana_add_rx_queues(struct mana_port_context *apc,
 			      struct net_device *ndev)
 {
-	struct mana_context *ac = apc->ac;
 	struct mana_rxq *rxq;
 	int err = 0;
 	int i;
 
 	for (i = 0; i < apc->num_queues; i++) {
-		rxq = mana_create_rxq(apc, i, &ac->eqs[i], ndev);
+		rxq = mana_create_rxq(apc, i, &apc->eqs[i], ndev);
 		if (!rxq) {
 			err = -ENOMEM;
 			netdev_err(ndev, "Failed to create rxq %d : %d\n", i, err);
@@ -2869,9 +2873,8 @@ static int mana_add_rx_queues(struct mana_port_context *apc,
 	return err;
 }
 
-static void mana_destroy_vport(struct mana_port_context *apc)
+static void mana_destroy_rxqs(struct mana_port_context *apc)
 {
-	struct gdma_dev *gd = apc->ac->gdma_dev;
 	struct mana_rxq *rxq;
 	u32 rxq_idx;
 
@@ -2883,8 +2886,12 @@ static void mana_destroy_vport(struct mana_port_context *apc)
 		mana_destroy_rxq(apc, rxq, true);
 		apc->rxqs[rxq_idx] = NULL;
 	}
+}
+
+static void mana_destroy_vport(struct mana_port_context *apc)
+{
+	struct gdma_dev *gd = apc->ac->gdma_dev;
 
-	mana_destroy_txq(apc);
 	mana_uncfg_vport(apc);
 
 	if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode)
@@ -2905,11 +2912,7 @@ static int mana_create_vport(struct mana_port_context *apc,
 			return err;
 	}
 
-	err = mana_cfg_vport(apc, gd->pdid, gd->doorbell);
-	if (err)
-		return err;
-
-	return mana_create_txq(apc, net);
+	return mana_cfg_vport(apc, gd->pdid, gd->doorbell);
 }
 
 static int mana_rss_table_alloc(struct mana_port_context *apc)
@@ -3195,21 +3198,36 @@ int mana_alloc_queues(struct net_device *ndev)
 
 	err = mana_create_vport(apc, ndev);
 	if (err) {
-		netdev_err(ndev, "Failed to create vPort %u : %d\n", apc->port_idx, err);
+		netdev_err(ndev, "Failed to create vPort %u : %d\n",
+			   apc->port_idx, err);
 		return err;
 	}
 
+	err = mana_create_eq(apc);
+	if (err) {
+		netdev_err(ndev, "Failed to create EQ on vPort %u: %d\n",
+			   apc->port_idx, err);
+		goto destroy_vport;
+	}
+
+	err = mana_create_txq(apc, ndev);
+	if (err) {
+		netdev_err(ndev, "Failed to create TXQ on vPort %u: %d\n",
+			   apc->port_idx, err);
+		goto destroy_eq;
+	}
+
 	err = netif_set_real_num_tx_queues(ndev, apc->num_queues);
 	if (err) {
 		netdev_err(ndev,
 			   "netif_set_real_num_tx_queues () failed for ndev with num_queues %u : %d\n",
 			   apc->num_queues, err);
-		goto destroy_vport;
+		goto destroy_txq;
 	}
 
 	err = mana_add_rx_queues(apc, ndev);
 	if (err)
-		goto destroy_vport;
+		goto destroy_rxq;
 
 	apc->rss_state = apc->num_queues > 1 ? TRI_STATE_TRUE : TRI_STATE_FALSE;
 
@@ -3218,7 +3236,7 @@ int mana_alloc_queues(struct net_device *ndev)
 		netdev_err(ndev,
 			   "netif_set_real_num_rx_queues () failed for ndev with num_queues %u : %d\n",
 			   apc->num_queues, err);
-		goto destroy_vport;
+		goto destroy_rxq;
 	}
 
 	mana_rss_table_init(apc);
@@ -3226,19 +3244,25 @@ int mana_alloc_queues(struct net_device *ndev)
 	err = mana_config_rss(apc, TRI_STATE_TRUE, true, true);
 	if (err) {
 		netdev_err(ndev, "Failed to configure RSS table: %d\n", err);
-		goto destroy_vport;
+		goto destroy_rxq;
 	}
 
 	if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) {
 		err = mana_pf_register_filter(apc);
 		if (err)
-			goto destroy_vport;
+			goto destroy_rxq;
 	}
 
 	mana_chn_setxdp(apc, mana_xdp_get(apc));
 
 	return 0;
 
+destroy_rxq:
+	mana_destroy_rxqs(apc);
+destroy_txq:
+	mana_destroy_txq(apc);
+destroy_eq:
+	mana_destroy_eq(apc);
 destroy_vport:
 	mana_destroy_vport(apc);
 	return err;
@@ -3343,6 +3367,9 @@ static int mana_dealloc_queues(struct net_device *ndev)
 	mana_fence_rqs(apc);
 
 	/* Even in err case, still need to cleanup the vPort */
+	mana_destroy_rxqs(apc);
+	mana_destroy_txq(apc);
+	mana_destroy_eq(apc);
 	mana_destroy_vport(apc);
 
 	return 0;
@@ -3663,12 +3690,6 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 
 	INIT_DELAYED_WORK(&ac->gf_stats_work, mana_gf_stats_work_handler);
 
-	err = mana_create_eq(ac);
-	if (err) {
-		dev_err(dev, "Failed to create EQs: %d\n", err);
-		goto out;
-	}
-
 	err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION,
 				    MANA_MICRO_VERSION, &num_ports, &bm_hostmode);
 	if (err)
@@ -3808,8 +3829,6 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
 		free_netdev(ndev);
 	}
 
-	mana_destroy_eq(ac);
-
 	if (ac->per_port_queue_reset_wq) {
 		destroy_workqueue(ac->per_port_queue_reset_wq);
 		ac->per_port_queue_reset_wq = NULL;
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index aa90a858c8e3..c8e7d16f6685 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -480,8 +480,6 @@ struct mana_context {
 	u8 bm_hostmode;
 
 	struct mana_ethtool_hc_stats hc_stats;
-	struct mana_eq *eqs;
-	struct dentry *mana_eqs_debugfs;
 	struct workqueue_struct *per_port_queue_reset_wq;
 	/* Workqueue for querying hardware stats */
 	struct delayed_work gf_stats_work;
@@ -501,6 +499,9 @@ struct mana_port_context {
 
 	u8 mac_addr[ETH_ALEN];
 
+	struct mana_eq *eqs;
+	struct dentry *mana_eqs_debugfs;
+
 	enum TRI_STATE rss_state;
 
 	mana_handle_t default_rxobj;
@@ -1034,6 +1035,8 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type,
 int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
 		   u32 doorbell_pg_id);
 void mana_uncfg_vport(struct mana_port_context *apc);
+int mana_create_eq(struct mana_port_context *apc);
+void mana_destroy_eq(struct mana_port_context *apc);
 
 struct net_device *mana_get_primary_netdev(struct mana_context *ac,
 					   u32 port_index,
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v8 0/6] net: mana: Per-vPort EQ and MSI-X interrupt management
From: Long Li @ 2026-05-08 22:11 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel

This series moves EQ ownership from the shared mana_context to per-vPort
mana_port_context, enabling each vPort to have dedicated MSI-X vectors
when the hardware provides enough vectors. When vectors are limited, the
driver falls back to sharing MSI-X among vPorts.

The series introduces a GDMA IRQ Context (GIC) abstraction with reference
counting to manage interrupt context lifecycle. This allows both Ethernet
and RDMA EQs to dynamically acquire dedicated or shared MSI-X vectors at
vPort creation time rather than pre-allocating all vectors at probe time.

Key changes:
- Per-vPort EQ allocation with exported lifecycle functions for RDMA use
- Device capability query to determine dedicated vs shared MSI-X mode
- GIC context with refcounting for flexible interrupt management
- On-demand interrupt context allocation when creating vPort EQs
- RDMA EQ integration with the GIC framework

Changes in v8:
- Fix comment to reference per-vPort queue count instead of
  gc->max_num_queues (patch 2)
- Remove duplicate irq_update_affinity_hint() calls from error paths
  and mana_gd_remove_irqs(); the clearing is now centralized in
  mana_gd_put_gic() (patch 4)
- Note the IRQ name change (mana_q -> mana_msi) in the commit
  message (patch 4)
- Remove dead conditional write to spec.eq.msix_index (patch 5)
- Document GIC ownership contract and msix_index invariant change
  in commit message (patch 5)
- Populate eq.irq on RDMA EQs for consistency with the Ethernet
  path (patch 6)
- Document BIT(6) relocation and capability flag semantics in
  commit message (patch 6)
- Fix checkpatch --strict alignment and line length warnings

Changes in v7:
- Use rounddown_pow_of_two() instead of roundup_pow_of_two() when
  computing per-vPort queue count to avoid unnecessarily forcing shared
  MSI-X mode (patch 2)
- Call mana_gd_setup_remaining_irqs() unconditionally to ensure
  irq_contexts are populated in both dedicated and shared MSI-X modes,
  fixing bisectability between patches 2 and 5 (patch 2)
- Guard ibdev_dbg() in mana_ib_cfg_vport() with error check so the
  vport handle is not logged on the failure path (patch 1)
- Use cached gic->irq instead of pci_irq_vector() lookup in
  mana_gd_put_gic() for consistency with the allocation path (patch 3)
- Fix unsigned int* to int* pointer type mismatch when calling
  mana_gd_get_gic() by using a local int variable for the MSI index
  (patches 5, 6)

Changes in v6:
- Rebased on net-next/main (v7.1-rc1)

Changes in v5:
- Rebased on net-next/main

Changes in v4:
- Rebased on net-next/main 7.0-rc4
- Patch 2: Use MANA_DEF_NUM_QUEUES instead of hardcoded 16 for
  max_num_queues clamping
- Patch 3: Track dyn_msix in GIC context instead of re-checking
  pci_msix_can_alloc_dyn() on each call; improved remove_irqs iteration
  to skip unallocated entries

Changes in v3:
- Rebased on net-next/main
- Patch 1: Added NULL check for mpc->eqs in mana_ib_create_qp_rss() to
  prevent NULL pointer dereference when RSS QP is created before a raw QP
  has configured the vport and allocated EQs

Changes in v2:
- Rebased on net-next/main (adapted to kzalloc_objs/kzalloc_obj macros,
  new GDMA_DRV_CAP_FLAG definitions)
- Patch 2: Fixed misleading comment for max_num_queues vs
  max_num_queues_vport in gdma.h
- Patch 3: Fixed spelling typo in gdma_main.c ("difference" -> "different")

Long Li (6):
  net: mana: Create separate EQs for each vPort
  net: mana: Query device capabilities and configure MSI-X sharing for
    EQs
  net: mana: Introduce GIC context with refcounting for interrupt
    management
  net: mana: Use GIC functions to allocate global EQs
  net: mana: Allocate interrupt context for each EQ when creating vPort
  RDMA/mana_ib: Allocate interrupt contexts on EQs

 drivers/infiniband/hw/mana/main.c             |  62 +++-
 drivers/infiniband/hw/mana/qp.c               |  16 +-
 .../net/ethernet/microsoft/mana/gdma_main.c   | 316 +++++++++++++-----
 drivers/net/ethernet/microsoft/mana/mana_en.c | 169 ++++++----
 include/net/mana/gdma.h                       |  33 +-
 include/net/mana/mana.h                       |   7 +-
 6 files changed, 434 insertions(+), 169 deletions(-)

-- 
2.43.0

^ permalink raw reply

* [PATCH net-next] net: mana: Add handler for sriov configure
From: Haiyang Zhang @ 2026-05-08 22:04 UTC (permalink / raw)
  To: linux-hyperv, netdev, K. Y. Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Andrew Lunn, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Bjorn Helgaas, Simon Horman,
	Shradha Gupta, Dipayaan Roy, Erni Sri Satya Vennela, linux-kernel,
	linux-pci
  Cc: paulros

From: Haiyang Zhang <haiyangz@microsoft.com>

Add callback function for the pci_driver, sriov_configure.

Also disable VF autoprobe when it runs as PF driver on bare metal,
since the hardware side may not have the VF ready immediately.

Export pci_vf_drivers_autoprobe() so the driver can toggle the VF
autoprobe flag.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 .../net/ethernet/microsoft/mana/gdma_main.c   | 20 +++++++++++++++++++
 drivers/pci/iov.c                             |  1 +
 2 files changed, 21 insertions(+)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 3bc3fff55999..767f11d5b351 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -2094,6 +2094,11 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	gc->numa_node = dev_to_node(&pdev->dev);
 	gc->is_pf = mana_is_pf(pdev->device);
+
+	/* Disable VF autoprobe on BM */
+	if (gc->is_pf)
+		pci_vf_drivers_autoprobe(pdev, false);
+
 	gc->bar0_va = bar0_va;
 	gc->dev = &pdev->dev;
 	xa_init(&gc->irq_contexts);
@@ -2262,6 +2267,20 @@ static void mana_gd_shutdown(struct pci_dev *pdev)
 	pci_disable_device(pdev);
 }
 
+static int mana_sriov_configure(struct pci_dev *pdev, int numvfs)
+{
+	int err = 0;
+
+	dev_info(&pdev->dev, "Requested num VFs: %d\n", numvfs);
+
+	if (numvfs > 0)
+		err = pci_enable_sriov(pdev, numvfs);
+	else
+		pci_disable_sriov(pdev);
+
+	return err ? err : numvfs;
+}
+
 static const struct pci_device_id mana_id_table[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT, MANA_PF_DEVICE_ID) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT, MANA_VF_DEVICE_ID) },
@@ -2276,6 +2295,7 @@ static struct pci_driver mana_driver = {
 	.suspend	= mana_gd_suspend,
 	.resume		= mana_gd_resume,
 	.shutdown	= mana_gd_shutdown,
+	.sriov_configure = mana_sriov_configure,
 };
 
 static int __init mana_driver_init(void)
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 91ac4e37ecb9..5a701f44b8fd 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -1127,6 +1127,7 @@ void pci_vf_drivers_autoprobe(struct pci_dev *dev, bool auto_probe)
 	if (dev->is_physfn)
 		dev->sriov->drivers_autoprobe = auto_probe;
 }
+EXPORT_SYMBOL_GPL(pci_vf_drivers_autoprobe);
 
 /**
  * pci_iov_bus_range - find bus range used by Virtual Function
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH net-next v8] net: mana: Expose hardware diagnostic info via debugfs
From: Jakub Kicinski @ 2026-05-08 22:01 UTC (permalink / raw)
  To: ernis
  Cc: Jakub Kicinski, kys, haiyangz, wei.liu, decui, longli,
	andrew+netdev, davem, edumazet, pabeni, kotaranov, horms,
	shradhagupta, dipayanroy, shirazsaleem, yury.norov, kees,
	linux-hyperv, netdev, linux-kernel, linux-rdma
In-Reply-To: <20260506055128.291494-1-ernis@linux.microsoft.com>

This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
---
net: mana: Expose hardware diagnostic info via debugfs

Add debugfs entries to expose hardware configuration and diagnostic
information that aids in debugging driver initialization and runtime
operations without adding noise to dmesg.

> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
[ ... ]
> @@ -2231,13 +2239,18 @@ int mana_gd_resume(struct pci_dev *pdev)
>  
>  	err = mana_probe(&gc->mana, true);
>  	if (err)
> -		return err;
> +		goto cleanup_gd;
>  
>  	err = mana_rdma_probe(&gc->mana_ib);
>  	if (err)
> -		return err;
> +		goto cleanup_mana;
>  
>  	return 0;
> +cleanup_mana:
> +	mana_remove(&gc->mana, true);
> +cleanup_gd:
> +	mana_gd_cleanup_device(pdev);
> +	return err;
>  }

Since mana_remove() called with true for suspending skips
unregister_netdevice(), the network devices remain registered in the system.

If mana_gd_cleanup_device() is then called, which destroys the hardware
channel and clears gc->hwc.driver_data, could this lead to a NULL pointer
dereference if userspace later interacts with the still-registered devices?

> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
[ ... ]
> @@ -3161,6 +3169,23 @@ static int mana_init_port(struct net_device *ndev)
>  	eth_hw_addr_set(ndev, apc->mac_addr);
>  	sprintf(vport, "vport%d", port_idx);
>  	apc->mana_port_debugfs = debugfs_create_dir(vport, gc->mana_pci_debugfs);
> +
> +	debugfs_create_u64("port_handle", 0400, apc->mana_port_debugfs,
> +			   &apc->port_handle);

When the device suspends, mana_gd_suspend() calls mana_gd_cleanup_device(),
which recursively removes the entire debugfs directory tree.

During resume, the parent debugfs directory is recreated, but mana_probe()
skips calling mana_init_port() for existing ports.

Does this mean the per-vPort debugfs entries are permanently lost after a
suspend and resume cycle?

[ ... ]
> @@ -3800,6 +3830,9 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
>  
>  	mana_gd_deregister_device(gd);
>  
> +	debugfs_lookup_and_remove("bm_hostmode", gc->mana_pci_debugfs);
> +	debugfs_lookup_and_remove("num_vports", gc->mana_pci_debugfs);
> +
>  	if (suspending)
>  		return;

Earlier in mana_remove(), free_netdev() is called, which frees the port
context structure (apc) allocated via netdev_priv(ndev).

Since the per-vPort debugfs files created in mana_init_port() directly
reference memory within apc, and apc->mana_port_debugfs is not explicitly
removed here, does this leave the debugfs files accessible but pointing to
freed memory until the parent directory is finally removed in
mana_gd_cleanup_device()?

^ permalink raw reply

* RE: [PATCH v3] mshv: support 1G hugepages by passing them as 2M-aligned chunks
From: Michael Kelley @ 2026-05-08 16:27 UTC (permalink / raw)
  To: Anirudh Rayabharam (Microsoft), K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li
  Cc: linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20260506-huge_1g-v3-1-26e1e4c439e4@anirudhrb.com>

From: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com> Sent: Wednesday, May 6, 2026 6:45 AM
> 
> The hypervisor's map GPA hypercall coalesces contiguous 2M-aligned
> chunks into 1G mappings when alignment permits, so the driver can
> support 1G hugepages by feeding them in as 2M chunks. Note that this
> is the only way to make 1G mappings; there is no way to directly map
> a 1G hugepage using the hypercall.
> 
> Update mshv_chunk_stride() to:
> 
>   - Accept 2M-aligned tail pages of a larger folio. The previous
>     PageHead() check rejected every page after the head of a 1G
>     hugepage and fell back to 4K mappings for the remaining 1022 MB.
>     Replace it with a PFN alignment check so any 2M-aligned page of a
>     sufficiently large folio is acceptable.
> 
>   - Always emit a 2M (PMD_ORDER) stride for the huge-page case. The
>     hypercall has no 1G stride, so 1G folios are processed as a
>     sequence of 2M chunks. Folios whose order is neither PMD_ORDER nor
>     PUD_ORDER (e.g. mTHP) fall back to single-page stride; mapping
>     them as 2M would fail in the hypervisor anyway.
> 
> Assisted-by: Copilot-CLI:claude-opus-4.7
> Signed-off-by: Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> ---
> Changes in v3:
> - Fixed various corner cases reported by Sashiko.
> - Link to v2: https://lore.kernel.org/r/20260505-huge_1g-v2-1-
> b6a91327a88d@anirudhrb.com
> 
> Changes in v2:
> - Handled the case where we can have 2M aligned pages in the middle of a
>   1G page
> - Brought back the page order check but expanded it to include 1G
> - Clamp stride to requested page count in mshv_region_process_chunk
> - Link to v1: https://lore.kernel.org/r/20260416-huge_1g-v1-1-
> e066738cddfb@anirudhrb.com
> ---
>  drivers/hv/mshv_regions.c | 32 +++++++++++++++-----------------
>  1 file changed, 15 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
> index fdffd4f002f6..1756b733968c 100644
> --- a/drivers/hv/mshv_regions.c
> +++ b/drivers/hv/mshv_regions.c
> @@ -29,29 +29,28 @@
>   * Uses huge page stride if the backing page is huge and the guest mapping
>   * is properly aligned; otherwise falls back to single page stride.
>   *
> - * Return: Stride in pages, or -EINVAL if page order is unsupported.
> + * Return: Stride in pages.
>   */
> -static int mshv_chunk_stride(struct page *page,
> -			     u64 gfn, u64 page_count)
> +static unsigned int mshv_chunk_stride(struct page *page, u64 gfn,
> +				      u64 page_count)
>  {
> -	unsigned int page_order;
> +	unsigned int page_order = folio_order(page_folio(page));
> 
>  	/*
>  	 * Use single page stride by default. For huge page stride, the
> -	 * page must be compound and point to the head of the compound
> -	 * page, and both gfn and page_count must be huge-page aligned.
> +	 * page must be compound, the page's PFN must itself be 2M-aligned
> +	 * (so that a 2M-aligned tail page of a larger folio is acceptable),
> +	 * and both gfn and page_count must be huge-page aligned.
>  	 */
> -	if (!PageCompound(page) || !PageHead(page) ||
> +	if (!PageCompound(page) ||
> +	    !IS_ALIGNED(page_to_pfn(page), PTRS_PER_PMD) ||
>  	    !IS_ALIGNED(gfn, PTRS_PER_PMD) ||
> -	    !IS_ALIGNED(page_count, PTRS_PER_PMD))
> +	    !IS_ALIGNED(page_count, PTRS_PER_PMD) ||
> +	    (page_order != PMD_ORDER && page_order != PUD_ORDER))

One more thought on this patch:

This test could be unnecessarily restrictive. For example, if
there was a 4 MiB contiguous physical memory allocation,
page_order would be PMD_ORDER+1. There's no reason to
map such memory as single pages. While today there may
be no way for the user space VMM process address space
to be populated with a 4 MiB contiguous physical memory
range, who knows what the mm subsystem might do in the
future. I'd suggest doing (page_order < PMD_ORDER) to
allow page_orders of PMD_ORDER or bigger to be
processed in PMD-size chunks.

Michael

>  		return 1;
> 
> -	page_order = folio_order(page_folio(page));
> -	/* The hypervisor only supports 2M huge page */
> -	if (page_order != PMD_ORDER)
> -		return -EINVAL;
> -
> -	return 1 << page_order;
> +	/* Use 2M stride always i.e. process 1G folios as 2M chunks */
> +	return 1 << PMD_ORDER;
>  }
> 
>  /**
> @@ -86,15 +85,14 @@ static long mshv_region_process_chunk(struct
> mshv_mem_region *region,
>  	u64 gfn = region->start_gfn + page_offset;
>  	u64 count;
>  	struct page *page;
> -	int stride, ret;
> +	unsigned int stride;
> +	int ret;
> 
>  	page = region->mreg_pages[page_offset];
>  	if (!page)
>  		return -EINVAL;
> 
>  	stride = mshv_chunk_stride(page, gfn, page_count);
> -	if (stride < 0)
> -		return stride;
> 
>  	/* Start at stride since the first stride is validated */
>  	for (count = stride; count < page_count; count += stride) {
> 
> ---
> base-commit: cd9f2e7d6e5b1837ef40b96e300fa28b73ab5a77
> change-id: 20260416-huge_1g-e44461393c8f
> 
> Best regards,
> --
> Anirudh Rayabharam (Microsoft) <anirudh@anirudhrb.com>
> 


^ permalink raw reply

* [PATCH net-next,v9 2/2] net: mana: force full-page RX buffers via ethtool private flag
From: Dipayaan Roy @ 2026-05-08 14:27 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
	john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <20260508142921.497921-1-dipayanroy@linux.microsoft.com>

On some ARM64 platforms with 4K PAGE_SIZE, page_pool fragment
allocation in the RX refill path can cause 15-20% throughput
regression under high connection counts (>16 TCP streams).

Add an ethtool private flag "full-page-rx" that allows the user to
force one RX buffer per page, bypassing the page_pool fragment path.
This restores line-rate (180+ Gbps) performance on affected platforms.

Usage:
  ethtool --set-priv-flags eth0 full-page-rx on

There is no behavioral change by default. The flag must be explicitly
enabled by the user or udev rule.

The existing single-buffer-per-page logic for XDP and jumbo frames is
consolidated into a new helper mana_use_single_rxbuf_per_page() which
is now the single decision point for both the automatic and
user-controlled paths.

Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c |  22 +++-
 .../ethernet/microsoft/mana/mana_ethtool.c    | 103 ++++++++++++++++++
 include/net/mana/mana.h                       |   8 ++
 3 files changed, 131 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 462a457e7d53..c4bc8bf19d75 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -744,6 +744,25 @@ static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da)
 	return va;
 }
 
+static bool
+mana_use_single_rxbuf_per_page(struct mana_port_context *apc, u32 mtu)
+{
+	/* On some platforms with 4K PAGE_SIZE, page_pool fragment allocation
+	 * in the RX refill path (~2kB buffer) can cause significant throughput
+	 * regression under high connection counts. Allow user to force one RX
+	 * buffer per page via ethtool private flag to bypass the fragment
+	 * path.
+	 */
+	if (apc->priv_flags & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF))
+		return true;
+
+	/* For xdp and jumbo frames make sure only one packet fits per page. */
+	if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc))
+		return true;
+
+	return false;
+}
+
 /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */
 static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
 			       int mtu, u32 *datasize, u32 *alloc_size,
@@ -754,8 +773,7 @@ static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
 	/* Calculate datasize first (consistent across all cases) */
 	*datasize = mtu + ETH_HLEN;
 
-	/* For xdp and jumbo frames make sure only one packet fits per page */
-	if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) {
+	if (mana_use_single_rxbuf_per_page(apc, mtu)) {
 		if (mana_xdp_get(apc)) {
 			*headroom = XDP_PACKET_HEADROOM;
 			*alloc_size = PAGE_SIZE;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 7e79681634db..f22bbb325948 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -133,6 +133,10 @@ static const struct mana_stats_desc mana_phy_stats[] = {
 	{ "hc_tc7_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc7_phy) },
 };
 
+static const char mana_priv_flags[MANA_PRIV_FLAG_MAX][ETH_GSTRING_LEN] = {
+	[MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF] = "full-page-rx"
+};
+
 static int mana_get_sset_count(struct net_device *ndev, int stringset)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
@@ -144,6 +148,10 @@ static int mana_get_sset_count(struct net_device *ndev, int stringset)
 		       ARRAY_SIZE(mana_phy_stats) +
 		       ARRAY_SIZE(mana_hc_stats)  +
 		       num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
+
+	case ETH_SS_PRIV_FLAGS:
+		return MANA_PRIV_FLAG_MAX;
+
 	default:
 		return -EINVAL;
 	}
@@ -192,6 +200,14 @@ static void mana_get_strings_stats(struct mana_port_context *apc, u8 **data)
 	}
 }
 
+static void mana_get_strings_priv_flags(u8 **data)
+{
+	int i;
+
+	for (i = 0; i < MANA_PRIV_FLAG_MAX; i++)
+		ethtool_puts(data, mana_priv_flags[i]);
+}
+
 static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
@@ -200,6 +216,9 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
 	case ETH_SS_STATS:
 		mana_get_strings_stats(apc, &data);
 		break;
+	case ETH_SS_PRIV_FLAGS:
+		mana_get_strings_priv_flags(&data);
+		break;
 	default:
 		break;
 	}
@@ -590,6 +609,88 @@ static int mana_get_link_ksettings(struct net_device *ndev,
 	return 0;
 }
 
+static u32 mana_get_priv_flags(struct net_device *ndev)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+
+	return apc->priv_flags;
+}
+
+static int mana_set_priv_flags(struct net_device *ndev, u32 priv_flags)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+	u32 changed = apc->priv_flags ^ priv_flags;
+	u32 old_priv_flags = apc->priv_flags;
+	bool schedule_port_reset = false;
+	int err = 0;
+
+	if (!changed)
+		return 0;
+
+	/* Reject unknown bits */
+	if (priv_flags & ~GENMASK(MANA_PRIV_FLAG_MAX - 1, 0))
+		return -EINVAL;
+
+	if (changed & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF)) {
+		apc->priv_flags = priv_flags;
+
+		if (!apc->port_is_up) {
+			/* Port is down, flag updated to apply on next up
+			 * so just return.
+			 */
+			return 0;
+		}
+
+		/* Pre-allocate buffers to prevent failure in mana_attach
+		 * later
+		 */
+		err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues);
+		if (err) {
+			netdev_err(ndev,
+				   "Insufficient memory for new allocations\n");
+			apc->priv_flags = old_priv_flags;
+			return err;
+		}
+
+		err = mana_detach(ndev, false);
+		if (err) {
+			netdev_err(ndev, "mana_detach failed: %d\n", err);
+			apc->priv_flags = old_priv_flags;
+
+			/* Port is in an inconsistent state. Restore
+			 * 'port_is_up' so that queue reset work handler
+			 * can properly detach and re-attach.
+			 */
+			apc->port_is_up = true;
+			schedule_port_reset = true;
+			goto out;
+		}
+
+		err = mana_attach(ndev);
+		if (err) {
+			netdev_err(ndev, "mana_attach failed: %d\n", err);
+			apc->priv_flags = old_priv_flags;
+
+			/* Restore 'port_is_up' so the reset work handler
+			 * can properly detach/attach. Without this,
+			 * the handler sees port_is_up=false and skips
+			 * queue allocation, leaving the port dead.
+			 */
+			apc->port_is_up = true;
+			schedule_port_reset = true;
+		}
+	}
+
+out:
+	mana_pre_dealloc_rxbufs(apc);
+
+	if (schedule_port_reset)
+		queue_work(apc->ac->per_port_queue_reset_wq,
+			   &apc->queue_reset_work);
+
+	return err;
+}
+
 const struct ethtool_ops mana_ethtool_ops = {
 	.supported_coalesce_params = ETHTOOL_COALESCE_RX_CQE_FRAMES,
 	.get_ethtool_stats	= mana_get_ethtool_stats,
@@ -608,4 +709,6 @@ const struct ethtool_ops mana_ethtool_ops = {
 	.set_ringparam          = mana_set_ringparam,
 	.get_link_ksettings	= mana_get_link_ksettings,
 	.get_link		= ethtool_op_get_link,
+	.get_priv_flags		= mana_get_priv_flags,
+	.set_priv_flags		= mana_set_priv_flags,
 };
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index aa90a858c8e3..1d44a78da520 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -30,6 +30,12 @@ enum TRI_STATE {
 	TRI_STATE_TRUE = 1
 };
 
+/* MANA ethtool private flag bit positions */
+enum mana_priv_flag_bits {
+	MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF = 0,
+	MANA_PRIV_FLAG_MAX,
+};
+
 /* Number of entries for hardware indirection table must be in power of 2 */
 #define MANA_INDIRECT_TABLE_MAX_SIZE 512
 #define MANA_INDIRECT_TABLE_DEF_SIZE 64
@@ -531,6 +537,8 @@ struct mana_port_context {
 	u32 rxbpre_headroom;
 	u32 rxbpre_frag_count;
 
+	u32 priv_flags;
+
 	struct bpf_prog *bpf_prog;
 
 	/* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next,v9 1/2] net: mana: refactor mana_get_strings() and mana_get_sset_count() to use switch
From: Dipayaan Roy @ 2026-05-08 14:27 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
	john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <20260508142921.497921-1-dipayanroy@linux.microsoft.com>

Refactor mana_get_strings() and mana_get_sset_count() from if/else to
switch statements in preparation for adding ethtool private flags
support which requires handling ETH_SS_PRIV_FLAGS.

No functional change.

Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
 .../ethernet/microsoft/mana/mana_ethtool.c    | 75 ++++++++++++-------
 1 file changed, 46 insertions(+), 29 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 04350973e19e..7e79681634db 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -138,53 +138,70 @@ static int mana_get_sset_count(struct net_device *ndev, int stringset)
 	struct mana_port_context *apc = netdev_priv(ndev);
 	unsigned int num_queues = apc->num_queues;
 
-	if (stringset != ETH_SS_STATS)
+	switch (stringset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(mana_eth_stats) +
+		       ARRAY_SIZE(mana_phy_stats) +
+		       ARRAY_SIZE(mana_hc_stats)  +
+		       num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
+	default:
 		return -EINVAL;
-
-	return ARRAY_SIZE(mana_eth_stats) + ARRAY_SIZE(mana_phy_stats) + ARRAY_SIZE(mana_hc_stats) +
-			num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
+	}
 }
 
-static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
+static void mana_get_strings_stats(struct mana_port_context *apc, u8 **data)
 {
-	struct mana_port_context *apc = netdev_priv(ndev);
 	unsigned int num_queues = apc->num_queues;
 	int i, j;
 
-	if (stringset != ETH_SS_STATS)
-		return;
 	for (i = 0; i < ARRAY_SIZE(mana_eth_stats); i++)
-		ethtool_puts(&data, mana_eth_stats[i].name);
+		ethtool_puts(data, mana_eth_stats[i].name);
 
 	for (i = 0; i < ARRAY_SIZE(mana_hc_stats); i++)
-		ethtool_puts(&data, mana_hc_stats[i].name);
+		ethtool_puts(data, mana_hc_stats[i].name);
 
 	for (i = 0; i < ARRAY_SIZE(mana_phy_stats); i++)
-		ethtool_puts(&data, mana_phy_stats[i].name);
+		ethtool_puts(data, mana_phy_stats[i].name);
 
 	for (i = 0; i < num_queues; i++) {
-		ethtool_sprintf(&data, "rx_%d_packets", i);
-		ethtool_sprintf(&data, "rx_%d_bytes", i);
-		ethtool_sprintf(&data, "rx_%d_xdp_drop", i);
-		ethtool_sprintf(&data, "rx_%d_xdp_tx", i);
-		ethtool_sprintf(&data, "rx_%d_xdp_redirect", i);
-		ethtool_sprintf(&data, "rx_%d_pkt_len0_err", i);
+		ethtool_sprintf(data, "rx_%d_packets", i);
+		ethtool_sprintf(data, "rx_%d_bytes", i);
+		ethtool_sprintf(data, "rx_%d_xdp_drop", i);
+		ethtool_sprintf(data, "rx_%d_xdp_tx", i);
+		ethtool_sprintf(data, "rx_%d_xdp_redirect", i);
+		ethtool_sprintf(data, "rx_%d_pkt_len0_err", i);
 		for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++)
-			ethtool_sprintf(&data, "rx_%d_coalesced_cqe_%d", i, j + 2);
+			ethtool_sprintf(data,
+					"rx_%d_coalesced_cqe_%d",
+					i,
+					j + 2);
 	}
 
 	for (i = 0; i < num_queues; i++) {
-		ethtool_sprintf(&data, "tx_%d_packets", i);
-		ethtool_sprintf(&data, "tx_%d_bytes", i);
-		ethtool_sprintf(&data, "tx_%d_xdp_xmit", i);
-		ethtool_sprintf(&data, "tx_%d_tso_packets", i);
-		ethtool_sprintf(&data, "tx_%d_tso_bytes", i);
-		ethtool_sprintf(&data, "tx_%d_tso_inner_packets", i);
-		ethtool_sprintf(&data, "tx_%d_tso_inner_bytes", i);
-		ethtool_sprintf(&data, "tx_%d_long_pkt_fmt", i);
-		ethtool_sprintf(&data, "tx_%d_short_pkt_fmt", i);
-		ethtool_sprintf(&data, "tx_%d_csum_partial", i);
-		ethtool_sprintf(&data, "tx_%d_mana_map_err", i);
+		ethtool_sprintf(data, "tx_%d_packets", i);
+		ethtool_sprintf(data, "tx_%d_bytes", i);
+		ethtool_sprintf(data, "tx_%d_xdp_xmit", i);
+		ethtool_sprintf(data, "tx_%d_tso_packets", i);
+		ethtool_sprintf(data, "tx_%d_tso_bytes", i);
+		ethtool_sprintf(data, "tx_%d_tso_inner_packets", i);
+		ethtool_sprintf(data, "tx_%d_tso_inner_bytes", i);
+		ethtool_sprintf(data, "tx_%d_long_pkt_fmt", i);
+		ethtool_sprintf(data, "tx_%d_short_pkt_fmt", i);
+		ethtool_sprintf(data, "tx_%d_csum_partial", i);
+		ethtool_sprintf(data, "tx_%d_mana_map_err", i);
+	}
+}
+
+static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		mana_get_strings_stats(apc, &data);
+		break;
+	default:
+		break;
 	}
 }
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next,v9 0/2] net: mana: add ethtool private flag for full-page RX buffers
From: Dipayaan Roy @ 2026-05-08 14:27 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
	john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov

On some ARM64 platforms with 4K PAGE_SIZE, utilizing page_pool 
fragments for allocation in the RX refill path (~2kB buffer per fragment)
causes 15-20% throughput regression under high connection counts
(>16 TCP streams at 180+ Gbps). Using full-page buffers on these
platforms shows no regression and restores line-rate performance.

This behavior is observed on a single platform; other platforms
perform better with page_pool fragments, indicating this is not a
page_pool issue but platform-specific.

This series adds an ethtool private flag "full-page-rx" to let the
user opt in to one RX buffer per page:

  ethtool --set-priv-flags eth0 full-page-rx on

There is no behavioral change by default. The flag can be persisted
via udev rule for affected platforms.

Changes in v9:
  - Added coorect tree.
Changes in v8:
  - Fixed queue_reset_work recovery by restoring port_is_up before
    scheduling reset so the handler can properly re-attach.
  - Simplified "err && schedule_port_reset" to "schedule_port_reset".
Changes in v7:
  - Rebased onto net-next.
  - Retained private flag approach after David Wei's testing on
    Grace (ARM64) confirmed that fragment mode outperforms
    full-page mode on other platforms, validating this is a
    single-platform workaround rather than a generic issue.
Changes in v6:
  - Added missed maintainers.
Changes in v5:
  - Split prep refactor into separate patch (patch 1/2)
Changes in v4:
  - Dropping the smbios string parsing and add ethtool priv flag
    to reconfigure the queues with full page rx buffers.
Changes in v3:
  - changed u8* to char*
Changes in v2:
  - separate reading string index and the string, remove inline.

Dipayaan Roy (2):
  net: mana: refactor mana_get_strings() and mana_get_sset_count() to
    use switch
  net: mana: force full-page RX buffers via ethtool private flag

 drivers/net/ethernet/microsoft/mana/mana_en.c |  22 ++-
 .../ethernet/microsoft/mana/mana_ethtool.c    | 178 +++++++++++++++---
 include/net/mana/mana.h                       |   8 +
 3 files changed, 177 insertions(+), 31 deletions(-)

-- 
2.43.0


^ permalink raw reply

* [PATCH v8 2/2] net: mana: force full-page RX buffers via ethtool private flag
From: Dipayaan Roy @ 2026-05-08 11:46 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
	john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <20260508115100.488506-1-dipayanroy@linux.microsoft.com>

On some ARM64 platforms with 4K PAGE_SIZE, page_pool fragment
allocation in the RX refill path can cause 15-20% throughput
regression under high connection counts (>16 TCP streams).

Add an ethtool private flag "full-page-rx" that allows the user to
force one RX buffer per page, bypassing the page_pool fragment path.
This restores line-rate (180+ Gbps) performance on affected platforms.

Usage:
  ethtool --set-priv-flags eth0 full-page-rx on

There is no behavioral change by default. The flag must be explicitly
enabled by the user or udev rule.

The existing single-buffer-per-page logic for XDP and jumbo frames is
consolidated into a new helper mana_use_single_rxbuf_per_page() which
is now the single decision point for both the automatic and
user-controlled paths.

Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c |  22 +++-
 .../ethernet/microsoft/mana/mana_ethtool.c    | 103 ++++++++++++++++++
 include/net/mana/mana.h                       |   8 ++
 3 files changed, 131 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 462a457e7d53..c4bc8bf19d75 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -744,6 +744,25 @@ static void *mana_get_rxbuf_pre(struct mana_rxq *rxq, dma_addr_t *da)
 	return va;
 }
 
+static bool
+mana_use_single_rxbuf_per_page(struct mana_port_context *apc, u32 mtu)
+{
+	/* On some platforms with 4K PAGE_SIZE, page_pool fragment allocation
+	 * in the RX refill path (~2kB buffer) can cause significant throughput
+	 * regression under high connection counts. Allow user to force one RX
+	 * buffer per page via ethtool private flag to bypass the fragment
+	 * path.
+	 */
+	if (apc->priv_flags & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF))
+		return true;
+
+	/* For xdp and jumbo frames make sure only one packet fits per page. */
+	if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc))
+		return true;
+
+	return false;
+}
+
 /* Get RX buffer's data size, alloc size, XDP headroom based on MTU */
 static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
 			       int mtu, u32 *datasize, u32 *alloc_size,
@@ -754,8 +773,7 @@ static void mana_get_rxbuf_cfg(struct mana_port_context *apc,
 	/* Calculate datasize first (consistent across all cases) */
 	*datasize = mtu + ETH_HLEN;
 
-	/* For xdp and jumbo frames make sure only one packet fits per page */
-	if (mtu + MANA_RXBUF_PAD > PAGE_SIZE / 2 || mana_xdp_get(apc)) {
+	if (mana_use_single_rxbuf_per_page(apc, mtu)) {
 		if (mana_xdp_get(apc)) {
 			*headroom = XDP_PACKET_HEADROOM;
 			*alloc_size = PAGE_SIZE;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 7e79681634db..f22bbb325948 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -133,6 +133,10 @@ static const struct mana_stats_desc mana_phy_stats[] = {
 	{ "hc_tc7_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc7_phy) },
 };
 
+static const char mana_priv_flags[MANA_PRIV_FLAG_MAX][ETH_GSTRING_LEN] = {
+	[MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF] = "full-page-rx"
+};
+
 static int mana_get_sset_count(struct net_device *ndev, int stringset)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
@@ -144,6 +148,10 @@ static int mana_get_sset_count(struct net_device *ndev, int stringset)
 		       ARRAY_SIZE(mana_phy_stats) +
 		       ARRAY_SIZE(mana_hc_stats)  +
 		       num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
+
+	case ETH_SS_PRIV_FLAGS:
+		return MANA_PRIV_FLAG_MAX;
+
 	default:
 		return -EINVAL;
 	}
@@ -192,6 +200,14 @@ static void mana_get_strings_stats(struct mana_port_context *apc, u8 **data)
 	}
 }
 
+static void mana_get_strings_priv_flags(u8 **data)
+{
+	int i;
+
+	for (i = 0; i < MANA_PRIV_FLAG_MAX; i++)
+		ethtool_puts(data, mana_priv_flags[i]);
+}
+
 static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
@@ -200,6 +216,9 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
 	case ETH_SS_STATS:
 		mana_get_strings_stats(apc, &data);
 		break;
+	case ETH_SS_PRIV_FLAGS:
+		mana_get_strings_priv_flags(&data);
+		break;
 	default:
 		break;
 	}
@@ -590,6 +609,88 @@ static int mana_get_link_ksettings(struct net_device *ndev,
 	return 0;
 }
 
+static u32 mana_get_priv_flags(struct net_device *ndev)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+
+	return apc->priv_flags;
+}
+
+static int mana_set_priv_flags(struct net_device *ndev, u32 priv_flags)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+	u32 changed = apc->priv_flags ^ priv_flags;
+	u32 old_priv_flags = apc->priv_flags;
+	bool schedule_port_reset = false;
+	int err = 0;
+
+	if (!changed)
+		return 0;
+
+	/* Reject unknown bits */
+	if (priv_flags & ~GENMASK(MANA_PRIV_FLAG_MAX - 1, 0))
+		return -EINVAL;
+
+	if (changed & BIT(MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF)) {
+		apc->priv_flags = priv_flags;
+
+		if (!apc->port_is_up) {
+			/* Port is down, flag updated to apply on next up
+			 * so just return.
+			 */
+			return 0;
+		}
+
+		/* Pre-allocate buffers to prevent failure in mana_attach
+		 * later
+		 */
+		err = mana_pre_alloc_rxbufs(apc, ndev->mtu, apc->num_queues);
+		if (err) {
+			netdev_err(ndev,
+				   "Insufficient memory for new allocations\n");
+			apc->priv_flags = old_priv_flags;
+			return err;
+		}
+
+		err = mana_detach(ndev, false);
+		if (err) {
+			netdev_err(ndev, "mana_detach failed: %d\n", err);
+			apc->priv_flags = old_priv_flags;
+
+			/* Port is in an inconsistent state. Restore
+			 * 'port_is_up' so that queue reset work handler
+			 * can properly detach and re-attach.
+			 */
+			apc->port_is_up = true;
+			schedule_port_reset = true;
+			goto out;
+		}
+
+		err = mana_attach(ndev);
+		if (err) {
+			netdev_err(ndev, "mana_attach failed: %d\n", err);
+			apc->priv_flags = old_priv_flags;
+
+			/* Restore 'port_is_up' so the reset work handler
+			 * can properly detach/attach. Without this,
+			 * the handler sees port_is_up=false and skips
+			 * queue allocation, leaving the port dead.
+			 */
+			apc->port_is_up = true;
+			schedule_port_reset = true;
+		}
+	}
+
+out:
+	mana_pre_dealloc_rxbufs(apc);
+
+	if (schedule_port_reset)
+		queue_work(apc->ac->per_port_queue_reset_wq,
+			   &apc->queue_reset_work);
+
+	return err;
+}
+
 const struct ethtool_ops mana_ethtool_ops = {
 	.supported_coalesce_params = ETHTOOL_COALESCE_RX_CQE_FRAMES,
 	.get_ethtool_stats	= mana_get_ethtool_stats,
@@ -608,4 +709,6 @@ const struct ethtool_ops mana_ethtool_ops = {
 	.set_ringparam          = mana_set_ringparam,
 	.get_link_ksettings	= mana_get_link_ksettings,
 	.get_link		= ethtool_op_get_link,
+	.get_priv_flags		= mana_get_priv_flags,
+	.set_priv_flags		= mana_set_priv_flags,
 };
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index aa90a858c8e3..1d44a78da520 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -30,6 +30,12 @@ enum TRI_STATE {
 	TRI_STATE_TRUE = 1
 };
 
+/* MANA ethtool private flag bit positions */
+enum mana_priv_flag_bits {
+	MANA_PRIV_FLAG_USE_FULL_PAGE_RXBUF = 0,
+	MANA_PRIV_FLAG_MAX,
+};
+
 /* Number of entries for hardware indirection table must be in power of 2 */
 #define MANA_INDIRECT_TABLE_MAX_SIZE 512
 #define MANA_INDIRECT_TABLE_DEF_SIZE 64
@@ -531,6 +537,8 @@ struct mana_port_context {
 	u32 rxbpre_headroom;
 	u32 rxbpre_frag_count;
 
+	u32 priv_flags;
+
 	struct bpf_prog *bpf_prog;
 
 	/* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
-- 
2.43.0


^ permalink raw reply related

* [PATCH v8 1/2] net: mana: refactor mana_get_strings() and mana_get_sset_count() to use switch
From: Dipayaan Roy @ 2026-05-08 11:46 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
	john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <20260508115100.488506-1-dipayanroy@linux.microsoft.com>

Refactor mana_get_strings() and mana_get_sset_count() from if/else to
switch statements in preparation for adding ethtool private flags
support which requires handling ETH_SS_PRIV_FLAGS.

No functional change.

Signed-off-by: Dipayaan Roy <dipayanroy@linux.microsoft.com>
---
 .../ethernet/microsoft/mana/mana_ethtool.c    | 75 ++++++++++++-------
 1 file changed, 46 insertions(+), 29 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 04350973e19e..7e79681634db 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -138,53 +138,70 @@ static int mana_get_sset_count(struct net_device *ndev, int stringset)
 	struct mana_port_context *apc = netdev_priv(ndev);
 	unsigned int num_queues = apc->num_queues;
 
-	if (stringset != ETH_SS_STATS)
+	switch (stringset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(mana_eth_stats) +
+		       ARRAY_SIZE(mana_phy_stats) +
+		       ARRAY_SIZE(mana_hc_stats)  +
+		       num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
+	default:
 		return -EINVAL;
-
-	return ARRAY_SIZE(mana_eth_stats) + ARRAY_SIZE(mana_phy_stats) + ARRAY_SIZE(mana_hc_stats) +
-			num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT);
+	}
 }
 
-static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
+static void mana_get_strings_stats(struct mana_port_context *apc, u8 **data)
 {
-	struct mana_port_context *apc = netdev_priv(ndev);
 	unsigned int num_queues = apc->num_queues;
 	int i, j;
 
-	if (stringset != ETH_SS_STATS)
-		return;
 	for (i = 0; i < ARRAY_SIZE(mana_eth_stats); i++)
-		ethtool_puts(&data, mana_eth_stats[i].name);
+		ethtool_puts(data, mana_eth_stats[i].name);
 
 	for (i = 0; i < ARRAY_SIZE(mana_hc_stats); i++)
-		ethtool_puts(&data, mana_hc_stats[i].name);
+		ethtool_puts(data, mana_hc_stats[i].name);
 
 	for (i = 0; i < ARRAY_SIZE(mana_phy_stats); i++)
-		ethtool_puts(&data, mana_phy_stats[i].name);
+		ethtool_puts(data, mana_phy_stats[i].name);
 
 	for (i = 0; i < num_queues; i++) {
-		ethtool_sprintf(&data, "rx_%d_packets", i);
-		ethtool_sprintf(&data, "rx_%d_bytes", i);
-		ethtool_sprintf(&data, "rx_%d_xdp_drop", i);
-		ethtool_sprintf(&data, "rx_%d_xdp_tx", i);
-		ethtool_sprintf(&data, "rx_%d_xdp_redirect", i);
-		ethtool_sprintf(&data, "rx_%d_pkt_len0_err", i);
+		ethtool_sprintf(data, "rx_%d_packets", i);
+		ethtool_sprintf(data, "rx_%d_bytes", i);
+		ethtool_sprintf(data, "rx_%d_xdp_drop", i);
+		ethtool_sprintf(data, "rx_%d_xdp_tx", i);
+		ethtool_sprintf(data, "rx_%d_xdp_redirect", i);
+		ethtool_sprintf(data, "rx_%d_pkt_len0_err", i);
 		for (j = 0; j < MANA_RXCOMP_OOB_NUM_PPI - 1; j++)
-			ethtool_sprintf(&data, "rx_%d_coalesced_cqe_%d", i, j + 2);
+			ethtool_sprintf(data,
+					"rx_%d_coalesced_cqe_%d",
+					i,
+					j + 2);
 	}
 
 	for (i = 0; i < num_queues; i++) {
-		ethtool_sprintf(&data, "tx_%d_packets", i);
-		ethtool_sprintf(&data, "tx_%d_bytes", i);
-		ethtool_sprintf(&data, "tx_%d_xdp_xmit", i);
-		ethtool_sprintf(&data, "tx_%d_tso_packets", i);
-		ethtool_sprintf(&data, "tx_%d_tso_bytes", i);
-		ethtool_sprintf(&data, "tx_%d_tso_inner_packets", i);
-		ethtool_sprintf(&data, "tx_%d_tso_inner_bytes", i);
-		ethtool_sprintf(&data, "tx_%d_long_pkt_fmt", i);
-		ethtool_sprintf(&data, "tx_%d_short_pkt_fmt", i);
-		ethtool_sprintf(&data, "tx_%d_csum_partial", i);
-		ethtool_sprintf(&data, "tx_%d_mana_map_err", i);
+		ethtool_sprintf(data, "tx_%d_packets", i);
+		ethtool_sprintf(data, "tx_%d_bytes", i);
+		ethtool_sprintf(data, "tx_%d_xdp_xmit", i);
+		ethtool_sprintf(data, "tx_%d_tso_packets", i);
+		ethtool_sprintf(data, "tx_%d_tso_bytes", i);
+		ethtool_sprintf(data, "tx_%d_tso_inner_packets", i);
+		ethtool_sprintf(data, "tx_%d_tso_inner_bytes", i);
+		ethtool_sprintf(data, "tx_%d_long_pkt_fmt", i);
+		ethtool_sprintf(data, "tx_%d_short_pkt_fmt", i);
+		ethtool_sprintf(data, "tx_%d_csum_partial", i);
+		ethtool_sprintf(data, "tx_%d_mana_map_err", i);
+	}
+}
+
+static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data)
+{
+	struct mana_port_context *apc = netdev_priv(ndev);
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		mana_get_strings_stats(apc, &data);
+		break;
+	default:
+		break;
 	}
 }
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH v8 0/2] net: mana: add ethtool private flag for full-page RX buffers
From: Dipayaan Roy @ 2026-05-08 11:46 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
	john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov

On some ARM64 platforms with 4K PAGE_SIZE, utilizing page_pool 
fragments for allocation in the RX refill path (~2kB buffer per fragment)
causes 15-20% throughput regression under high connection counts
(>16 TCP streams at 180+ Gbps). Using full-page buffers on these
platforms shows no regression and restores line-rate performance.

This behavior is observed on a single platform; other platforms
perform better with page_pool fragments, indicating this is not a
page_pool issue but platform-specific.

This series adds an ethtool private flag "full-page-rx" to let the
user opt in to one RX buffer per page:

  ethtool --set-priv-flags eth0 full-page-rx on

There is no behavioral change by default. The flag can be persisted
via udev rule for affected platforms.

Changes in v8:
  - Fixed queue_reset_work recovery by restoring port_is_up before
    scheduling reset so the handler can properly re-attach.
  - Simplified "err && schedule_port_reset" to "schedule_port_reset".
Changes in v7:
  - Rebased onto net-next.
  - Retained private flag approach after David Wei's testing on
    Grace (ARM64) confirmed that fragment mode outperforms
    full-page mode on other platforms, validating this is a
    single-platform workaround rather than a generic issue.
Changes in v6:
  - Added missed maintainers.
Changes in v5:
  - Split prep refactor into separate patch (patch 1/2)
Changes in v4:
  - Dropping the smbios string parsing and add ethtool priv flag
    to reconfigure the queues with full page rx buffers.
Changes in v3:
  - changed u8* to char*
Changes in v2:
  - separate reading string index and the string, remove inline.

Dipayaan Roy (2):
  net: mana: refactor mana_get_strings() and mana_get_sset_count() to
    use switch
  net: mana: force full-page RX buffers via ethtool private flag

 drivers/net/ethernet/microsoft/mana/mana_en.c |  22 ++-
 .../ethernet/microsoft/mana/mana_ethtool.c    | 178 +++++++++++++++---
 include/net/mana/mana.h                       |   8 +
 3 files changed, 177 insertions(+), 31 deletions(-)

-- 
2.43.0


^ permalink raw reply

* Re: [PATCH net-next v7 0/2] net: mana: add ethtool private flag for full-page RX buffers
From: Dipayaan Roy @ 2026-05-08 10:48 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, andrew+netdev, davem, edumazet,
	kuba, pabeni, leon, longli, kotaranov, horms, shradhagupta,
	ssengar, ernis, shirazsaleem, linux-hyperv, netdev, linux-kernel,
	linux-rdma, stephen, jacob.e.keller, dipayanroy, leitao, kees,
	john.fastabend, hawk, bpf, daniel, ast, sdf, yury.norov
In-Reply-To: <20260506170034.327907-1-dipayanroy@linux.microsoft.com>

On Wed, May 06, 2026 at 09:58:56AM -0700, Dipayaan Roy wrote:
> On some ARM64 platforms with 4K PAGE_SIZE, utilizing page_pool 
> fragments for allocation in the RX refill path (~2kB buffer per fragment)
> causes 15-20% throughput regression under high connection counts
> (>16 TCP streams at 180+ Gbps). Using full-page buffers on these
> platforms shows no regression and restores line-rate performance.
> 
> This behavior is observed on a single platform; other platforms
> perform better with page_pool fragments, indicating this is not a
> page_pool issue but platform-specific.
> 
> This series adds an ethtool private flag "full-page-rx" to let the
> user opt in to one RX buffer per page:
> 
>   ethtool --set-priv-flags eth0 full-page-rx on
> 
> There is no behavioral change by default. The flag can be persisted
> via udev rule for affected platforms.
> 
> Changes in v7:
>   - Rebased onto net-next.
>   - Retained private flag approach after David Wei's testing on
>     Grace (ARM64) confirmed that fragment mode outperforms
>     full-page mode on other platforms, validating this is a
>     single-platform workaround rather than a generic issue.
> Changes in v6:
>   - Added missed maintainers.
> Changes in v5:
>   - Split prep refactor into separate patch (patch 1/2)
> Changes in v4:
>   - Dropping the smbios string parsing and add ethtool priv flag
>     to reconfigure the queues with full page rx buffers.
> Changes in v3:
>   - changed u8* to char*
> Changes in v2:
>   - separate reading string index and the string, remove inline.
> 
> Dipayaan Roy (2):
>   net: mana: refactor mana_get_strings() and mana_get_sset_count() to
>     use switch
>   net: mana: force full-page RX buffers via ethtool private flag
> 
>  drivers/net/ethernet/microsoft/mana/mana_en.c |  22 ++-
>  .../ethernet/microsoft/mana/mana_ethtool.c    | 164 ++++++++++++++----
>  include/net/mana/mana.h                       |   8 +
>  3 files changed, 163 insertions(+), 31 deletions(-)
> 
> -- 
> 2.43.0
>

Sashiko pointed out a valid point,I will reshare a v8 adressing that.
https://netdev-ai.bots.linux.dev/sashiko/#/patchset/20260506170034.327907-1-dipayanroy%40linux.microsoft.com

Thank you 

^ permalink raw reply

* Re: [PATCH v2 07/15] arm64: hyperv: Add support for mshv_vtl_return_call
From: Naman Jain @ 2026-05-08  9:56 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: Mark Rutland, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H . Peter Anvin, Arnd Bergmann, Paul Walmsley, Palmer Dabbelt,
	Albert Ou, Alexandre Ghiti, Michael Kelley, Timothy Hayes,
	Lorenzo Pieralisi, Sascha Bischoff, mrigendrachaubey,
	linux-hyperv, linux-arm-kernel, linux-kernel, linux-arch,
	linux-riscv, vdso, ssengar
In-Reply-To: <86mryaxng3.wl-maz@kernel.org>



On 5/8/2026 2:55 PM, Marc Zyngier wrote:
> On Wed, 29 Apr 2026 10:56:11 +0100,
> Naman Jain <namjain@linux.microsoft.com> wrote:
>>
> 
> [...]
> 
>> Merging threads for addressing comments from Mark Rutland and Marc
>> Zyngier on this patch.
>>
>> Thanks for reviewing the changes. Please allow me to briefly explain
>> the use case here and then address your comments.
>>
>> Hyper-V's Virtual Trust Levels (VTLs) provide hardware-enforced
>> isolation within a single VM, analogous to ARM TrustZone. The kernel
>> runs in VTL2 (higher privilege) as a "paravisor", a security monitor
>> that handles intercepts for the primary OS in VTL0 (lower
>> privilege). The VTL switch (mshv_vtl_return_call) is functionally
>> equivalent to KVM's guest enter/exit. It saves VTL2 state, loads
>> VTL0's GPRs other registers from a shared context structure, issues
>> hvc #3 to let VTL0 run, and on return saves VTL0's updated state back.
> 
> No, this is fundamentally different. KVM is purely architectural,
> doesn't try to "sanitise" anything, and context switches *all* of the
> guest state. No ifs, no buts, no "reserved registers".
> 
> [...]

Acked.

> 
>> Regarding Non-SMCCC "hvc #3" call, I have a limitation here owing to
>> the ABI that is defined by the Hyper-V hypervisor. Fixing this
>> requires a hypervisor-side change to support SMCCC-style dispatch for
>> VTL return. Until then, hvc #3 is the only working interface. Moreover
>> there would be backward compatibility issues with this new ABI
>> interface, if at all it is added.
> 
> Left hand, please talk to right hand. This is not the first time we
> push back on this, and we already had this annoying discussion back
> when arm64 as a Hyper-V guest was first proposed (6, 7 years ago?).
> 
> What has changed since? Absolutely nothing.
> 
> If the Hyper-V folks decide to ignore the standard and go their own
> way, that's fine. They only get to keep the pieces.
> 
> Thanks,
> 
> 	M.
> 

Thanks for the feedback. I understand your and Mark’s concerns with this 
approach now, and I’ve initiated internal discussions with the team to 
explore potential solutions.

Regards,
Naman

^ permalink raw reply

* Re: [PATCH v2 07/15] arm64: hyperv: Add support for mshv_vtl_return_call
From: Marc Zyngier @ 2026-05-08  9:25 UTC (permalink / raw)
  To: Naman Jain
  Cc: Mark Rutland, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H . Peter Anvin, Arnd Bergmann, Paul Walmsley, Palmer Dabbelt,
	Albert Ou, Alexandre Ghiti, Michael Kelley, Timothy Hayes,
	Lorenzo Pieralisi, Sascha Bischoff, mrigendrachaubey,
	linux-hyperv, linux-arm-kernel, linux-kernel, linux-arch,
	linux-riscv, vdso, ssengar
In-Reply-To: <f4059f5d-a82b-40c2-942e-3e24cefab94f@linux.microsoft.com>

On Wed, 29 Apr 2026 10:56:11 +0100,
Naman Jain <namjain@linux.microsoft.com> wrote:
> 

[...]

> Merging threads for addressing comments from Mark Rutland and Marc
> Zyngier on this patch.
> 
> Thanks for reviewing the changes. Please allow me to briefly explain
> the use case here and then address your comments.
> 
> Hyper-V's Virtual Trust Levels (VTLs) provide hardware-enforced
> isolation within a single VM, analogous to ARM TrustZone. The kernel
> runs in VTL2 (higher privilege) as a "paravisor", a security monitor
> that handles intercepts for the primary OS in VTL0 (lower
> privilege). The VTL switch (mshv_vtl_return_call) is functionally
> equivalent to KVM's guest enter/exit. It saves VTL2 state, loads
> VTL0's GPRs other registers from a shared context structure, issues
> hvc #3 to let VTL0 run, and on return saves VTL0's updated state back.

No, this is fundamentally different. KVM is purely architectural,
doesn't try to "sanitise" anything, and context switches *all* of the
guest state. No ifs, no buts, no "reserved registers".

[...]

> Regarding Non-SMCCC "hvc #3" call, I have a limitation here owing to
> the ABI that is defined by the Hyper-V hypervisor. Fixing this
> requires a hypervisor-side change to support SMCCC-style dispatch for
> VTL return. Until then, hvc #3 is the only working interface. Moreover
> there would be backward compatibility issues with this new ABI
> interface, if at all it is added.

Left hand, please talk to right hand. This is not the first time we
push back on this, and we already had this annoying discussion back
when arm64 as a Hyper-V guest was first proposed (6, 7 years ago?).

What has changed since? Absolutely nothing.

If the Hyper-V folks decide to ignore the standard and go their own
way, that's fine. They only get to keep the pieces.

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.

^ permalink raw reply

* Re: [PATCH net v2] net: mana: Optimize irq affinity for low vcpu configs
From: Shradha Gupta @ 2026-05-08  5:51 UTC (permalink / raw)
  To: Yury Norov
  Cc: Dexuan Cui, Wei Liu, Haiyang Zhang, K. Y. Srinivasan, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Konstantin Taranov, Simon Horman, Erni Sri Satya Vennela,
	Dipayaan Roy, Shiraz Saleem, Michael Kelley, Long Li, Yury Norov,
	linux-hyperv, linux-kernel, netdev, Paul Rosswurm, Shradha Gupta,
	Saurabh Singh Sengar, stable
In-Reply-To: <afoQHm28qj8JnKww@yury>

On Tue, May 05, 2026 at 11:43:26AM -0400, Yury Norov wrote:
> On Mon, May 04, 2026 at 11:15:03PM -0700, Shradha Gupta wrote:
> > On Sat, May 02, 2026 at 01:15:36PM -0400, Yury Norov wrote:
> > > On Sat, May 02, 2026 at 07:37:43AM -0700, Shradha Gupta wrote:
> > > > On Fri, May 01, 2026 at 12:22:20PM -0400, Yury Norov wrote:
> > > > > On Wed, Apr 29, 2026 at 02:06:37AM -0700, Shradha Gupta wrote:
> > > > > > In mana driver, the number of IRQs allocated is capped by the
> > > > > > min(num_cpu + 1, queue count). In cases, where the IRQ count is greater
> > > > > > than the vcpu count, we want to utilize all the vCPUs, irrespective of
> > > > > > their NUMA/core bindings.
> > > > > > 
> > > > > > This is important, especially in the envs where number of vCPUs are so
> > > > > > few that the softIRQ handling overhead on two IRQs on the same vCPU is
> > > > > > much more than their overheads if they were spread across sibling vCPUs.
> > > > > > 
> > > > > > This behaviour is more evident with dynamic IRQ allocation. Since MANA
> > > > > > IRQs are assigned at a later stage compared to static allocation, other
> > > > > > device IRQs may already be affinitized to the vCPUs. As a result, IRQ
> > > > > > weights become imbalanced, causing multiple MANA IRQs to land on the
> > > > > > same vCPU, while some vCPUs have none.
> > > > > > 
> > > > > > In such cases when many parallel TCP connections are tested, the
> > > > > > throughput drops significantly.
> > > > > > 
> > > > > > Test envs:
> > > > > > =======================================================
> > > > > > Case 1: without this patch
> > > > > > =======================================================
> > > > > > 4 vcpu(2 cores), 5 MANA IRQs (1 HWC + 4 Queue)
> > > > > > 
> > > > > > 	TYPE		effective vCPU aff
> > > > > > =======================================================
> > > > > > IRQ0:	HWC		0
> > > > > > IRQ1:	mana_q1		0
> > > > > > IRQ2:	mana_q2		2
> > > > > > IRQ3:	mana_q3		0
> > > > > > IRQ4:	mana_q4		3
> > > > > > 
> > > > > > %soft on each vCPU(mpstat -P ALL 1) on receiver
> > > > > > vCPU		0	1	2	3
> > > > > > =======================================================
> > > > > > pass 1:		38.85	0.03	24.89	24.65
> > > > > > pass 2:		39.15	0.03	24.57	25.28
> > > > > > pass 3:		40.36	0.03	23.20	23.17
> > > > > > 
> > > > > > =======================================================
> > > > > > Case 2: with this patch
> > > > > > =======================================================
> > > > > > 4 vcpu(2 cores), 5 MANA IRQs (1 HWC + 4 Queue)
> > > > > > 
> > > > > >         TYPE            effective vCPU aff
> > > > > > =======================================================
> > > > > > IRQ0:   HWC             0
> > > > > > IRQ1:   mana_q1         0
> > > > > > IRQ2:   mana_q2         1
> > > > > > IRQ3:   mana_q3         2
> > > > > > IRQ4:   mana_q4         3
> > > > > > 
> > > > > > %soft on each vCPU(mpstat -P ALL 1) on receiver
> > > > > > vCPU            0       1       2       3
> > > > > > =======================================================
> > > > > > pass 1:         15.42	15.85	14.99	14.51
> > > > > > pass 2:         15.53	15.94	15.81	15.93
> > > > > > pass 3:         16.41	16.35	16.40	16.36
> > > > > > 
> > > > > > =======================================================
> > > > > > Throughput Impact(in Gbps, same env)
> > > > > > =======================================================
> > > > > > TCP conn	with patch	w/o patch
> > > > > > 20480		15.65		7.73
> > > > > > 10240		15.63		8.93
> > > > > > 8192		15.64		9.69
> > > > > > 6144		15.64		13.16
> > > > > > 4096		15.69		15.75
> > > > > > 2048		15.69		15.83
> > > > > > 1024		15.71		15.28
> > > > > > 
> > > > > > Fixes: 755391121038 ("net: mana: Allocate MSI-X vectors dynamically")
> > > > > > Cc: stable@vger.kernel.org
> > > > > > Co-developed-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
> > > > > > Signed-off-by: Erni Sri Satya Vennela <ernis@linux.microsoft.com>
> > > > > > Signed-off-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
> > > > > > Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
> > > > > > ---
> > > > > > Changes in v2
> > > > > >  * Removed the unused skip_first_cpu variable
> > > > > >  * fixed exit condition in irq_setup_linear() with len == 0
> > > > > >  * changed return type of irq_setup_linear() as it will always be 0
> > > > > >  * removed the unnecessary rcu_read_lock() in irq_setup_linear()
> > > > > >  * added appropriate comments to indicate expected behaviour when
> > > > > >    IRQs are more than or equal to num_online_cpus()
> > > > > > ---
> > > > > >  .../net/ethernet/microsoft/mana/gdma_main.c   | 47 ++++++++++++++++---
> > > > > >  1 file changed, 40 insertions(+), 7 deletions(-)
> > > > > > 
> > > > > > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > > > > index 098fbda0d128..d740d1dc43da 100644
> > > > > > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > > > > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > > > > > @@ -167,6 +167,8 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
> > > > > >  	} else {
> > > > > >  		/* If dynamic allocation is enabled we have already allocated
> > > > > >  		 * hwc msi
> > > > > > +		 * Also, we make sure in this case the following is always true
> > > > > > +		 * (num_msix_usable - 1 HWC) <= num_online_cpus()
> > > > > >  		 */
> > > > > >  		gc->num_msix_usable = min(resp.max_msix, num_online_cpus() + 1);
> > > > > >  	}
> > > > > > @@ -1672,11 +1674,24 @@ static int irq_setup(unsigned int *irqs, unsigned int len, int node,
> > > > > >  	return 0;
> > > > > >  }
> > > > > >  
> > > > > > +/* should be called with cpus_read_lock() held */
> > > > > > +static void irq_setup_linear(unsigned int *irqs, unsigned int len)
> > > > > > +{
> > > > > > +	int cpu;
> > > > > > +
> > > > > > +	for_each_online_cpu(cpu) {
> > > > > > +		if (len == 0)
> > > > > > +			break;
> > > > > > +
> > > > > > +		irq_set_affinity_and_hint(*irqs++, cpumask_of(cpu));
> > > > > > +		len--;
> > > > > > +	}
> > > > > > +}
> > > > > > +
> > > > > >  static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
> > > > > >  {
> > > > > >  	struct gdma_context *gc = pci_get_drvdata(pdev);
> > > > > >  	struct gdma_irq_context *gic;
> > > > > > -	bool skip_first_cpu = false;
> > > > > >  	int *irqs, irq, err, i;
> > > > > >  
> > > > > >  	irqs = kmalloc_objs(int, nvec);
> > > > > 
> > > > > So what about WARN_ON() and nvec adjustment before kmalloc?
> > > > Hey Yury,
> > > > 
> > > > I am still a bit unsure about the WARN_ON() before kmalloc, as after
> > > > that also, in the same function till we take the cpus_read_lock() the
> > > > num_online_cpus() can change(or reduce). That's why I introduced the
> > > > dev_dbg() to capture hot-remove edge case.
> > > 
> > > OK.
> > >  
> > > > Do you still think it adds more value?
> > > 
> > > It's your driver, so you know better. I just wonder because you said
> > > it's good to add WARN_ON(), and then didn't do that.
> > > 
> > > > > 
> > > > > > @@ -1722,13 +1737,31 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
> > > > > >  	 * first CPU sibling group since they are already affinitized to HWC IRQ
> > > > > >  	 */
> > > > > >  	cpus_read_lock();
> > > > > > -	if (gc->num_msix_usable <= num_online_cpus())
> > > > > > -		skip_first_cpu = true;
> > > > > > +	if (gc->num_msix_usable <= num_online_cpus()) {
> > > > > > +		err = irq_setup(irqs, nvec, gc->numa_node, true);
> > > > > > +		if (err) {
> > > > > > +			cpus_read_unlock();
> > > > > > +			goto free_irq;
> > > > > 
> > > > > One thing puzzles me: if you skip first CPU with this 'true', and the
> > > > > gc->num_msix_usable == num_online_cpus(), it's one more than you can
> > > > > distribute. What do I miss?
> > > > > 
> > > > 
> > > > Let me explain this case a bit better then,
> > > > 
> > > > - num_msix_usable = HWC IRQ + Queue IRQ
> > > > - nvec in this functions is only Queue IRQ (HWC already setup)
> > > > 
> > > > When num_online_cpus == num_msix_usable:
> > > > - nvec = num_online_cpus - 1
> > > > - first CPU is already assigned to HWC IRQ, so skip it
> > > > - Queue IRQs fit in the remaining CPUs
> > > > 
> > > > please let me know if I did not get your question right
> > > 
> > > Can you put that in a comment?
> > 
> > Sure I will. thanks
> > 
> > > 
> > > > > > +		}
> > > > > > +	} else {
> > > > > > +		/*
> > > > > > +		 * When num_msix_usable are more than num_online_cpus, we try to
> > > > > > +		 * make sure we are using all vcpus. In such a case NUMA or
> > > > > > +		 * CPU core affinity does not matter.
> > > > > 
> > > > > If it doesn't matter, why don't you assign each IRQ to all CPUs then?
> > > > > In theory, the system would have most of flexibility to balance them.
> > > > > 
> > > > 
> > > > Okay, let me fix the comment and elaborate on this. It doesn't matter
> > > > because in such a case we want to anyway exhaust and distribute the
> > > > Queue IRQs to all vCPUs.
> > > > We don't want to rely on the system's balancer in this case as it could
> > > > be skewed by other devices' IRQ weights
> > > 
> > > I don't understand this. If I want to reserve some CPUs to solely
> > > handle IRQs from my high-priority hardware, then I configure my system
> > > accordingly. For example, assign all non-networking IRQs on CPU0, and
> > > all networking IRQs to all CPUs.
> > > 
> > > In your case, you distribute IRQs evenly, which means you've no
> > > preferred CPUs. So, assuming the system is only running your IRQ
> > > driver, it's at max is as good as all-CPU distribution. In case of
> > > heavy loading some particular CPU, your scheme could cause
> > > corresponding IRQs to starve.
> > > 
> > > I recall, when we was working on irq_setup(), the original idea was to
> > > distribute IRQs one-to-one, but than I suggested the 
> > > 
> > >         irq_set_affinity_and_hint(*irqs++, topology_sibling_cpumask(cpu));
> > > 
> > > and after experiments, you agreed on that.
> > > 
> > > Can you please run your throughput test for my suggested distribution
> > > too? Would be also nice to see how each distribution works when some
> > > CPUs are under stress.
> > > 
> > > Thanks,
> > > Yury
> > 
> > The design of irq_setup() works exactly how we want it for our IRQs for
> > almost all of our usecases, so we want to keep that as is. The only
> > scenarios where this is an issue in terms of significant throughput drop
> > is when we are working with low vCPU VMs (vCPU <= 4 with high TCP
> > connection counts) and where there are additional NVMe devices attached
> > to the VM.
> > 
> > The current patch about utilizing all the vCPUs helps in that case and
> > doesn't cause any regression for other cases.
> > 
> > This linear path is only taken when num_msix_usable > num_online_cpus(),
> > which is limited to low-vCPU VMs. Larger VMs continue using irq_setup()
> > as before.
> > 
> > We can definately get our throughput run results on other suggestions
> > you have. And about that, I just needed a bit more clarity on what to
> > test against. Are you suggesting, with irq_setup() intact and in use, we
> > configure the non-mana IRQs to say CPU0 and capture the numbers?
> 
> Can you try this:
> 
>        while(len--)
>                // Or cpu_online_mask or cpu_all_mask?
>                irq_set_affinity_and_hint(*irqs++, NULL);
> 
> And compare it to the linear version under your vCPU scenario?
> 
> Can you run your throughput test alone and on parallel with some
> IRQ torture test?
> 
>         stress-ng --timer 4 --timeout 60s
> 
> And maybe pin the stress test to the default CPU. Assuming it's 0:
> 
>         taskset -c 0 stress-ng --timer 4 --timeout 60s
> 
> Unless the 'linear' version is significantly faster, I'd stick to the
> above.
> 
> Thanks,
> Yury

Hey Yury,

We tried a few tests with your suggestion, and throughput seems to be
the same compared to the linear distribution approach. We stressed out
CPU0 in both the cases and the results were similar. No IRQ migration
was observed in either case and no throughput drop.
 
But one observation I had was that " irq_set_affinity_and_hint(*irqs++,
NULL);" is essentially a no-op and we end up relying on the initial
placement from pci_alloc_irq_vectors(). Even though in these tests we
were not able to reproduce it, but with this distribution there is a
chance we end up clustering the mana queue IRQs, while other vCPUs are
not running any network load. It's because the placement depends on
system-wide IRQ state at allocation time.
 
The linear approach however gaurantees each queue IRQ lands on a
distinct vCPU regardless of system state. Even after stressing the cpus
using stress-ng, we did not observe any significant throughput drop.


regards,
Shradha.

^ permalink raw reply

* Re: [PATCH v2 07/15] arm64: hyperv: Add support for mshv_vtl_return_call
From: Naman Jain @ 2026-05-08  4:26 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Marc Zyngier, K . Y . Srinivasan, Haiyang Zhang, Wei Liu,
	Dexuan Cui, Long Li, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H . Peter Anvin, Arnd Bergmann, Paul Walmsley, Palmer Dabbelt,
	Albert Ou, Alexandre Ghiti, Michael Kelley, Timothy Hayes,
	Lorenzo Pieralisi, Sascha Bischoff, mrigendrachaubey,
	linux-hyperv, linux-arm-kernel, linux-kernel, linux-arch,
	linux-riscv, vdso, ssengar
In-Reply-To: <afrzKl3ixCUUVL6C@J2N7QTR9R3>



On 5/6/2026 1:22 PM, Mark Rutland wrote:
> On Wed, Apr 29, 2026 at 03:26:11PM +0530, Naman Jain wrote:
>> On 4/23/2026 7:26 PM, Mark Rutland wrote:
>>> On Thu, Apr 23, 2026 at 12:41:57PM +0000, Naman Jain wrote:
> 
> [ non-SMMC hypercall code omitted for brevity ]
> 
>>> NAK to this.
>>>
>>> * This is a non-SMCCC hypercall, which we have NAK'd in general in the
>>>     past for various reasons that I am not going to rehash here.
>>>
>>> * It's not clear how this is going to be extended with necessary
>>>     architecture state in future (e.g. SVE, SME). This is not
>>>     future-proof, and I don't believe this is maintainable.
>>>
>>> * This breaks general requirements for reliable stacktracing by
>>>     clobbering state (e.g. x29) that we depend upon being valid AT ALL
>>>     TIMES outside of entry code.
>>>
>>> * IMO, if this needs to be saved/restored, that should happen in
>>>     whatever you are calling.
>>>
>>> Mark.
>>
>> Merging threads for addressing comments from Mark Rutland and Marc Zyngier
>> on this patch.
>>
>> Thanks for reviewing the changes. Please allow me to briefly explain the use
>> case here and then address your comments.
>>
>> Hyper-V's Virtual Trust Levels (VTLs) provide hardware-enforced isolation
>> within a single VM, analogous to ARM TrustZone. The kernel runs in VTL2
>> (higher privilege) as a "paravisor", a security monitor that handles
>> intercepts for the primary OS in VTL0 (lower privilege). The VTL switch
>> (mshv_vtl_return_call) is functionally equivalent to KVM's guest enter/exit.
> 
> It's worth noting that for KVM, the KVM hyp code is *tightly* coupled
> with the host kernel (they are one single binary object), and the
> calling convention between the two is an implementation detail that can
> change at any time without any ABI concerns.
> 
> While I appreciate this might be trying to do the same thing from a
> *functional* perspective, it's certainly different from a
> maintainability perspective, and can't be treated in the same way.
> 
>> It saves VTL2 state, loads VTL0's GPRs other registers from a shared context
>> structure, issues hvc #3 to let VTL0 run, and on return saves VTL0's updated
>> state back.
>>
>> Coming to the problems with the code, I have identified a few ways to
>> address them.
>>
>> I can put the assembly code in a separate .S file with
>> SYM_FUNC_START/SYM_FUNC_END and marked as noinstr, to prevent ftrace/kprobes
>> from instrumenting between the GPR load and the hvc, which could have
>> corrupted VTL0 register state. This should solve x29 clobbering, stack
>> tracing problems.
> 
> My point was that you must not clobber those registers.
> 
> Looking at the TLFS document you linked below, it says:
> 
> | Note: X29 (FP/frame pointer), X30 (LR/link register), and SP are private
> | per-VTL
> 
> ... so clobbering those doesn't seem to be necessary anyway. Clearly
> having an arbitrary calling convention is confusing for everyone.
> 
>> I should use kernel_neon_begin()/kernel_neon_end() to save/restore the full
>> extended FP state of the current task in VTL2. VTL0's Q0-Q31 can be
>> loaded/saved separately via fpsimd_load_state()/fpsimd_save_state(). This
>> way, the assembly touches none of the SIMD registers. This is SVE/SME-safe
>> for VTL2's task state. VTL0 still only carries Q0-Q31 in the context struct,
>> and extending to SVE, SME is a future context struct change, which will need
>> Hyper-V arm64 ABI support.
>> This way, VTL2's callee-saved regs (x19-x28, x29, x30) are explicitly saved
>> to the stack frame at the top and restored at the bottom of assembly code.
>> The C caller (in hv_vtl.c) is a clean function call.
> 
> That doesn't really address my concerns here.
> 
> I do not think that Linux should have to save/restore anything here;
> that should be the job of the real hypervisor. The arbitrary separation
> of PE state into private and shared (with shred state being directly
> exposed to Linux) is a problem for maintainability and forward
> compatibility.
> 
> Looking at the TLFS document you linked below, I see:
> 
> | Note: SVE state (Z0-Z31, P0-P15, FFR) and SME state are VTL-private.
> | The lower 128-bit portion (Q registers) is shared, but the upper bits
> | of Z registers may be corrupted on VTL transitions. Software should
> | not rely on Z register contents being preserved across VTL switches.
> 
> ... which is certainly going to be a pain to manage.
> 
> Note in particular "SME state" is not an architectural term. I don't
> know which state in particular that is intended to cover (e.g. ZA, ZT0,
> SVCR, all streaming mode state)?
> 
> There's no mention of SVCR, so I don't know how this is going to
> interact with management of ZA state (ZA and ZT0, which are dependent
> upon SVCR.ZA) or streaming mode (dependent upon SVCR.SM). That state has
> been *incredibly* painful for us to manage generally. Regardless of the
> SMCCC concerns, that needs to be specified better.
> 
>> Regarding Non-SMCCC "hvc #3" call, I have a limitation here owing to the ABI
>> that is defined by the Hyper-V hypervisor. Fixing this requires a
>> hypervisor-side change to support SMCCC-style dispatch for VTL return. Until
>> then, hvc #3 is the only working interface. Moreover there would be backward
>> compatibility issues with this new ABI interface, if at all it is added.
> 
> To be clear, that's Microsoft's problem, not the Linux kernel
> community's problem. My NAK still stands.
> 
> Multiple years ago now, we made it clear that we would not accept a
> non-SMCCC calling convention. Ignoring the substance of that feedback,
> and inventing a new calling convention after that point is a
> self-inflicted problem.
> 
> [...]
> 
>> Link to TLFS: https://learn.microsoft.com/en-us/virtualization/hyper-v-on-windows/tlfs/vsm#on-arm64-platforms-3
> 
> For shared state, aside fomr GPRs and FPSIMD/SVE/SME state, that says:
> 
> | * System Information Registers (read-only or non-security-critical):
> |   * System identification and feature registers
> |   * Cache and TLB type information
> 
> It's *implied* that some of those registers might be writable, but as
> the specific set of registers is not described I cannot tell. Are there
> any writable system registers which are shared?
> 
> I don't see how we can know which registers we might need to
> save/restore without that being explicitly documented.
> 
> I also see:
> 
> | Note: SPE (Statistical Profiling Extension) state is shared across VTLs,
> | except for PMBSR_EL1 which is VTL-private.
> 
> If "SPE state" includes PMBPTR or PMBLIMITR (which is the obvious
> reading), this would be a security problem, as a lower-privileged VTL
> could clobber those and cause SPE to write to arbitrary memory
> immediately upon return to the higher-privileged VTL. Having PMBSR be
> private on its own isn't sufficient to prevent that (e.g. since the
> higher-privileged VTL could have its own active SPE profiling session).
> 
> I'm not keen on requiring hyper-v specific hooks in the SPE driver to
> achieve that, and I'm also not keen on having hyper-v support code poke
> SPE registers behind the SPE driver's back.
> 
> This does not give me confidence that any future PE state (e.g. things
> like TRBE) will be managed in a safe way either.
> 
> Mark.


Thanks for sharing this, I'll discuss it internally and come up with a plan.

Regards,
Naman

^ permalink raw reply

* Re: [PATCH V2 09/11] x86/hyperv: Implement hyperv virtual IOMMU
From: Mukesh R @ 2026-05-08  1:54 UTC (permalink / raw)
  To: Souradeep Chakrabarti
  Cc: hpa, robin.murphy, robh, wei.liu, mhklinux, muislam, namjain,
	magnuskulke, anbelski, linux-kernel, linux-hyperv, iommu,
	linux-pci, linux-arch, kys, haiyangz, decui, longli, tglx, mingo,
	bp, dave.hansen, x86, joro, will, lpieralisi, kwilczynski,
	bhelgaas, arnd
In-Reply-To: <afsSxVAlDvtpZ63q@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net>

On 5/6/26 03:07, Souradeep Chakrabarti wrote:
> On Thu, Apr 30, 2026 at 05:41:55PM -0700, Mukesh R wrote:
>> Add a new file to implement management of device domains, mapping and
>> unmapping of IOMMU memory, and other iommu_ops to fit within the VFIO
>> framework for PCI passthru on Hyper-V running Linux as baremetal root
>> or L1VH root. This also implements direct attach mechanism (see below),
>> a special feature of Hyper-V for PCI passthru, and it is also made to
>> work within the VFIO framework.
>>
>> At a high level, during boot the hypervisor creates a default identity
>> domain and attaches all devices to it. This nicely maps to Linux IOMMU
>> subsystem IOMMU_DOMAIN_IDENTITY domain. As a result, Linux does not
>> need to explicitly ask Hyper-V to attach devices and do maps/unmaps
>> during boot. As mentioned previously, Hyper-V supports two ways to do
>> PCI passthru:
>>
>>    1. Device Domain (aka Domain Attach): root must create a device domain
>>       in the hypervisor, and do map/unmap hypercalls for mapping and
>>       unmapping guest RAM for DMA. All hypervisor communications use
>>       device ID of type PCI for identifying and referencing the device.
>>
>>    2. Direct Attach: the hypervisor will simply use the guest's HW
>>       page table for mappings, thus the root need not map/unmap guest
>>       memory for DMA. As such, direct attach passthru setup during guest
>>       boot is extremely fast. A direct attached device must always be
>>       referenced via logical device ID and not via the PCI device ID.
>>
>> At present, L1VH root only supports direct attaches. Also direct attach is
>> default in non-L1VH cases because there are some significant performance
>> issues with domain attach implementations currently for guests with higher
>> RAM (say more than 8GB), and that unfortunately cannot be addressed in
>> the short term.
>>
>> Co-developed-by: Wei Liu <wei.liu@kernel.org>
>> Signed-off-by: Wei Liu <wei.liu@kernel.org>
>> Signed-off-by: Mukesh R <mrathor@linux.microsoft.com>
>> ---
>>   MAINTAINERS                       |   1 +
>>   arch/x86/kernel/pci-dma.c         |   2 +
>>   drivers/iommu/Kconfig             |   5 +-
>>   drivers/iommu/Makefile            |   1 +
>>   drivers/iommu/hyperv-iommu-root.c | 908 ++++++++++++++++++++++++++++++
>>   include/asm-generic/mshyperv.h    |  17 +
>>   include/linux/hyperv.h            |   6 +
>>   7 files changed, 937 insertions(+), 3 deletions(-)
>>   create mode 100644 drivers/iommu/hyperv-iommu-root.c
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index f803a6a38fee..8ae040b89a56 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -11914,6 +11914,7 @@ F:	drivers/clocksource/hyperv_timer.c
>>   F:	drivers/hid/hid-hyperv.c
>>   F:	drivers/hv/
>>   F:	drivers/input/serio/hyperv-keyboard.c
>> +F:	drivers/iommu/hyperv-iommu-root.c
>>   F:	drivers/iommu/hyperv-irq.c
>>   F:	drivers/net/ethernet/microsoft/
>>   F:	drivers/net/hyperv/
>> diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
>> index 6267363e0189..cfeee6505e17 100644
>> --- a/arch/x86/kernel/pci-dma.c
>> +++ b/arch/x86/kernel/pci-dma.c
>> @@ -8,6 +8,7 @@
>>   #include <linux/gfp.h>
>>   #include <linux/pci.h>
>>   #include <linux/amd-iommu.h>
>> +#include <linux/hyperv.h>
>>   
>>   #include <asm/proto.h>
>>   #include <asm/dma.h>
>> @@ -105,6 +106,7 @@ void __init pci_iommu_alloc(void)
>>   	gart_iommu_hole_init();
>>   	amd_iommu_detect();
>>   	detect_intel_iommu();
>> +	hv_iommu_detect();
>>   	swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags);
>>   }
>>   
>> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>> index f86262b11416..7909cf4373a6 100644
>> --- a/drivers/iommu/Kconfig
>> +++ b/drivers/iommu/Kconfig
>> @@ -352,13 +352,12 @@ config MTK_IOMMU_V1
>>   	  if unsure, say N here.
>>   
>>   config HYPERV_IOMMU
>> -	bool "Hyper-V IRQ Handling"
>> +	bool "Hyper-V IOMMU Unit"
>>   	depends on HYPERV && X86
>>   	select IOMMU_API
>>   	default HYPERV
>>   	help
>> -	  Stub IOMMU driver to handle IRQs to support Hyper-V Linux
>> -	  guest and root partitions.
>> +	  Hyper-V pseudo IOMMU unit.
>>   
>>   config VIRTIO_IOMMU
>>   	tristate "Virtio IOMMU driver"
>> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
>> index 335ea77cced6..296fbc6ca829 100644
>> --- a/drivers/iommu/Makefile
>> +++ b/drivers/iommu/Makefile
>> @@ -31,6 +31,7 @@ obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o
>>   obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
>>   obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
>>   obj-$(CONFIG_HYPERV) += hyperv-irq.o
>> +obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu-root.o
>>   obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
>>   obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o
>>   obj-$(CONFIG_IOMMU_IOPF) += io-pgfault.o
>> diff --git a/drivers/iommu/hyperv-iommu-root.c b/drivers/iommu/hyperv-iommu-root.c
>> new file mode 100644
>> index 000000000000..739bbf39dea2
>> --- /dev/null
>> +++ b/drivers/iommu/hyperv-iommu-root.c
>> @@ -0,0 +1,908 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Hyper-V root vIOMMU driver.
>> + * Copyright (C) 2026, Microsoft, Inc.
>> + */
>> +
>> +#include <linux/pci.h>
>> +#include <linux/dma-map-ops.h>
>> +#include <linux/interval_tree.h>
>> +#include <linux/hyperv.h>
>> +#include "dma-iommu.h"
>> +#include <asm/iommu.h>
>> +#include <asm/mshyperv.h>
>> +
>> +/* We will not claim these PCI devices, eg hypervisor needs it for debugger */
>> +static char *pci_devs_to_skip;
>> +static int __init hv_iommu_setup_skip(char *str)
>> +{
>> +	pci_devs_to_skip = str;
>> +
>> +	return 0;
>> +}
>> +/* hv_iommu_skip=(SSSS:BB:DD.F)(SSSS:BB:DD.F) */
>> +__setup("hv_iommu_skip=", hv_iommu_setup_skip);
>> +
>> +bool hv_no_attdev;	 /* disable direct device attach for passthru */
>> +EXPORT_SYMBOL_GPL(hv_no_attdev);
>> +static int __init setup_hv_no_attdev(char *str)
>> +{
>> +	hv_no_attdev = true;
>> +	return 0;
>> +}
>> +__setup("hv_no_attdev", setup_hv_no_attdev);
>> +
>> +/* Iommu device that we export to the world. HyperV supports max of one */
>> +static struct iommu_device hv_virt_iommu;
>> +
>> +struct hv_domain {
>> +	struct iommu_domain iommu_dom;
>> +	u32 domid_num;			      /* as opposed to domain_id.type */
>> +	bool attached_dom;		      /* is this direct attached dom? */
>> +	u64 partid;			      /* partition id */
>> +	spinlock_t mappings_lock;	      /* protects mappings_tree */
>> +	struct rb_root_cached mappings_tree;  /* iova to pa lookup tree */
>> +};
>> +
>> +#define to_hv_domain(d) container_of(d, struct hv_domain, iommu_dom)
>> +
>> +struct hv_iommu_mapping {
>> +	phys_addr_t paddr;
>> +	struct interval_tree_node iova;
>> +	u32 flags;
>> +};
>> +
>> +/*
>> + * By default, during boot the hypervisor creates one Stage 2 (S2) default
>> + * domain. Stage 2 means that the page table is controlled by the hypervisor.
>> + *   S2 default: access to entire root partition memory. This for us easily
>> + *		 maps to IOMMU_DOMAIN_IDENTITY in the iommu subsystem, and
>> + *		 is called HV_DEVICE_DOMAIN_ID_S2_DEFAULT in the hypervisor.
>> + *
>> + * Device Management:
>> + *   There are two ways to manage device attaches to domains:
>> + *     1. Domain Attach: A device domain is created in the hypervisor, the
>> + *			 device is attached to this domain, and then memory
>> + *			 ranges are mapped in the map callbacks.
>> + *     2. Direct Attach: No need to create a domain in the hypervisor for direct
>> + *			 attached devices. A hypercall is made to tell the
>> + *			 hypervisor to attach the device to a guest. There is
>> + *			 no need for explicit memory mappings because the
>> + *			 hypervisor will just use the guest HW page table.
>> + *
>> + * Since a direct attach is much faster, it is the default. This can be
>> + * changed via hv_no_attdev.
>> + *
>> + * L1VH: hypervisor only supports direct attach.
>> + */
>> +
>> +/*
>> + * Create dummy domains to correspond to hypervisor prebuilt default identity
>> + * and null domains (dummy because we do not make hypercalls to create them).
>> + */
>> +static struct hv_domain hv_def_identity_dom;
>> +static struct hv_domain hv_null_dom;
>> +
>> +static bool hv_special_domain(struct hv_domain *hvdom)
>> +{
>> +	return hvdom == &hv_def_identity_dom || hvdom == &hv_null_dom;
>> +}
>> +
>> +struct iommu_domain_geometry default_geometry = (struct iommu_domain_geometry) {
>> +	.aperture_start = 0,
>> +	.aperture_end = -1UL,
>> +	.force_aperture = true,
>> +};
>> +
>> +/*
>> + * Since the relevant hypercalls can only fit less than 512 PFNs in the pfn
>> + * array, report 1M max.
>> + */
>> +#define HV_IOMMU_PGSIZES (SZ_4K | SZ_1M)
>> +
>> +static u32 unique_id;	      /* unique numeric id of a new domain */
>> +
>> +static void hv_iommu_detach_dev(struct iommu_domain *immdom,
>> +				struct device *dev);
>> +static size_t hv_iommu_unmap_pages(struct iommu_domain *immdom, ulong iova,
>> +				   size_t pgsize, size_t pgcount,
>> +				   struct iommu_iotlb_gather *gather);
>> +
>> +/*
>> + * If the current thread is a VMM thread, return the partition id of the VM it
>> + * is managing, else return HV_PARTITION_ID_INVALID.
>> + */
>> +u64 hv_get_current_partid(void)
>> +{
>> +	u64 (*fn)(void);
>> +	u64 ptid;
>> +
>> +	fn = symbol_get(mshv_current_partid);
>> +	if (!fn)
>> +		return HV_PARTITION_ID_INVALID;
>> +
>> +	ptid = fn();
>> +	symbol_put(mshv_current_partid);
>> +
>> +	return ptid;
>> +}
>> +EXPORT_SYMBOL_GPL(hv_get_current_partid);
>> +
>> +/* If this is a VMM thread, then this domain is for a guest vm */
>> +static bool hv_curr_thread_is_vmm(void)
>> +{
>> +	return hv_get_current_partid() != HV_PARTITION_ID_INVALID;
>> +}
>> +
>> +/* As opposed to some host app like SPDK etc... */
>> +static bool hv_dom_owner_is_vmm(struct hv_domain *hvdom)
>> +{
>> +	return hvdom && hvdom->partid != HV_PARTITION_ID_INVALID;
>> +}
>> +
>> +static bool hv_iommu_capable(struct device *dev, enum iommu_cap cap)
>> +{
>> +	switch (cap) {
>> +	case IOMMU_CAP_CACHE_COHERENCY:
>> +		return true;
>> +	default:
>> +		return false;
>> +	}
>> +}
>> +
>> +/*
>> + * Check if given pci device is a direct attached device. Caller must have
>> + * verified pdev is a valid pci device.
>> + */
>> +bool hv_pcidev_is_attached_dev(struct pci_dev *pdev)
>> +{
>> +	struct iommu_domain *iommu_domain;
>> +	struct hv_domain *hvdom;
>> +	struct device *dev = &pdev->dev;
>> +
>> +	iommu_domain = iommu_get_domain_for_dev(dev);
>> +	if (iommu_domain) {
>> +		hvdom = to_hv_domain(iommu_domain);
>> +		return hvdom->attached_dom;
>> +	}
>> +
>> +	return false;
>> +}
>> +EXPORT_SYMBOL_GPL(hv_pcidev_is_attached_dev);
>> +
>> +bool hv_pcidev_is_pthru_dev(struct pci_dev *pdev)
>> +{
>> +	struct device *dev = &pdev->dev;
>> +	struct hv_domain *hvdom = dev_iommu_priv_get(dev);
>> +
>> +	if (hvdom && !hv_special_domain(hvdom))
>> +		return true;
>> +
>> +	return false;
>> +}
>> +EXPORT_SYMBOL_GPL(hv_pcidev_is_pthru_dev);
>> +
>> +/* Build device id for direct attached devices */
>> +static u64 hv_build_devid_type_logical(struct pci_dev *pdev)
>> +{
>> +	hv_pci_segment segment;
>> +	union hv_device_id hv_devid;
>> +	union hv_pci_bdf bdf = {.as_uint16 = 0};
>> +	u32 rid = PCI_DEVID(pdev->bus->number, pdev->devfn);
>> +
>> +	segment = pci_domain_nr(pdev->bus);
>> +	bdf.bus = PCI_BUS_NUM(rid);
>> +	bdf.device = PCI_SLOT(rid);
>> +	bdf.function = PCI_FUNC(rid);
>> +
>> +	hv_devid.as_uint64 = 0;
>> +	hv_devid.device_type = HV_DEVICE_TYPE_LOGICAL;
>> +	hv_devid.logical.id = (u64)segment << 16 | bdf.as_uint16;
>> +
>> +	return hv_devid.as_uint64;
>> +}
>> +
>> +u64 hv_build_devid_oftype(struct pci_dev *pdev, enum hv_device_type type)
>> +{
>> +	if (type == HV_DEVICE_TYPE_LOGICAL) {
>> +		if (hv_l1vh_partition())
>> +			return hv_pci_vmbus_device_id(pdev);
>> +		else
>> +			return hv_build_devid_type_logical(pdev);
>> +	} else if (type == HV_DEVICE_TYPE_PCI)
>> +#ifdef CONFIG_X86
>> +		return hv_build_devid_type_pci(pdev);
>> +#else
>> +		return 0;
>> +#endif
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(hv_build_devid_oftype);
>> +
>> +/* Create a new device domain in the hypervisor */
>> +static int hv_iommu_create_hyp_devdom(struct hv_domain *hvdom)
>> +{
>> +	u64 status;
>> +	struct hv_input_device_domain *ddp;
>> +	struct hv_input_create_device_domain *input;
>> +	unsigned long flags;
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	ddp = &input->device_domain;
>> +	ddp->partition_id = HV_PARTITION_ID_SELF;
>> +	ddp->domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
>> +	ddp->domain_id.id = hvdom->domid_num;
>> +
>> +	input->create_device_domain_flags.forward_progress_required = 1;
>> +	input->create_device_domain_flags.inherit_owning_vtl = 0;
>> +
>> +	status = hv_do_hypercall(HVCALL_CREATE_DEVICE_DOMAIN, input, NULL);
>> +
>> +	local_irq_restore(flags);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +
>> +	return hv_result_to_errno(status);
>> +}
>> +
>> +static struct iommu_domain *hv_iommu_domain_alloc_paging(struct device *dev)
>> +{
>> +	struct hv_domain *hvdom;
>> +	int rc;
>> +
>> +	if (hv_l1vh_partition() && !hv_curr_thread_is_vmm()) {
>> +		pr_err("Hyper-V: l1vh iommu does not support host devices\n");
>> +		return NULL;
>> +	}
>> +
>> +	hvdom = kzalloc(sizeof(struct hv_domain), GFP_KERNEL);
>> +	if (hvdom == NULL)
>> +		return NULL;
>> +
>> +	spin_lock_init(&hvdom->mappings_lock);
>> +	hvdom->mappings_tree = RB_ROOT_CACHED;
>> +
>> +	/* Called under iommu group mutex, so single threaded */
>> +	if (++unique_id == HV_DEVICE_DOMAIN_ID_S2_DEFAULT)   /* ie, 0 */
>> +		goto out_err;
>> +
>> +	hvdom->domid_num = unique_id;
>> +	hvdom->partid = hv_get_current_partid();
>> +	hvdom->iommu_dom.geometry = default_geometry;
>> +	hvdom->iommu_dom.pgsize_bitmap = HV_IOMMU_PGSIZES;
>> +
>> +	/* For guests, by default we do direct attaches, so no domain in hyp */
>> +	if (hv_dom_owner_is_vmm(hvdom) && !hv_no_attdev)
>> +		hvdom->attached_dom = true;
>> +	else {
>> +		rc = hv_iommu_create_hyp_devdom(hvdom);
>> +		if (rc)
>> +			goto out_err;
>> +	}
>> +
>> +	return &hvdom->iommu_dom;
>> +
>> +out_err:
>> +	unique_id--;
>> +	kfree(hvdom);
>> +	return NULL;
>> +}
>> +
>> +static void hv_iommu_domain_free(struct iommu_domain *immdom)
>> +{
>> +	struct hv_domain *hvdom = to_hv_domain(immdom);
>> +	unsigned long flags;
>> +	u64 status;
>> +	struct hv_input_delete_device_domain *input;
>> +
>> +	if (hv_special_domain(hvdom))
>> +		return;
>> +
>> +	if (!hv_dom_owner_is_vmm(hvdom) || hv_no_attdev) {
>> +		struct hv_input_device_domain *ddp;
>> +
>> +		local_irq_save(flags);
>> +		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +		ddp = &input->device_domain;
>> +		memset(input, 0, sizeof(*input));
>> +
>> +		ddp->partition_id = HV_PARTITION_ID_SELF;
>> +		ddp->domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
>> +		ddp->domain_id.id = hvdom->domid_num;
>> +
>> +		status = hv_do_hypercall(HVCALL_DELETE_DEVICE_DOMAIN, input,
>> +					 NULL);
>> +		local_irq_restore(flags);
>> +
>> +		if (!hv_result_success(status))
>> +			hv_status_err(status, "\n");
>> +	}
>> +
>> +	kfree(hvdom);
>> +}
>> +
>> +/* Attach a device to a domain previously created in the hypervisor */
>> +static int hv_iommu_att_dev2dom(struct hv_domain *hvdom, struct pci_dev *pdev)
>> +{
>> +	unsigned long flags;
>> +	u64 status;
>> +	enum hv_device_type dev_type;
>> +	struct hv_input_attach_device_domain *input;
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	input->device_domain.partition_id = HV_PARTITION_ID_SELF;
>> +	input->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
>> +	input->device_domain.domain_id.id = hvdom->domid_num;
>> +
>> +	/* NB: Upon guest shutdown, device is re-attached to the default domain
>> +	 *     without explicit detach.
>> +	 */
>> +	if (hv_l1vh_partition())
>> +		dev_type = HV_DEVICE_TYPE_LOGICAL;
>> +	else
>> +		dev_type = HV_DEVICE_TYPE_PCI;
>> +
>> +	input->device_id.as_uint64 = hv_build_devid_oftype(pdev, dev_type);
>> +
>> +	status = hv_do_hypercall(HVCALL_ATTACH_DEVICE_DOMAIN, input, NULL);
>> +	local_irq_restore(flags);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +
>> +	return hv_result_to_errno(status);
>> +}
>> +
>> +/* Caller must have validated that dev is a valid pci dev */
>> +static int hv_iommu_direct_attach_device(struct pci_dev *pdev, u64 ptid)
>> +{
>> +	struct hv_input_attach_device *input;
>> +	u64 status;
>> +	int rc;
>> +	unsigned long flags;
>> +	union hv_device_id host_devid;
>> +	enum hv_device_type dev_type;
>> +
>> +	if (ptid == HV_PARTITION_ID_INVALID) {
>> +		pr_err("Hyper-V: Invalid partition id in direct attach\n");
>> +		return -EINVAL;
>> +	}
>> +
>> +	if (hv_l1vh_partition())
>> +		dev_type = HV_DEVICE_TYPE_LOGICAL;
>> +	else
>> +		dev_type = HV_DEVICE_TYPE_PCI;
>> +
>> +	host_devid.as_uint64 = hv_build_devid_oftype(pdev, dev_type);
>> +
>> +	do {
>> +		local_irq_save(flags);
>> +		input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +		memset(input, 0, sizeof(*input));
>> +		input->partition_id = ptid;
>> +		input->device_id = host_devid;
>> +
>> +		/* Hypervisor associates logical_id with this device, and in
>> +		 * some hypercalls like retarget interrupts, logical_id must be
>> +		 * used instead of the BDF. It is a required parameter.
>> +		 */
>> +		input->attdev_flags.logical_id = 1;
>> +		input->logical_devid =
>> +			   hv_build_devid_oftype(pdev, HV_DEVICE_TYPE_LOGICAL);
>> +
>> +		status = hv_do_hypercall(HVCALL_ATTACH_DEVICE, input, NULL);
>> +		local_irq_restore(flags);
>> +
>> +		if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
>> +			rc = hv_call_deposit_pages(NUMA_NO_NODE, ptid, 1);
>> +			if (rc)
>> +				break;
>> +		}
>> +	} while (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY);
> This can become a infinite loop, if for some reason HV continue to fail
> to attach device for some other reason than insufficient memory. We can
> have a max retry count here.

I don't understand. do while will exit if status is not insuff memory,
right?

>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +
>> +	return hv_result_to_errno(status);
>> +}
>> +
>> +/* Attach a device for passthru to guest VMs, host apps like SPDK, etc */
>> +static int hv_iommu_attach_dev(struct iommu_domain *immdom, struct device *dev,
>> +			       struct iommu_domain *old)
>> +{
>> +	struct pci_dev *pdev;
>> +	int rc;
>> +	struct hv_domain *hvdom_new = to_hv_domain(immdom);
>> +	struct hv_domain *hvdom_prev = dev_iommu_priv_get(dev);
>> +
>> +	/* Only allow PCI devices for now */
>> +	if (!dev_is_pci(dev))
>> +		return -EINVAL;
>> +
>> +	pdev = to_pci_dev(dev);
>> +
>> +	if (hv_l1vh_partition() && !hv_special_domain(hvdom_new) &&
>> +	    !hvdom_new->attached_dom)
>> +		return -EINVAL;
>> +
>> +	/* VFIO does not do explicit detach calls, hence check first if we need
>> +	 * to detach first. Also, in case of guest shutdown, it's the VMM
>> +	 * thread that attaches it back to the hv_def_identity_dom, and
>> +	 * hvdom_prev will not be null then. It is null during boot.
>> +	 */
>> +	if (hvdom_prev)
>> +		if (!hv_l1vh_partition() || !hv_special_domain(hvdom_prev))
>> +			hv_iommu_detach_dev(&hvdom_prev->iommu_dom, dev);
>> +
>> +	if (hv_l1vh_partition() && hv_special_domain(hvdom_new)) {
>> +		dev_iommu_priv_set(dev, hvdom_new);  /* sets "private" field */
>> +		return 0;
>> +	}
>> +
>> +	if (hvdom_new->attached_dom)
>> +		rc = hv_iommu_direct_attach_device(pdev, hvdom_new->partid);
>> +	else
>> +		rc = hv_iommu_att_dev2dom(hvdom_new, pdev);
> destructive detach before failable attach with no rollback.
> 1. If hvdom_prev exists, issue HVCALL_DETACH_DEVICE_DOMAIN /
> HVCALL_DETACH_DEVICE against the old hypervisor domain.
> 2. Then issue the attach hypercall against the new domain
> 3. Only on success, update dev_iommu_priv.
> 
> If step 2 fails, the device is left hypervisor-detached while
> dev_iommu_priv still points at the old domain, and the IOMMU core's
> recovery does not help

Yeah, the error will cause either boot to fail or vmm to abort. Worst
case would be to detach again, but I suppose we could just set to NULL.

>> +
>> +	if (rc == 0)
>> +		dev_iommu_priv_set(dev, hvdom_new);  /* sets "private" field */
>> +
>> +	return rc;
>> +}
>> +
>> +static void hv_iommu_det_dev_from_guest(struct pci_dev *pdev, u64 ptid)
>> +{
>> +	struct hv_input_detach_device *input;
>> +	u64 status, log_devid;
>> +	unsigned long flags;
>> +
>> +	log_devid = hv_build_devid_oftype(pdev, HV_DEVICE_TYPE_LOGICAL);
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	input->partition_id = ptid;
>> +	input->logical_devid = log_devid;
>> +	status = hv_do_hypercall(HVCALL_DETACH_DEVICE, input, NULL);
>> +	local_irq_restore(flags);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +}
>> +
>> +static void hv_iommu_det_dev_from_dom(struct pci_dev *pdev)
>> +{
>> +	u64 status, devid;
>> +	unsigned long flags;
>> +	struct hv_input_detach_device_domain *input;
>> +
>> +	devid = hv_build_devid_oftype(pdev, HV_DEVICE_TYPE_PCI);
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	input->partition_id = HV_PARTITION_ID_SELF;
>> +	input->device_id.as_uint64 = devid;
>> +	status = hv_do_hypercall(HVCALL_DETACH_DEVICE_DOMAIN, input, NULL);
>> +	local_irq_restore(flags);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +}
>> +
>> +static void hv_iommu_detach_dev(struct iommu_domain *immdom, struct device *dev)
>> +{
>> +	struct pci_dev *pdev;
>> +	struct hv_domain *hvdom = to_hv_domain(immdom);
>> +
>> +	/* See the attach function, only PCI devices for now */
>> +	if (!dev_is_pci(dev))
>> +		return;
>> +
>> +	pdev = to_pci_dev(dev);
>> +
>> +	if (hvdom->attached_dom)
>> +		hv_iommu_det_dev_from_guest(pdev, hvdom->partid);
>> +
>> +		/* Do not reset attached_dom, hv_iommu_unmap_pages happens
>> +		 * next.
>> +		 */
>> +	else
>> +		hv_iommu_det_dev_from_dom(pdev);
>> +}
>> +
>> +static int hv_iommu_add_tree_mapping(struct hv_domain *hvdom,
>> +				     unsigned long iova, phys_addr_t paddr,
>> +				     size_t size, u32 flags)
>> +{
>> +	unsigned long irqflags;
>> +	struct hv_iommu_mapping *mapping;
>> +
>> +	mapping = kzalloc(sizeof(*mapping), GFP_ATOMIC);
>> +	if (!mapping)
>> +		return -ENOMEM;
>> +
>> +	mapping->paddr = paddr;
>> +	mapping->iova.start = iova;
>> +	mapping->iova.last = iova + size - 1;
>> +	mapping->flags = flags;
>> +
>> +	spin_lock_irqsave(&hvdom->mappings_lock, irqflags);
>> +	interval_tree_insert(&mapping->iova, &hvdom->mappings_tree);
>> +	spin_unlock_irqrestore(&hvdom->mappings_lock, irqflags);
>> +
>> +	return 0;
>> +}
>> +
>> +static size_t hv_iommu_del_tree_mappings(struct hv_domain *hvdom,
>> +					unsigned long iova, size_t size)
>> +{
>> +	unsigned long flags;
>> +	size_t unmapped = 0;
>> +	unsigned long last = iova + size - 1;
>> +	struct hv_iommu_mapping *mapping = NULL;
>> +	struct interval_tree_node *node, *next;
>> +
>> +	spin_lock_irqsave(&hvdom->mappings_lock, flags);
>> +	next = interval_tree_iter_first(&hvdom->mappings_tree, iova, last);
>> +	while (next) {
>> +		node = next;
>> +		mapping = container_of(node, struct hv_iommu_mapping, iova);
>> +		next = interval_tree_iter_next(node, iova, last);
>> +
>> +		/* Trying to split a mapping? Not supported for now. */
>> +		if (mapping->iova.start < iova)
>> +			break;
>> +
>> +		unmapped += mapping->iova.last - mapping->iova.start + 1;
>> +
>> +		interval_tree_remove(node, &hvdom->mappings_tree);
>> +		kfree(mapping);
>> +	}
>> +	spin_unlock_irqrestore(&hvdom->mappings_lock, flags);
>> +
>> +	return unmapped;
>> +}
>> +
>> +/* Return: must return exact status from the hypercall without changes */
>> +static u64 hv_iommu_map_pgs(struct hv_domain *hvdom,
>> +			    unsigned long iova, phys_addr_t paddr,
>> +			    unsigned long npages, u32 map_flags)
>> +{
>> +	u64 status;
>> +	int i;
>> +	struct hv_input_map_device_gpa_pages *input;
>> +	unsigned long flags, pfn;
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	input->device_domain.partition_id = HV_PARTITION_ID_SELF;
>> +	input->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
>> +	input->device_domain.domain_id.id = hvdom->domid_num;
>> +	input->map_flags = map_flags;
>> +	input->target_device_va_base = iova;
>> +
>> +	pfn = paddr >> HV_HYP_PAGE_SHIFT;
>> +	for (i = 0; i < npages; i++, pfn++)
>> +		input->gpa_page_list[i] = pfn;
> No boundary check for npages here in gpa_page_list, as 512 PFNs in the
> pfn array is the limit for the rep hypercall.

Not needed. Like the comment above says:

/*
  * Since the relevant hypercalls can only fit less than 512 PFNs in the pfn
  * array, report 1M max.
  */
#define HV_IOMMU_PGSIZES (SZ_4K | SZ_1M)

We plan for it.

>> +
>> +	status = hv_do_rep_hypercall(HVCALL_MAP_DEVICE_GPA_PAGES, npages, 0,
>> +				     input, NULL);
> npages can get trucated here, as rep_count is u16.

See above.

Thanks,
-Mukesh

>> +
>> +	local_irq_restore(flags);
>> +	return status;
>> +}
>> +
>> +/*
>> + * The core VFIO code loops over memory ranges calling this function with
>> + * the largest size from HV_IOMMU_PGSIZES. cond_resched() is in vfio_iommu_map.
>> + */
>> +static int hv_iommu_map_pages(struct iommu_domain *immdom, ulong iova,
>> +			      phys_addr_t paddr, size_t pgsize, size_t pgcount,
>> +			      int prot, gfp_t gfp, size_t *mapped)
>> +{
>> +	u32 map_flags;
>> +	int ret;
>> +	u64 status;
>> +	unsigned long npages, done = 0;
>> +	struct hv_domain *hvdom = to_hv_domain(immdom);
>> +	size_t size = pgsize * pgcount;
>> +
>> +	map_flags = HV_MAP_GPA_READABLE;	/* required */
>> +	map_flags |= prot & IOMMU_WRITE ? HV_MAP_GPA_WRITABLE : 0;
>> +
>> +	ret = hv_iommu_add_tree_mapping(hvdom, iova, paddr, size, map_flags);
>> +	if (ret)
>> +		return ret;
>> +
>> +	if (hvdom->attached_dom) {
>> +		*mapped = size;
>> +		return 0;
>> +	}
>> +
>> +	npages = size >> HV_HYP_PAGE_SHIFT;
>> +	while (done < npages) {
>> +		ulong completed, remain = npages - done;
>> +
>> +		status = hv_iommu_map_pgs(hvdom, iova, paddr, remain,
>> +					  map_flags);
>> +
>> +		completed = hv_repcomp(status);
>> +		done = done + completed;
>> +		iova = iova + (completed << HV_HYP_PAGE_SHIFT);
>> +		paddr = paddr + (completed << HV_HYP_PAGE_SHIFT);
>> +
>> +		if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
>> +			ret = hv_call_deposit_pages(NUMA_NO_NODE,
>> +						    hv_current_partition_id,
>> +						    256);
>> +			if (ret)
>> +				break;
>> +			continue;
>> +		}
>> +		if (!hv_result_success(status))
>> +			break;
>> +	}
>> +
>> +	if (!hv_result_success(status)) {
>> +		size_t done_size = done << HV_HYP_PAGE_SHIFT;
>> +
>> +		hv_status_err(status, "pgs:%lx/%lx iova:%lx\n",
>> +			      done, npages, iova);
>> +		/*
>> +		 * lookup tree has all mappings [0 - size-1]. Below unmap will
>> +		 * only remove from [0 - done], we need to remove second chunk
>> +		 * [done+1 - size-1].
>> +		 */
>> +		hv_iommu_del_tree_mappings(hvdom, iova, size - done_size);
>> +		hv_iommu_unmap_pages(immdom, iova - done_size, HV_HYP_PAGE_SIZE,
>> +				     done, NULL);
>> +		if (mapped)
>> +			*mapped = 0;
>> +	} else
>> +		if (mapped)
>> +			*mapped = size;
>> +
>> +	return hv_result_to_errno(status);
>> +}
>> +
>> +static size_t hv_iommu_unmap_pages(struct iommu_domain *immdom, ulong iova,
>> +				   size_t pgsize, size_t pgcount,
>> +				   struct iommu_iotlb_gather *gather)
>> +{
>> +	unsigned long flags, npages;
>> +	struct hv_input_unmap_device_gpa_pages *input;
>> +	u64 status;
>> +	struct hv_domain *hvdom = to_hv_domain(immdom);
>> +	size_t unmapped, size = pgsize * pgcount;
>> +
>> +	unmapped = hv_iommu_del_tree_mappings(hvdom, iova, size);
>> +	if (unmapped < size)
>> +		pr_err("%s: could not delete all mappings (%lx:%lx/%lx)\n",
>> +		       __func__, iova, unmapped, size);
>> +
>> +	if (hvdom->attached_dom)
>> +		return size;
>> +
>> +	npages = size >> HV_HYP_PAGE_SHIFT;
>> +
>> +	local_irq_save(flags);
>> +	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
>> +	memset(input, 0, sizeof(*input));
>> +
>> +	input->device_domain.partition_id = HV_PARTITION_ID_SELF;
>> +	input->device_domain.domain_id.type = HV_DEVICE_DOMAIN_TYPE_S2;
>> +	input->device_domain.domain_id.id = hvdom->domid_num;
>> +	input->target_device_va_base = iova;
>> +
>> +	status = hv_do_rep_hypercall(HVCALL_UNMAP_DEVICE_GPA_PAGES, npages,
>> +				     0, input, NULL);
> npages can get truncated here.
>> +	local_irq_restore(flags);
>> +
>> +	if (!hv_result_success(status))
>> +		hv_status_err(status, "\n");
>> +
>> +	return unmapped;
>> +}
>> +
>> +static phys_addr_t hv_iommu_iova_to_phys(struct iommu_domain *immdom,
>> +					 dma_addr_t iova)
>> +{
>> +	unsigned long flags;
>> +	struct hv_iommu_mapping *mapping;
>> +	struct interval_tree_node *node;
>> +	u64 paddr = 0;
>> +	struct hv_domain *hvdom = to_hv_domain(immdom);
>> +
>> +	spin_lock_irqsave(&hvdom->mappings_lock, flags);
>> +	node = interval_tree_iter_first(&hvdom->mappings_tree, iova, iova);
>> +	if (node) {
>> +		mapping = container_of(node, struct hv_iommu_mapping, iova);
>> +		paddr = mapping->paddr + (iova - mapping->iova.start);
>> +	}
>> +	spin_unlock_irqrestore(&hvdom->mappings_lock, flags);
>> +
>> +	return paddr;
>> +}
>> +
>> +/*
>> + * Currently, hypervisor does not provide list of devices it is using
>> + * dynamically. So use this to allow users to manually specify devices that
>> + * should be skipped. (eg. hypervisor debugger using some network device).
>> + */
>> +static struct iommu_device *hv_iommu_probe_device(struct device *dev)
>> +{
>> +	if (!dev_is_pci(dev))
>> +		return ERR_PTR(-ENODEV);
>> +
>> +	if (pci_devs_to_skip && *pci_devs_to_skip) {
>> +		int rc, pos = 0;
>> +		int parsed;
>> +		int segment, bus, slot, func;
>> +		struct pci_dev *pdev = to_pci_dev(dev);
>> +
>> +		do {
>> +			parsed = 0;
>> +
>> +			rc = sscanf(pci_devs_to_skip + pos, " (%x:%x:%x.%x) %n",
>> +				    &segment, &bus, &slot, &func, &parsed);
>> +			if (rc)
>> +				break;
>> +			if (parsed <= 0)
>> +				break;
>> +
>> +			if (pci_domain_nr(pdev->bus) == segment &&
>> +			    pdev->bus->number == bus &&
>> +			    PCI_SLOT(pdev->devfn) == slot &&
>> +			    PCI_FUNC(pdev->devfn) == func) {
>> +
>> +				dev_info(dev, "skipped by Hyper-V IOMMU\n");
>> +				return ERR_PTR(-ENODEV);
>> +			}
>> +			pos += parsed;
>> +
>> +		} while (pci_devs_to_skip[pos]);
>> +	}
>> +
>> +	/* Device will be explicitly attached to the default domain, so no need
>> +	 * to do dev_iommu_priv_set() here.
>> +	 */
>> +
>> +	return &hv_virt_iommu;
>> +}
>> +
>> +static void hv_iommu_probe_finalize(struct device *dev)
>> +{
>> +	struct iommu_domain *immdom = iommu_get_domain_for_dev(dev);
>> +
>> +	if (immdom && immdom->type == IOMMU_DOMAIN_DMA)
>> +		iommu_setup_dma_ops(dev, immdom);
>> +	else
>> +		set_dma_ops(dev, NULL);
>> +}
>> +
>> +static void hv_iommu_release_device(struct device *dev)
>> +{
>> +	struct hv_domain *hvdom = dev_iommu_priv_get(dev);
>> +
>> +	/* Need to detach device from device domain if necessary. */
>> +	if (hvdom)
>> +		hv_iommu_detach_dev(&hvdom->iommu_dom, dev);
>> +
>> +	dev_iommu_priv_set(dev, NULL);
>> +	set_dma_ops(dev, NULL);
>> +}
>> +
>> +static struct iommu_group *hv_iommu_device_group(struct device *dev)
>> +{
>> +	if (dev_is_pci(dev))
>> +		return pci_device_group(dev);
>> +	else
>> +		return generic_device_group(dev);
>> +}
>> +
>> +static int hv_iommu_def_domain_type(struct device *dev)
>> +{
>> +	/* The hypervisor always creates this by default during boot */
>> +	return IOMMU_DOMAIN_IDENTITY;
>> +}
>> +
>> +static struct iommu_ops hv_iommu_ops = {
>> +	.capable	    = hv_iommu_capable,
>> +	.domain_alloc_paging	= hv_iommu_domain_alloc_paging,
>> +	.probe_device	    = hv_iommu_probe_device,
>> +	.probe_finalize     = hv_iommu_probe_finalize,
>> +	.release_device     = hv_iommu_release_device,
>> +	.def_domain_type    = hv_iommu_def_domain_type,
>> +	.device_group	    = hv_iommu_device_group,
>> +	.default_domain_ops = &(const struct iommu_domain_ops) {
>> +		.attach_dev   = hv_iommu_attach_dev,
>> +		.map_pages    = hv_iommu_map_pages,
>> +		.unmap_pages  = hv_iommu_unmap_pages,
>> +		.iova_to_phys = hv_iommu_iova_to_phys,
>> +		.free	      = hv_iommu_domain_free,
>> +	},
>> +	.owner		    = THIS_MODULE,
>> +	.identity_domain = &hv_def_identity_dom.iommu_dom,
>> +	.blocked_domain  = &hv_null_dom.iommu_dom,
>> +};
>> +
>> +static const struct iommu_domain_ops hv_special_domain_ops = {
>> +	.attach_dev = hv_iommu_attach_dev,
>> +};
>> +
>> +static void __init hv_initialize_special_domains(void)
>> +{
>> +	hv_def_identity_dom.iommu_dom.type = IOMMU_DOMAIN_IDENTITY;
>> +	hv_def_identity_dom.iommu_dom.ops = &hv_special_domain_ops;
>> +	hv_def_identity_dom.iommu_dom.owner = &hv_iommu_ops;
>> +	hv_def_identity_dom.iommu_dom.geometry = default_geometry;
>> +	hv_def_identity_dom.domid_num = HV_DEVICE_DOMAIN_ID_S2_DEFAULT; /* 0 */
>> +
>> +	hv_null_dom.iommu_dom.type = IOMMU_DOMAIN_BLOCKED;
>> +	hv_null_dom.iommu_dom.ops = &hv_special_domain_ops;
>> +	hv_null_dom.iommu_dom.owner = &hv_iommu_ops;
>> +	hv_null_dom.iommu_dom.geometry = default_geometry;
>> +	hv_null_dom.domid_num = HV_DEVICE_DOMAIN_ID_S2_NULL;  /* INTMAX */
>> +}
>> +
>> +static int __init hv_iommu_init(void)
>> +{
>> +	int ret;
>> +	struct iommu_device *iommup = &hv_virt_iommu;
>> +
>> +	if (!hv_is_hyperv_initialized())
>> +		return -ENODEV;
>> +
>> +	ret = iommu_device_sysfs_add(iommup, NULL, NULL, "%s", "hyperv-iommu");
>> +	if (ret) {
>> +		pr_err("Hyper-V: iommu_device_sysfs_add failed: %d\n", ret);
>> +		return ret;
>> +	}
>> +
>> +	/* This must come before iommu_device_register because the latter calls
>> +	 * into the hooks.
>> +	 */
>> +	hv_initialize_special_domains();
>> +
>> +	ret = iommu_device_register(iommup, &hv_iommu_ops, NULL);
>> +	if (ret) {
>> +		pr_err("Hyper-V: iommu_device_register failed: %d\n", ret);
>> +		goto err_sysfs_remove;
>> +	}
>> +
>> +	pr_info("Hyper-V IOMMU initialized\n");
>> +
>> +	return 0;
>> +
>> +err_sysfs_remove:
>> +	iommu_device_sysfs_remove(iommup);
>> +	return ret;
>> +}
>> +
>> +void __init hv_iommu_detect(void)
>> +{
>> +	if (no_iommu || iommu_detected)
>> +		return;
>> +
>> +	/* For l1vh, always expose an iommu unit */
>> +	if (!hv_l1vh_partition())
>> +		if (!(ms_hyperv.misc_features & HV_DEVICE_DOMAIN_AVAILABLE))
>> +			return;
>> +
>> +	iommu_detected = 1;
>> +	x86_init.iommu.iommu_init = hv_iommu_init;
>> +
>> +	pci_request_acs();
>> +}
>> diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h
>> index a6878ab685e7..fca5ed68b5c2 100644
>> --- a/include/asm-generic/mshyperv.h
>> +++ b/include/asm-generic/mshyperv.h
>> @@ -337,6 +337,23 @@ static inline u64 hv_pci_vmbus_device_id(struct pci_dev *pdev)
>>   { return 0; }
>>   #endif /* IS_ENABLED(CONFIG_PCI_HYPERV) */
>>   
>> +#if IS_ENABLED(CONFIG_HYPERV_IOMMU)
>> +u64 hv_get_current_partid(void);
>> +bool hv_pcidev_is_attached_dev(struct pci_dev *pdev);
>> +bool hv_pcidev_is_pthru_dev(struct pci_dev *pdev);
>> +u64 hv_build_devid_oftype(struct pci_dev *pdev, enum hv_device_type type);
>> +#else
>> +static inline bool hv_pcidev_is_attached_dev(struct pci_dev *pdev)
>> +{ return false; }
>> +static inline bool hv_pcidev_is_pthru_dev(struct pci_dev *pdev)
>> +{ return false; }
>> +static inline u64 hv_build_devid_oftype(struct pci_dev *pdev,
>> +					enum hv_device_type type)
>> +{ return 0; }
>> +static inline u64 hv_get_current_partid(void)
>> +{ return HV_PARTITION_ID_INVALID; }
>> +#endif /* IS_ENABLED(CONFIG_HYPERV_IOMMU) */
>> +
>>   #if IS_ENABLED(CONFIG_MSHV_ROOT)
>>   static inline bool hv_root_partition(void)
>>   {
>> diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
>> index 5459e776ec17..6eee1cbf6f23 100644
>> --- a/include/linux/hyperv.h
>> +++ b/include/linux/hyperv.h
>> @@ -1769,4 +1769,10 @@ static inline unsigned long virt_to_hvpfn(void *addr)
>>   #define HVPFN_DOWN(x)	((x) >> HV_HYP_PAGE_SHIFT)
>>   #define page_to_hvpfn(page)	(page_to_pfn(page) * NR_HV_HYP_PAGES_IN_PAGE)
>>   
>> +#ifdef CONFIG_HYPERV_IOMMU
>> +void __init hv_iommu_detect(void);
>> +#else
>> +static inline void hv_iommu_detect(void) { }
>> +#endif /* CONFIG_HYPERV_IOMMU */
>> +
>>   #endif /* _HYPERV_H */
>> -- 
>> 2.51.2.vfs.0.1
>>


^ permalink raw reply

* RE: [EXTERNAL] Re: [PATCH v2] Drivers: hv: vmbus: Improve the logic of reserving fb_mmio on Gen2 VMs
From: Dexuan Cui @ 2026-05-07 21:43 UTC (permalink / raw)
  To: Matthew Ruffell
  Cc: Michael Kelley, KY Srinivasan, Haiyang Zhang, wei.liu@kernel.org,
	Long Li, linux-hyperv@vger.kernel.org,
	linux-kernel@vger.kernel.org, johansen@templeofstupid.com,
	hargar@linux.microsoft.com, stable@vger.kernel.org
In-Reply-To: <CAKAwkKtUo5XX_Qh4hSYcbxTWkZP=+i0hZQaPHX78G20MFdz2Lg@mail.gmail.com>

> From: Matthew Ruffell <matthew.ruffell@canonical.com>
> Sent: Wednesday, May 6, 2026 8:58 PM
> To: Dexuan Cui <DECUI@microsoft.com>
>  ...
> Hi Dexuan,
> 
> Thanks for making the amendments, and thank you Michael for all your
> reviews.

Thank you Matthew and Krister for all your help with testing!
Thank you Michael for all your valuable review comments, great analysis,  and testing!

> Since you posted the diff to the V3, I went and tested the V3 patch.

There is no real code change between v2 and v3 :-)

>  ...
> Tested-by: Matthew Ruffell <matthew.ruffell@canonical.com>
> 
> Thanks,
> Matthew

I just posted v3 here:
https://lore.kernel.org/linux-hyperv/20260507212838.448891-1-decui@microsoft.com/T/#u

^ permalink raw reply

* [PATCH v3] Drivers: hv: vmbus: Improve the logic of reserving fb_mmio on Gen2 VMs
From: Dexuan Cui @ 2026-05-07 21:28 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, linux-hyperv, linux-kernel,
	mhklinux, matthew.ruffell, johansen, hargar
  Cc: stable, Krister Johansen

If vmbus_reserve_fb() in the kdump/kexec kernel fails to properly reserve
the framebuffer MMIO range (which is below 4GB) due to a Gen2 VM's
screen.lfb_base being zero [1], there is an MMIO conflict between the
drivers hyperv-drm and pci-hyperv: when the driver pci-hyperv's
hv_allocate_config_window() calls vmbus_allocate_mmio() to get an
MMIO range, typically it gets a 32-bit MMIO range that overlaps with the
framebuffer MMIO range, and later hv_pci_enter_d0() fails with an
error message "PCI Pass-through VSP failed D0 Entry with status" since
the host thinks that PCI devices must not use MMIO space that the
host has assigned to the framebuffer.

This is especially an issue if pci-hyperv is built-in and hyperv-drm is
built as a module. Consequently, the kdump/kexec kernel fails to detect
PCI devices via pci-hyperv, and may fail to mount the root file system,
which may reside in a NVMe disk. The issue described here has existed
for SR-IOV VF NICs since day one of the pci-hyperv driver, and has been
worked around on x64 when possible. With the recent introduction of
ARM64 VMs that boot from NVMe, there is no workaround, so we need a
formal fix.

On Gen2 VMs, if the screen.lfb_base is 0 in the kdump/kexec kernel [1],
fall back to the low MMIO base, which should be equal to the framebuffer
MMIO base [2] (the statement is true according to my testing on x64
Windows Server 2016, and on x64 and ARM64 Windows Server 2025 and on
Azure. I checked with the Hyper-V team and they said the statement should
continue to be true for Gen2 VMs). In the first kernel, screen.lfb_base
is not 0; if the user specifies a very high resolution, it's not enough
to only reserve 8MB: let's always reserve half of the space below 4GB,
but cap the reservation to 128MB, which is the required framebuffer size
of the highest resolution 7680*4320 supported by Hyper-V.

While at it, fix the comparison "end > VTPM_BASE_ADDRESS" by changing
the > to >=. Here the 'end' is an inclusive end (typically, it's
0xFFFF_FFFF for the low MMIO range).

Note: vmbus_reserve_fb() now also reserves an MMIO range at the beginning
of the low MMIO range on CVMs, which have no framebuffers (the
'screen.lfb_base' in vmbus_reserve_fb() is 0 for CVMs), just in case the
host might treat the beginning of the low MMIO range specially [3]. BTW,
the OpenHCL kernel is not affected by the change, because that kernel
boots with DeviceTree rather than ACPI (so vmbus_reserve_fb() won't run
there), and there is no framebuffer device for that kernel.

Note: normally Gen1 VMs don't have the MMIO conflict issue because the
framebuffer MMIO range (which is hardcoded to base=4GB-128MB and
size=64MB for Gen1 VMs by the host) is always reported via the legacy PCI
graphics device's BAR, so the kdump/kexec kernel can reserve the 64MB
MMIO range; however, if the VM is configured to use a very high resolution
and the required framebuffer size exceeds 64MB (AFAIK, in practice, this
isn't a typical configuration by users), the hyperv-drm driver may need to
allocate an MMIO range above 4GB and change the framebuffer MMIO location
to the allocated MMIO range -- in this case, there can still be issues [4]
which can't be easily fixed: any possible affected Gen1 users would have
to use a resolution whose framebuffer size is <= 64MB, or switch to Gen2
VMs.

[1] https://lore.kernel.org/all/SA1PR21MB692176C1BC53BFC9EAE5CF8EBF51A@SA1PR21MB6921.namprd21.prod.outlook.com/
[2] https://lore.kernel.org/all/SA1PR21MB69218F955B62DFF62E3E88D2BF222@SA1PR21MB6921.namprd21.prod.outlook.com/
[3] https://lore.kernel.org/all/SN6PR02MB415726B17D5A6027CD1717E8D4342@SN6PR02MB4157.namprd02.prod.outlook.com/
[4] https://lore.kernel.org/all/SA1PR21MB69213486F821CA5A2C793C81BF342@SA1PR21MB6921.namprd21.prod.outlook.com/

Fixes: 4daace0d8ce8 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs")
CC: stable@vger.kernel.org
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Tested-by: Krister Johansen <kjlx@templeofstupid.com>
Tested-by: Matthew Ruffell <matthew.ruffell@canonical.com>
Signed-off-by: Dexuan Cui <decui@microsoft.com>
---

Changes since v1 (https://lore.kernel.org/all/20260416183529.838321-1-decui@microsoft.com/):
  Fixed a typo in the subject: s/logc/logic/.

  In the commit message, better explained fb_mmio_base is equal to
  low_mmio_base for Gen2 VMs.

  Addressed Michael Kelley's comments:

    In the commit message:
         Changed the "kdump" to "kdump/kexec" since the described
         issue is applicable to both kdump and kexec.

         Provided more detail about the MMIO conflict.

         Described an scenario where Gen1 VMs can also be affected.

    Added a pr_warn() in vmbus_reserve_fb() in case the 'start' is 0.

    Dropped the CVM check in vmbus_reserve(), meaning vmbus_reserve_fb()
    also reserves MMIO for CVMs.

  Changed "low_mmio_base >= SZ_4G" to "upper_32_bits(low_mmio_base)"
  to avoid a compilation warning for the i386 build.

  Changed "0x%pa" to "%pa", because %pa already adds a "0x" prefix.
  

Hi Krister, Matthew, sorry -- I'm not adding your Tested-by's since
the code changed, though the change is small. If the v2 looks good
to Michael, please test the patch again. 

Hi Hardik, I'm not adding your Reviewed-by since the patch changed.
Please review the v2. 


Changes since v2 (https://lore.kernel.org/all/20260505004846.193441-1-decui@microsoft.com/):
    Fixed the commit message:
        hv_pci_allocate_bridge_windows() -> hv_allocate_config_window()

    Changed the "kdump" in the comment to "kdump/kexec or CVM" [Michael Kelley]

    Fixed the order of the "[3]" and "[4]" in the commit message.

    Added Krister's Tested-by.
    Added Matthew's Tested-by.
    Added Michael's Reviewed-by.

 drivers/hv/vmbus_drv.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index f0d0803d1e16..d73ac5c8dd04 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -2327,8 +2327,8 @@ static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx)
 		return AE_NO_MEMORY;
 
 	/* If this range overlaps the virtual TPM, truncate it. */
-	if (end > VTPM_BASE_ADDRESS && start < VTPM_BASE_ADDRESS)
-		end = VTPM_BASE_ADDRESS;
+	if (end >= VTPM_BASE_ADDRESS && start < VTPM_BASE_ADDRESS)
+		end = VTPM_BASE_ADDRESS - 1;
 
 	new_res->name = "hyperv mmio";
 	new_res->flags = IORESOURCE_MEM;
@@ -2395,6 +2395,7 @@ static void vmbus_mmio_remove(void)
 static void __maybe_unused vmbus_reserve_fb(void)
 {
 	resource_size_t start = 0, size;
+	resource_size_t low_mmio_base;
 	struct pci_dev *pdev;
 
 	if (efi_enabled(EFI_BOOT)) {
@@ -2402,6 +2403,24 @@ static void __maybe_unused vmbus_reserve_fb(void)
 		if (IS_ENABLED(CONFIG_SYSFB)) {
 			start = sysfb_primary_display.screen.lfb_base;
 			size = max_t(__u32, sysfb_primary_display.screen.lfb_size, 0x800000);
+
+			low_mmio_base = hyperv_mmio->start;
+			if (!low_mmio_base || upper_32_bits(low_mmio_base) ||
+			    (start && start < low_mmio_base)) {
+				pr_warn("Unexpected low mmio base %pa\n", &low_mmio_base);
+			} else {
+				/*
+				 * If the kdump/kexec or CVM kernel's lfb_base
+				 * is 0, fall back to the low mmio base.
+				 */
+				if (!start)
+					start = low_mmio_base;
+				/*
+				 * Reserve half of the space below 4GB for high
+				 * resolutions, but cap the reservation to 128MB.
+				 */
+				size = min((SZ_4G - start) / 2, SZ_128M);
+			}
 		}
 	} else {
 		/* Gen1 VM: get FB base from PCI */
@@ -2422,8 +2441,10 @@ static void __maybe_unused vmbus_reserve_fb(void)
 		pci_dev_put(pdev);
 	}
 
-	if (!start)
+	if (!start) {
+		pr_warn("Unexpected framebuffer mmio base of zero\n");
 		return;
+	}
 
 	/*
 	 * Make a claim for the frame buffer in the resource tree under the
@@ -2433,6 +2454,8 @@ static void __maybe_unused vmbus_reserve_fb(void)
 	 */
 	for (; !fb_mmio && (size >= 0x100000); size >>= 1)
 		fb_mmio = __request_region(hyperv_mmio, start, size, fb_mmio_name, 0);
+
+	pr_info("hv_mmio=%pR,%pR fb=%pR\n", hyperv_mmio, hyperv_mmio->sibling, fb_mmio);
 }
 
 /**
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH v3] mshv: Simplify GPA map/unmap hypercall helpers
From: Stanislav Kinsburskii @ 2026-05-07 20:42 UTC (permalink / raw)
  To: Michael Kelley
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <SN6PR02MB4157F8C28F454788714CFEF2D43C2@SN6PR02MB4157.namprd02.prod.outlook.com>

On Thu, May 07, 2026 at 07:09:13PM +0000, Michael Kelley wrote:
> From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> > 
> > Clean up hv_do_map_gpa_hcall() and hv_call_unmap_gpa_pages() after the
> > preceding bug-fix patches:
> > 
> > Move "done += completed" before the status checks so that pages mapped
> > by a partially-successful batch are included in the error cleanup unmap.
> > Previously these mappings were leaked on failure.
> > 
> > While here, improve type safety and readability:
> >  - Change "int done" to "u64 done" to match the u64 page_count it is
> >    compared against, avoiding signed/unsigned comparison hazards.
> >  - Use u64 for loop iteration and batch size variables consistently.
> >  - Add proper braces to the for-loop body in hv_do_map_gpa_hcall().
> >  - Remove unnecessary "ret" variable from hv_call_unmap_gpa_pages().
> >  - Simplify the error-path unmap to use "done << large_shift" directly
> >    instead of mutating done in place.
> > 
> > v3: aligned changes by 80 colons
> > v2: replaced min with min_t
> > 
> > Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
> > Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> 
> Question about "packaging" of this patch. To apply cleanly, it
> needs the previous two fixes applied.  As such, shouldn't it be
> the 3rd patch in patch set that includes the other two?
> 
> Also, there are changes in the previous two fixes that get undone
> or changed by this patch (such as applying large_shift in the error
> path of hv_do_map_gpa_hcall(). With a little more coordination
> between the three patches, there could be less code churn and
> the patches would overall be smaller.
> 

Indeed. I should have done it as a series, but I didn't.
So, to avoid any additonal churn we the reset of the patches coming on
top (a series of fixes), I'd rather keep this one individual.

Thanks,
Stanislav

> Michael
> 
> > ---
> >  drivers/hv/mshv_root_hv_call.c |   56 +++++++++++++++-------------------------
> >  1 file changed, 21 insertions(+), 35 deletions(-)
> > 
> > diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
> > index e5992c324904a..e1f9e28d5a19b 100644
> > --- a/drivers/hv/mshv_root_hv_call.c
> > +++ b/drivers/hv/mshv_root_hv_call.c
> > @@ -195,8 +195,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64
> > page_struct_count,
> >  	struct hv_input_map_gpa_pages *input_page;
> >  	u64 status, *pfnlist;
> >  	unsigned long irq_flags, large_shift = 0;
> > -	int ret = 0, done = 0;
> > -	u64 page_count = page_struct_count;
> > +	u64 done = 0, page_count = page_struct_count;
> > +	int ret = 0;
> > 
> >  	if (page_count == 0 || (pages && mmio_spa))
> >  		return -EINVAL;
> > @@ -213,8 +213,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64
> > page_struct_count,
> >  	}
> > 
> >  	while (done < page_count) {
> > -		ulong i, completed, remain = page_count - done;
> > -		int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE);
> > +		u64 i, completed, remain = page_count - done;
> > +		u64 rep_count = min_t(u64, remain, HV_MAP_GPA_BATCH_SIZE);
> > 
> >  		local_irq_save(irq_flags);
> >  		input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > @@ -224,23 +224,14 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64
> > page_struct_count,
> >  		input_page->map_flags = flags;
> >  		pfnlist = input_page->source_gpa_page_list;
> > 
> > -		for (i = 0; i < rep_count; i++)
> > -			if (flags & HV_MAP_GPA_NO_ACCESS) {
> > +		for (i = 0; i < rep_count; i++) {
> > +			if (flags & HV_MAP_GPA_NO_ACCESS)
> >  				pfnlist[i] = 0;
> > -			} else if (pages) {
> > -				u64 index = (done + i) << large_shift;
> > -
> > -				if (index >= page_struct_count) {
> > -					ret = -EINVAL;
> > -					break;
> > -				}
> > -				pfnlist[i] = page_to_pfn(pages[index]);
> > -			} else {
> > +			else if (pages)
> > +				pfnlist[i] = page_to_pfn(pages[(done + i) <<
> > +							 large_shift]);
> > +			else
> >  				pfnlist[i] = mmio_spa + done + i;
> > -			}
> > -		if (ret) {
> > -			local_irq_restore(irq_flags);
> > -			break;
> >  		}
> > 
> >  		status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0,
> > @@ -248,29 +239,26 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64
> > page_struct_count,
> >  		local_irq_restore(irq_flags);
> > 
> >  		completed = hv_repcomp(status);
> > +		done += completed;
> > 
> >  		if (hv_result_needs_memory(status)) {
> >  			ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
> >  						    HV_MAP_GPA_DEPOSIT_PAGES);
> >  			if (ret)
> >  				break;
> > -
> >  		} else if (!hv_result_success(status)) {
> >  			ret = hv_result_to_errno(status);
> >  			break;
> >  		}
> > -
> > -		done += completed;
> >  	}
> > 
> >  	if (ret && done) {
> >  		u32 unmap_flags = 0;
> > 
> > -		if (flags & HV_MAP_GPA_LARGE_PAGE) {
> > +		if (flags & HV_MAP_GPA_LARGE_PAGE)
> >  			unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
> > -			done <<= large_shift;
> > -		}
> > -		hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags);
> > +		hv_call_unmap_gpa_pages(partition_id, gfn,
> > +					done << large_shift, unmap_flags);
> >  	}
> > 
> >  	return ret;
> > @@ -305,7 +293,7 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64
> > page_count_4k,
> >  	struct hv_input_unmap_gpa_pages *input_page;
> >  	u64 status, page_count = page_count_4k;
> >  	unsigned long irq_flags, large_shift = 0;
> > -	int ret = 0, done = 0;
> > +	u64 done = 0;
> > 
> >  	if (page_count == 0)
> >  		return -EINVAL;
> > @@ -319,8 +307,8 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64
> > page_count_4k,
> >  	}
> > 
> >  	while (done < page_count) {
> > -		ulong completed, remain = page_count - done;
> > -		int rep_count = min(remain, HV_UMAP_GPA_PAGES);
> > +		u64 completed, remain = page_count - done;
> > +		u64 rep_count = min_t(u64, remain, HV_UMAP_GPA_PAGES);
> > 
> >  		local_irq_save(irq_flags);
> >  		input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
> > @@ -333,15 +321,13 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64
> > page_count_4k,
> >  		local_irq_restore(irq_flags);
> > 
> >  		completed = hv_repcomp(status);
> > -		if (!hv_result_success(status)) {
> > -			ret = hv_result_to_errno(status);
> > -			break;
> > -		}
> > -
> >  		done += completed;
> > +
> > +		if (!hv_result_success(status))
> > +			return hv_result_to_errno(status);
> >  	}
> > 
> > -	return ret;
> > +	return 0;
> >  }
> > 
> >  int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn,
> > 
> > 
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox