Linux-HyperV List

Linux-HyperV List
 help / color / mirror / Atom feed

* [PATCH net-next v7 3/6] net: mana: Introduce GIC context with refcounting for interrupt management
From: Long Li @ 2026-05-07 19:12 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260507191237.438671-1-longli@microsoft.com>

To allow Ethernet EQs to use dedicated or shared MSI-X vectors and RDMA
EQs to share the same MSI-X, introduce a GIC (GDMA IRQ Context) with
reference counting. This allows the driver to create an interrupt context
on an assigned or unassigned MSI-X vector and share it across multiple
EQ consumers.

Signed-off-by: Long Li <longli@microsoft.com>
---
 .../net/ethernet/microsoft/mana/gdma_main.c   | 159 ++++++++++++++++++
 include/net/mana/gdma.h                       |  12 ++
 2 files changed, 171 insertions(+)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 3aa96329f359..5930ab817056 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -1615,6 +1615,164 @@ static irqreturn_t mana_gd_intr(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
+void mana_gd_put_gic(struct gdma_context *gc, bool use_msi_bitmap, int msi)
+{
+	struct pci_dev *dev = to_pci_dev(gc->dev);
+	struct msi_map irq_map;
+	struct gdma_irq_context *gic;
+	int irq;
+
+	mutex_lock(&gc->gic_mutex);
+
+	gic = xa_load(&gc->irq_contexts, msi);
+	if (WARN_ON(!gic)) {
+		mutex_unlock(&gc->gic_mutex);
+		return;
+	}
+
+	if (use_msi_bitmap)
+		gic->bitmap_refs--;
+
+	if (use_msi_bitmap && gic->bitmap_refs == 0)
+		clear_bit(msi, gc->msi_bitmap);
+
+	if (!refcount_dec_and_test(&gic->refcount))
+		goto out;
+
+	irq = gic->irq;
+
+	irq_update_affinity_hint(irq, NULL);
+	free_irq(irq, gic);
+
+	if (gic->dyn_msix) {
+		irq_map.virq = irq;
+		irq_map.index = msi;
+		pci_msix_free_irq(dev, irq_map);
+	}
+
+	xa_erase(&gc->irq_contexts, msi);
+	kfree(gic);
+
+out:
+	mutex_unlock(&gc->gic_mutex);
+}
+EXPORT_SYMBOL_NS(mana_gd_put_gic, "NET_MANA");
+
+/*
+ * Get a GIC (GDMA IRQ Context) on a MSI vector
+ * a MSI can be shared between different EQs, this function supports setting
+ * up separate MSIs using a bitmap, or directly using the MSI index
+ *
+ * @use_msi_bitmap:
+ * True if MSI is assigned by this function on available slots from bitmap.
+ * False if MSI is passed from *msi_requested
+ */
+struct gdma_irq_context *mana_gd_get_gic(struct gdma_context *gc,
+					 bool use_msi_bitmap,
+					 int *msi_requested)
+{
+	struct gdma_irq_context *gic;
+	struct pci_dev *dev = to_pci_dev(gc->dev);
+	struct msi_map irq_map = { };
+	int irq;
+	int msi;
+	int err;
+
+	mutex_lock(&gc->gic_mutex);
+
+	if (use_msi_bitmap) {
+		msi = find_first_zero_bit(gc->msi_bitmap, gc->num_msix_usable);
+		if (msi >= gc->num_msix_usable) {
+			dev_err(gc->dev, "No free MSI vectors available\n");
+			gic = NULL;
+			goto out;
+		}
+		*msi_requested = msi;
+	} else {
+		msi = *msi_requested;
+	}
+
+	gic = xa_load(&gc->irq_contexts, msi);
+	if (gic) {
+		refcount_inc(&gic->refcount);
+		if (use_msi_bitmap) {
+			gic->bitmap_refs++;
+			set_bit(msi, gc->msi_bitmap);
+		}
+		goto out;
+	}
+
+	irq = pci_irq_vector(dev, msi);
+	if (irq == -EINVAL) {
+		irq_map = pci_msix_alloc_irq_at(dev, msi, NULL);
+		if (!irq_map.virq) {
+			err = irq_map.index;
+			dev_err(gc->dev,
+				"Failed to alloc irq_map msi %d err %d\n",
+				msi, err);
+			gic = NULL;
+			goto out;
+		}
+		irq = irq_map.virq;
+		msi = irq_map.index;
+	}
+
+	gic = kzalloc(sizeof(*gic), GFP_KERNEL);
+	if (!gic) {
+		if (irq_map.virq)
+			pci_msix_free_irq(dev, irq_map);
+		goto out;
+	}
+
+	gic->handler = mana_gd_process_eq_events;
+	gic->msi = msi;
+	gic->irq = irq;
+	INIT_LIST_HEAD(&gic->eq_list);
+	spin_lock_init(&gic->lock);
+
+	if (!gic->msi)
+		snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_hwc@pci:%s",
+			 pci_name(dev));
+	else
+		snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_msi%d@pci:%s",
+			 gic->msi, pci_name(dev));
+
+	err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
+	if (err) {
+		dev_err(gc->dev, "Failed to request irq %d %s\n",
+			irq, gic->name);
+		kfree(gic);
+		gic = NULL;
+		if (irq_map.virq)
+			pci_msix_free_irq(dev, irq_map);
+		goto out;
+	}
+
+	gic->dyn_msix = !!irq_map.virq;
+	refcount_set(&gic->refcount, 1);
+	gic->bitmap_refs = use_msi_bitmap ? 1 : 0;
+
+	err = xa_err(xa_store(&gc->irq_contexts, msi, gic, GFP_KERNEL));
+	if (err) {
+		dev_err(gc->dev, "Failed to store irq context for msi %d: %d\n",
+			msi, err);
+		free_irq(irq, gic);
+		kfree(gic);
+		gic = NULL;
+		if (irq_map.virq)
+			pci_msix_free_irq(dev, irq_map);
+		goto out;
+	}
+
+	if (use_msi_bitmap)
+		set_bit(msi, gc->msi_bitmap);
+
+out:
+	mutex_unlock(&gc->gic_mutex);
+	return gic;
+}
+EXPORT_SYMBOL_NS(mana_gd_get_gic, "NET_MANA");
+
 int mana_gd_alloc_res_map(u32 res_avail, struct gdma_resource *r)
 {
 	r->map = bitmap_zalloc(res_avail, GFP_KERNEL);
@@ -2104,6 +2262,7 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto release_region;
 
 	mutex_init(&gc->eq_test_event_mutex);
+	mutex_init(&gc->gic_mutex);
 	pci_set_drvdata(pdev, gc);
 	gc->bar0_pa = pci_resource_start(pdev, 0);
 	gc->bar0_size = pci_resource_len(pdev, 0);
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 9c05b1e15c3e..fbe3c1427b45 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -388,6 +388,11 @@ struct gdma_irq_context {
 	spinlock_t lock;
 	struct list_head eq_list;
 	char name[MANA_IRQ_NAME_SZ];
+	unsigned int msi;
+	unsigned int irq;
+	refcount_t refcount;
+	unsigned int bitmap_refs;
+	bool dyn_msix;
 };
 
 enum gdma_context_flags {
@@ -449,6 +454,9 @@ struct gdma_context {
 
 	unsigned long		flags;
 
+	/* Protect access to GIC context */
+	struct mutex		gic_mutex;
+
 	/* Indicate if this device is sharing MSI for EQs on MANA */
 	bool msi_sharing;
 
@@ -1026,6 +1034,10 @@ int mana_gd_resume(struct pci_dev *pdev);
 
 bool mana_need_log(struct gdma_context *gc, int err);
 
+struct gdma_irq_context *mana_gd_get_gic(struct gdma_context *gc,
+					 bool use_msi_bitmap,
+					 int *msi_requested);
+void mana_gd_put_gic(struct gdma_context *gc, bool use_msi_bitmap, int msi);
 int mana_gd_query_device_cfg(struct gdma_context *gc, u32 proto_major_ver,
 			     u32 proto_minor_ver, u32 proto_micro_ver,
 			     u16 *max_num_vports, u8 *bm_hostmode);
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v7 2/6] net: mana: Query device capabilities and configure MSI-X sharing for EQs
From: Long Li @ 2026-05-07 19:12 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260507191237.438671-1-longli@microsoft.com>

When querying the device, adjust the max number of queues to allow
dedicated MSI-X vectors for each vPort. The number of queues per vPort
is clamped to no less than MANA_DEF_NUM_QUEUES. MSI-X sharing among
vPorts is disabled by default and is only enabled when there are not
enough MSI-X vectors for dedicated allocation.

Rename mana_query_device_cfg() to mana_gd_query_device_cfg() as it is
used at GDMA device probe time for querying device capabilities.

Signed-off-by: Long Li <longli@microsoft.com>
---
 .../net/ethernet/microsoft/mana/gdma_main.c   | 56 ++++++++++++++++++-
 drivers/net/ethernet/microsoft/mana/mana_en.c | 37 +++++++-----
 include/net/mana/gdma.h                       | 13 ++++-
 3 files changed, 87 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index f3316e929175..3aa96329f359 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -149,6 +149,9 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
 	struct gdma_context *gc = pci_get_drvdata(pdev);
 	struct gdma_query_max_resources_resp resp = {};
 	struct gdma_general_req req = {};
+	unsigned int max_num_queues;
+	u8 bm_hostmode;
+	u16 num_ports;
 	int err;
 
 	mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_MAX_RESOURCES,
@@ -197,6 +200,40 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
 	if (gc->max_num_queues == 0)
 		return -ENOSPC;
 
+	err = mana_gd_query_device_cfg(gc, MANA_MAJOR_VERSION, MANA_MINOR_VERSION,
+				       MANA_MICRO_VERSION, &num_ports, &bm_hostmode);
+	if (err)
+		return err;
+
+	if (!num_ports)
+		return -EINVAL;
+
+	/*
+	 * Adjust gc->max_num_queues returned from the SOC to allow dedicated
+	 * MSIx for each vPort. Clamp to no less than MANA_DEF_NUM_QUEUES.
+	 */
+	max_num_queues = (gc->num_msix_usable - 1) / num_ports;
+	max_num_queues = rounddown_pow_of_two(max(max_num_queues, 1U));
+	if (max_num_queues < MANA_DEF_NUM_QUEUES)
+		max_num_queues = MANA_DEF_NUM_QUEUES;
+
+	/*
+	 * Use dedicated MSIx for EQs whenever possible, use MSIx sharing for
+	 * Ethernet EQs when (max_num_queues * num_ports > num_msix_usable - 1)
+	 */
+	max_num_queues = min(gc->max_num_queues, max_num_queues);
+	if (max_num_queues * num_ports > gc->num_msix_usable - 1)
+		gc->msi_sharing = true;
+
+	/* If MSI is shared, use max allowed value */
+	if (gc->msi_sharing)
+		gc->max_num_queues_vport = min(gc->num_msix_usable - 1, gc->max_num_queues);
+	else
+		gc->max_num_queues_vport = max_num_queues;
+
+	dev_info(gc->dev, "MSI sharing mode %d max queues %d\n",
+		 gc->msi_sharing, gc->max_num_queues);
+
 	return 0;
 }
 
@@ -1859,6 +1896,7 @@ static int mana_gd_setup_hwc_irqs(struct pci_dev *pdev)
 		/* Need 1 interrupt for HWC */
 		max_irqs = min(num_online_cpus(), MANA_MAX_NUM_QUEUES) + 1;
 		min_irqs = 2;
+		gc->msi_sharing = true;
 	}
 
 	nvec = pci_alloc_irq_vectors(pdev, min_irqs, max_irqs, PCI_IRQ_MSIX);
@@ -1937,6 +1975,8 @@ static void mana_gd_remove_irqs(struct pci_dev *pdev)
 
 	pci_free_irq_vectors(pdev);
 
+	bitmap_free(gc->msi_bitmap);
+	gc->msi_bitmap = NULL;
 	gc->max_num_msix = 0;
 	gc->num_msix_usable = 0;
 }
@@ -1971,6 +2011,10 @@ static int mana_gd_setup(struct pci_dev *pdev)
 	if (err)
 		goto destroy_hwc;
 
+	err = mana_gd_detect_devices(pdev);
+	if (err)
+		goto destroy_hwc;
+
 	err = mana_gd_query_max_resources(pdev);
 	if (err)
 		goto destroy_hwc;
@@ -1981,9 +2025,15 @@ static int mana_gd_setup(struct pci_dev *pdev)
 		goto destroy_hwc;
 	}
 
-	err = mana_gd_detect_devices(pdev);
-	if (err)
-		goto destroy_hwc;
+	if (!gc->msi_sharing) {
+		gc->msi_bitmap = bitmap_zalloc(gc->num_msix_usable, GFP_KERNEL);
+		if (!gc->msi_bitmap) {
+			err = -ENOMEM;
+			goto destroy_hwc;
+		}
+		/* Set bit for HWC */
+		set_bit(0, gc->msi_bitmap);
+	}
 
 	dev_dbg(&pdev->dev, "mana gdma setup successful\n");
 	return 0;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index a13204b3ee79..2f106d6f5be4 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1007,10 +1007,9 @@ static int mana_init_port_context(struct mana_port_context *apc)
 	return !apc->rxqs ? -ENOMEM : 0;
 }
 
-static int mana_send_request(struct mana_context *ac, void *in_buf,
-			     u32 in_len, void *out_buf, u32 out_len)
+static int gdma_mana_send_request(struct gdma_context *gc, void *in_buf,
+				  u32 in_len, void *out_buf, u32 out_len)
 {
-	struct gdma_context *gc = ac->gdma_dev->gdma_context;
 	struct gdma_resp_hdr *resp = out_buf;
 	struct gdma_req_hdr *req = in_buf;
 	struct device *dev = gc->dev;
@@ -1044,6 +1043,14 @@ static int mana_send_request(struct mana_context *ac, void *in_buf,
 	return 0;
 }
 
+static int mana_send_request(struct mana_context *ac, void *in_buf,
+			     u32 in_len, void *out_buf, u32 out_len)
+{
+	struct gdma_context *gc = ac->gdma_dev->gdma_context;
+
+	return gdma_mana_send_request(gc, in_buf, in_len, out_buf, out_len);
+}
+
 static int mana_verify_resp_hdr(const struct gdma_resp_hdr *resp_hdr,
 				const enum mana_command_code expected_code,
 				const u32 min_size)
@@ -1177,11 +1184,10 @@ static void mana_pf_deregister_filter(struct mana_port_context *apc)
 			   err, resp.hdr.status);
 }
 
-static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver,
-				 u32 proto_minor_ver, u32 proto_micro_ver,
-				 u16 *max_num_vports, u8 *bm_hostmode)
+int mana_gd_query_device_cfg(struct gdma_context *gc, u32 proto_major_ver,
+			     u32 proto_minor_ver, u32 proto_micro_ver,
+			     u16 *max_num_vports, u8 *bm_hostmode)
 {
-	struct gdma_context *gc = ac->gdma_dev->gdma_context;
 	struct mana_query_device_cfg_resp resp = {};
 	struct mana_query_device_cfg_req req = {};
 	struct device *dev = gc->dev;
@@ -1196,7 +1202,7 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver,
 	req.proto_minor_ver = proto_minor_ver;
 	req.proto_micro_ver = proto_micro_ver;
 
-	err = mana_send_request(ac, &req, sizeof(req), &resp, sizeof(resp));
+	err = gdma_mana_send_request(gc, &req, sizeof(req), &resp, sizeof(resp));
 	if (err) {
 		dev_err(dev, "Failed to query config: %d", err);
 		return err;
@@ -1230,8 +1236,6 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver,
 	else
 		*bm_hostmode = 0;
 
-	debugfs_create_u16("adapter-MTU", 0400, gc->mana_pci_debugfs, &gc->adapter_mtu);
-
 	return 0;
 }
 
@@ -3415,7 +3419,7 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
 	int err;
 
 	ndev = alloc_etherdev_mq(sizeof(struct mana_port_context),
-				 gc->max_num_queues);
+				 gc->max_num_queues_vport);
 	if (!ndev)
 		return -ENOMEM;
 
@@ -3424,9 +3428,9 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
 	apc = netdev_priv(ndev);
 	apc->ac = ac;
 	apc->ndev = ndev;
-	apc->max_queues = gc->max_num_queues;
+	apc->max_queues = gc->max_num_queues_vport;
 	/* Use MANA_DEF_NUM_QUEUES as default, still honoring the HW limit */
-	apc->num_queues = min(gc->max_num_queues, MANA_DEF_NUM_QUEUES);
+	apc->num_queues = min(gc->max_num_queues_vport, MANA_DEF_NUM_QUEUES);
 	apc->tx_queue_size = DEF_TX_BUFFERS_PER_QUEUE;
 	apc->rx_queue_size = DEF_RX_BUFFERS_PER_QUEUE;
 	apc->port_handle = INVALID_MANA_HANDLE;
@@ -3690,13 +3694,16 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 
 	INIT_DELAYED_WORK(&ac->gf_stats_work, mana_gf_stats_work_handler);
 
-	err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION,
-				    MANA_MICRO_VERSION, &num_ports, &bm_hostmode);
+	err = mana_gd_query_device_cfg(gc, MANA_MAJOR_VERSION, MANA_MINOR_VERSION,
+				       MANA_MICRO_VERSION, &num_ports, &bm_hostmode);
 	if (err)
 		goto out;
 
 	ac->bm_hostmode = bm_hostmode;
 
+	debugfs_create_u16("adapter-MTU", 0400,
+			   gc->mana_pci_debugfs, &gc->adapter_mtu);
+
 	if (!resuming) {
 		ac->num_ports = num_ports;
 	} else {
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 6d836060976a..9c05b1e15c3e 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -399,8 +399,10 @@ struct gdma_context {
 	struct device		*dev;
 	struct dentry		*mana_pci_debugfs;
 
-	/* Per-vPort max number of queues */
+	/* Hardware max number of queues */
 	unsigned int		max_num_queues;
+	/* Per-vPort max number of queues */
+	unsigned int		max_num_queues_vport;
 	unsigned int		max_num_msix;
 	unsigned int		num_msix_usable;
 	struct xarray		irq_contexts;
@@ -446,6 +448,12 @@ struct gdma_context {
 	struct workqueue_struct *service_wq;
 
 	unsigned long		flags;
+
+	/* Indicate if this device is sharing MSI for EQs on MANA */
+	bool msi_sharing;
+
+	/* Bitmap tracks where MSI is allocated when it is not shared for EQs */
+	unsigned long *msi_bitmap;
 };
 
 static inline bool mana_gd_is_mana(struct gdma_dev *gd)
@@ -1018,4 +1026,7 @@ int mana_gd_resume(struct pci_dev *pdev);
 
 bool mana_need_log(struct gdma_context *gc, int err);
 
+int mana_gd_query_device_cfg(struct gdma_context *gc, u32 proto_major_ver,
+			     u32 proto_minor_ver, u32 proto_micro_ver,
+			     u16 *max_num_vports, u8 *bm_hostmode);
 #endif /* _GDMA_H */
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v7 1/6] net: mana: Create separate EQs for each vPort
From: Long Li @ 2026-05-07 19:12 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260507191237.438671-1-longli@microsoft.com>

To prepare for assigning vPorts to dedicated MSI-X vectors, remove EQ
sharing among the vPorts and create dedicated EQs for each vPort.

Move the EQ definition from struct mana_context to struct mana_port_context
and update related support functions. Export mana_create_eq() and
mana_destroy_eq() for use by the MANA RDMA driver.

Signed-off-by: Long Li <longli@microsoft.com>
---
 drivers/infiniband/hw/mana/main.c             |  19 ++-
 drivers/infiniband/hw/mana/qp.c               |  16 ++-
 drivers/net/ethernet/microsoft/mana/mana_en.c | 111 ++++++++++--------
 include/net/mana/mana.h                       |   7 +-
 4 files changed, 98 insertions(+), 55 deletions(-)

diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
index ac5e75dd3494..8000ab6e8beb 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -20,8 +20,10 @@ void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd,
 	pd->vport_use_count--;
 	WARN_ON(pd->vport_use_count < 0);
 
-	if (!pd->vport_use_count)
+	if (!pd->vport_use_count) {
+		mana_destroy_eq(mpc);
 		mana_uncfg_vport(mpc);
+	}
 
 	mutex_unlock(&pd->vport_mutex);
 }
@@ -55,15 +57,22 @@ int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd,
 		return err;
 	}
 
-	mutex_unlock(&pd->vport_mutex);
 
 	pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
 	pd->tx_vp_offset = mpc->tx_vp_offset;
+	err = mana_create_eq(mpc);
+	if (err) {
+		mana_uncfg_vport(mpc);
+		pd->vport_use_count--;
+	}
 
-	ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
-		  mpc->port_handle, pd->pdn, doorbell_id);
+	mutex_unlock(&pd->vport_mutex);
 
-	return 0;
+	if (!err)
+		ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
+			  mpc->port_handle, pd->pdn, doorbell_id);
+
+	return err;
 }
 
 int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
index 645581359cee..6f1043383e8c 100644
--- a/drivers/infiniband/hw/mana/qp.c
+++ b/drivers/infiniband/hw/mana/qp.c
@@ -168,7 +168,15 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 		cq_spec.gdma_region = cq->queue.gdma_region;
 		cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE;
 		cq_spec.modr_ctx_id = 0;
-		eq = &mpc->ac->eqs[cq->comp_vector];
+		/* EQs are created when a raw QP configures the vport.
+		 * A raw QP must be created before creating rwq_ind_tbl.
+		 */
+		if (!mpc->eqs) {
+			ret = -EINVAL;
+			i--;
+			goto fail;
+		}
+		eq = &mpc->eqs[cq->comp_vector % mpc->num_queues];
 		cq_spec.attached_eq = eq->eq->id;
 
 		ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ,
@@ -317,7 +325,11 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
 	cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE;
 	cq_spec.modr_ctx_id = 0;
 	eq_vec = send_cq->comp_vector;
-	eq = &mpc->ac->eqs[eq_vec];
+	if (!mpc->eqs) {
+		err = -EINVAL;
+		goto err_destroy_queue;
+	}
+	eq = &mpc->eqs[eq_vec % mpc->num_queues];
 	cq_spec.attached_eq = eq->eq->id;
 
 	err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec,
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 462a457e7d53..a13204b3ee79 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1615,78 +1615,83 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type,
 }
 EXPORT_SYMBOL_NS(mana_destroy_wq_obj, "NET_MANA");
 
-static void mana_destroy_eq(struct mana_context *ac)
+void mana_destroy_eq(struct mana_port_context *apc)
 {
+	struct mana_context *ac = apc->ac;
 	struct gdma_context *gc = ac->gdma_dev->gdma_context;
 	struct gdma_queue *eq;
 	int i;
 
-	if (!ac->eqs)
+	if (!apc->eqs)
 		return;
 
-	debugfs_remove_recursive(ac->mana_eqs_debugfs);
-	ac->mana_eqs_debugfs = NULL;
+	debugfs_remove_recursive(apc->mana_eqs_debugfs);
+	apc->mana_eqs_debugfs = NULL;
 
-	for (i = 0; i < gc->max_num_queues; i++) {
-		eq = ac->eqs[i].eq;
+	for (i = 0; i < apc->num_queues; i++) {
+		eq = apc->eqs[i].eq;
 		if (!eq)
 			continue;
 
 		mana_gd_destroy_queue(gc, eq);
 	}
 
-	kfree(ac->eqs);
-	ac->eqs = NULL;
+	kfree(apc->eqs);
+	apc->eqs = NULL;
 }
+EXPORT_SYMBOL_NS(mana_destroy_eq, "NET_MANA");
 
-static void mana_create_eq_debugfs(struct mana_context *ac, int i)
+static void mana_create_eq_debugfs(struct mana_port_context *apc, int i)
 {
-	struct mana_eq eq = ac->eqs[i];
+	struct mana_eq eq = apc->eqs[i];
 	char eqnum[32];
 
 	sprintf(eqnum, "eq%d", i);
-	eq.mana_eq_debugfs = debugfs_create_dir(eqnum, ac->mana_eqs_debugfs);
+	eq.mana_eq_debugfs = debugfs_create_dir(eqnum, apc->mana_eqs_debugfs);
 	debugfs_create_u32("head", 0400, eq.mana_eq_debugfs, &eq.eq->head);
 	debugfs_create_u32("tail", 0400, eq.mana_eq_debugfs, &eq.eq->tail);
 	debugfs_create_file("eq_dump", 0400, eq.mana_eq_debugfs, eq.eq, &mana_dbg_q_fops);
 }
 
-static int mana_create_eq(struct mana_context *ac)
+int mana_create_eq(struct mana_port_context *apc)
 {
-	struct gdma_dev *gd = ac->gdma_dev;
+	struct gdma_dev *gd = apc->ac->gdma_dev;
 	struct gdma_context *gc = gd->gdma_context;
 	struct gdma_queue_spec spec = {};
 	int err;
 	int i;
 
-	ac->eqs = kzalloc_objs(struct mana_eq, gc->max_num_queues);
-	if (!ac->eqs)
+	WARN_ON(apc->eqs);
+	apc->eqs = kzalloc_objs(struct mana_eq, apc->num_queues);
+	if (!apc->eqs)
 		return -ENOMEM;
 
 	spec.type = GDMA_EQ;
 	spec.monitor_avl_buf = false;
 	spec.queue_size = EQ_SIZE;
 	spec.eq.callback = NULL;
-	spec.eq.context = ac->eqs;
+	spec.eq.context = apc->eqs;
 	spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE;
 
-	ac->mana_eqs_debugfs = debugfs_create_dir("EQs", gc->mana_pci_debugfs);
+	apc->mana_eqs_debugfs = debugfs_create_dir("EQs",
+						    apc->mana_port_debugfs);
 
-	for (i = 0; i < gc->max_num_queues; i++) {
+	for (i = 0; i < apc->num_queues; i++) {
 		spec.eq.msix_index = (i + 1) % gc->num_msix_usable;
-		err = mana_gd_create_mana_eq(gd, &spec, &ac->eqs[i].eq);
+		err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq);
 		if (err) {
 			dev_err(gc->dev, "Failed to create EQ %d : %d\n", i, err);
 			goto out;
 		}
-		mana_create_eq_debugfs(ac, i);
+		mana_create_eq_debugfs(apc, i);
 	}
 
 	return 0;
 out:
-	mana_destroy_eq(ac);
+	mana_destroy_eq(apc);
 	return err;
 }
+EXPORT_SYMBOL_NS(mana_create_eq, "NET_MANA");
 
 static int mana_fence_rq(struct mana_port_context *apc, struct mana_rxq *rxq)
 {
@@ -2451,7 +2456,7 @@ static int mana_create_txq(struct mana_port_context *apc,
 		spec.monitor_avl_buf = false;
 		spec.queue_size = cq_size;
 		spec.cq.callback = mana_schedule_napi;
-		spec.cq.parent_eq = ac->eqs[i].eq;
+		spec.cq.parent_eq = apc->eqs[i].eq;
 		spec.cq.context = cq;
 		err = mana_gd_create_mana_wq_cq(gd, &spec, &cq->gdma_cq);
 		if (err)
@@ -2844,13 +2849,12 @@ static void mana_create_rxq_debugfs(struct mana_port_context *apc, int idx)
 static int mana_add_rx_queues(struct mana_port_context *apc,
 			      struct net_device *ndev)
 {
-	struct mana_context *ac = apc->ac;
 	struct mana_rxq *rxq;
 	int err = 0;
 	int i;
 
 	for (i = 0; i < apc->num_queues; i++) {
-		rxq = mana_create_rxq(apc, i, &ac->eqs[i], ndev);
+		rxq = mana_create_rxq(apc, i, &apc->eqs[i], ndev);
 		if (!rxq) {
 			err = -ENOMEM;
 			netdev_err(ndev, "Failed to create rxq %d : %d\n", i, err);
@@ -2869,9 +2873,8 @@ static int mana_add_rx_queues(struct mana_port_context *apc,
 	return err;
 }
 
-static void mana_destroy_vport(struct mana_port_context *apc)
+static void mana_destroy_rxqs(struct mana_port_context *apc)
 {
-	struct gdma_dev *gd = apc->ac->gdma_dev;
 	struct mana_rxq *rxq;
 	u32 rxq_idx;
 
@@ -2883,8 +2886,12 @@ static void mana_destroy_vport(struct mana_port_context *apc)
 		mana_destroy_rxq(apc, rxq, true);
 		apc->rxqs[rxq_idx] = NULL;
 	}
+}
+
+static void mana_destroy_vport(struct mana_port_context *apc)
+{
+	struct gdma_dev *gd = apc->ac->gdma_dev;
 
-	mana_destroy_txq(apc);
 	mana_uncfg_vport(apc);
 
 	if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode)
@@ -2905,11 +2912,7 @@ static int mana_create_vport(struct mana_port_context *apc,
 			return err;
 	}
 
-	err = mana_cfg_vport(apc, gd->pdid, gd->doorbell);
-	if (err)
-		return err;
-
-	return mana_create_txq(apc, net);
+	return mana_cfg_vport(apc, gd->pdid, gd->doorbell);
 }
 
 static int mana_rss_table_alloc(struct mana_port_context *apc)
@@ -3195,21 +3198,36 @@ int mana_alloc_queues(struct net_device *ndev)
 
 	err = mana_create_vport(apc, ndev);
 	if (err) {
-		netdev_err(ndev, "Failed to create vPort %u : %d\n", apc->port_idx, err);
+		netdev_err(ndev, "Failed to create vPort %u : %d\n",
+			   apc->port_idx, err);
 		return err;
 	}
 
+	err = mana_create_eq(apc);
+	if (err) {
+		netdev_err(ndev, "Failed to create EQ on vPort %u: %d\n",
+			   apc->port_idx, err);
+		goto destroy_vport;
+	}
+
+	err = mana_create_txq(apc, ndev);
+	if (err) {
+		netdev_err(ndev, "Failed to create TXQ on vPort %u: %d\n",
+			   apc->port_idx, err);
+		goto destroy_eq;
+	}
+
 	err = netif_set_real_num_tx_queues(ndev, apc->num_queues);
 	if (err) {
 		netdev_err(ndev,
 			   "netif_set_real_num_tx_queues () failed for ndev with num_queues %u : %d\n",
 			   apc->num_queues, err);
-		goto destroy_vport;
+		goto destroy_txq;
 	}
 
 	err = mana_add_rx_queues(apc, ndev);
 	if (err)
-		goto destroy_vport;
+		goto destroy_rxq;
 
 	apc->rss_state = apc->num_queues > 1 ? TRI_STATE_TRUE : TRI_STATE_FALSE;
 
@@ -3218,7 +3236,7 @@ int mana_alloc_queues(struct net_device *ndev)
 		netdev_err(ndev,
 			   "netif_set_real_num_rx_queues () failed for ndev with num_queues %u : %d\n",
 			   apc->num_queues, err);
-		goto destroy_vport;
+		goto destroy_rxq;
 	}
 
 	mana_rss_table_init(apc);
@@ -3226,19 +3244,25 @@ int mana_alloc_queues(struct net_device *ndev)
 	err = mana_config_rss(apc, TRI_STATE_TRUE, true, true);
 	if (err) {
 		netdev_err(ndev, "Failed to configure RSS table: %d\n", err);
-		goto destroy_vport;
+		goto destroy_rxq;
 	}
 
 	if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) {
 		err = mana_pf_register_filter(apc);
 		if (err)
-			goto destroy_vport;
+			goto destroy_rxq;
 	}
 
 	mana_chn_setxdp(apc, mana_xdp_get(apc));
 
 	return 0;
 
+destroy_rxq:
+	mana_destroy_rxqs(apc);
+destroy_txq:
+	mana_destroy_txq(apc);
+destroy_eq:
+	mana_destroy_eq(apc);
 destroy_vport:
 	mana_destroy_vport(apc);
 	return err;
@@ -3343,6 +3367,9 @@ static int mana_dealloc_queues(struct net_device *ndev)
 	mana_fence_rqs(apc);
 
 	/* Even in err case, still need to cleanup the vPort */
+	mana_destroy_rxqs(apc);
+	mana_destroy_txq(apc);
+	mana_destroy_eq(apc);
 	mana_destroy_vport(apc);
 
 	return 0;
@@ -3663,12 +3690,6 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 
 	INIT_DELAYED_WORK(&ac->gf_stats_work, mana_gf_stats_work_handler);
 
-	err = mana_create_eq(ac);
-	if (err) {
-		dev_err(dev, "Failed to create EQs: %d\n", err);
-		goto out;
-	}
-
 	err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION,
 				    MANA_MICRO_VERSION, &num_ports, &bm_hostmode);
 	if (err)
@@ -3808,8 +3829,6 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
 		free_netdev(ndev);
 	}
 
-	mana_destroy_eq(ac);
-
 	if (ac->per_port_queue_reset_wq) {
 		destroy_workqueue(ac->per_port_queue_reset_wq);
 		ac->per_port_queue_reset_wq = NULL;
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index aa90a858c8e3..c8e7d16f6685 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -480,8 +480,6 @@ struct mana_context {
 	u8 bm_hostmode;
 
 	struct mana_ethtool_hc_stats hc_stats;
-	struct mana_eq *eqs;
-	struct dentry *mana_eqs_debugfs;
 	struct workqueue_struct *per_port_queue_reset_wq;
 	/* Workqueue for querying hardware stats */
 	struct delayed_work gf_stats_work;
@@ -501,6 +499,9 @@ struct mana_port_context {
 
 	u8 mac_addr[ETH_ALEN];
 
+	struct mana_eq *eqs;
+	struct dentry *mana_eqs_debugfs;
+
 	enum TRI_STATE rss_state;
 
 	mana_handle_t default_rxobj;
@@ -1034,6 +1035,8 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type,
 int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
 		   u32 doorbell_pg_id);
 void mana_uncfg_vport(struct mana_port_context *apc);
+int mana_create_eq(struct mana_port_context *apc);
+void mana_destroy_eq(struct mana_port_context *apc);
 
 struct net_device *mana_get_primary_netdev(struct mana_context *ac,
 					   u32 port_index,
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v7 0/6] net: mana: Per-vPort EQ and MSI-X interrupt management
From: Long Li @ 2026-05-07 19:12 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel

This series adds per-vPort Event Queue (EQ) allocation and MSI-X interrupt
management for the MANA driver. Previously, all vPorts shared a single set
of EQs. This change enables dedicated EQs per vPort with support for both
dedicated and shared MSI-X vector allocation modes.

Patch 1 moves EQ ownership from mana_context to per-vPort mana_port_context
and exports create/destroy functions for the RDMA driver. Also adds EQ
create/destroy calls to mana_ib_cfg_vport/uncfg_vport so RDMA vPorts get
their own EQs.

Patch 2 adds device capability queries to determine whether MSI-X vectors
should be dedicated per-vPort or shared. When the number of available MSI-X
vectors is insufficient for dedicated allocation, the driver enables sharing
mode with bitmap-based vector assignment.

Patch 3 introduces the GIC (GDMA IRQ Context) abstraction with reference
counting, allowing multiple EQs to safely share a single MSI-X vector.

Patch 4 converts the global EQ allocation in probe/resume to use the new
GIC functions.

Patch 5 adds per-vPort GIC lifecycle management, calling get/put on each
EQ creation and destruction during vPort open/close.

Patch 6 extends the same GIC lifecycle management to the RDMA driver's EQ
allocation path.

Changes in v7:
- Rebased on net-next/main
- Patch 1: Guard ibdev_dbg() in mana_ib_cfg_vport() with error check so
  the vport handle is not logged on the failure path
- Patch 1: Fix checkpatch line length warning in debugfs_create_dir() call
- Patch 2: Use rounddown_pow_of_two() instead of roundup_pow_of_two() when
  computing per-vPort queue count to avoid unnecessarily forcing shared
  MSI-X mode in borderline configurations
- Patch 2: Call mana_gd_setup_remaining_irqs() unconditionally to ensure
  irq_contexts are populated in both dedicated and shared MSI-X modes,
  fixing bisectability between patches 2 and 5
- Patch 2: Fix checkpatch line length warning in debugfs_create_u16() call
- Patch 3: Use cached gic->irq instead of pci_irq_vector() lookup in
  mana_gd_put_gic() for consistency with the allocation path
- Patch 3: Fix checkpatch line length warning in mana_gd_get_gic()
  declaration
- Patch 5: Fix unsigned int* to int* pointer type mismatch when calling
  mana_gd_get_gic() by using a local int variable for the MSI index
- Patch 6: Fix same unsigned int* to int* pointer type mismatch in RDMA
  EQ creation path

Changes in v6:
- Rebased on net-next/main (v7.1-rc1)

Changes in v5:
- Rebased on net-next/main

Changes in v4:
- Rebased on net-next/main 7.0-rc4
- Patch 2: Use MANA_DEF_NUM_QUEUES instead of hardcoded 16 for
  max_num_queues clamping
- Patch 3: Track dyn_msix in GIC context instead of re-checking
  pci_msix_can_alloc_dyn() on each call; improved remove_irqs iteration
  to skip unallocated entries

Changes in v3:
- Rebased on net-next/main
- Patch 1: Added NULL check for mpc->eqs in mana_ib_create_qp_rss() to
  prevent NULL pointer dereference when RSS QP is created before a raw QP
  has configured the vport and allocated EQs

Changes in v2:
- Rebased on net-next/main (adapted to kzalloc_objs/kzalloc_obj macros,
  new GDMA_DRV_CAP_FLAG definitions)
- Patch 2: Fixed misleading comment for max_num_queues vs
  max_num_queues_vport in gdma.h
- Patch 3: Fixed spelling typo in gdma_main.c ("difference" -> "different")

Long Li (6):
  net: mana: Create separate EQs for each vPort
  net: mana: Query device capabilities and configure MSI-X sharing for
    EQs
  net: mana: Introduce GIC context with refcounting for interrupt
    management
  net: mana: Use GIC functions to allocate global EQs
  net: mana: Allocate interrupt context for each EQ when creating vPort
  RDMA/mana_ib: Allocate interrupt contexts on EQs

 drivers/infiniband/hw/mana/main.c             |  60 +++-
 drivers/infiniband/hw/mana/qp.c               |  16 +-
 .../net/ethernet/microsoft/mana/gdma_main.c   | 297 +++++++++++++-----
 drivers/net/ethernet/microsoft/mana/mana_en.c | 168 ++++++----
 include/net/mana/gdma.h                       |  33 +-
 include/net/mana/mana.h                       |   7 +-
 6 files changed, 425 insertions(+), 156 deletions(-)

-- 
2.43.0

^ permalink raw reply

* RE: [PATCH v3] mshv: Simplify GPA map/unmap hypercall helpers
From: Michael Kelley @ 2026-05-07 19:09 UTC (permalink / raw)
  To: Stanislav Kinsburskii, kys@microsoft.com, haiyangz@microsoft.com,
	wei.liu@kernel.org, decui@microsoft.com, longli@microsoft.com
  Cc: linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <177756065245.17889.140699174692055235.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> 
> Clean up hv_do_map_gpa_hcall() and hv_call_unmap_gpa_pages() after the
> preceding bug-fix patches:
> 
> Move "done += completed" before the status checks so that pages mapped
> by a partially-successful batch are included in the error cleanup unmap.
> Previously these mappings were leaked on failure.
> 
> While here, improve type safety and readability:
>  - Change "int done" to "u64 done" to match the u64 page_count it is
>    compared against, avoiding signed/unsigned comparison hazards.
>  - Use u64 for loop iteration and batch size variables consistently.
>  - Add proper braces to the for-loop body in hv_do_map_gpa_hcall().
>  - Remove unnecessary "ret" variable from hv_call_unmap_gpa_pages().
>  - Simplify the error-path unmap to use "done << large_shift" directly
>    instead of mutating done in place.
> 
> v3: aligned changes by 80 colons
> v2: replaced min with min_t
> 
> Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
> Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>

Question about "packaging" of this patch. To apply cleanly, it
needs the previous two fixes applied.  As such, shouldn't it be
the 3rd patch in patch set that includes the other two?

Also, there are changes in the previous two fixes that get undone
or changed by this patch (such as applying large_shift in the error
path of hv_do_map_gpa_hcall(). With a little more coordination
between the three patches, there could be less code churn and
the patches would overall be smaller.

Michael

> ---
>  drivers/hv/mshv_root_hv_call.c |   56 +++++++++++++++-------------------------
>  1 file changed, 21 insertions(+), 35 deletions(-)
> 
> diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
> index e5992c324904a..e1f9e28d5a19b 100644
> --- a/drivers/hv/mshv_root_hv_call.c
> +++ b/drivers/hv/mshv_root_hv_call.c
> @@ -195,8 +195,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64
> page_struct_count,
>  	struct hv_input_map_gpa_pages *input_page;
>  	u64 status, *pfnlist;
>  	unsigned long irq_flags, large_shift = 0;
> -	int ret = 0, done = 0;
> -	u64 page_count = page_struct_count;
> +	u64 done = 0, page_count = page_struct_count;
> +	int ret = 0;
> 
>  	if (page_count == 0 || (pages && mmio_spa))
>  		return -EINVAL;
> @@ -213,8 +213,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64
> page_struct_count,
>  	}
> 
>  	while (done < page_count) {
> -		ulong i, completed, remain = page_count - done;
> -		int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE);
> +		u64 i, completed, remain = page_count - done;
> +		u64 rep_count = min_t(u64, remain, HV_MAP_GPA_BATCH_SIZE);
> 
>  		local_irq_save(irq_flags);
>  		input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
> @@ -224,23 +224,14 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64
> page_struct_count,
>  		input_page->map_flags = flags;
>  		pfnlist = input_page->source_gpa_page_list;
> 
> -		for (i = 0; i < rep_count; i++)
> -			if (flags & HV_MAP_GPA_NO_ACCESS) {
> +		for (i = 0; i < rep_count; i++) {
> +			if (flags & HV_MAP_GPA_NO_ACCESS)
>  				pfnlist[i] = 0;
> -			} else if (pages) {
> -				u64 index = (done + i) << large_shift;
> -
> -				if (index >= page_struct_count) {
> -					ret = -EINVAL;
> -					break;
> -				}
> -				pfnlist[i] = page_to_pfn(pages[index]);
> -			} else {
> +			else if (pages)
> +				pfnlist[i] = page_to_pfn(pages[(done + i) <<
> +							 large_shift]);
> +			else
>  				pfnlist[i] = mmio_spa + done + i;
> -			}
> -		if (ret) {
> -			local_irq_restore(irq_flags);
> -			break;
>  		}
> 
>  		status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0,
> @@ -248,29 +239,26 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64
> page_struct_count,
>  		local_irq_restore(irq_flags);
> 
>  		completed = hv_repcomp(status);
> +		done += completed;
> 
>  		if (hv_result_needs_memory(status)) {
>  			ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
>  						    HV_MAP_GPA_DEPOSIT_PAGES);
>  			if (ret)
>  				break;
> -
>  		} else if (!hv_result_success(status)) {
>  			ret = hv_result_to_errno(status);
>  			break;
>  		}
> -
> -		done += completed;
>  	}
> 
>  	if (ret && done) {
>  		u32 unmap_flags = 0;
> 
> -		if (flags & HV_MAP_GPA_LARGE_PAGE) {
> +		if (flags & HV_MAP_GPA_LARGE_PAGE)
>  			unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
> -			done <<= large_shift;
> -		}
> -		hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags);
> +		hv_call_unmap_gpa_pages(partition_id, gfn,
> +					done << large_shift, unmap_flags);
>  	}
> 
>  	return ret;
> @@ -305,7 +293,7 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64
> page_count_4k,
>  	struct hv_input_unmap_gpa_pages *input_page;
>  	u64 status, page_count = page_count_4k;
>  	unsigned long irq_flags, large_shift = 0;
> -	int ret = 0, done = 0;
> +	u64 done = 0;
> 
>  	if (page_count == 0)
>  		return -EINVAL;
> @@ -319,8 +307,8 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64
> page_count_4k,
>  	}
> 
>  	while (done < page_count) {
> -		ulong completed, remain = page_count - done;
> -		int rep_count = min(remain, HV_UMAP_GPA_PAGES);
> +		u64 completed, remain = page_count - done;
> +		u64 rep_count = min_t(u64, remain, HV_UMAP_GPA_PAGES);
> 
>  		local_irq_save(irq_flags);
>  		input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
> @@ -333,15 +321,13 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64
> page_count_4k,
>  		local_irq_restore(irq_flags);
> 
>  		completed = hv_repcomp(status);
> -		if (!hv_result_success(status)) {
> -			ret = hv_result_to_errno(status);
> -			break;
> -		}
> -
>  		done += completed;
> +
> +		if (!hv_result_success(status))
> +			return hv_result_to_errno(status);
>  	}
> 
> -	return ret;
> +	return 0;
>  }
> 
>  int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn,
> 
> 


^ permalink raw reply

* RE: [PATCH v3] mshv: Simplify GPA map/unmap hypercall helpers
From: Michael Kelley @ 2026-05-07 18:21 UTC (permalink / raw)
  To: Stanislav Kinsburskii, kys@microsoft.com, haiyangz@microsoft.com,
	wei.liu@kernel.org, decui@microsoft.com, longli@microsoft.com
  Cc: linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <177756065245.17889.140699174692055235.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Thursday, April 30, 2026 7:52 AM
> 
> Clean up hv_do_map_gpa_hcall() and hv_call_unmap_gpa_pages() after the
> preceding bug-fix patches:
> 
> Move "done += completed" before the status checks so that pages mapped
> by a partially-successful batch are included in the error cleanup unmap.
> Previously these mappings were leaked on failure.
> 
> While here, improve type safety and readability:
>  - Change "int done" to "u64 done" to match the u64 page_count it is
>    compared against, avoiding signed/unsigned comparison hazards.
>  - Use u64 for loop iteration and batch size variables consistently.
>  - Add proper braces to the for-loop body in hv_do_map_gpa_hcall().
>  - Remove unnecessary "ret" variable from hv_call_unmap_gpa_pages().
>  - Simplify the error-path unmap to use "done << large_shift" directly
>    instead of mutating done in place.
> 
> v3: aligned changes by 80 colons
> v2: replaced min with min_t
> 
> Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to
> VMMs")
> Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> ---
>  drivers/hv/mshv_root_hv_call.c |   56 +++++++++++++++-------------------------
>  1 file changed, 21 insertions(+), 35 deletions(-)
> 
> diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
> index e5992c324904a..e1f9e28d5a19b 100644
> --- a/drivers/hv/mshv_root_hv_call.c
> +++ b/drivers/hv/mshv_root_hv_call.c
> @@ -195,8 +195,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
>  	struct hv_input_map_gpa_pages *input_page;
>  	u64 status, *pfnlist;
>  	unsigned long irq_flags, large_shift = 0;
> -	int ret = 0, done = 0;
> -	u64 page_count = page_struct_count;
> +	u64 done = 0, page_count = page_struct_count;
> +	int ret = 0;
> 
>  	if (page_count == 0 || (pages && mmio_spa))
>  		return -EINVAL;
> @@ -213,8 +213,8 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
>  	}
> 
>  	while (done < page_count) {
> -		ulong i, completed, remain = page_count - done;
> -		int rep_count = min(remain, HV_MAP_GPA_BATCH_SIZE);
> +		u64 i, completed, remain = page_count - done;
> +		u64 rep_count = min_t(u64, remain, HV_MAP_GPA_BATCH_SIZE);
> 
>  		local_irq_save(irq_flags);
>  		input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
> @@ -224,23 +224,14 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
>  		input_page->map_flags = flags;
>  		pfnlist = input_page->source_gpa_page_list;
> 
> -		for (i = 0; i < rep_count; i++)
> -			if (flags & HV_MAP_GPA_NO_ACCESS) {
> +		for (i = 0; i < rep_count; i++) {
> +			if (flags & HV_MAP_GPA_NO_ACCESS)
>  				pfnlist[i] = 0;
> -			} else if (pages) {
> -				u64 index = (done + i) << large_shift;
> -
> -				if (index >= page_struct_count) {
> -					ret = -EINVAL;
> -					break;
> -				}
> -				pfnlist[i] = page_to_pfn(pages[index]);
> -			} else {
> +			else if (pages)
> +				pfnlist[i] = page_to_pfn(pages[(done + i) <<
> +							 large_shift]);
> +			else
>  				pfnlist[i] = mmio_spa + done + i;
> -			}
> -		if (ret) {
> -			local_irq_restore(irq_flags);
> -			break;
>  		}
> 
>  		status = hv_do_rep_hypercall(HVCALL_MAP_GPA_PAGES, rep_count, 0,
> @@ -248,29 +239,26 @@ static int hv_do_map_gpa_hcall(u64 partition_id, u64 gfn, u64 page_struct_count,
>  		local_irq_restore(irq_flags);
> 
>  		completed = hv_repcomp(status);
> +		done += completed;

A further cleanup: local variable "completed" is only used in these
two statements.  Drop the local variable and just do:

		done += hv_repcomp(status); 

> 
>  		if (hv_result_needs_memory(status)) {
>  			ret = hv_call_deposit_pages(NUMA_NO_NODE, partition_id,
>  						    HV_MAP_GPA_DEPOSIT_PAGES);
>  			if (ret)
>  				break;
> -
>  		} else if (!hv_result_success(status)) {
>  			ret = hv_result_to_errno(status);
>  			break;
>  		}
> -
> -		done += completed;
>  	}
> 
>  	if (ret && done) {
>  		u32 unmap_flags = 0;
> 
> -		if (flags & HV_MAP_GPA_LARGE_PAGE) {
> +		if (flags & HV_MAP_GPA_LARGE_PAGE)
>  			unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
> -			done <<= large_shift;
> -		}
> -		hv_call_unmap_gpa_pages(partition_id, gfn, done, unmap_flags);
> +		hv_call_unmap_gpa_pages(partition_id, gfn,
> +					done << large_shift, unmap_flags);
>  	}
> 
>  	return ret;
> @@ -305,7 +293,7 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
>  	struct hv_input_unmap_gpa_pages *input_page;
>  	u64 status, page_count = page_count_4k;
>  	unsigned long irq_flags, large_shift = 0;
> -	int ret = 0, done = 0;
> +	u64 done = 0;
> 
>  	if (page_count == 0)
>  		return -EINVAL;
> @@ -319,8 +307,8 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
>  	}
> 
>  	while (done < page_count) {
> -		ulong completed, remain = page_count - done;
> -		int rep_count = min(remain, HV_UMAP_GPA_PAGES);
> +		u64 completed, remain = page_count - done;
> +		u64 rep_count = min_t(u64, remain, HV_UMAP_GPA_PAGES);
> 
>  		local_irq_save(irq_flags);
>  		input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
> @@ -333,15 +321,13 @@ int hv_call_unmap_gpa_pages(u64 partition_id, u64 gfn, u64 page_count_4k,
>  		local_irq_restore(irq_flags);
> 
>  		completed = hv_repcomp(status);
> -		if (!hv_result_success(status)) {
> -			ret = hv_result_to_errno(status);
> -			break;
> -		}
> -
>  		done += completed;

Same here. Drop "completed" and just do:

		done += hv_repcomp(status);


> +
> +		if (!hv_result_success(status))
> +			return hv_result_to_errno(status);
>  	}
> 
> -	return ret;
> +	return 0;
>  }
> 
>  int hv_call_get_gpa_access_states(u64 partition_id, u32 count, u64 gpa_base_pfn,
> 
> 

Michael


^ permalink raw reply

* [PATCH v4 18/18] mshv: Fix missing error code on VP allocation failure
From: Stanislav Kinsburskii @ 2026-05-07 15:44 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

In mshv_partition_ioctl_create_vp(), when kzalloc for the VP struct
fails, the code jumps to the cleanup path without setting ret. At that
point ret is 0 from the preceding successful mshv_vp_stats_map() call,
so the function returns success to userspace despite having failed to
create the VP. No fd is installed and no VP is registered in pt_vp_array,
but userspace has no way to know the operation failed.

Set ret to -ENOMEM before jumping to the cleanup path.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_root_main.c |    4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 1c18d1c1f7947..03c65ff6a7397 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1189,8 +1189,10 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
 		goto unmap_ghcb_page;
 
 	vp = kzalloc_obj(*vp);
-	if (!vp)
+	if (!vp) {
+		ret = -ENOMEM;
 		goto unmap_stats_pages;
+	}
 
 	vp->vp_partition = mshv_partition_get(partition);
 	if (!vp->vp_partition) {



^ permalink raw reply related

* [PATCH v4 17/18] mshv: Publish VP to pt_vp_array before installing the file descriptor
From: Stanislav Kinsburskii @ 2026-05-07 15:44 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

mshv_partition_ioctl_create_vp() called anon_inode_getfd() before
publishing the new VP into partition->pt_vp_array.  anon_inode_getfd()
includes fd_install(), so the fd was live in current->files before the
publish ran.

A concurrent MSHV_RUN_VP ioctl on that fd does not serialise against the
in-progress MSHV_CREATE_VP — it takes vp->vp_mutex, not the partition
mutex.  Once the VP starts running and traps, mshv_intercept_isr() can look
up partition->pt_vp_array[vp_index] and observe NULL, silently dropping the
intercept message.

Split the fd creation: reserve an fd with get_unused_fd_flags(), create the
file with anon_inode_getfile(), publish the VP via smp_store_release(), and
finally call fd_install() as the userspace-visibility commit point.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_root_main.c |   29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index e32f6e0f9f637..1c18d1c1f7947 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1142,6 +1142,8 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
 	struct mshv_vp *vp;
 	struct page *intercept_msg_page, *register_page, *ghcb_page;
 	struct hv_stats_page *stats_pages[2];
+	struct file *file;
+	int fd;
 	long ret;
 
 	if (copy_from_user(&args, arg, sizeof(args)))
@@ -1214,14 +1216,18 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
 	if (ret)
 		goto put_partition;
 
-	/*
-	 * Keep anon_inode_getfd last: it installs fd in the file struct and
-	 * thus makes the state accessible in user space.
-	 */
-	ret = anon_inode_getfd("mshv_vp", &mshv_vp_fops, vp,
-			       O_RDWR | O_CLOEXEC);
-	if (ret < 0)
+	fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+	if (fd < 0) {
+		ret = fd;
 		goto remove_debugfs_vp;
+	}
+
+	file = anon_inode_getfile("mshv_vp", &mshv_vp_fops, vp,
+				  O_RDWR | O_CLOEXEC);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto put_unused_vp_fd;
+	}
 
 	/* already exclusive with the partition mutex for all ioctls */
 	partition->pt_vp_count++;
@@ -1233,8 +1239,17 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
 	 */
 	smp_store_release(&partition->pt_vp_array[args.vp_index], vp);
 
+	/*
+	 * fd_install() is the userspace-visibility commit point.  Must be the
+	 * last operation that can fail or be observed.
+	 */
+	fd_install(fd, file);
+	ret = fd;
+
 	goto out;
 
+put_unused_vp_fd:
+	put_unused_fd(fd);
 remove_debugfs_vp:
 	mshv_debugfs_vp_remove(vp);
 put_partition:



^ permalink raw reply related

* [PATCH v4 16/18] mshv: Validate scheduler message bounds from hypervisor
From: Stanislav Kinsburskii @ 2026-05-07 15:44 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

handle_pair_message() iterates up to msg->vp_count without verifying it
against HV_MESSAGE_MAX_PARTITION_VP_PAIR_COUNT. Since vp_count is read
from untrusted hypervisor data, a malformed message with a large value
would cause out-of-bounds reads from the partition_ids and vp_indexes
arrays.

handle_bitset_message() iterates over set bits in valid_bank_mask (up to
64) and advances bank_contents for each one. However, the payload buffer
only has space for 16 bank entries. A valid_bank_mask with more than 16
bits set causes bank_contents to read beyond the message buffer.

Fix both by adding bounds validation:
- Clamp vp_count to HV_MESSAGE_MAX_PARTITION_VP_PAIR_COUNT
- Track banks consumed and stop before exceeding buffer capacity

Fixes: 621191d709b1 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_synic.c |   20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
index 89207aad7cf1f..5d509299f14d7 100644
--- a/drivers/hv/mshv_synic.c
+++ b/drivers/hv/mshv_synic.c
@@ -190,7 +190,9 @@ static void kick_vp(struct mshv_vp *vp)
 static void
 handle_bitset_message(const struct hv_vp_signal_bitset_scheduler_message *msg)
 {
-	int bank_idx, vps_signaled = 0, bank_mask_size;
+	int bank_idx, vps_signaled = 0, bank_mask_size, banks_used = 0;
+	const int max_banks = sizeof(msg->vp_bitset.bitset_buffer) /
+			      sizeof(u64) - 2; /* subtract format + mask */
 	struct mshv_partition *partition;
 	const struct hv_vpset *vpset;
 	const u64 *bank_contents;
@@ -230,6 +232,11 @@ handle_bitset_message(const struct hv_vp_signal_bitset_scheduler_message *msg)
 		if (bank_idx == bank_mask_size)
 			break;
 
+		if (unlikely(banks_used >= max_banks)) {
+			pr_debug("valid_bank_mask exceeds buffer capacity\n");
+			goto unlock_out;
+		}
+
 		while (true) {
 			struct mshv_vp *vp;
 
@@ -258,6 +265,7 @@ handle_bitset_message(const struct hv_vp_signal_bitset_scheduler_message *msg)
 		}
 
 		bank_contents++;
+		banks_used++;
 	}
 
 unlock_out:
@@ -274,10 +282,18 @@ handle_pair_message(const struct hv_vp_signal_pair_scheduler_message *msg)
 	struct mshv_partition *partition = NULL;
 	struct mshv_vp *vp;
 	int idx;
+	u8 vp_count = msg->vp_count;
+
+	if (unlikely(vp_count > HV_MESSAGE_MAX_PARTITION_VP_PAIR_COUNT)) {
+		pr_debug("pair message vp_count %u exceeds max %lu\n",
+			 vp_count,
+			 (unsigned long)HV_MESSAGE_MAX_PARTITION_VP_PAIR_COUNT);
+		return;
+	}
 
 	rcu_read_lock();
 
-	for (idx = 0; idx < msg->vp_count; idx++) {
+	for (idx = 0; idx < vp_count; idx++) {
 		u64 partition_id = msg->partition_ids[idx];
 		u32 vp_index = msg->vp_indexes[idx];
 



^ permalink raw reply related

* [PATCH v4 15/18] mshv: Defer mshv_vp free to an RCU grace period
From: Stanislav Kinsburskii @ 2026-05-07 15:44 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

destroy_partition() frees mshv_vp with plain kfree() while ISR readers
walk pt_vp_array[] under rcu_read_lock().  On non-root schedulers,
where drain_all_vps() does not run, an in-flight intercept ISR can
observe a non-NULL pt_vp_array slot and dereference freed memory in
kick_vp().  On the root scheduler the same race exists in a narrower
form: drain_vp_signals() synchronises on kick_vp()'s kicked_by_hv flag
but not on its wake_up() tail, so the wait-queue lock embedded in vp
can still be held when destroy_partition() reaches kfree(vp).

Add struct rcu_head vp_rcu to struct mshv_vp, clear the pt_vp_array
slot before the free, and use kfree_rcu() so the actual kfree happens
after a grace period.  drain_all_vps() is retained because it serves a
separate purpose (telling the hypervisor to stop signalling and
reconciling signal counts) that kfree_rcu() does not address.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_root.h      |    1 +
 drivers/hv/mshv_root_main.c |    5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index b6961a6d9a98b..e19a84ea07905 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -35,6 +35,7 @@ static_assert(HV_HYP_PAGE_SIZE == MSHV_HV_PAGE_SIZE);
 #define MSHV_PIN_PAGES_BATCH_SIZE	(0x10000000ULL / HV_HYP_PAGE_SIZE)

 struct mshv_vp {
+	struct rcu_head vp_rcu;
 	u32 vp_index;
 	struct mshv_partition *vp_partition;
 	struct mutex vp_mutex;
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 381aa86c5b90e..e32f6e0f9f637 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -30,6 +30,7 @@
 #include <linux/panic_notifier.h>
 #include <linux/vmalloc.h>
 #include <linux/rseq.h>
+#include <linux/rcupdate.h>

 #include "mshv_eventfd.h"
 #include "mshv.h"
@@ -1915,9 +1916,9 @@ static void destroy_partition(struct mshv_partition *partition)
 				vp->vp_ghcb_page = NULL;
 			}

-			kfree(vp);
-
 			partition->pt_vp_array[i] = NULL;
+
+			kfree_rcu(vp, vp_rcu);
 		}

 		mshv_debugfs_partition_remove(partition);

^ permalink raw reply related

* [PATCH v4 14/18] mshv: Order pt_vp_array publish against irqfd assertion path
From: Stanislav Kinsburskii @ 2026-05-07 15:44 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

mshv_partition_ioctl_create_vp() initialises a VP struct (allocations,
mutex_init, init_waitqueue_head, page mappings) and then publishes the
pointer into partition->pt_vp_array.  Several ISR paths read this array
locklessly: the intercept ISR, the two scheduler ISRs, and
mshv_try_assert_irq_fast() on the irqfd fast path.

Of these, only mshv_try_assert_irq_fast() can structurally race the
publish.  It runs from an eventfd waker without holding pt_mutex, and
MSHV_IRQFD does not require the target lapic_apic_id (== vp_index) to
refer to an existing VP at registration time.  A user can therefore
register an irqfd targeting a yet-to-be-created VP, then trigger
mshv_try_assert_irq_fast() concurrently with MSHV_CREATE_VP for the
same index.  On weakly-ordered architectures the reader can observe a
non-NULL pointer in pt_vp_array before the initialising stores to the
VP struct become visible, leading to use of partially-initialised
fields (e.g. vp_register_page).

The other ISR readers cannot reach this race: the hypervisor will not
generate intercept or scheduler messages for a VP that has never been
told to run, and the user can only call MSHV_RUN_VP on the VP fd
returned by MSHV_CREATE_VP, which by construction is returned after
the publish.  Leave those readers as plain loads.

Use smp_store_release() in mshv_partition_ioctl_create_vp() to publish
the pointer, and pair it with smp_load_acquire() in
mshv_try_assert_irq_fast().  On x86 these compile to plain accesses
under TSO; on ARM64 they emit one-instruction acquire/release barriers,
acceptable on this fast path.

The destroy-side path (destroy_partition() clearing pt_vp_array[i] to
NULL after kfree(vp)) has a separate ordering and lifetime concern
that is out of scope here.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_eventfd.c   |    9 ++++++++-
 drivers/hv/mshv_root_main.c |    8 +++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 11a6006f80194..5f0dd243e1445 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -197,7 +197,14 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd,
 	if (irq->lapic_apic_id >= MSHV_MAX_VPS)
 		return -EINVAL;

-	vp = partition->pt_vp_array[irq->lapic_apic_id];
+	/*
+	 * Pairs with smp_store_release() in mshv_partition_ioctl_create_vp().
+	 * MSHV_IRQFD does not require the target lapic_apic_id to refer to an
+	 * existing VP, so this read can race a concurrent VP creation; the
+	 * acquire ensures that a non-NULL pointer implies the VP's
+	 * initialising stores are visible.
+	 */
+	vp = smp_load_acquire(&partition->pt_vp_array[irq->lapic_apic_id]);
 	if (!vp)
 		return -EINVAL;

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 7e4252b6bc65c..381aa86c5b90e 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1224,7 +1224,13 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,

 	/* already exclusive with the partition mutex for all ioctls */
 	partition->pt_vp_count++;
-	partition->pt_vp_array[args.vp_index] = vp;
+	/*
+	 * Pairs with smp_load_acquire() in mshv_try_assert_irq_fast(), which
+	 * can run concurrently from an irqfd waker without holding pt_mutex.
+	 * The release ensures the VP's initialising stores are visible to any
+	 * reader that observes a non-NULL pointer in pt_vp_array.
+	 */
+	smp_store_release(&partition->pt_vp_array[args.vp_index], vp);

 	goto out;

^ permalink raw reply related

* [PATCH v4 13/18] mshv: Add missing vp_index bounds check in intercept ISR
From: Stanislav Kinsburskii @ 2026-05-07 15:44 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

mshv_intercept_isr() reads vp_index from the SynIC intercept message
payload and uses it directly to index into partition->pt_vp_array without
validating that vp_index < MSHV_MAX_VPS.

Mshv treats the Microsoft Hypervisor as trusted, so a malformed vp_index is
not a security concern; the threat model does not include a malicious
hypervisor. A hypervisor bug that placed an out-of-range value here would,
however, cause an out-of-bounds read of pt_vp_array in hardirq context,
manifesting as random memory corruption or a host crash with no clear
signal pointing back to the hypervisor as the source.

handle_bitset_message() and handle_pair_message() already perform this
defensive check on hypervisor-supplied vp_index values, with an explicit
"This shouldn't happen, but just in case" comment  Add the same check to
mshv_intercept_isr() for consistency, turning a potential silent corruption
into a debuggable pr_debug message.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_synic.c |    5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
index bac890cd2b468..89207aad7cf1f 100644
--- a/drivers/hv/mshv_synic.c
+++ b/drivers/hv/mshv_synic.c
@@ -387,6 +387,11 @@ mshv_intercept_isr(struct hv_message *msg)
 	 */
 	vp_index =
 	       ((struct hv_opaque_intercept_message *)msg->u.payload)->vp_index;
+	/* This shouldn't happen, but just in case. */
+	if (unlikely(vp_index >= MSHV_MAX_VPS)) {
+		pr_debug("VP index %u out of bounds\n", vp_index);
+		goto unlock_out;
+	}
 	vp = partition->pt_vp_array[vp_index];
 	if (unlikely(!vp)) {
 		pr_debug("failed to find VP %u\n", vp_index);

^ permalink raw reply related

* [PATCH v4 12/18] mshv: Use kfree_rcu in mshv_portid_free
From: Stanislav Kinsburskii @ 2026-05-07 15:44 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

mshv_portid_free() uses synchronize_rcu() followed by kfree() to
reclaim port table entries. This blocks the caller until a full RCU
grace period elapses, which is unnecessary since the same module already
uses the non-blocking kfree_rcu() pattern in mshv_port_table_fini().

Replace with kfree_rcu() to avoid the blocking wait and keep the
reclamation strategy consistent across the file.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_portid_table.c |    3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/hv/mshv_portid_table.c b/drivers/hv/mshv_portid_table.c
index d87a82e399e96..42d21b92b88fd 100644
--- a/drivers/hv/mshv_portid_table.c
+++ b/drivers/hv/mshv_portid_table.c
@@ -62,8 +62,7 @@ mshv_portid_free(int port_id)
 	WARN_ON(!info);
 	idr_unlock(&port_table_idr);

-	synchronize_rcu();
-	kfree(info);
+	kfree_rcu(info, portbl_rcu);
 }

 /*

^ permalink raw reply related

* [PATCH v4 11/18] mshv: Fix sleeping under spinlock in mshv_portid_alloc
From: Stanislav Kinsburskii @ 2026-05-07 15:43 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

idr_alloc() is called with GFP_KERNEL inside idr_lock(), which holds a
spinlock. GFP_KERNEL allows the allocator to sleep, triggering a
sleeping-while-atomic bug.

Fix by using idr_preload(GFP_KERNEL) before taking the lock to
pre-allocate memory in a sleepable context, then idr_alloc() with
GFP_NOWAIT inside the spinlock-protected section.

Fixes: 621191d709b1 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_portid_table.c |    6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/hv/mshv_portid_table.c b/drivers/hv/mshv_portid_table.c
index 4cdf8e9575390..d87a82e399e96 100644
--- a/drivers/hv/mshv_portid_table.c
+++ b/drivers/hv/mshv_portid_table.c
@@ -40,12 +40,14 @@ mshv_port_table_fini(void)
 int
 mshv_portid_alloc(struct port_table_info *info)
 {
-	int ret = 0;
+	int ret;
 
+	idr_preload(GFP_KERNEL);
 	idr_lock(&port_table_idr);
 	ret = idr_alloc(&port_table_idr, info, PORTID_MIN,
-			PORTID_MAX, GFP_KERNEL);
+			PORTID_MAX, GFP_NOWAIT);
 	idr_unlock(&port_table_idr);
+	idr_preload_end();
 
 	return ret;
 }



^ permalink raw reply related

* [PATCH v4 10/18] mshv: portid_table: Make mshv_portid_lookup() RCU-aware by contract
From: Stanislav Kinsburskii @ 2026-05-07 15:43 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

mshv_portid_lookup() previously took rcu_read_lock() internally, ran
idr_find(), released the read lock, and copied the struct contents
into a caller-supplied buffer.  This had two problems.

1. The struct copy ran outside the read section, racing with
   mshv_portid_free() which does idr_remove + synchronize_rcu + kfree.
   A copy that started just before synchronize_rcu() observed the read
   section as already drained and was free to read freed memory while
   the writer was kfree()'ing the entry.

2. The only consumer, mshv_doorbell_isr(), then dispatched a callback
   using fields of the snapshot — entirely outside any RCU read
   section.  The callback's data argument and any field it touches
   were therefore safe only because mshv_isr() runs from
   sysvec_hyperv_callback, a non-threaded system vector that
   synchronize_rcu() implicitly waits for via the hardirq quiescent-
   state coupling.  That protection is real today but undocumented and
   fragile: a future move of mshv_isr() to a threaded context, or a
   future caller that registers a doorbell with a shorter-lived data
   pointer, would silently expose a use-after-free.

Make the contract explicit instead of implicit.  mshv_portid_lookup()
now returns a pointer to the table entry and requires the caller to
hold rcu_read_lock for the entire lifetime of that pointer.  The
contract is annotated with __must_hold(RCU) so sparse flags any
direct caller that forgets it.  The sole caller, mshv_doorbell_isr(),
takes rcu_read_lock around the whole drain loop, so the lookup, the
field reads, and the doorbell_cb dispatch all run inside one
read-side critical section.  synchronize_rcu() in mshv_portid_free()
now genuinely waits for any in-flight callback before kfree() runs,
without relying on hardirq context for correctness.

This also drops the by-value struct copy: entries are publish-once
(populated before idr_alloc) and free-once (after synchronize_rcu),
so a pointer dereferenced inside the read section gives a stable
view of the contents without copying.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_portid_table.c |   22 +++++++---------------
 drivers/hv/mshv_root.h         |    2 +-
 drivers/hv/mshv_synic.c        |   15 +++++++++------
 3 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/drivers/hv/mshv_portid_table.c b/drivers/hv/mshv_portid_table.c
index c349af1f0aaac..4cdf8e9575390 100644
--- a/drivers/hv/mshv_portid_table.c
+++ b/drivers/hv/mshv_portid_table.c
@@ -64,20 +64,12 @@ mshv_portid_free(int port_id)
 	kfree(info);
 }

-int
-mshv_portid_lookup(int port_id, struct port_table_info *info)
+/*
+ * Caller must hold rcu_read_lock for the entire lifetime of the
+ * returned pointer.  Returns NULL if @port_id is not in the table.
+ */
+struct port_table_info *mshv_portid_lookup(int port_id)
+	__must_hold(RCU)
 {
-	struct port_table_info *_info;
-	int ret = -ENOENT;
-
-	rcu_read_lock();
-	_info = idr_find(&port_table_idr, port_id);
-	rcu_read_unlock();
-
-	if (_info) {
-		*info = *_info;
-		ret = 0;
-	}
-
-	return ret;
+	return idr_find(&port_table_idr, port_id);
 }
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index 2e6c4414740cc..b6961a6d9a98b 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -241,7 +241,7 @@ void mshv_irqfd_routing_update(struct mshv_partition *partition);

 void mshv_port_table_fini(void);
 int mshv_portid_alloc(struct port_table_info *info);
-int mshv_portid_lookup(int port_id, struct port_table_info *info);
+struct port_table_info *mshv_portid_lookup(int port_id) __must_hold(RCU);
 void mshv_portid_free(int port_id);

 int mshv_register_doorbell(u64 partition_id, doorbell_cb_t doorbell_cb,
diff --git a/drivers/hv/mshv_synic.c b/drivers/hv/mshv_synic.c
index 43f1bcbbf2d34..bac890cd2b468 100644
--- a/drivers/hv/mshv_synic.c
+++ b/drivers/hv/mshv_synic.c
@@ -114,24 +114,27 @@ mshv_doorbell_isr(struct hv_message *msg)
 	if (notification->sint_index != HV_SYNIC_DOORBELL_SINT_INDEX)
 		return false;

+	rcu_read_lock();
 	while ((port = synic_event_ring_get_queued_port(HV_SYNIC_DOORBELL_SINT_INDEX))) {
-		struct port_table_info ptinfo = { 0 };
+		struct port_table_info *ptinfo;

-		if (mshv_portid_lookup(port, &ptinfo)) {
+		ptinfo = mshv_portid_lookup(port);
+		if (!ptinfo) {
 			pr_debug("Failed to get port info from port_table!\n");
 			continue;
 		}

-		if (ptinfo.hv_port_type != HV_PORT_TYPE_DOORBELL) {
+		if (ptinfo->hv_port_type != HV_PORT_TYPE_DOORBELL) {
 			pr_debug("Not a doorbell port!, port: %d, port_type: %d\n",
-				 port, ptinfo.hv_port_type);
+				 port, ptinfo->hv_port_type);
 			continue;
 		}

 		/* Invoke the callback */
-		ptinfo.hv_port_doorbell.doorbell_cb(port,
-						 ptinfo.hv_port_doorbell.data);
+		ptinfo->hv_port_doorbell.doorbell_cb(port,
+						 ptinfo->hv_port_doorbell.data);
 	}
+	rcu_read_unlock();

 	return true;
 }

^ permalink raw reply related

* [PATCH v4 09/18] mshv: Fix duplicate GSI detection for GSI 0
From: Stanislav Kinsburskii @ 2026-05-07 15:43 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

The duplicate routing entry check in mshv_update_routing_table() uses
guest_irq_num != 0 to detect whether a GSI slot is already occupied.
This fails for GSI 0 because its guest_irq_num is 0 both when the slot
is unused (zero-initialized) and when legitimately assigned. As a
result, duplicate entries for GSI 0 are silently accepted, with the
second entry overwriting the first — corrupting the routing table
without any error reported to userspace.

While GSI 0 (legacy timer) is unlikely to appear in MSI-based routing
in practice, the check is semantically wrong — it conflates
"uninitialized" with "GSI number 0." Use girq_entry_valid instead,
which is explicitly set to true when an entry is populated and remains
zero for unused slots regardless of the GSI number.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_irq.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hv/mshv_irq.c b/drivers/hv/mshv_irq.c
index 59584a132ca9f..db05512db5548 100644
--- a/drivers/hv/mshv_irq.c
+++ b/drivers/hv/mshv_irq.c
@@ -88,7 +88,7 @@ int mshv_update_routing_table(struct mshv_partition *partition,
 		/*
 		 * Allow only one to one mapping between GSI and MSI routing.
 		 */
-		if (girq->guest_irq_num != 0) {
+		if (girq->girq_entry_valid) {
 			r = -EINVAL;
 			goto out;
 		}

^ permalink raw reply related

* [PATCH v4 08/18] mshv: Fix level-triggered check on uninitialized data
From: Stanislav Kinsburskii @ 2026-05-07 15:43 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

In mshv_irqfd_assign(), the level-triggered validation for resample
irqfds checks irqfd_lapic_irq.lapic_control.level_triggered before
mshv_irqfd_update() has populated the field. Since the irqfd struct is
zero-allocated, level_triggered is always 0 at that point, causing the
check to always reject resample irqfds with -EINVAL. This makes
level-triggered interrupt resampling — used to avoid interrupt storms
with assigned devices — completely non-functional.

Move the check after the mshv_irqfd_update() call, which resolves the
IRQ routing entry and populates irqfd_lapic_irq with the actual trigger
mode.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_eventfd.c |   25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index c24069dff9702..11a6006f80194 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -491,6 +491,19 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
 	init_poll_funcptr(&irqfd->irqfd_polltbl, mshv_irqfd_queue_proc);
 
 	spin_lock_irq(&pt->pt_irqfds_lock);
+	ret = 0;
+	hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) {
+		if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx)
+			continue;
+		/* This fd is used for another irq already. */
+		ret = -EBUSY;
+		spin_unlock_irq(&pt->pt_irqfds_lock);
+		goto fail;
+	}
+
+	idx = srcu_read_lock(&pt->pt_irq_srcu);
+	mshv_irqfd_update(pt, irqfd);
+
 #if IS_ENABLED(CONFIG_X86)
 	if (args->flags & BIT(MSHV_IRQFD_BIT_RESAMPLE) &&
 	    !irqfd->irqfd_lapic_irq.lapic_control.level_triggered) {
@@ -499,22 +512,12 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
 		 * Otherwise return with failure
 		 */
 		spin_unlock_irq(&pt->pt_irqfds_lock);
+		srcu_read_unlock(&pt->pt_irq_srcu, idx);
 		ret = -EINVAL;
 		goto fail;
 	}
 #endif
-	ret = 0;
-	hlist_for_each_entry(tmp, &pt->pt_irqfds_list, irqfd_hnode) {
-		if (irqfd->irqfd_eventfd_ctx != tmp->irqfd_eventfd_ctx)
-			continue;
-		/* This fd is used for another irq already. */
-		ret = -EBUSY;
-		spin_unlock_irq(&pt->pt_irqfds_lock);
-		goto fail;
-	}
 
-	idx = srcu_read_lock(&pt->pt_irq_srcu);
-	mshv_irqfd_update(pt, irqfd);
 	hlist_add_head(&irqfd->irqfd_hnode, &pt->pt_irqfds_list);
 	spin_unlock_irq(&pt->pt_irqfds_lock);
 



^ permalink raw reply related

* [PATCH v4 07/18] mshv: Consolidate irqfd interrupt injection paths
From: Stanislav Kinsburskii @ 2026-05-07 15:43 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

irqfd interrupt injection had divergent seqcount snapshot scaffolding in
three places, and inconsistent validity checks between the fast and slow
assert paths:

1. mshv_irqfd_wakeup() snapshotted irqfd_lapic_irq under seqcount, then on
fast-path failure called mshv_assert_irq_slow(), which re-snapshotted both
irqfd_girq_ent and irqfd_lapic_irq under seqcount again — wasteful and
duplicative.

2. The girq_entry_valid check only existed in the slow path.  The fast path
would happily accept a zero-initialised mshv_lapic_irq when routing was not
yet configured for the GSI, potentially injecting vector 0 to VP 0.

3. The slow path's validity predicate was 'guest_irq_num &&
!girq_entry_valid', which short-circuits for GSI 0 (guest_irq_num == 0) and
so bypasses the validity check entirely for that GSI.

4. mshv_irqfd_resampler_ack() read irqfd_lapic_irq.lapic_control without
seqcount protection, which could observe a stale or transient value while
mshv_irqfd_update() was concurrently rewriting irqfd_lapic_irq via
mshv_copy_girq_info()'s memset-and-fill sequence.

Introduce mshv_irqfd_snapshot() that takes a consistent snapshot of both
irqfd_girq_ent and irqfd_lapic_irq inside the seqcount loop; girq_ent is
optional so the resampler ack path can snapshot only the LAPIC IRQ.

Use the helper from mshv_irqfd_resampler_ack() (closes 4),
mshv_irqfd_wakeup() and mshv_irqfd_assign()'s EPOLLIN replay path,
replacing the three ad-hoc seqcount loops.

Move the validity check into mshv_irqfd_wakeup() before either injection
path runs, so the fast path no longer accepts an unrouted irqfd (closes
2).  Use !girq_entry_valid as the condition (closes 3).  Change
mshv_assert_irq_slow() to take a pre-snapshotted const struct
mshv_lapic_irq pointer, eliminating its internal seqcount and SRCU
scaffolding (closes 1).

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_eventfd.c |   90 ++++++++++++++++++++++++++-------------------
 1 file changed, 52 insertions(+), 38 deletions(-)

diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 25bdc5e678849..c24069dff9702 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -74,6 +74,27 @@ static inline bool hv_should_clear_interrupt(enum hv_interrupt_type type)
 }
 #endif
 
+/*
+ * Snapshot per-irqfd routing state under seqcount protection so callers
+ * see a consistent point-in-time view of irqfd_girq_ent and
+ * irqfd_lapic_irq even if mshv_irqfd_update() runs concurrently.
+ *
+ * @girq_ent may be NULL when the caller only needs the LAPIC IRQ.
+ */
+static void mshv_irqfd_snapshot(struct mshv_irqfd *irqfd,
+				struct mshv_guest_irq_ent *girq_ent,
+				struct mshv_lapic_irq *irq)
+{
+	unsigned int seq;
+
+	do {
+		seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+		if (girq_ent)
+			*girq_ent = irqfd->irqfd_girq_ent;
+		*irq = irqfd->irqfd_lapic_irq;
+	} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+}
+
 static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian)
 {
 	struct mshv_irqfd_resampler *resampler;
@@ -90,7 +111,11 @@ static void mshv_irqfd_resampler_ack(struct mshv_irq_ack_notifier *mian)
 	hlist_for_each_entry_srcu(irqfd, &resampler->rsmplr_irqfd_list,
 				 irqfd_resampler_hnode,
 				 srcu_read_lock_held(&partition->pt_irq_srcu)) {
-		if (hv_should_clear_interrupt(irqfd->irqfd_lapic_irq.lapic_control.interrupt_type))
+		struct mshv_lapic_irq irq;
+
+		mshv_irqfd_snapshot(irqfd, NULL, &irq);
+
+		if (hv_should_clear_interrupt(irq.lapic_control.interrupt_type))
 			hv_call_clear_virtual_interrupt(partition->pt_id);
 
 		eventfd_signal(irqfd->irqfd_resamplefd);
@@ -198,37 +223,14 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd,
 }
 #endif
 
-static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd)
+static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd,
+				 const struct mshv_lapic_irq *irq)
 {
 	struct mshv_partition *partition = irqfd->irqfd_partn;
-	struct mshv_guest_irq_ent girq_ent;
-	struct mshv_lapic_irq irq;
-	unsigned int seq;
-	int idx;
-
-	idx = srcu_read_lock(&partition->pt_irq_srcu);
-
-	do {
-		seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
-		girq_ent = irqfd->irqfd_girq_ent;
-		irq = irqfd->irqfd_lapic_irq;
-	} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
-
-#if IS_ENABLED(CONFIG_X86)
-	WARN_ON(irqfd->irqfd_resampler &&
-		!irq.lapic_control.level_triggered);
-#endif
-
-	if (girq_ent.guest_irq_num && !girq_ent.girq_entry_valid) {
-		srcu_read_unlock(&partition->pt_irq_srcu, idx);
-		return;
-	}
 
 	hv_call_assert_virtual_interrupt(partition->pt_id,
-					 irq.lapic_vector, irq.lapic_apic_id,
-					 irq.lapic_control);
-
-	srcu_read_unlock(&partition->pt_irq_srcu, idx);
+					 irq->lapic_vector, irq->lapic_apic_id,
+					 irq->lapic_control);
 }
 
 static void mshv_irqfd_resampler_shutdown(struct mshv_irqfd *irqfd)
@@ -308,26 +310,31 @@ static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
 						irqfd_wait);
 	__poll_t flags = key_to_poll(key);
 	int idx;
-	unsigned int seq;
 	struct mshv_partition *pt = irqfd->irqfd_partn;
 	int ret = 0;
 
 	if (flags & EPOLLIN) {
+		struct mshv_guest_irq_ent girq_ent;
 		struct mshv_lapic_irq irq;
 		u64 cnt;
 
 		eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt);
 		idx = srcu_read_lock(&pt->pt_irq_srcu);
-		do {
-			seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
-			irq = irqfd->irqfd_lapic_irq;
-		} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+		mshv_irqfd_snapshot(irqfd, &girq_ent, &irq);
+
+		if (!girq_ent.girq_entry_valid)
+			goto out_unlock;
+
+#if IS_ENABLED(CONFIG_X86)
+		WARN_ON(irqfd->irqfd_resampler &&
+			!irq.lapic_control.level_triggered);
+#endif
 
 		/* An event has been signaled, raise an interrupt */
-		ret = mshv_try_assert_irq_fast(irqfd, &irq);
-		if (ret)
-			mshv_assert_irq_slow(irqfd);
+		if (mshv_try_assert_irq_fast(irqfd, &irq))
+			mshv_assert_irq_slow(irqfd, &irq);
 
+out_unlock:
 		srcu_read_unlock(&pt->pt_irq_srcu, idx);
 
 		ret = 1;
@@ -517,8 +524,15 @@ static int mshv_irqfd_assign(struct mshv_partition *pt,
 	 */
 	events = vfs_poll(fd_file(f), &irqfd->irqfd_polltbl);
 
-	if (events & EPOLLIN)
-		mshv_assert_irq_slow(irqfd);
+	if (events & EPOLLIN) {
+		struct mshv_guest_irq_ent girq_ent;
+		struct mshv_lapic_irq irq;
+
+		mshv_irqfd_snapshot(irqfd, &girq_ent, &irq);
+
+		if (girq_ent.girq_entry_valid)
+			mshv_assert_irq_slow(irqfd, &irq);
+	}
 
 	srcu_read_unlock(&pt->pt_irq_srcu, idx);
 	return 0;



^ permalink raw reply related

* [PATCH v4 06/18] mshv: Fix broken seqcount read protection
From: Stanislav Kinsburskii @ 2026-05-07 15:43 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

mshv_irqfd_update() writes both irqfd_girq_ent and irqfd_lapic_irq as a
logical unit under seqcount write protection.  Readers must snapshot these
fields inside the seqcount begin/retry loop to obtain a consistent
point-in-time view; otherwise a concurrent update can produce a torn read
where one field comes from the old state and the other from the new.

Both mshv_assert_irq_slow() and mshv_irqfd_wakeup() got this wrong: the
seqcount loop bodies were empty (just spinning until a stable sequence was
observed), and all reads of the protected fields happened after the loop
with no protection from concurrent writes. If mshv_irqfd_update() races
with interrupt assertion, the caller may use a stale or mixed
vector/apic_id/control combination, delivering an interrupt to the wrong
vCPU, with the wrong vector, or with the wrong trigger mode.  This can
cause spurious or lost interrupts in the guest, or a stuck interrupt line
in the level-triggered case.

Fix mshv_assert_irq_slow() by snapshotting both irqfd_girq_ent and
irqfd_lapic_irq into local variables inside the seqcount loop, then using
those locals for the validity check, the resampler WARN_ON() and the
hypercall.  Reorder the function so the seqcount loop runs first and every
subsequent read of the protected fields is satisfied from the snapshot.

Fix mshv_irqfd_wakeup() by snapshotting irqfd_lapic_irq inside its seqcount
loop and passing the snapshot to mshv_try_assert_irq_fast(), so the fast
path operates on the consistent copy rather than reading the field directly
outside seqcount protection.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_eventfd.c |   44 +++++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index b398e58411dd7..25bdc5e678849 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -151,10 +151,10 @@ static int mshv_vp_irq_set_vector(struct mshv_vp *vp, u32 vector)
  * Try to raise irq for guest via shared vector array. hyp does the actual
  * inject of the interrupt.
  */
-static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
+static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd,
+				    const struct mshv_lapic_irq *irq)
 {
 	struct mshv_partition *partition = irqfd->irqfd_partn;
-	struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
 	struct mshv_vp *vp;
 
 	if (!(ms_hyperv.ext_features &
@@ -191,7 +191,8 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
 	return 0;
 }
 #else /* CONFIG_X86_64 */
-static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
+static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd,
+				    const struct mshv_lapic_irq *irq)
 {
 	return -EOPNOTSUPP;
 }
@@ -200,30 +201,33 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
 static void mshv_assert_irq_slow(struct mshv_irqfd *irqfd)
 {
 	struct mshv_partition *partition = irqfd->irqfd_partn;
-	struct mshv_lapic_irq *irq = &irqfd->irqfd_lapic_irq;
+	struct mshv_guest_irq_ent girq_ent;
+	struct mshv_lapic_irq irq;
 	unsigned int seq;
 	int idx;
 
+	idx = srcu_read_lock(&partition->pt_irq_srcu);
+
+	do {
+		seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+		girq_ent = irqfd->irqfd_girq_ent;
+		irq = irqfd->irqfd_lapic_irq;
+	} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+
 #if IS_ENABLED(CONFIG_X86)
 	WARN_ON(irqfd->irqfd_resampler &&
-		!irq->lapic_control.level_triggered);
+		!irq.lapic_control.level_triggered);
 #endif
 
-	idx = srcu_read_lock(&partition->pt_irq_srcu);
-	if (irqfd->irqfd_girq_ent.guest_irq_num) {
-		if (!irqfd->irqfd_girq_ent.girq_entry_valid) {
-			srcu_read_unlock(&partition->pt_irq_srcu, idx);
-			return;
-		}
-
-		do {
-			seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
-		} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
+	if (girq_ent.guest_irq_num && !girq_ent.girq_entry_valid) {
+		srcu_read_unlock(&partition->pt_irq_srcu, idx);
+		return;
 	}
 
-	hv_call_assert_virtual_interrupt(irqfd->irqfd_partn->pt_id,
-					 irq->lapic_vector, irq->lapic_apic_id,
-					 irq->lapic_control);
+	hv_call_assert_virtual_interrupt(partition->pt_id,
+					 irq.lapic_vector, irq.lapic_apic_id,
+					 irq.lapic_control);
+
 	srcu_read_unlock(&partition->pt_irq_srcu, idx);
 }
 
@@ -309,16 +313,18 @@ static int mshv_irqfd_wakeup(wait_queue_entry_t *wait, unsigned int mode,
 	int ret = 0;
 
 	if (flags & EPOLLIN) {
+		struct mshv_lapic_irq irq;
 		u64 cnt;
 
 		eventfd_ctx_do_read(irqfd->irqfd_eventfd_ctx, &cnt);
 		idx = srcu_read_lock(&pt->pt_irq_srcu);
 		do {
 			seq = read_seqcount_begin(&irqfd->irqfd_irqe_sc);
+			irq = irqfd->irqfd_lapic_irq;
 		} while (read_seqcount_retry(&irqfd->irqfd_irqe_sc, seq));
 
 		/* An event has been signaled, raise an interrupt */
-		ret = mshv_try_assert_irq_fast(irqfd);
+		ret = mshv_try_assert_irq_fast(irqfd, &irq);
 		if (ret)
 			mshv_assert_irq_slow(irqfd);
 



^ permalink raw reply related

* [PATCH v4 05/18] mshv: irqfd: Reject routing updates that invalidate resampler binding
From: Stanislav Kinsburskii @ 2026-05-07 15:43 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

A resampler-bound irqfd is only meaningful for level-triggered interrupts,
because the resampler signals userspace on guest EOI and edge-triggered
interrupts have no EOI to react to.  mshv_irqfd_assign() already rejects
the combination at registration time, but nothing prevented a subsequent
MSHV_SET_MSI_ROUTING ioctl from flipping the cached
lapic_control.level_triggered bit to edge-triggered while the resampler was
still attached. Once that happened, the WARN_ON() in mshv_assert_irq_slow()
was the only signal of an inconsistency the kernel had no way to recover
from: the resampler stayed wired in but no EOI ever arrived, so userspace
never saw a resample signal and the guest interrupt could become stuck.

Add mshv_irqfd_validate_routing() and call it from
mshv_update_routing_table() before publishing the new table. It walks
pt_irqfds_list looking for resampler-bound irqfds whose new routing entry
would be valid but not level-triggered, and returns -EINVAL to reject the
routing change atomically. The check is x86-only because
lapic_control.level_triggered is only populated on x86; on ARM64
mshv_copy_girq_info() unconditionally sets asserted = 1 and the invariant
does not apply.

The validator briefly takes pt_irqfds_lock for the list walk and drops it
before mshv_irqfd_routing_update() reacquires it.  This is safe because the
partition ioctl dispatcher holds pt_mutex for the duration of every
partition ioctl, so MSHV_IRQFD (which calls mshv_irqfd_assign()) cannot run
concurrently with MSHV_SET_MSI_ROUTING; no new resampler-bound irqfd can be
inserted between validation and refresh.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_irq.c |   44 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/drivers/hv/mshv_irq.c b/drivers/hv/mshv_irq.c
index b3142c84dcbc2..59584a132ca9f 100644
--- a/drivers/hv/mshv_irq.c
+++ b/drivers/hv/mshv_irq.c
@@ -14,7 +14,44 @@
 #include "mshv.h"
 #include "mshv_root.h"

-/* called from the ioctl code, user wants to update the guest irq table */
+static int mshv_irqfd_validate_routing(struct mshv_partition *pt,
+				       struct mshv_girq_routing_table *new)
+{
+	int r = 0;
+#if IS_ENABLED(CONFIG_X86)
+	struct mshv_irqfd *irqfd;
+
+	if (!new)
+		return 0;
+
+	spin_lock_irq(&pt->pt_irqfds_lock);
+	hlist_for_each_entry(irqfd, &pt->pt_irqfds_list, irqfd_hnode) {
+		struct mshv_guest_irq_ent ent = {};
+		struct mshv_lapic_irq lirq;
+
+		if (!irqfd->irqfd_resampler)
+			continue;
+
+		if (irqfd->irqfd_irqnum < new->num_rt_entries)
+			ent = new->mshv_girq_info_tbl[irqfd->irqfd_irqnum];
+
+		mshv_copy_girq_info(&ent, &lirq);
+
+		if (ent.girq_entry_valid &&
+		    !lirq.lapic_control.level_triggered) {
+			r = -EINVAL;
+			break;
+		}
+	}
+	spin_unlock_irq(&pt->pt_irqfds_lock);
+#endif
+	return r;
+}
+
+/*
+ * Called from the ioctl code, user wants to update the guest irq table.
+ * Serialized with mshv_irqfd_assign by partition mutex.
+ */
 int mshv_update_routing_table(struct mshv_partition *partition,
 			      const struct mshv_user_irq_entry *ue,
 			      unsigned int numents)
@@ -65,6 +102,11 @@ int mshv_update_routing_table(struct mshv_partition *partition,

 swap_routes:
 	mutex_lock(&partition->pt_irq_lock);
+	r = mshv_irqfd_validate_routing(partition, new);
+	if (r) {
+		mutex_unlock(&partition->pt_irq_lock);
+		goto out;
+	}
 	old = rcu_dereference_protected(partition->pt_girq_tbl, 1);
 	rcu_assign_pointer(partition->pt_girq_tbl, new);
 	mshv_irqfd_routing_update(partition);

^ permalink raw reply related

* [PATCH v4 04/18] mshv: Add NULL check for vp in mshv_try_assert_irq_fast
From: Stanislav Kinsburskii @ 2026-05-07 15:43 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

mshv_try_assert_irq_fast() dereferences the vp pointer obtained from
pt_vp_array[lapic_apic_id] without checking for NULL or validating that
lapic_apic_id is within bounds. A spurious interrupt from the hypervisor
targeting a non-existent VP (or one not yet created) causes a NULL
pointer dereference and crashes the host.

Add a bounds check on lapic_apic_id against MSHV_MAX_VPS and a NULL
check on the vp pointer before dereferencing.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_eventfd.c |    5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 5995a62aff8d8..b398e58411dd7 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -169,7 +169,12 @@ static int mshv_try_assert_irq_fast(struct mshv_irqfd *irqfd)
 		return -EOPNOTSUPP;
 #endif
 
+	if (irq->lapic_apic_id >= MSHV_MAX_VPS)
+		return -EINVAL;
+
 	vp = partition->pt_vp_array[irq->lapic_apic_id];
+	if (!vp)
+		return -EINVAL;
 
 	if (!vp->vp_register_page)
 		return -EOPNOTSUPP;



^ permalink raw reply related

* [PATCH v4 03/18] mshv: Fix race in mshv_irqfd_deassign
From: Stanislav Kinsburskii @ 2026-05-07 15:43 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

mshv_irqfd_deactivate() and the hlist traversal of pt_irqfds_list
require pt->pt_irqfds_lock to be held, but mshv_irqfd_deassign()
omits it. This races with the EPOLLHUP path in mshv_irqfd_wakeup(),
which does take the lock before calling mshv_irqfd_deactivate().

Additionally, mshv_irqfd_deactivate() uses hlist_del() which poisons
the node pointers rather than resetting them. Since
mshv_irqfd_is_active() relies on hlist_unhashed() (checks pprev ==
NULL), a poisoned node still appears active. If a concurrent path calls
mshv_irqfd_deactivate() again on the same irqfd, the guard fails to
prevent a double hlist_del() on poisoned pointers.

Fix both issues:
- Add the missing spin_lock_irq/spin_unlock_irq around the list
  traversal in mshv_irqfd_deassign(), matching mshv_irqfd_release().
- Use hlist_del_init() instead of hlist_del() so the node is properly
  marked as unhashed after removal, making the is_active guard reliable.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_eventfd.c |    5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/hv/mshv_eventfd.c b/drivers/hv/mshv_eventfd.c
index 90959f639dc32..5995a62aff8d8 100644
--- a/drivers/hv/mshv_eventfd.c
+++ b/drivers/hv/mshv_eventfd.c
@@ -284,7 +284,7 @@ static void mshv_irqfd_deactivate(struct mshv_irqfd *irqfd)
 	if (!mshv_irqfd_is_active(irqfd))
 		return;
 
-	hlist_del(&irqfd->irqfd_hnode);
+	hlist_del_init(&irqfd->irqfd_hnode);
 
 	queue_work(irqfd_cleanup_wq, &irqfd->irqfd_shutdown);
 }
@@ -541,13 +541,14 @@ static int mshv_irqfd_deassign(struct mshv_partition *pt,
 	if (IS_ERR(eventfd))
 		return PTR_ERR(eventfd);
 
+	spin_lock_irq(&pt->pt_irqfds_lock);
 	hlist_for_each_entry_safe(irqfd, n, &pt->pt_irqfds_list,
 				  irqfd_hnode) {
 		if (irqfd->irqfd_eventfd_ctx == eventfd &&
 		    irqfd->irqfd_irqnum == args->gsi)
-
 			mshv_irqfd_deactivate(irqfd);
 	}
+	spin_unlock_irq(&pt->pt_irqfds_lock);
 
 	eventfd_ctx_put(eventfd);
 



^ permalink raw reply related

* [PATCH v4 02/18] mshv: Fix mshv_prepare_pinned_region error path for unencrypted partitions
From: Stanislav Kinsburskii @ 2026-05-07 15:43 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

mshv_prepare_pinned_region() returns 0 (success) when mshv_region_map()
fails on an unencrypted partition. The condition on the error path:

    if (ret && mshv_partition_encrypted(partition))

only handles map failures for encrypted partitions — if the partition is
not encrypted and the map fails, execution falls through to 'return 0',
silently ignoring the error.

Additionally, calling mshv_region_invalidate() inline on map failure
zeroes the mreg_pages array before the caller's cleanup path
(mshv_region_destroy) can call mshv_region_unmap(). Since unmap skips
pages where mreg_pages[offset] is NULL, this can leave stale SLAT
mappings for partially-mapped pages.

Fix by returning immediately on success and falling through to error
return on failure. For unencrypted partitions, the caller's
mshv_region_destroy() handles unmap followed by invalidate in the
correct order. For encrypted partitions where re-sharing fails, zero
the page array without unpinning — the pages are inaccessible to the
host and must not be unpinned, but zeroing prevents
mshv_region_destroy() from attempting to unpin them.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_root_main.c |   26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 665d565899c15..7e4252b6bc65c 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1360,32 +1360,38 @@ static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
 			pt_err(partition,
 			       "Failed to unshare memory region (guest_pfn: %llu): %d\n",
 			       region->start_gfn, ret);
-			goto invalidate_region;
+			goto err_out;
 		}
 	}

 	ret = mshv_region_map(region);
-	if (ret && mshv_partition_encrypted(partition)) {
+	if (ret)
+		goto share_region;
+
+	return 0;
+
+share_region:
+	if (mshv_partition_encrypted(partition)) {
 		int shrc;

 		shrc = mshv_region_share(region);
 		if (!shrc)
-			goto invalidate_region;
+			goto err_out;

 		pt_err(partition,
 		       "Failed to share memory region (guest_pfn: %llu): %d\n",
 		       region->start_gfn, shrc);
 		/*
-		 * Don't unpin if marking shared failed because pages are no
-		 * longer mapped in the host, ie root, anymore.
+		 * Re-sharing failed — the pages remain inaccessible to the
+		 * host.  Zero the page array so that mshv_region_destroy()
+		 * won't attempt to unpin them (leaking the page references
+		 * is intentional; unpinning host-inaccessible pages would be
+		 * unsafe).
 		 */
+		memset(region->mreg_pages, 0,
+		       region->nr_pages * sizeof(region->mreg_pages[0]));
 		goto err_out;
 	}
-
-	return 0;
-
-invalidate_region:
-	mshv_region_invalidate(region);
 err_out:
 	return ret;
 }

^ permalink raw reply related

* [PATCH v4 01/18] mshv: Fix IRQ leak and type hazards in hv_call_modify_spa_host_access
From: Stanislav Kinsburskii @ 2026-05-07 15:43 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel
In-Reply-To: <177816592843.21765.4364464279247150355.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

The bounds check inside the PFN-filling loop can return -EINVAL while
interrupts are disabled via local_irq_save(), leaking IRQ state.

Remove the check — it is redundant because the loop invariant
(done + i < page_count == page_struct_count >> large_shift) guarantees
(done + i) << large_shift < page_struct_count always holds.

While here, fix type mismatches: change 'int done' to 'u64 done' and
use u64 for loop and batch-size variables so they match the u64
page_count they are compared against.

Fixes: 621191d709b14 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_root_hv_call.c |   18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
index 129456bd72aba..cc580225e9e45 100644
--- a/drivers/hv/mshv_root_hv_call.c
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -1042,7 +1042,7 @@ int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
 {
 	struct hv_input_modify_sparse_spa_page_host_access *input_page;
 	u64 status;
-	int done = 0;
+	u64 done = 0;
 	unsigned long irq_flags, large_shift = 0;
 	u64 page_count = page_struct_count;
 	u16 code = acquire ? HVCALL_ACQUIRE_SPARSE_SPA_PAGE_HOST_ACCESS :
@@ -1059,9 +1059,9 @@ int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
 	}
 
 	while (done < page_count) {
-		ulong i, completed, remain = page_count - done;
-		int rep_count = min(remain,
-				    HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT);
+		u64 i, completed, remain = page_count - done;
+		u64 rep_count = min_t(u64, remain,
+				      HV_MODIFY_SPARSE_SPA_PAGE_HOST_ACCESS_MAX_PAGE_COUNT);
 
 		local_irq_save(irq_flags);
 		input_page = *this_cpu_ptr(hyperv_pcpu_input_arg);
@@ -1075,15 +1075,9 @@ int hv_call_modify_spa_host_access(u64 partition_id, struct page **pages,
 		input_page->flags = flags;
 		input_page->host_access = host_access;
 
-		for (i = 0; i < rep_count; i++) {
-			u64 index = (done + i) << large_shift;
-
-			if (index >= page_struct_count)
-				return -EINVAL;
-
+		for (i = 0; i < rep_count; i++)
 			input_page->spa_page_list[i] =
-						page_to_pfn(pages[index]);
-		}
+				page_to_pfn(pages[(done + i) << large_shift]);
 
 		status = hv_do_rep_hypercall(code, rep_count, 0, input_page,
 					     NULL);



^ permalink raw reply related

* [PATCH v4 00/18] mshv: Bug fixes across the mshv_root module
From: Stanislav Kinsburskii @ 2026-05-07 15:42 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel

This series addresses bugs found during a continued review of the
mshv_root module mostly introduced by commit 621191d709b14 ("Drivers: hv:
Introduce mshv_root module to expose /dev/mshv to VMMs").

Changes in v4:
- Dropped the following patches as the issues they fix don't happen in
  practice:
    - mshv: Fix potential integer overflow in mshv_region_create
    - mshv: Fix potential u64 overflow in region overlap check
    - mshv: Add defensive synchronize_srcu in irqfd shutdown

- Added new fixes:
    - mshv: irqfd: Reject routing updates that invalidate resampler binding
    - mshv: Fix sleeping under spinlock in mshv_portid_alloc
    - mshv: Order pt_vp_array publish against irqfd assertion path
    - mshv: Defer mshv_vp free to an RCU grace period
    - mshv: Publish VP to pt_vp_array before installing the file descriptor

- Replaced:
    - mshv: Fix use-after-RCU in mshv_portid_lookup 
      by
      mshv: portid_table: Make mshv_portid_lookup() RCU-aware by contract
    - mshv: Add store/load ordering for VP array publish
      by
      mshv: Order pt_vp_array publish against irqfd assertion path

Changes in v3:
- "Fix mshv_prepare_pinned_region error path for unencrypted
  partitions": removed inline mshv_region_invalidate() to prevent
  zeroing mreg_pages before mshv_region_destroy() can unmap partial
  SLAT mappings; for encrypted share-failure, memset the page array
  without unpinning (pages are host-inaccessible).
- "Consolidate irqfd interrupt injection paths": fixed data race in
  mshv_irqfd_assign EPOLLIN path — girq_ent is now snapshotted inside
  the seqcount loop (matching mshv_irqfd_wakeup) to prevent a
  concurrent routing update from injecting vector 0 to VP 0.
- "Add missing vp_index bounds check in intercept ISR": added
  array_index_nospec() after the bounds check to prevent speculative
  out-of-bounds array access.
- "Add store/load ordering for VP array publish": added missing
  smp_load_acquire in mshv_try_assert_irq_fast.

Changes in v2:
- Added 8 new patches addressing issues found by Sashiko (automated
  review) covering the irqfd, portid, scheduler message, and VP
  lifecycle paths.
- Consolidated the irqfd fast/slow injection paths to eliminate
  duplicated seqcount reads and fix the GSI 0 validity bypass.
- Added memory ordering for the lockless VP array.

The fixes range from data corruption and use-after-free to silent
functional failures and sleeping-while-atomic:

 Memory region management:
  - Integer overflow on userspace-controlled allocation size
    (mshv_region_create)
  - Silent success on map failure for unencrypted partitions
    (mshv_prepare_pinned_region)
  - u64 overflow in region overlap check allowing overlapping mappings

 IRQ/eventfd path:
  - IRQ state leak and type truncation in hypercall helpers
  - Missing locking and hlist_del vs hlist_del_init race in irqfd
    deassign
  - Defensive synchronize_srcu in irqfd shutdown (follows KVM pattern)
  - NULL pointer dereference on spurious interrupt to non-existent VP
    (mshv_try_assert_irq_fast)
  - Broken seqcount read protection — torn reads of interrupt routing
  - Duplicated and inconsistent validity checks between fast/slow
    injection paths; fast path could inject vector 0 spuriously
  - Level-triggered check on uninitialized data making interrupt
    resampling completely non-functional
  - Duplicate GSI 0 detection using the wrong predicate

 Port ID table:
  - Use-after-RCU in mshv_portid_lookup (dereference outside read-side
    critical section)
  - Sleeping under spinlock in mshv_portid_alloc (GFP_KERNEL inside
    idr_lock)
  - Use kfree_rcu for deferred free without blocking

 SynIC / ISR paths:
  - Missing VP index bounds check in intercept ISR (OOB in interrupt
    context from untrusted hypervisor data)
  - Missing store/load ordering for VP array publish — lockless ISR
    readers could observe partially-initialized VP
  - Missing bounds validation in scheduler messages
    (handle_pair_message vp_count, handle_bitset_message bank_mask)

 Miscellaneous:
  - Missing error code on VP allocation failure (silent success to
    userspace)

Kudos to Claude and Sashiko for assisting with analysis and
implementation.

---

Stanislav Kinsburskii (18):
      mshv: Fix IRQ leak and type hazards in hv_call_modify_spa_host_access
      mshv: Fix mshv_prepare_pinned_region error path for unencrypted partitions
      mshv: Fix race in mshv_irqfd_deassign
      mshv: Add NULL check for vp in mshv_try_assert_irq_fast
      mshv: irqfd: Reject routing updates that invalidate resampler binding
      mshv: Fix broken seqcount read protection
      mshv: Consolidate irqfd interrupt injection paths
      mshv: Fix level-triggered check on uninitialized data
      mshv: Fix duplicate GSI detection for GSI 0
      mshv: portid_table: Make mshv_portid_lookup() RCU-aware by contract
      mshv: Fix sleeping under spinlock in mshv_portid_alloc
      mshv: Use kfree_rcu in mshv_portid_free
      mshv: Add missing vp_index bounds check in intercept ISR
      mshv: Order pt_vp_array publish against irqfd assertion path
      mshv: Defer mshv_vp free to an RCU grace period
      mshv: Validate scheduler message bounds from hypervisor
      mshv: Publish VP to pt_vp_array before installing the file descriptor
      mshv: Fix missing error code on VP allocation failure


 drivers/hv/mshv_eventfd.c      |  136 +++++++++++++++++++++++++---------------
 drivers/hv/mshv_irq.c          |   46 +++++++++++++-
 drivers/hv/mshv_portid_table.c |   31 ++++-----
 drivers/hv/mshv_root.h         |    3 +
 drivers/hv/mshv_root_hv_call.c |   18 ++---
 drivers/hv/mshv_root_main.c    |   72 +++++++++++++++------
 drivers/hv/mshv_synic.c        |   40 +++++++++---
 7 files changed, 233 insertions(+), 113 deletions(-)


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox