Linux-HyperV List

Linux-HyperV List
 help / color / mirror / Atom feed

* [PATCH v3 08/11] mshv: Move pinned region setup to mshv_regions.c
From: Stanislav Kinsburskii @ 2026-05-13 18:51 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, skinsburskii
  Cc: linux-hyperv, linux-kernel
In-Reply-To: <177869813422.18773.515308662914055136.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

Move mshv_prepare_pinned_region() from mshv_root_main.c to
mshv_regions.c and rename it to mshv_map_pinned_region(). This
co-locates the pinned region logic with the rest of the memory region
operations.

Make mshv_region_pin(), mshv_region_map(), mshv_region_share(),
mshv_region_unshare(), and mshv_region_invalidate() static, as they are
no longer called outside of mshv_regions.c.

No functional change.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_regions.c   |   83 ++++++++++++++++++++++++++++++++++++++++---
 drivers/hv/mshv_root.h      |    6 +--
 drivers/hv/mshv_root_main.c |   75 +--------------------------------------
 3 files changed, 80 insertions(+), 84 deletions(-)

diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
index 057fc83895d37..d9e33e9ef8550 100644
--- a/drivers/hv/mshv_regions.c
+++ b/drivers/hv/mshv_regions.c
@@ -240,7 +240,7 @@ static int mshv_region_chunk_share(struct mshv_mem_region *region,
 					      flags, true);
 }
 
-int mshv_region_share(struct mshv_mem_region *region)
+static int mshv_region_share(struct mshv_mem_region *region)
 {
 	u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED;
 
@@ -266,7 +266,7 @@ static int mshv_region_chunk_unshare(struct mshv_mem_region *region,
 					      flags, false);
 }
 
-int mshv_region_unshare(struct mshv_mem_region *region)
+static int mshv_region_unshare(struct mshv_mem_region *region)
 {
 	u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE;
 
@@ -306,7 +306,7 @@ static int mshv_region_remap_pfns(struct mshv_mem_region *region,
 					 mshv_region_chunk_remap);
 }
 
-int mshv_region_map(struct mshv_mem_region *region)
+static int mshv_region_map(struct mshv_mem_region *region)
 {
 	u32 map_flags = region->hv_map_flags;
 
@@ -330,12 +330,12 @@ static void mshv_region_invalidate_pfns(struct mshv_mem_region *region,
 	mshv_region_init_pfns_range(region, pfn_offset, pfn_count);
 }
 
-void mshv_region_invalidate(struct mshv_mem_region *region)
+static void mshv_region_invalidate(struct mshv_mem_region *region)
 {
 	mshv_region_invalidate_pfns(region, 0, region->nr_pfns);
 }
 
-int mshv_region_pin(struct mshv_mem_region *region)
+static int mshv_region_pin(struct mshv_mem_region *region)
 {
 	u64 done_count, nr_pfns, i;
 	unsigned long *pfns;
@@ -691,3 +691,76 @@ bool mshv_region_movable_init(struct mshv_mem_region *region)
 
 	return true;
 }
+
+/**
+ * mshv_map_pinned_region - Pin and map memory regions
+ * @region: Pointer to the memory region structure
+ *
+ * This function processes memory regions that are explicitly marked as pinned.
+ * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based
+ * population. The function ensures the region is properly populated, handles
+ * encryption requirements for SNP partitions if applicable, maps the region,
+ * and performs necessary sharing or eviction operations based on the mapping
+ * result.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int mshv_map_pinned_region(struct mshv_mem_region *region)
+{
+	struct mshv_partition *partition = region->partition;
+	int ret;
+
+	ret = mshv_region_pin(region);
+	if (ret) {
+		pt_err(partition, "Failed to pin memory region: %d\n",
+		       ret);
+		goto err_out;
+	}
+
+	/*
+	 * For an SNP partition it is a requirement that for every memory region
+	 * that we are going to map for this partition we should make sure that
+	 * host access to that region is released. This is ensured by doing an
+	 * additional hypercall which will update the SLAT to release host
+	 * access to guest memory regions.
+	 */
+	if (mshv_partition_encrypted(partition)) {
+		ret = mshv_region_unshare(region);
+		if (ret) {
+			pt_err(partition,
+			       "Failed to unshare memory region (guest_pfn: %llu): %d\n",
+			       region->start_gfn, ret);
+			goto err_out;
+		}
+	}
+
+	ret = mshv_region_map(region);
+	if (ret)
+		goto share_region;
+
+	return 0;
+
+share_region:
+	if (mshv_partition_encrypted(partition)) {
+		int shrc;
+
+		shrc = mshv_region_share(region);
+		if (!shrc)
+			goto err_out;
+
+		pt_err(partition,
+		       "Failed to share memory region (guest_pfn: %llu): %d\n",
+		       region->start_gfn, shrc);
+		/*
+		 * Re-sharing failed — the pages remain inaccessible to the
+		 * host.  Zero the page array so that mshv_region_destroy()
+		 * won't attempt to unpin them (leaking the page references
+		 * is intentional; unpinning host-inaccessible pages would be
+		 * unsafe).
+		 */
+		mshv_region_init_pfns(region);
+		goto err_out;
+	}
+err_out:
+	return ret;
+}
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index 34c9b57c50f47..2a4eff27917f2 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -371,16 +371,12 @@ extern u8 * __percpu *hv_synic_eventring_tail;
 
 struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
 					   u64 uaddr, u32 flags);
-int mshv_region_share(struct mshv_mem_region *region);
-int mshv_region_unshare(struct mshv_mem_region *region);
-int mshv_region_map(struct mshv_mem_region *region);
-void mshv_region_invalidate(struct mshv_mem_region *region);
 void mshv_region_init_pfns(struct mshv_mem_region *region);
-int mshv_region_pin(struct mshv_mem_region *region);
 void mshv_region_put(struct mshv_mem_region *region);
 int mshv_region_get(struct mshv_mem_region *region);
 bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn);
 void mshv_region_movable_fini(struct mshv_mem_region *region);
 bool mshv_region_movable_init(struct mshv_mem_region *region);
+int mshv_map_pinned_region(struct mshv_mem_region *region);
 
 #endif /* _MSHV_ROOT_H_ */
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 986cb9a4c428e..4af2b98738ee2 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1346,79 +1346,6 @@ static int mshv_partition_create_region(struct mshv_partition *partition,
 	return 0;
 }
 
-/**
- * mshv_prepare_pinned_region - Pin and map memory regions
- * @region: Pointer to the memory region structure
- *
- * This function processes memory regions that are explicitly marked as pinned.
- * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based
- * population. The function ensures the region is properly populated, handles
- * encryption requirements for SNP partitions if applicable, maps the region,
- * and performs necessary sharing or eviction operations based on the mapping
- * result.
- *
- * Return: 0 on success, negative error code on failure.
- */
-static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
-{
-	struct mshv_partition *partition = region->partition;
-	int ret;
-
-	ret = mshv_region_pin(region);
-	if (ret) {
-		pt_err(partition, "Failed to pin memory region: %d\n",
-		       ret);
-		goto err_out;
-	}
-
-	/*
-	 * For an SNP partition it is a requirement that for every memory region
-	 * that we are going to map for this partition we should make sure that
-	 * host access to that region is released. This is ensured by doing an
-	 * additional hypercall which will update the SLAT to release host
-	 * access to guest memory regions.
-	 */
-	if (mshv_partition_encrypted(partition)) {
-		ret = mshv_region_unshare(region);
-		if (ret) {
-			pt_err(partition,
-			       "Failed to unshare memory region (guest_pfn: %llu): %d\n",
-			       region->start_gfn, ret);
-			goto err_out;
-		}
-	}
-
-	ret = mshv_region_map(region);
-	if (ret)
-		goto share_region;
-
-	return 0;
-
-share_region:
-	if (mshv_partition_encrypted(partition)) {
-		int shrc;
-
-		shrc = mshv_region_share(region);
-		if (!shrc)
-			goto err_out;
-
-		pt_err(partition,
-		       "Failed to share memory region (guest_pfn: %llu): %d\n",
-		       region->start_gfn, shrc);
-		/*
-		 * Re-sharing failed — the pages remain inaccessible to the
-		 * host.  Zero the page array so that mshv_region_destroy()
-		 * won't attempt to unpin them (leaking the page references
-		 * is intentional; unpinning host-inaccessible pages would be
-		 * unsafe).
-		 */
-		mshv_region_init_pfns(region);
-		goto err_out;
-	}
-err_out:
-	return ret;
-}
-
 /*
  * This maps two things: guest RAM and for pci passthru mmio space.
  *
@@ -1461,7 +1388,7 @@ mshv_map_user_memory(struct mshv_partition *partition,
 
 	switch (region->mreg_type) {
 	case MSHV_REGION_TYPE_MEM_PINNED:
-		ret = mshv_prepare_pinned_region(region);
+		ret = mshv_map_pinned_region(region);
 		break;
 	case MSHV_REGION_TYPE_MEM_MOVABLE:
 		/*



^ permalink raw reply related

* [PATCH v3 09/11] mshv: Map populated pages on movable region creation
From: Stanislav Kinsburskii @ 2026-05-13 18:51 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, skinsburskii
  Cc: linux-hyperv, linux-kernel
In-Reply-To: <177869813422.18773.515308662914055136.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

Map any populated pages into the hypervisor upfront when creating a
movable region, rather than waiting for faults. Previously, movable
regions were created with all pages marked as HV_MAP_GPA_NO_ACCESS
regardless of whether the userspace mapping contained populated pages.

This guarantees that if the caller passes a populated mapping, those
present pages will be mapped into the hypervisor immediately during
region creation instead of being faulted in later.

Pages that are present but not writable in host page tables (e.g.
shared zero pages) are left as no-access mappings to preserve copy-on-write
semantics; they will be faulted in on demand.

The region is processed in bounded chunks to avoid soft lockups and
livelock from concurrent invalidations.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_regions.c   |  126 +++++++++++++++++++++++++++++++++----------
 drivers/hv/mshv_root.h      |    1 
 drivers/hv/mshv_root_main.c |   10 ---
 3 files changed, 98 insertions(+), 39 deletions(-)

diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
index d9e33e9ef8550..85f8b7bddf939 100644
--- a/drivers/hv/mshv_regions.c
+++ b/drivers/hv/mshv_regions.c
@@ -17,8 +17,12 @@
 
 #include "mshv_root.h"
 
+/* Process memory regions in chunks to avoid soft lockups and livelock */
+#define MSHV_MAX_PFN_BATCH				(SZ_2M / PAGE_SIZE)
+
 #define MSHV_MAP_FAULT_IN_PAGES				\
 	(PTRS_PER_PMD * max_t(unsigned long, 1, PAGE_SIZE / HV_HYP_PAGE_SIZE))
+
 #define MSHV_INVALID_PFN				ULONG_MAX
 
 static inline bool mshv_pfn_valid(unsigned long pfn)
@@ -450,13 +454,16 @@ int mshv_region_get(struct mshv_mem_region *region)
 /**
  * mshv_region_hmm_fault_and_lock - Fault in pages across VMAs and lock
  *                                  the memory region
- * @region: Pointer to the memory region structure
- * @start : Starting virtual address of the range to fault (inclusive)
- * @end   : Ending virtual address of the range to fault (exclusive)
- * @pfns  : Output array for page frame numbers with HMM flags
+ * @region  : Pointer to the memory region structure
+ * @start   : Starting virtual address of the range to fault (inclusive)
+ * @end     : Ending virtual address of the range to fault (exclusive)
+ * @pfns    : Output array for page frame numbers with HMM flags
+ * @do_fault: If true, fault in missing pages; if false, snapshot only
+ *            pages already present in page tables
  *
- * Iterates through VMAs covering [start, end), faulting in pages via
- * hmm_range_fault() for each VMA segment.  Write faults are requested
+ * Iterates through VMAs covering [start, end), collecting page frame
+ * numbers via hmm_range_fault() for each VMA segment.  When @do_fault
+ * is true, missing pages are faulted in and write faults are requested
  * only when both the VMA and the hypervisor mapping permit writes, to
  * avoid breaking copy-on-write semantics on read-only mappings.
  *
@@ -469,7 +476,8 @@ int mshv_region_get(struct mshv_mem_region *region)
 static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region,
 					  unsigned long start,
 					  unsigned long end,
-					  unsigned long *pfns)
+					  unsigned long *pfns,
+					  bool do_fault)
 {
 	struct hmm_range range = {
 		.notifier = &region->mreg_mni,
@@ -491,18 +499,22 @@ static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region,
 		range.hmm_pfns = pfns;
 		range.start = start;
 		range.end = min(vma->vm_end, end);
-		range.default_flags = HMM_PFN_REQ_FAULT;
-		/*
-		 * Only request writable pages from HMM when both the
-		 * VMA and the hypervisor mapping allow writes.  Without
-		 * this, hmm_range_fault() would trigger COW on read-only
-		 * mappings (e.g. shared zero pages, file-backed pages),
-		 * breaking copy-on-write semantics and potentially
-		 * granting the guest write access to shared host pages.
-		 */
-		if ((vma->vm_flags & VM_WRITE) &&
-		    (region->hv_map_flags & HV_MAP_GPA_WRITABLE))
-			range.default_flags |= HMM_PFN_REQ_WRITE;
+		range.default_flags = 0;
+		if (do_fault) {
+			range.default_flags = HMM_PFN_REQ_FAULT;
+			/*
+			 * Only request writable pages from HMM when both
+			 * the VMA and the hypervisor mapping allow writes.
+			 * Without this, hmm_range_fault() would trigger
+			 * COW on read-only mappings (e.g. shared zero
+			 * pages, file-backed pages), breaking
+			 * copy-on-write semantics and potentially granting
+			 * the guest write access to shared host pages.
+			 */
+			if ((vma->vm_flags & VM_WRITE) &&
+			    (region->hv_map_flags & HV_MAP_GPA_WRITABLE))
+				range.default_flags |= HMM_PFN_REQ_WRITE;
+		}
 
 		ret = hmm_range_fault(&range);
 		if (ret)
@@ -527,19 +539,33 @@ static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region,
 }
 
 /**
- * mshv_region_range_fault - Handle memory range faults for a given region.
- * @region: Pointer to the memory region structure.
- * @pfn_offset: Offset of the page within the region.
- * @pfn_count: Number of pages to handle.
+ * mshv_region_collect_and_map - Collect PFNs for a user range and map them
+ * @region    : memory region being processed
+ * @pfn_offset: PFNs offset within the region
+ * @pfn_count : number of PFNs to process
+ * @do_fault  : if true, fault in missing pages;
+ *              if false, collect only present pages
  *
- * This function resolves memory faults for a specified range of pages
- * within a memory region. It uses HMM (Heterogeneous Memory Management)
- * to fault in the required pages and updates the region's page array.
+ * Collects PFNs for the specified portion of @region from the
+ * corresponding userspace VMAs and maps them into the hypervisor. The
+ * behavior depends on @do_fault:
  *
- * Return: 0 on success, negative error code on failure.
+ * - true: Fault in missing pages from userspace, ensuring all pages in the
+ *   range are present. Used for on-demand page population.
+ * - false: Collect PFNs only for pages already present in userspace,
+ *   leaving missing pages as invalid PFN markers.
+ *   Used for initial region setup.
+ *
+ * Collected PFNs are stored in region->mreg_pfns[] with HMM bookkeeping
+ * flags cleared, then the range is mapped into the hypervisor. Present
+ * PFNs get mapped with region access permissions; missing PFNs (invalid
+ * entries) get mapped with no-access permissions.
+ *
+ * Return: 0 on success, negative errno on failure.
  */
-static int mshv_region_range_fault(struct mshv_mem_region *region,
-				   u64 pfn_offset, u64 pfn_count)
+static int mshv_region_collect_and_map(struct mshv_mem_region *region,
+				       u64 pfn_offset, u64 pfn_count,
+				       bool do_fault)
 {
 	unsigned long start, end;
 	unsigned long *pfns;
@@ -555,7 +581,7 @@ static int mshv_region_range_fault(struct mshv_mem_region *region,
 
 	do {
 		ret = mshv_region_hmm_fault_and_lock(region, start, end,
-						     pfns);
+						     pfns, do_fault);
 	} while (ret == -EBUSY);
 
 	if (ret)
@@ -564,6 +590,11 @@ static int mshv_region_range_fault(struct mshv_mem_region *region,
 	for (i = 0; i < pfn_count; i++) {
 		if (!(pfns[i] & HMM_PFN_VALID))
 			continue;
+		/* Skip read-only pages to avoid bypassing COW */
+		if (!do_fault &&
+		    (region->hv_map_flags & HV_MAP_GPA_WRITABLE) &&
+		    !(pfns[i] & HMM_PFN_WRITE))
+			continue;
 		/* Drop HMM_PFN_* flags to ensure PFNs are valid. */
 		region->mreg_pfns[pfn_offset + i] = pfns[i] & ~HMM_PFN_FLAGS;
 	}
@@ -577,6 +608,13 @@ static int mshv_region_range_fault(struct mshv_mem_region *region,
 	return ret;
 }
 
+static int mshv_region_range_fault(struct mshv_mem_region *region,
+				   u64 pfn_offset, u64 pfn_count)
+{
+	return mshv_region_collect_and_map(region, pfn_offset, pfn_count,
+					   true);
+}
+
 bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn)
 {
 	u64 pfn_offset, pfn_count;
@@ -764,3 +802,31 @@ int mshv_map_pinned_region(struct mshv_mem_region *region)
 err_out:
 	return ret;
 }
+
+/*
+ * mshv_map_movable_region - Map a movable memory region to the hypervisor
+ * @region: The memory region to map
+ *
+ * Maps the entire movable region by processing it in bounded chunks to avoid
+ * soft lockups from holding mmap_read_lock() too long and to prevent livelock
+ * if concurrent memory invalidations force restarts.
+ *
+ * Returns: 0 on success, negative error code on failure.
+ */
+int mshv_map_movable_region(struct mshv_mem_region *region)
+{
+	u64 pfn, count;
+	int ret;
+
+	for (pfn = 0; pfn < region->nr_pfns; pfn += MSHV_MAX_PFN_BATCH) {
+		count = min_t(u64, MSHV_MAX_PFN_BATCH, region->nr_pfns - pfn);
+
+		ret = mshv_region_collect_and_map(region, pfn, count, false);
+		if (ret)
+			return ret;
+
+		cond_resched();
+	}
+
+	return 0;
+}
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index 2a4eff27917f2..0f4fc57a14cd0 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -378,5 +378,6 @@ bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn);
 void mshv_region_movable_fini(struct mshv_mem_region *region);
 bool mshv_region_movable_init(struct mshv_mem_region *region);
 int mshv_map_pinned_region(struct mshv_mem_region *region);
+int mshv_map_movable_region(struct mshv_mem_region *region);
 
 #endif /* _MSHV_ROOT_H_ */
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 4af2b98738ee2..e38438c539c5d 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1391,15 +1391,7 @@ mshv_map_user_memory(struct mshv_partition *partition,
 		ret = mshv_map_pinned_region(region);
 		break;
 	case MSHV_REGION_TYPE_MEM_MOVABLE:
-		/*
-		 * For movable memory regions, remap with no access to let
-		 * the hypervisor track dirty pages, enabling pre-copy live
-		 * migration.
-		 */
-		ret = hv_call_map_ram_pfns(partition->pt_id,
-					   region->start_gfn,
-					   region->nr_pfns,
-					   HV_MAP_GPA_NO_ACCESS, NULL);
+		ret = mshv_map_movable_region(region);
 		break;
 	case MSHV_REGION_TYPE_MMIO:
 		ret = hv_call_map_mmio_pfns(partition->pt_id,



^ permalink raw reply related

* [PATCH v3 10/11] mshv: Extract MMIO region mapping into separate function
From: Stanislav Kinsburskii @ 2026-05-13 18:51 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, skinsburskii
  Cc: linux-hyperv, linux-kernel
In-Reply-To: <177869813422.18773.515308662914055136.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

Extract the MMIO region mapping logic from mshv_map_user_memory() into
a dedicated mshv_map_mmio_region() function. This improves code
organization and consistency with the existing mshv_map_pinned_region()
and mshv_map_movable_region() functions.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_regions.c   |    8 ++++++++
 drivers/hv/mshv_root.h      |    2 ++
 drivers/hv/mshv_root_main.c |    5 +----
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
index 85f8b7bddf939..7bcfba9ebac12 100644
--- a/drivers/hv/mshv_regions.c
+++ b/drivers/hv/mshv_regions.c
@@ -830,3 +830,11 @@ int mshv_map_movable_region(struct mshv_mem_region *region)
 
 	return 0;
 }
+
+int mshv_map_mmio_region(struct mshv_mem_region *region,
+			 unsigned long mmio_pfn)
+{
+	return hv_call_map_mmio_pfns(region->partition->pt_id,
+				     region->start_gfn,
+				     mmio_pfn, region->nr_pfns);
+}
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index 0f4fc57a14cd0..b091db06390b0 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -379,5 +379,7 @@ void mshv_region_movable_fini(struct mshv_mem_region *region);
 bool mshv_region_movable_init(struct mshv_mem_region *region);
 int mshv_map_pinned_region(struct mshv_mem_region *region);
 int mshv_map_movable_region(struct mshv_mem_region *region);
+int mshv_map_mmio_region(struct mshv_mem_region *region,
+			 unsigned long mmio_pfn);
 
 #endif /* _MSHV_ROOT_H_ */
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index e38438c539c5d..d5197559d7650 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1394,10 +1394,7 @@ mshv_map_user_memory(struct mshv_partition *partition,
 		ret = mshv_map_movable_region(region);
 		break;
 	case MSHV_REGION_TYPE_MMIO:
-		ret = hv_call_map_mmio_pfns(partition->pt_id,
-					    region->start_gfn,
-					    mmio_pfn,
-					    region->nr_pfns);
+		ret = mshv_map_mmio_region(region, mmio_pfn);
 		break;
 	}
 



^ permalink raw reply related

* [PATCH v3 11/11] mshv: Add tracepoint for map GPA hypercall
From: Stanislav Kinsburskii @ 2026-05-13 18:51 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, skinsburskii
  Cc: linux-hyperv, linux-kernel
In-Reply-To: <177869813422.18773.515308662914055136.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

Add tracing for GPA mapping hypercalls to aid in debugging memory
management issues in child partitions. The tracepoint captures both
successful and failed mapping attempts, including the number of pages
successfully mapped before any failure occurred.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_root_hv_call.c |    3 +++
 drivers/hv/mshv_trace.h        |   36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/drivers/hv/mshv_root_hv_call.c b/drivers/hv/mshv_root_hv_call.c
index 00d37c39cbf26..ccf297fbfae3a 100644
--- a/drivers/hv/mshv_root_hv_call.c
+++ b/drivers/hv/mshv_root_hv_call.c
@@ -250,6 +250,9 @@ static int hv_do_map_pfns(u64 partition_id, u64 gfn, u64 pfns_count,
 		}
 	}
 
+	trace_mshv_map_pfns(partition_id, gfn, pfns_count, page_count,
+			    flags, mmio_spa, done, ret);
+
 	if (ret && done) {
 		u32 unmap_flags = 0;
 
diff --git a/drivers/hv/mshv_trace.h b/drivers/hv/mshv_trace.h
index e7280c47e579a..619c4563d4211 100644
--- a/drivers/hv/mshv_trace.h
+++ b/drivers/hv/mshv_trace.h
@@ -538,6 +538,42 @@ TRACE_EVENT(mshv_handle_gpa_intercept,
 	    )
 );
 
+TRACE_EVENT(mshv_map_pfns,
+	    TP_PROTO(u64 partition_id, u64 gfn, u64 pfn_count, u64 page_count, u32 flags,
+		     u64 mmio_spa, u64 done, int ret),
+	    TP_ARGS(partition_id, gfn, pfn_count, page_count, flags, mmio_spa, done, ret),
+	    TP_STRUCT__entry(
+		    __field(u64, partition_id)
+		    __field(u64, gfn)
+		    __field(u64, pfn_count)
+		    __field(u64, page_count)
+		    __field(u32, flags)
+		    __field(u64, mmio_spa)
+		    __field(u64, done)
+		    __field(int, ret)
+	    ),
+	    TP_fast_assign(
+		    __entry->partition_id = partition_id;
+		    __entry->gfn = gfn;
+		    __entry->pfn_count = pfn_count;
+		    __entry->page_count = page_count;
+		    __entry->flags = flags;
+		    __entry->mmio_spa = mmio_spa;
+		    __entry->done = done;
+		    __entry->ret = ret;
+	    ),
+	    TP_printk("partition_id=%llu gfn=0x%llx pfn_count=%llu page_count=%llu flags=0x%x mmio_spa=0x%llx done=%llu ret=%d",
+		    __entry->partition_id,
+		    __entry->gfn,
+		    __entry->pfn_count,
+		    __entry->page_count,
+		    __entry->flags,
+		    __entry->mmio_spa,
+		    __entry->done,
+		    __entry->ret
+	    )
+);
+
 #endif /* _MSHV_TRACE_H_ */
 
 /* This part must be outside protection */



^ permalink raw reply related

* [PATCH v4 0/6] mshv: Reduce memory consumption for unpinned regions
From: Stanislav Kinsburskii @ 2026-05-13 18:54 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, skinsburskii
  Cc: linux-hyperv, linux-kernel

This series reduces memory consumption for unpinned regions by avoiding
PFN array allocation. A 1GB unpinned region currently wastes 2MB for an
unused PFN array that HMM-managed regions don't need.


v4:
 - Rebased on the latest state of the tree.
 - Dropped "mshv: Improve code readability with handler function typedef"
   as redundant.

v3:
- Fix missing unmap/remap of pages before the first huge page.

v2:
- Improved commit message
- Fixed invalid vfree(region->mreg_pfns) call for MMIO-backed regions
- Fixed unpinning of already-released pages in the error path during
  pinned region creation
- Removed redundant mshv_map_region helper in favor of the new
  optimized mapping logic

---

Stanislav Kinsburskii (6):
      mshv: Consolidate region creation and mapping
      mshv: Rename mshv_mem_region to mshv_region
      mshv: Optimize region unmap and invalidation with large pages
      mshv: Pass pfns array explicitly through processing chain
      mshv: Simplify pfn array handling in region processing
      mshv: Allocate pfns array only for pinned regions


 drivers/hv/mshv_regions.c   |  346 +++++++++++++++++++++++++++----------------
 drivers/hv/mshv_root.h      |   28 ++-
 drivers/hv/mshv_root_main.c |   81 ++++------
 3 files changed, 265 insertions(+), 190 deletions(-)


^ permalink raw reply

* [PATCH v4 1/6] mshv: Consolidate region creation and mapping
From: Stanislav Kinsburskii @ 2026-05-13 18:54 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, skinsburskii
  Cc: linux-hyperv, linux-kernel
In-Reply-To: <177869833161.19379.1357399549586915698.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

Consolidate region type detection and initialization into
mshv_region_create() to simplify the region creation flow. Move type
determination logic (MMIO/pinned/movable) earlier in the process and
initialize type-specific fields during creation rather than after.

Movable region initialization now fails explicitly if
mmu_interval_notifier_insert() returns an error, rather than silently
falling back to pinned memory. This fail-fast approach makes
configuration issues more visible.

This eliminates the need for mshv_region_movable_init/fini() by
handling MMU interval notifier setup directly in the constructor and
teardown in the destructor. Region mapping is also unified through a
single mshv_map_region() dispatcher that routes to the appropriate
type-specific handler, reducing the exported API surface by 5
functions.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_regions.c   |   83 ++++++++++++++++++++++++++++---------------
 drivers/hv/mshv_root.h      |   15 ++++----
 drivers/hv/mshv_root_main.c |   61 ++++++++++++--------------------
 3 files changed, 85 insertions(+), 74 deletions(-)

diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
index 7bcfba9ebac12..894eb4dcad93f 100644
--- a/drivers/hv/mshv_regions.c
+++ b/drivers/hv/mshv_regions.c
@@ -25,6 +25,8 @@
 
 #define MSHV_INVALID_PFN				ULONG_MAX
 
+static const struct mmu_interval_notifier_ops mshv_region_mni_ops;
+
 static inline bool mshv_pfn_valid(unsigned long pfn)
 {
 	return pfn != MSHV_INVALID_PFN;
@@ -200,15 +202,21 @@ static int mshv_region_process_range(struct mshv_mem_region *region,
 	return 0;
 }
 
-struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pfns,
-					   u64 uaddr, u32 flags)
+struct mshv_mem_region *mshv_region_create(struct mshv_partition *partition,
+					   enum mshv_region_type type,
+					   u64 guest_pfn, u64 nr_pfns,
+					   u64 uaddr, u32 flags,
+					   unsigned long mmio_pfn)
 {
 	struct mshv_mem_region *region;
+	int ret = 0;
 
 	region = vzalloc(struct_size(region, mreg_pfns, nr_pfns));
 	if (!region)
 		return ERR_PTR(-ENOMEM);
 
+	region->partition = partition;
+	region->mreg_type = type;
 	region->nr_pfns = nr_pfns;
 	region->start_gfn = guest_pfn;
 	region->start_uaddr = uaddr;
@@ -220,9 +228,33 @@ struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pfns,
 
 	mshv_region_init_pfns(region);
 
+	mutex_init(&region->mreg_mutex);
 	kref_init(&region->mreg_refcount);
 
+	switch (type) {
+	case MSHV_REGION_TYPE_MEM_MOVABLE:
+		ret = mmu_interval_notifier_insert(&region->mreg_mni,
+						   current->mm, uaddr,
+						   nr_pfns << HV_HYP_PAGE_SHIFT,
+						   &mshv_region_mni_ops);
+		break;
+	case MSHV_REGION_TYPE_MEM_PINNED:
+		break;
+	case MSHV_REGION_TYPE_MMIO:
+		region->mreg_mmio_pfn = mmio_pfn;
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (ret)
+		goto free_region;
+
 	return region;
+
+free_region:
+	vfree(region);
+	return ERR_PTR(ret);
 }
 
 static int mshv_region_chunk_share(struct mshv_mem_region *region,
@@ -422,7 +454,7 @@ static void mshv_region_destroy(struct kref *ref)
 	int ret;
 
 	if (region->mreg_type == MSHV_REGION_TYPE_MEM_MOVABLE)
-		mshv_region_movable_fini(region);
+		mmu_interval_notifier_remove(&region->mreg_mni);
 
 	if (mshv_partition_encrypted(partition)) {
 		ret = mshv_region_share(region);
@@ -709,27 +741,6 @@ static const struct mmu_interval_notifier_ops mshv_region_mni_ops = {
 	.invalidate = mshv_region_interval_invalidate,
 };
 
-void mshv_region_movable_fini(struct mshv_mem_region *region)
-{
-	mmu_interval_notifier_remove(&region->mreg_mni);
-}
-
-bool mshv_region_movable_init(struct mshv_mem_region *region)
-{
-	int ret;
-
-	ret = mmu_interval_notifier_insert(&region->mreg_mni, current->mm,
-					   region->start_uaddr,
-					   region->nr_pfns << HV_HYP_PAGE_SHIFT,
-					   &mshv_region_mni_ops);
-	if (ret)
-		return false;
-
-	mutex_init(&region->mreg_mutex);
-
-	return true;
-}
-
 /**
  * mshv_map_pinned_region - Pin and map memory regions
  * @region: Pointer to the memory region structure
@@ -743,7 +754,7 @@ bool mshv_region_movable_init(struct mshv_mem_region *region)
  *
  * Return: 0 on success, negative error code on failure.
  */
-int mshv_map_pinned_region(struct mshv_mem_region *region)
+static int mshv_map_pinned_region(struct mshv_mem_region *region)
 {
 	struct mshv_partition *partition = region->partition;
 	int ret;
@@ -813,7 +824,7 @@ int mshv_map_pinned_region(struct mshv_mem_region *region)
  *
  * Returns: 0 on success, negative error code on failure.
  */
-int mshv_map_movable_region(struct mshv_mem_region *region)
+static int mshv_map_movable_region(struct mshv_mem_region *region)
 {
 	u64 pfn, count;
 	int ret;
@@ -831,10 +842,24 @@ int mshv_map_movable_region(struct mshv_mem_region *region)
 	return 0;
 }
 
-int mshv_map_mmio_region(struct mshv_mem_region *region,
-			 unsigned long mmio_pfn)
+static int mshv_map_mmio_region(struct mshv_mem_region *region)
 {
 	return hv_call_map_mmio_pfns(region->partition->pt_id,
 				     region->start_gfn,
-				     mmio_pfn, region->nr_pfns);
+				     region->mreg_mmio_pfn,
+				     region->nr_pfns);
+}
+
+int mshv_map_region(struct mshv_mem_region *region)
+{
+	switch (region->mreg_type) {
+	case MSHV_REGION_TYPE_MEM_PINNED:
+		return mshv_map_pinned_region(region);
+	case MSHV_REGION_TYPE_MEM_MOVABLE:
+		return mshv_map_movable_region(region);
+	case MSHV_REGION_TYPE_MMIO:
+		return mshv_map_mmio_region(region);
+	default:
+		return -EINVAL;
+	}
 }
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index b091db06390b0..40fcf57219ece 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -93,6 +93,7 @@ struct mshv_mem_region {
 	enum mshv_region_type mreg_type;
 	struct mmu_interval_notifier mreg_mni;
 	struct mutex mreg_mutex;	/* protects region PFNs remapping */
+	u64 mreg_mmio_pfn;
 	unsigned long mreg_pfns[];
 };
 
@@ -369,17 +370,15 @@ extern struct mshv_root mshv_root;
 extern enum hv_scheduler_type hv_scheduler_type;
 extern u8 * __percpu *hv_synic_eventring_tail;
 
-struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
-					   u64 uaddr, u32 flags);
+struct mshv_mem_region *mshv_region_create(struct mshv_partition *partition,
+					   enum mshv_region_type type,
+					   u64 guest_pfn, u64 nr_pfns,
+					   u64 uaddr, u32 flags,
+					   unsigned long mmio_pfn);
 void mshv_region_init_pfns(struct mshv_mem_region *region);
 void mshv_region_put(struct mshv_mem_region *region);
 int mshv_region_get(struct mshv_mem_region *region);
 bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn);
-void mshv_region_movable_fini(struct mshv_mem_region *region);
-bool mshv_region_movable_init(struct mshv_mem_region *region);
-int mshv_map_pinned_region(struct mshv_mem_region *region);
-int mshv_map_movable_region(struct mshv_mem_region *region);
-int mshv_map_mmio_region(struct mshv_mem_region *region,
-			 unsigned long mmio_pfn);
+int mshv_map_region(struct mshv_mem_region *region);
 
 #endif /* _MSHV_ROOT_H_ */
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index d5197559d7650..68609f2452b3a 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -1309,11 +1309,14 @@ static void mshv_async_hvcall_handler(void *data, u64 *status)
  */
 static int mshv_partition_create_region(struct mshv_partition *partition,
 					struct mshv_user_mem_region *mem,
-					struct mshv_mem_region **regionpp,
-					bool is_mmio)
+					struct mshv_mem_region **regionpp)
 {
 	struct mshv_mem_region *rg;
+	enum mshv_region_type type;
 	u64 nr_pfns = HVPFN_DOWN(mem->size);
+	struct vm_area_struct *vma;
+	ulong mmio_pfn;
+	bool is_mmio;
 
 	/* Reject overlapping regions */
 	spin_lock(&partition->pt_mem_regions_lock);
@@ -1326,20 +1329,27 @@ static int mshv_partition_create_region(struct mshv_partition *partition,
 	}
 	spin_unlock(&partition->pt_mem_regions_lock);
 
-	rg = mshv_region_create(mem->guest_pfn, nr_pfns,
-				mem->userspace_addr, mem->flags);
-	if (IS_ERR(rg))
-		return PTR_ERR(rg);
+	mmap_read_lock(current->mm);
+	vma = vma_lookup(current->mm, mem->userspace_addr);
+	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
+	mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
+	mmap_read_unlock(current->mm);
+
+	if (!vma)
+		return -EINVAL;
 
 	if (is_mmio)
-		rg->mreg_type = MSHV_REGION_TYPE_MMIO;
-	else if (mshv_partition_encrypted(partition) ||
-		 !mshv_region_movable_init(rg))
-		rg->mreg_type = MSHV_REGION_TYPE_MEM_PINNED;
+		type = MSHV_REGION_TYPE_MMIO;
+	else if (mshv_partition_encrypted(partition))
+		type = MSHV_REGION_TYPE_MEM_PINNED;
 	else
-		rg->mreg_type = MSHV_REGION_TYPE_MEM_MOVABLE;
+		type = MSHV_REGION_TYPE_MEM_MOVABLE;
 
-	rg->partition = partition;
+	rg = mshv_region_create(partition, type, mem->guest_pfn, nr_pfns,
+				mem->userspace_addr, mem->flags,
+				mmio_pfn);
+	if (IS_ERR(rg))
+		return PTR_ERR(rg);
 
 	*regionpp = rg;
 
@@ -1363,40 +1373,17 @@ mshv_map_user_memory(struct mshv_partition *partition,
 		     struct mshv_user_mem_region *mem)
 {
 	struct mshv_mem_region *region;
-	struct vm_area_struct *vma;
-	bool is_mmio;
-	ulong mmio_pfn;
 	long ret;
 
 	if (mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
 	    !access_ok((const void __user *)mem->userspace_addr, mem->size))
 		return -EINVAL;
 
-	mmap_read_lock(current->mm);
-	vma = vma_lookup(current->mm, mem->userspace_addr);
-	is_mmio = vma ? !!(vma->vm_flags & (VM_IO | VM_PFNMAP)) : 0;
-	mmio_pfn = is_mmio ? vma->vm_pgoff : 0;
-	mmap_read_unlock(current->mm);
-
-	if (!vma)
-		return -EINVAL;
-
-	ret = mshv_partition_create_region(partition, mem, &region,
-					   is_mmio);
+	ret = mshv_partition_create_region(partition, mem, &region);
 	if (ret)
 		return ret;
 
-	switch (region->mreg_type) {
-	case MSHV_REGION_TYPE_MEM_PINNED:
-		ret = mshv_map_pinned_region(region);
-		break;
-	case MSHV_REGION_TYPE_MEM_MOVABLE:
-		ret = mshv_map_movable_region(region);
-		break;
-	case MSHV_REGION_TYPE_MMIO:
-		ret = mshv_map_mmio_region(region, mmio_pfn);
-		break;
-	}
+	ret = mshv_map_region(region);
 
 	trace_mshv_map_user_memory(partition->pt_id, region->start_uaddr,
 				   region->start_gfn, region->nr_pfns,



^ permalink raw reply related

* [PATCH v4 2/6] mshv: Rename mshv_mem_region to mshv_region
From: Stanislav Kinsburskii @ 2026-05-13 18:54 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, skinsburskii
  Cc: linux-hyperv, linux-kernel
In-Reply-To: <177869833161.19379.1357399549586915698.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

The mshv_mem_region structure represents guest address space regions,
which can be either RAM-backed memory or memory-mapped IO regions
without physical backing. The "mem_" prefix incorrectly suggests the
structure only handles memory regions, creating confusion about its
actual purpose.

Remove the "mem_" prefix to align with existing function naming
(mshv_region_map, mshv_region_pin, etc.) and accurately reflect that
this structure manages arbitrary guest address space mappings
regardless of their backing type.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_regions.c   |   76 ++++++++++++++++++++++---------------------
 drivers/hv/mshv_root.h      |   22 ++++++------
 drivers/hv/mshv_root_main.c |   22 ++++++------
 3 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
index 894eb4dcad93f..f81951ae3f808 100644
--- a/drivers/hv/mshv_regions.c
+++ b/drivers/hv/mshv_regions.c
@@ -32,7 +32,7 @@ static inline bool mshv_pfn_valid(unsigned long pfn)
 	return pfn != MSHV_INVALID_PFN;
 }
 
-static void mshv_region_init_pfns_range(struct mshv_mem_region *region,
+static void mshv_region_init_pfns_range(struct mshv_region *region,
 					u64 pfn_offset, u64 pfn_count)
 {
 	u64 i;
@@ -41,7 +41,7 @@ static void mshv_region_init_pfns_range(struct mshv_mem_region *region,
 		region->mreg_pfns[i] = MSHV_INVALID_PFN;
 }
 
-void mshv_region_init_pfns(struct mshv_mem_region *region)
+void mshv_region_init_pfns(struct mshv_region *region)
 {
 	mshv_region_init_pfns_range(region, 0, region->nr_pfns);
 }
@@ -106,7 +106,7 @@ static int mshv_chunk_stride(unsigned long pfn, u64 gfn, u64 pfn_count)
  * Return: Length of the run in PFNs, or a negative errno from
  *         mshv_chunk_stride() if the backing folio order is unsupported.
  */
-static long mshv_region_chunk_size(struct mshv_mem_region *region,
+static long mshv_region_chunk_size(struct mshv_region *region,
 				   u64 pfn_offset, u64 pfn_count,
 				   bool *huge_page)
 {
@@ -164,10 +164,10 @@ static long mshv_region_chunk_size(struct mshv_mem_region *region,
  *
  * Returns 0 on success, or a negative error code on failure.
  */
-static int mshv_region_process_range(struct mshv_mem_region *region,
+static int mshv_region_process_range(struct mshv_region *region,
 				     u32 flags,
 				     u64 pfn_offset, u64 pfn_count,
-				     int (*handler)(struct mshv_mem_region *region,
+				     int (*handler)(struct mshv_region *region,
 						    u32 flags,
 						    u64 pfn_offset,
 						    u64 pfn_count,
@@ -202,13 +202,13 @@ static int mshv_region_process_range(struct mshv_mem_region *region,
 	return 0;
 }
 
-struct mshv_mem_region *mshv_region_create(struct mshv_partition *partition,
-					   enum mshv_region_type type,
-					   u64 guest_pfn, u64 nr_pfns,
-					   u64 uaddr, u32 flags,
-					   unsigned long mmio_pfn)
+struct mshv_region *mshv_region_create(struct mshv_partition *partition,
+				       enum mshv_region_type type,
+				       u64 guest_pfn, u64 nr_pfns,
+				       u64 uaddr, u32 flags,
+				       unsigned long mmio_pfn)
 {
-	struct mshv_mem_region *region;
+	struct mshv_region *region;
 	int ret = 0;
 
 	region = vzalloc(struct_size(region, mreg_pfns, nr_pfns));
@@ -257,7 +257,7 @@ struct mshv_mem_region *mshv_region_create(struct mshv_partition *partition,
 	return ERR_PTR(ret);
 }
 
-static int mshv_region_chunk_share(struct mshv_mem_region *region,
+static int mshv_region_chunk_share(struct mshv_region *region,
 				   u32 flags,
 				   u64 pfn_offset, u64 pfn_count,
 				   bool huge_page)
@@ -276,7 +276,7 @@ static int mshv_region_chunk_share(struct mshv_mem_region *region,
 					      flags, true);
 }
 
-static int mshv_region_share(struct mshv_mem_region *region)
+static int mshv_region_share(struct mshv_region *region)
 {
 	u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED;
 
@@ -285,7 +285,7 @@ static int mshv_region_share(struct mshv_mem_region *region)
 					 mshv_region_chunk_share);
 }
 
-static int mshv_region_chunk_unshare(struct mshv_mem_region *region,
+static int mshv_region_chunk_unshare(struct mshv_region *region,
 				     u32 flags,
 				     u64 pfn_offset, u64 pfn_count,
 				     bool huge_page)
@@ -302,7 +302,7 @@ static int mshv_region_chunk_unshare(struct mshv_mem_region *region,
 					      flags, false);
 }
 
-static int mshv_region_unshare(struct mshv_mem_region *region)
+static int mshv_region_unshare(struct mshv_region *region)
 {
 	u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE;
 
@@ -311,7 +311,7 @@ static int mshv_region_unshare(struct mshv_mem_region *region)
 					 mshv_region_chunk_unshare);
 }
 
-static int mshv_region_chunk_remap(struct mshv_mem_region *region,
+static int mshv_region_chunk_remap(struct mshv_region *region,
 				   u32 flags,
 				   u64 pfn_offset, u64 pfn_count,
 				   bool huge_page)
@@ -333,7 +333,7 @@ static int mshv_region_chunk_remap(struct mshv_mem_region *region,
 				    region->mreg_pfns + pfn_offset);
 }
 
-static int mshv_region_remap_pfns(struct mshv_mem_region *region,
+static int mshv_region_remap_pfns(struct mshv_region *region,
 				  u32 map_flags,
 				  u64 pfn_offset, u64 pfn_count)
 {
@@ -342,7 +342,7 @@ static int mshv_region_remap_pfns(struct mshv_mem_region *region,
 					 mshv_region_chunk_remap);
 }
 
-static int mshv_region_map(struct mshv_mem_region *region)
+static int mshv_region_map(struct mshv_region *region)
 {
 	u32 map_flags = region->hv_map_flags;
 
@@ -350,7 +350,7 @@ static int mshv_region_map(struct mshv_mem_region *region)
 				      0, region->nr_pfns);
 }
 
-static void mshv_region_invalidate_pfns(struct mshv_mem_region *region,
+static void mshv_region_invalidate_pfns(struct mshv_region *region,
 					u64 pfn_offset, u64 pfn_count)
 {
 	u64 i;
@@ -366,12 +366,12 @@ static void mshv_region_invalidate_pfns(struct mshv_mem_region *region,
 	mshv_region_init_pfns_range(region, pfn_offset, pfn_count);
 }
 
-static void mshv_region_invalidate(struct mshv_mem_region *region)
+static void mshv_region_invalidate(struct mshv_region *region)
 {
 	mshv_region_invalidate_pfns(region, 0, region->nr_pfns);
 }
 
-static int mshv_region_pin(struct mshv_mem_region *region)
+static int mshv_region_pin(struct mshv_region *region)
 {
 	u64 done_count, nr_pfns, i;
 	unsigned long *pfns;
@@ -426,7 +426,7 @@ static int mshv_region_pin(struct mshv_mem_region *region)
 	return ret < 0 ? ret : -ENOMEM;
 }
 
-static int mshv_region_chunk_unmap(struct mshv_mem_region *region,
+static int mshv_region_chunk_unmap(struct mshv_region *region,
 				   u32 flags,
 				   u64 pfn_offset, u64 pfn_count,
 				   bool huge_page)
@@ -439,7 +439,7 @@ static int mshv_region_chunk_unmap(struct mshv_mem_region *region,
 				  pfn_count, flags);
 }
 
-static int mshv_region_unmap(struct mshv_mem_region *region)
+static int mshv_region_unmap(struct mshv_region *region)
 {
 	return mshv_region_process_range(region, 0,
 					 0, region->nr_pfns,
@@ -448,8 +448,8 @@ static int mshv_region_unmap(struct mshv_mem_region *region)
 
 static void mshv_region_destroy(struct kref *ref)
 {
-	struct mshv_mem_region *region =
-		container_of(ref, struct mshv_mem_region, mreg_refcount);
+	struct mshv_region *region =
+		container_of(ref, struct mshv_region, mreg_refcount);
 	struct mshv_partition *partition = region->partition;
 	int ret;
 
@@ -473,12 +473,12 @@ static void mshv_region_destroy(struct kref *ref)
 	vfree(region);
 }
 
-void mshv_region_put(struct mshv_mem_region *region)
+void mshv_region_put(struct mshv_region *region)
 {
 	kref_put(&region->mreg_refcount, mshv_region_destroy);
 }
 
-int mshv_region_get(struct mshv_mem_region *region)
+int mshv_region_get(struct mshv_region *region)
 {
 	return kref_get_unless_zero(&region->mreg_refcount);
 }
@@ -505,7 +505,7 @@ int mshv_region_get(struct mshv_mem_region *region)
  *
  * Return: 0 on success, negative error code on failure.
  */
-static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region,
+static int mshv_region_hmm_fault_and_lock(struct mshv_region *region,
 					  unsigned long start,
 					  unsigned long end,
 					  unsigned long *pfns,
@@ -595,7 +595,7 @@ static int mshv_region_hmm_fault_and_lock(struct mshv_mem_region *region,
  *
  * Return: 0 on success, negative errno on failure.
  */
-static int mshv_region_collect_and_map(struct mshv_mem_region *region,
+static int mshv_region_collect_and_map(struct mshv_region *region,
 				       u64 pfn_offset, u64 pfn_count,
 				       bool do_fault)
 {
@@ -640,14 +640,14 @@ static int mshv_region_collect_and_map(struct mshv_mem_region *region,
 	return ret;
 }
 
-static int mshv_region_range_fault(struct mshv_mem_region *region,
+static int mshv_region_range_fault(struct mshv_region *region,
 				   u64 pfn_offset, u64 pfn_count)
 {
 	return mshv_region_collect_and_map(region, pfn_offset, pfn_count,
 					   true);
 }
 
-bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn)
+bool mshv_region_handle_gfn_fault(struct mshv_region *region, u64 gfn)
 {
 	u64 pfn_offset, pfn_count;
 	int ret;
@@ -693,9 +693,9 @@ static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni,
 					    const struct mmu_notifier_range *range,
 					    unsigned long cur_seq)
 {
-	struct mshv_mem_region *region = container_of(mni,
-						      struct mshv_mem_region,
-						      mreg_mni);
+	struct mshv_region *region = container_of(mni,
+						  struct mshv_region,
+						  mreg_mni);
 	u64 pfn_offset, pfn_count;
 	unsigned long mstart, mend;
 	int ret = -EPERM;
@@ -754,7 +754,7 @@ static const struct mmu_interval_notifier_ops mshv_region_mni_ops = {
  *
  * Return: 0 on success, negative error code on failure.
  */
-static int mshv_map_pinned_region(struct mshv_mem_region *region)
+static int mshv_map_pinned_region(struct mshv_region *region)
 {
 	struct mshv_partition *partition = region->partition;
 	int ret;
@@ -824,7 +824,7 @@ static int mshv_map_pinned_region(struct mshv_mem_region *region)
  *
  * Returns: 0 on success, negative error code on failure.
  */
-static int mshv_map_movable_region(struct mshv_mem_region *region)
+static int mshv_map_movable_region(struct mshv_region *region)
 {
 	u64 pfn, count;
 	int ret;
@@ -842,7 +842,7 @@ static int mshv_map_movable_region(struct mshv_mem_region *region)
 	return 0;
 }
 
-static int mshv_map_mmio_region(struct mshv_mem_region *region)
+static int mshv_map_mmio_region(struct mshv_region *region)
 {
 	return hv_call_map_mmio_pfns(region->partition->pt_id,
 				     region->start_gfn,
@@ -850,7 +850,7 @@ static int mshv_map_mmio_region(struct mshv_mem_region *region)
 				     region->nr_pfns);
 }
 
-int mshv_map_region(struct mshv_mem_region *region)
+int mshv_map_region(struct mshv_region *region)
 {
 	switch (region->mreg_type) {
 	case MSHV_REGION_TYPE_MEM_PINNED:
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index 40fcf57219ece..e9bd18013b486 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -82,7 +82,7 @@ enum mshv_region_type {
 	MSHV_REGION_TYPE_MMIO
 };
 
-struct mshv_mem_region {
+struct mshv_region {
 	struct hlist_node hnode;
 	struct kref mreg_refcount;
 	u64 nr_pfns;
@@ -370,15 +370,15 @@ extern struct mshv_root mshv_root;
 extern enum hv_scheduler_type hv_scheduler_type;
 extern u8 * __percpu *hv_synic_eventring_tail;
 
-struct mshv_mem_region *mshv_region_create(struct mshv_partition *partition,
-					   enum mshv_region_type type,
-					   u64 guest_pfn, u64 nr_pfns,
-					   u64 uaddr, u32 flags,
-					   unsigned long mmio_pfn);
-void mshv_region_init_pfns(struct mshv_mem_region *region);
-void mshv_region_put(struct mshv_mem_region *region);
-int mshv_region_get(struct mshv_mem_region *region);
-bool mshv_region_handle_gfn_fault(struct mshv_mem_region *region, u64 gfn);
-int mshv_map_region(struct mshv_mem_region *region);
+struct mshv_region *mshv_region_create(struct mshv_partition *partition,
+				       enum mshv_region_type type,
+				       u64 guest_pfn, u64 nr_pfns,
+				       u64 uaddr, u32 flags,
+				       unsigned long mmio_pfn);
+void mshv_region_init_pfns(struct mshv_region *region);
+void mshv_region_put(struct mshv_region *region);
+int mshv_region_get(struct mshv_region *region);
+bool mshv_region_handle_gfn_fault(struct mshv_region *region, u64 gfn);
+int mshv_map_region(struct mshv_region *region);
 
 #endif /* _MSHV_ROOT_H_ */
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 68609f2452b3a..0e9d14d1d6167 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -612,10 +612,10 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
 static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
 	      "sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
 
-static struct mshv_mem_region *
+static struct mshv_region *
 mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
 {
-	struct mshv_mem_region *region;
+	struct mshv_region *region;
 
 	hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
 		if (gfn >= region->start_gfn &&
@@ -626,10 +626,10 @@ mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
 	return NULL;
 }
 
-static struct mshv_mem_region *
+static struct mshv_region *
 mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
 {
-	struct mshv_mem_region *region;
+	struct mshv_region *region;
 
 	spin_lock(&p->pt_mem_regions_lock);
 	region = mshv_partition_region_by_gfn(p, gfn);
@@ -656,7 +656,7 @@ mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
 static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
 {
 	struct mshv_partition *p = vp->vp_partition;
-	struct mshv_mem_region *region;
+	struct mshv_region *region;
 	bool ret = false;
 	u64 gfn;
 #if defined(CONFIG_X86_64)
@@ -933,7 +933,7 @@ mshv_vp_ioctl_translate_gva(struct mshv_vp *vp, void __user *user_args)
 			return ret;
 
 		if (mshv_gpa_fault_retryable(result.result_code)) {
-			struct mshv_mem_region *region;
+			struct mshv_region *region;
 			bool faulted;
 
 			region = mshv_partition_region_by_gfn_get(partition,
@@ -1309,9 +1309,9 @@ static void mshv_async_hvcall_handler(void *data, u64 *status)
  */
 static int mshv_partition_create_region(struct mshv_partition *partition,
 					struct mshv_user_mem_region *mem,
-					struct mshv_mem_region **regionpp)
+					struct mshv_region **regionpp)
 {
-	struct mshv_mem_region *rg;
+	struct mshv_region *rg;
 	enum mshv_region_type type;
 	u64 nr_pfns = HVPFN_DOWN(mem->size);
 	struct vm_area_struct *vma;
@@ -1372,7 +1372,7 @@ static long
 mshv_map_user_memory(struct mshv_partition *partition,
 		     struct mshv_user_mem_region *mem)
 {
-	struct mshv_mem_region *region;
+	struct mshv_region *region;
 	long ret;
 
 	if (mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP) ||
@@ -1408,7 +1408,7 @@ static long
 mshv_unmap_user_memory(struct mshv_partition *partition,
 		       struct mshv_user_mem_region *mem)
 {
-	struct mshv_mem_region *region;
+	struct mshv_region *region;
 
 	if (!(mem->flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
 		return -EINVAL;
@@ -1780,7 +1780,7 @@ remove_partition(struct mshv_partition *partition)
 static void destroy_partition(struct mshv_partition *partition)
 {
 	struct mshv_vp *vp;
-	struct mshv_mem_region *region;
+	struct mshv_region *region;
 	struct hlist_node *n;
 	int i;
 



^ permalink raw reply related

* [PATCH v4 3/6] mshv: Optimize region unmap and invalidation with large pages
From: Stanislav Kinsburskii @ 2026-05-13 18:54 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, skinsburskii
  Cc: linux-hyperv, linux-kernel
In-Reply-To: <177869833161.19379.1357399549586915698.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

Region unmapping and no-access remapping do not require per-PFN
iteration: all GFNs in a region are guaranteed mapped (valid PFNs
with access permissions, holes with no-access), so they can be
processed in bulk.

Split GFN ranges into large-page-aligned chunks and use
HV_MAP_GPA_LARGE_PAGE / HV_UNMAP_GPA_LARGE_PAGE flags for the
aligned portions. This eliminates PFN traversal and reduces
hypercalls for large regions.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_regions.c |   77 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 64 insertions(+), 13 deletions(-)

diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
index f81951ae3f808..cb42ee49c2e2f 100644
--- a/drivers/hv/mshv_regions.c
+++ b/drivers/hv/mshv_regions.c
@@ -20,11 +20,17 @@
 /* Process memory regions in chunks to avoid soft lockups and livelock */
 #define MSHV_MAX_PFN_BATCH				(SZ_2M / PAGE_SIZE)
 
+/* Hypervisor base pages per large page (2 MiB / 4 KiB) */
+#define HV_PFNS_PER_LARGE_PAGE				(SZ_2M / HV_HYP_PAGE_SIZE)
+
 #define MSHV_MAP_FAULT_IN_PAGES				\
 	(PTRS_PER_PMD * max_t(unsigned long, 1, PAGE_SIZE / HV_HYP_PAGE_SIZE))
 
 #define MSHV_INVALID_PFN				ULONG_MAX
 
+typedef int (*gfn_handler_t)(struct mshv_region *region,
+			      u64 gfn, u64 count, u32 flags);
+
 static const struct mmu_interval_notifier_ops mshv_region_mni_ops;
 
 static inline bool mshv_pfn_valid(unsigned long pfn)
@@ -426,24 +432,54 @@ static int mshv_region_pin(struct mshv_region *region)
 	return ret < 0 ? ret : -ENOMEM;
 }
 
-static int mshv_region_chunk_unmap(struct mshv_region *region,
-				   u32 flags,
-				   u64 pfn_offset, u64 pfn_count,
-				   bool huge_page)
+/*
+ * Split a GFN range into head (unaligned), large-page-aligned middle,
+ * and tail, invoking @fn for each non-empty piece.
+ */
+static int mshv_region_for_each_gfn_chunk(struct mshv_region *region,
+					  u64 gfn, u64 nr_pfns,
+					  u32 base_flags, u32 large_flag,
+					  gfn_handler_t fn)
 {
-	if (huge_page)
-		flags |= HV_UNMAP_GPA_LARGE_PAGE;
+	u64 head, aligned, tail;
+	int ret;
+
+	head = min(ALIGN(gfn, HV_PFNS_PER_LARGE_PAGE) - gfn, nr_pfns);
+	aligned = ALIGN_DOWN(nr_pfns - head, HV_PFNS_PER_LARGE_PAGE);
+	tail = nr_pfns - head - aligned;
+
+	if (head) {
+		ret = fn(region, gfn, head, base_flags);
+		if (ret)
+			return ret;
+	}
+	if (aligned) {
+		ret = fn(region, gfn + head, aligned,
+			 base_flags | large_flag);
+		if (ret)
+			return ret;
+	}
+	if (tail) {
+		ret = fn(region, gfn + head + aligned, tail, base_flags);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
 
+static int mshv_unmap_gfns(struct mshv_region *region,
+			   u64 gfn, u64 count, u32 flags)
+{
 	return hv_call_unmap_pfns(region->partition->pt_id,
-				  region->start_gfn + pfn_offset,
-				  pfn_count, flags);
+				  gfn, count, flags);
 }
 
 static int mshv_region_unmap(struct mshv_region *region)
 {
-	return mshv_region_process_range(region, 0,
-					 0, region->nr_pfns,
-					 mshv_region_chunk_unmap);
+	return mshv_region_for_each_gfn_chunk(region, region->start_gfn,
+					      region->nr_pfns,
+					      0, HV_UNMAP_GPA_LARGE_PAGE,
+					      mshv_unmap_gfns);
 }
 
 static void mshv_region_destroy(struct kref *ref)
@@ -671,6 +707,22 @@ bool mshv_region_handle_gfn_fault(struct mshv_region *region, u64 gfn)
 	return !ret;
 }
 
+static int mshv_map_no_access_gfns(struct mshv_region *region,
+				   u64 gfn, u64 count, u32 flags)
+{
+	return hv_call_map_ram_pfns(region->partition->pt_id,
+				    gfn, count, flags, NULL);
+}
+
+static int mshv_region_map_no_access(struct mshv_region *region,
+				     u64 pfn_offset, u64 pfn_count)
+{
+	return mshv_region_for_each_gfn_chunk(region,
+			region->start_gfn + pfn_offset, pfn_count,
+			HV_MAP_GPA_NO_ACCESS, HV_MAP_GPA_LARGE_PAGE,
+			mshv_map_no_access_gfns);
+}
+
 /**
  * mshv_region_interval_invalidate - Invalidate a range of memory region
  * @mni: Pointer to the mmu_interval_notifier structure
@@ -714,8 +766,7 @@ static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni,
 
 	mmu_interval_set_seq(mni, cur_seq);
 
-	ret = mshv_region_remap_pfns(region, HV_MAP_GPA_NO_ACCESS,
-				     pfn_offset, pfn_count);
+	ret = mshv_region_map_no_access(region, pfn_offset, pfn_count);
 	if (ret)
 		goto out_unlock;
 



^ permalink raw reply related

* [PATCH v4 4/6] mshv: Pass pfns array explicitly through processing chain
From: Stanislav Kinsburskii @ 2026-05-13 18:54 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, skinsburskii
  Cc: linux-hyperv, linux-kernel
In-Reply-To: <177869833161.19379.1357399549586915698.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

The current implementation relies on accessing region->pfns directly
within the pfn processing chain, making it difficult to use these
handlers with alternative pfn sources. This tight coupling limits
flexibility when processing pfns from different locations, such as
temporary arrays or external sources.

By threading the pfns pointer through the entire processing chain
(mshv_region_process_range, mshv_region_chunk_size, and all
handlers), we decouple the processing logic from the storage location.
This enables future enhancements like processing pfns from multiple
sources or implementing more sophisticated memory management strategies
without duplicating the core processing logic.

No functional change intended.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_regions.c |   50 +++++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
index cb42ee49c2e2f..87204b2b48290 100644
--- a/drivers/hv/mshv_regions.c
+++ b/drivers/hv/mshv_regions.c
@@ -31,6 +31,10 @@
 typedef int (*gfn_handler_t)(struct mshv_region *region,
 			      u64 gfn, u64 count, u32 flags);
 
+typedef int (*pfn_handler_t)(struct mshv_region *region, u32 flags,
+			     u64 pfn_offset, u64 pfn_count,
+			     unsigned long *pfns, bool huge_page);
+
 static const struct mmu_interval_notifier_ops mshv_region_mni_ops;
 
 static inline bool mshv_pfn_valid(unsigned long pfn)
@@ -98,6 +102,7 @@ static int mshv_chunk_stride(unsigned long pfn, u64 gfn, u64 pfn_count)
  * @region    : Memory region whose PFN array is being walked.
  * @pfn_offset: Offset into region->mreg_pfns at which to start.
  * @pfn_count : Upper bound on the run length.
+ * @pfns      : Pointer to an array of PFNs corresponding to the region.
  * @huge_page : Out-parameter set to true if the run may be dispatched
  *              as a 2 MiB chunk; false for 4 KiB-stride dispatch.
  *
@@ -114,12 +119,13 @@ static int mshv_chunk_stride(unsigned long pfn, u64 gfn, u64 pfn_count)
  */
 static long mshv_region_chunk_size(struct mshv_region *region,
 				   u64 pfn_offset, u64 pfn_count,
-				   bool *huge_page)
+				   unsigned long *pfns, bool *huge_page)
 {
-	unsigned long *pfns = region->mreg_pfns + pfn_offset;
 	u64 gfn = region->start_gfn + pfn_offset;
 	u64 count = 0, stride;
 
+	pfns += pfn_offset;
+
 	if (!mshv_pfn_valid(pfns[0])) {
 		for (count = 1; count < pfn_count; count++) {
 			if (mshv_pfn_valid(pfns[count]))
@@ -158,6 +164,7 @@ static long mshv_region_chunk_size(struct mshv_region *region,
  * @flags     : Flags to pass to the handler.
  * @pfn_offset: Offset into the region's PFNs array to start processing.
  * @pfn_count : Number of PFNs to process.
+ * @pfns      : Pointer to an array of PFNs corresponding to the region.
  * @handler   : Callback function to handle each chunk of contiguous
  *              valid PFNs.
  *
@@ -173,11 +180,8 @@ static long mshv_region_chunk_size(struct mshv_region *region,
 static int mshv_region_process_range(struct mshv_region *region,
 				     u32 flags,
 				     u64 pfn_offset, u64 pfn_count,
-				     int (*handler)(struct mshv_region *region,
-						    u32 flags,
-						    u64 pfn_offset,
-						    u64 pfn_count,
-						    bool huge_page))
+				     unsigned long *pfns,
+				     pfn_handler_t handler)
 {
 	u64 end;
 	long ret;
@@ -193,11 +197,12 @@ static int mshv_region_process_range(struct mshv_region *region,
 		long count;
 
 		count = mshv_region_chunk_size(region, pfn_offset, pfn_count,
-					       &huge_page);
+					       pfns, &huge_page);
 		if (count < 0)
 			return count;
 
-		ret = handler(region, flags, pfn_offset, count, huge_page);
+		ret = handler(region, flags, pfn_offset, count, pfns,
+			      huge_page);
 		if (ret < 0)
 			return ret;
 
@@ -266,16 +271,17 @@ struct mshv_region *mshv_region_create(struct mshv_partition *partition,
 static int mshv_region_chunk_share(struct mshv_region *region,
 				   u32 flags,
 				   u64 pfn_offset, u64 pfn_count,
+				   unsigned long *pfns,
 				   bool huge_page)
 {
-	if (!mshv_pfn_valid(region->mreg_pfns[pfn_offset]))
+	if (!mshv_pfn_valid(pfns[pfn_offset]))
 		return -EINVAL;
 
 	if (huge_page)
 		flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
 
 	return hv_call_modify_spa_host_access(region->partition->pt_id,
-					      region->mreg_pfns + pfn_offset,
+					      pfns + pfn_offset,
 					      pfn_count,
 					      HV_MAP_GPA_READABLE |
 					      HV_MAP_GPA_WRITABLE,
@@ -288,22 +294,24 @@ static int mshv_region_share(struct mshv_region *region)
 
 	return mshv_region_process_range(region, flags,
 					 0, region->nr_pfns,
+					 region->mreg_pfns,
 					 mshv_region_chunk_share);
 }
 
 static int mshv_region_chunk_unshare(struct mshv_region *region,
 				     u32 flags,
 				     u64 pfn_offset, u64 pfn_count,
+				     unsigned long *pfns,
 				     bool huge_page)
 {
-	if (!mshv_pfn_valid(region->mreg_pfns[pfn_offset]))
+	if (!mshv_pfn_valid(pfns[pfn_offset]))
 		return -EINVAL;
 
 	if (huge_page)
 		flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
 
 	return hv_call_modify_spa_host_access(region->partition->pt_id,
-					      region->mreg_pfns + pfn_offset,
+					      pfns + pfn_offset,
 					      pfn_count, 0,
 					      flags, false);
 }
@@ -314,12 +322,14 @@ static int mshv_region_unshare(struct mshv_region *region)
 
 	return mshv_region_process_range(region, flags,
 					 0, region->nr_pfns,
+					 region->mreg_pfns,
 					 mshv_region_chunk_unshare);
 }
 
 static int mshv_region_chunk_remap(struct mshv_region *region,
 				   u32 flags,
 				   u64 pfn_offset, u64 pfn_count,
+				   unsigned long *pfns,
 				   bool huge_page)
 {
 	/*
@@ -327,7 +337,7 @@ static int mshv_region_chunk_remap(struct mshv_region *region,
 	 * hypervisor track dirty pages, enabling precopy live
 	 * migration.
 	 */
-	if (!mshv_pfn_valid(region->mreg_pfns[pfn_offset]))
+	if (!mshv_pfn_valid(pfns[pfn_offset]))
 		flags = HV_MAP_GPA_NO_ACCESS;
 
 	if (huge_page)
@@ -336,15 +346,17 @@ static int mshv_region_chunk_remap(struct mshv_region *region,
 	return hv_call_map_ram_pfns(region->partition->pt_id,
 				    region->start_gfn + pfn_offset,
 				    pfn_count, flags,
-				    region->mreg_pfns + pfn_offset);
+				    pfns + pfn_offset);
 }
 
 static int mshv_region_remap_pfns(struct mshv_region *region,
 				  u32 map_flags,
-				  u64 pfn_offset, u64 pfn_count)
+				  u64 pfn_offset, u64 pfn_count,
+				  unsigned long *pfns)
 {
 	return mshv_region_process_range(region, map_flags,
 					 pfn_offset, pfn_count,
+					 pfns,
 					 mshv_region_chunk_remap);
 }
 
@@ -353,7 +365,8 @@ static int mshv_region_map(struct mshv_region *region)
 	u32 map_flags = region->hv_map_flags;
 
 	return mshv_region_remap_pfns(region, map_flags,
-				      0, region->nr_pfns);
+				      0, region->nr_pfns,
+				      region->mreg_pfns);
 }
 
 static void mshv_region_invalidate_pfns(struct mshv_region *region,
@@ -668,7 +681,8 @@ static int mshv_region_collect_and_map(struct mshv_region *region,
 	}
 
 	ret = mshv_region_remap_pfns(region, region->hv_map_flags,
-				     pfn_offset, pfn_count);
+				     pfn_offset, pfn_count,
+				     region->mreg_pfns);
 
 	mutex_unlock(&region->mreg_mutex);
 out:



^ permalink raw reply related

* [PATCH v4 5/6] mshv: Simplify pfn array handling in region processing
From: Stanislav Kinsburskii @ 2026-05-13 18:55 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, skinsburskii
  Cc: linux-hyperv, linux-kernel
In-Reply-To: <177869833161.19379.1357399549586915698.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

The current code requires passing both the full pfn array and an offset
parameter to region processing functions, forcing callees to manually
index into arrays. This approach is inflexible and makes it difficult
to work with different sources of pfn arrays.

Upcoming changes will need to pass pfn arrays obtained from the HMM
framework directly to these functions. The HMM framework returns arrays
that represent specific ranges rather than full region arrays with
offsets, making the current offset-based indexing pattern incompatible.

Refactor by having callers pass pre-offset pointers to pfn arrays and
removing offset-based indexing from callees. This allows functions to
work with any pfn array starting at index 0, regardless of its source,
and prepares the code for HMM integration.

No functional change intended.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_regions.c |   41 +++++++++++++++++++----------------------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
index 87204b2b48290..e20db61e9829f 100644
--- a/drivers/hv/mshv_regions.c
+++ b/drivers/hv/mshv_regions.c
@@ -99,14 +99,13 @@ static int mshv_chunk_stride(unsigned long pfn, u64 gfn, u64 pfn_count)
 
 /**
  * mshv_region_chunk_size - Length of the next contiguous PFN run in a region.
- * @region    : Memory region whose PFN array is being walked.
- * @pfn_offset: Offset into region->mreg_pfns at which to start.
- * @pfn_count : Upper bound on the run length.
- * @pfns      : Pointer to an array of PFNs corresponding to the region.
- * @huge_page : Out-parameter set to true if the run may be dispatched
+ * @gfn      : GFN corresponding to the start of the PFN run.
+ * @pfn_count: Upper bound on the run length.
+ * @pfns     : PFN array starting at the chunk's first PFN.
+ * @huge_page: Out-parameter set to true if the run may be dispatched
  *              as a 2 MiB chunk; false for 4 KiB-stride dispatch.
  *
- * Returns the length of the longest contiguous run starting at @pfn_offset
+ * Returns the length of the longest contiguous run starting at at @pfns[0]
  * that shares the classification of the first PFN: either a same-stride run of
  * valid PFNs (4 KiB or 2 MiB) or a hole of invalid PFNs. A hole that is
  * huge-page aligned in @gfn space and at least PTRS_PER_PMD entries long is
@@ -117,15 +116,11 @@ static int mshv_chunk_stride(unsigned long pfn, u64 gfn, u64 pfn_count)
  * Return: Length of the run in PFNs, or a negative errno from
  *         mshv_chunk_stride() if the backing folio order is unsupported.
  */
-static long mshv_region_chunk_size(struct mshv_region *region,
-				   u64 pfn_offset, u64 pfn_count,
+static long mshv_region_chunk_size(u64 gfn, u64 pfn_count,
 				   unsigned long *pfns, bool *huge_page)
 {
-	u64 gfn = region->start_gfn + pfn_offset;
 	u64 count = 0, stride;
 
-	pfns += pfn_offset;
-
 	if (!mshv_pfn_valid(pfns[0])) {
 		for (count = 1; count < pfn_count; count++) {
 			if (mshv_pfn_valid(pfns[count]))
@@ -162,7 +157,7 @@ static long mshv_region_chunk_size(struct mshv_region *region,
  * mshv_region_process_range - Processes a range of PFNs in a region.
  * @region    : Pointer to the memory region structure.
  * @flags     : Flags to pass to the handler.
- * @pfn_offset: Offset into the region's PFNs array to start processing.
+ * @pfn_offset: Offset into the region's PFN's array to start processing.
  * @pfn_count : Number of PFNs to process.
  * @pfns      : Pointer to an array of PFNs corresponding to the region.
  * @handler   : Callback function to handle each chunk of contiguous
@@ -183,6 +178,7 @@ static int mshv_region_process_range(struct mshv_region *region,
 				     unsigned long *pfns,
 				     pfn_handler_t handler)
 {
+	u64 gfn = region->start_gfn + pfn_offset;
 	u64 end;
 	long ret;
 
@@ -196,7 +192,7 @@ static int mshv_region_process_range(struct mshv_region *region,
 		bool huge_page;
 		long count;
 
-		count = mshv_region_chunk_size(region, pfn_offset, pfn_count,
+		count = mshv_region_chunk_size(gfn, pfn_count,
 					       pfns, &huge_page);
 		if (count < 0)
 			return count;
@@ -208,6 +204,8 @@ static int mshv_region_process_range(struct mshv_region *region,
 
 		pfn_offset += count;
 		pfn_count -= count;
+		pfns += count;
+		gfn += count;
 	}
 
 	return 0;
@@ -274,15 +272,14 @@ static int mshv_region_chunk_share(struct mshv_region *region,
 				   unsigned long *pfns,
 				   bool huge_page)
 {
-	if (!mshv_pfn_valid(pfns[pfn_offset]))
+	if (!mshv_pfn_valid(pfns[0]))
 		return -EINVAL;
 
 	if (huge_page)
 		flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
 
 	return hv_call_modify_spa_host_access(region->partition->pt_id,
-					      pfns + pfn_offset,
-					      pfn_count,
+					      pfns, pfn_count,
 					      HV_MAP_GPA_READABLE |
 					      HV_MAP_GPA_WRITABLE,
 					      flags, true);
@@ -304,15 +301,15 @@ static int mshv_region_chunk_unshare(struct mshv_region *region,
 				     unsigned long *pfns,
 				     bool huge_page)
 {
-	if (!mshv_pfn_valid(pfns[pfn_offset]))
+	if (!mshv_pfn_valid(pfns[0]))
 		return -EINVAL;
 
 	if (huge_page)
 		flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
 
 	return hv_call_modify_spa_host_access(region->partition->pt_id,
-					      pfns + pfn_offset,
-					      pfn_count, 0,
+					      pfns, pfn_count,
+					      0,
 					      flags, false);
 }
 
@@ -337,7 +334,7 @@ static int mshv_region_chunk_remap(struct mshv_region *region,
 	 * hypervisor track dirty pages, enabling precopy live
 	 * migration.
 	 */
-	if (!mshv_pfn_valid(pfns[pfn_offset]))
+	if (!mshv_pfn_valid(pfns[0]))
 		flags = HV_MAP_GPA_NO_ACCESS;
 
 	if (huge_page)
@@ -346,7 +343,7 @@ static int mshv_region_chunk_remap(struct mshv_region *region,
 	return hv_call_map_ram_pfns(region->partition->pt_id,
 				    region->start_gfn + pfn_offset,
 				    pfn_count, flags,
-				    pfns + pfn_offset);
+				    pfns);
 }
 
 static int mshv_region_remap_pfns(struct mshv_region *region,
@@ -682,7 +679,7 @@ static int mshv_region_collect_and_map(struct mshv_region *region,
 
 	ret = mshv_region_remap_pfns(region, region->hv_map_flags,
 				     pfn_offset, pfn_count,
-				     region->mreg_pfns);
+				     region->mreg_pfns + pfn_offset);
 
 	mutex_unlock(&region->mreg_mutex);
 out:



^ permalink raw reply related

* [PATCH v4 6/6] mshv: Allocate pfns array only for pinned regions
From: Stanislav Kinsburskii @ 2026-05-13 18:55 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli, skinsburskii
  Cc: linux-hyperv, linux-kernel
In-Reply-To: <177869833161.19379.1357399549586915698.stgit@skinsburskii-cloud-desktop.internal.cloudapp.net>

The per-region pfns array is only used by pinned regions, where it records
the host PFNs returned by pin_user_pages() so they can later be unpinned,
shared, and unshared. Movable regions get their PFNs from HMM on every
fault, and MMIO regions need only a single base PFN. Keeping a 2 MiB-per-1
GiB flexible array around for region types that never read it is pure
overhead.

Convert mreg_pfns to a pointer and allocate it (via vmalloc_array, since it
can be large) only for MSHV_REGION_TYPE_MEM_PINNED. Place it in a union
with mreg_mmio_pfn so MMIO regions reuse the storage for the device base
PFN. Movable regions leave the pointer NULL.

With the flexible array gone, the struct itself becomes fixed-size and the
main allocation can move from vzalloc() to kzalloc().

Gate every mreg_pfns access on mreg_type == MSHV_REGION_TYPE_MEM_PINNED:
share/unshare/invalidate_pfns short-circuit for other types, and the
destructor frees the array only for pinned regions. A NULL check would not
be safe here because the union also stores MMIO regions' mmio_pfn, which is
typically non-zero.

The movable-region fault path no longer copies HMM-collected PFNs into
mreg_pfns; instead it post-processes the temporary HMM array in place
(stamping skipped slots with MSHV_INVALID_PFN) and hands it directly to the
remap helper. Movable regions are now stateless from the kernel's point of
view; the hypervisor's SLAT is the source of truth.

Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_regions.c |   77 +++++++++++++++++++++++----------------------
 drivers/hv/mshv_root.h    |    7 ++--
 2 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
index e20db61e9829f..a4bfec9279ede 100644
--- a/drivers/hv/mshv_regions.c
+++ b/drivers/hv/mshv_regions.c
@@ -42,8 +42,8 @@ static inline bool mshv_pfn_valid(unsigned long pfn)
 	return pfn != MSHV_INVALID_PFN;
 }
 
-static void mshv_region_init_pfns_range(struct mshv_region *region,
-					u64 pfn_offset, u64 pfn_count)
+static void mshv_region_init_pfns(struct mshv_region *region,
+				  u64 pfn_offset, u64 pfn_count)
 {
 	u64 i;
 
@@ -51,11 +51,6 @@ static void mshv_region_init_pfns_range(struct mshv_region *region,
 		region->mreg_pfns[i] = MSHV_INVALID_PFN;
 }
 
-void mshv_region_init_pfns(struct mshv_region *region)
-{
-	mshv_region_init_pfns_range(region, 0, region->nr_pfns);
-}
-
 /**
  * mshv_chunk_stride - Compute stride for mapping guest memory
  * @pfn      : The PFN to check for huge page backing
@@ -220,7 +215,7 @@ struct mshv_region *mshv_region_create(struct mshv_partition *partition,
 	struct mshv_region *region;
 	int ret = 0;
 
-	region = vzalloc(struct_size(region, mreg_pfns, nr_pfns));
+	region = kzalloc_obj(struct mshv_region);
 	if (!region)
 		return ERR_PTR(-ENOMEM);
 
@@ -235,8 +230,6 @@ struct mshv_region *mshv_region_create(struct mshv_partition *partition,
 	if (flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE))
 		region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE;
 
-	mshv_region_init_pfns(region);
-
 	mutex_init(&region->mreg_mutex);
 	kref_init(&region->mreg_refcount);
 
@@ -248,6 +241,12 @@ struct mshv_region *mshv_region_create(struct mshv_partition *partition,
 						   &mshv_region_mni_ops);
 		break;
 	case MSHV_REGION_TYPE_MEM_PINNED:
+		region->mreg_pfns = vmalloc_array(nr_pfns, sizeof(unsigned long));
+		if (!region->mreg_pfns) {
+			ret = -ENOMEM;
+			break;
+		}
+		mshv_region_init_pfns(region, 0, region->nr_pfns);
 		break;
 	case MSHV_REGION_TYPE_MMIO:
 		region->mreg_mmio_pfn = mmio_pfn;
@@ -262,7 +261,7 @@ struct mshv_region *mshv_region_create(struct mshv_partition *partition,
 	return region;
 
 free_region:
-	vfree(region);
+	kfree(region);
 	return ERR_PTR(ret);
 }
 
@@ -289,6 +288,9 @@ static int mshv_region_share(struct mshv_region *region)
 {
 	u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED;
 
+	if (region->mreg_type != MSHV_REGION_TYPE_MEM_PINNED)
+		return -EINVAL;
+
 	return mshv_region_process_range(region, flags,
 					 0, region->nr_pfns,
 					 region->mreg_pfns,
@@ -317,6 +319,9 @@ static int mshv_region_unshare(struct mshv_region *region)
 {
 	u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE;
 
+	if (region->mreg_type != MSHV_REGION_TYPE_MEM_PINNED)
+		return -EINVAL;
+
 	return mshv_region_process_range(region, flags,
 					 0, region->nr_pfns,
 					 region->mreg_pfns,
@@ -357,29 +362,20 @@ static int mshv_region_remap_pfns(struct mshv_region *region,
 					 mshv_region_chunk_remap);
 }
 
-static int mshv_region_map(struct mshv_region *region)
-{
-	u32 map_flags = region->hv_map_flags;
-
-	return mshv_region_remap_pfns(region, map_flags,
-				      0, region->nr_pfns,
-				      region->mreg_pfns);
-}
-
 static void mshv_region_invalidate_pfns(struct mshv_region *region,
 					u64 pfn_offset, u64 pfn_count)
 {
 	u64 i;
 
-	for (i = pfn_offset; i < pfn_offset + pfn_count; i++) {
-		if (!mshv_pfn_valid(region->mreg_pfns[i]))
-			continue;
+	if (region->mreg_type != MSHV_REGION_TYPE_MEM_PINNED)
+		return;
 
-		if (region->mreg_type == MSHV_REGION_TYPE_MEM_PINNED)
+	for (i = pfn_offset; i < pfn_offset + pfn_count; i++) {
+		if (mshv_pfn_valid(region->mreg_pfns[i]))
 			unpin_user_page(pfn_to_page(region->mreg_pfns[i]));
 	}
 
-	mshv_region_init_pfns_range(region, pfn_offset, pfn_count);
+	mshv_region_init_pfns(region, pfn_offset, pfn_count);
 }
 
 static void mshv_region_invalidate(struct mshv_region *region)
@@ -516,7 +512,9 @@ static void mshv_region_destroy(struct kref *ref)
 
 	mshv_region_invalidate(region);
 
-	vfree(region);
+	if (region->mreg_type == MSHV_REGION_TYPE_MEM_PINNED)
+		vfree(region->mreg_pfns);
+	kfree(region);
 }
 
 void mshv_region_put(struct mshv_region *region)
@@ -634,10 +632,9 @@ static int mshv_region_hmm_fault_and_lock(struct mshv_region *region,
  *   leaving missing pages as invalid PFN markers.
  *   Used for initial region setup.
  *
- * Collected PFNs are stored in region->mreg_pfns[] with HMM bookkeeping
- * flags cleared, then the range is mapped into the hypervisor. Present
- * PFNs get mapped with region access permissions; missing PFNs (invalid
- * entries) get mapped with no-access permissions.
+ * HMM bookkeeping flags are stripped from collected PFNs before mapping.
+ * Present PFNs get mapped with region access permissions; missing PFNs
+ * (marked as MSHV_INVALID_PFN) get mapped with no-access permissions.
  *
  * Return: 0 on success, negative errno on failure.
  */
@@ -666,20 +663,24 @@ static int mshv_region_collect_and_map(struct mshv_region *region,
 		goto out;
 
 	for (i = 0; i < pfn_count; i++) {
-		if (!(pfns[i] & HMM_PFN_VALID))
+		if (!(pfns[i] & HMM_PFN_VALID)) {
+			pfns[i] = MSHV_INVALID_PFN;
 			continue;
+		}
 		/* Skip read-only pages to avoid bypassing COW */
 		if (!do_fault &&
 		    (region->hv_map_flags & HV_MAP_GPA_WRITABLE) &&
-		    !(pfns[i] & HMM_PFN_WRITE))
+		    !(pfns[i] & HMM_PFN_WRITE)) {
+			pfns[i] = MSHV_INVALID_PFN;
 			continue;
+		}
 		/* Drop HMM_PFN_* flags to ensure PFNs are valid. */
-		region->mreg_pfns[pfn_offset + i] = pfns[i] & ~HMM_PFN_FLAGS;
+		pfns[i] &= ~HMM_PFN_FLAGS;
 	}
 
 	ret = mshv_region_remap_pfns(region, region->hv_map_flags,
 				     pfn_offset, pfn_count,
-				     region->mreg_pfns + pfn_offset);
+				     pfns);
 
 	mutex_unlock(&region->mreg_mutex);
 out:
@@ -781,8 +782,6 @@ static bool mshv_region_interval_invalidate(struct mmu_interval_notifier *mni,
 	if (ret)
 		goto out_unlock;
 
-	mshv_region_invalidate_pfns(region, pfn_offset, pfn_count);
-
 	mutex_unlock(&region->mreg_mutex);
 
 	return true;
@@ -845,7 +844,9 @@ static int mshv_map_pinned_region(struct mshv_region *region)
 		}
 	}
 
-	ret = mshv_region_map(region);
+	ret = mshv_region_remap_pfns(region, region->hv_map_flags,
+				     0, region->nr_pfns,
+				     region->mreg_pfns);
 	if (ret)
 		goto share_region;
 
@@ -869,7 +870,7 @@ static int mshv_map_pinned_region(struct mshv_region *region)
 		 * is intentional; unpinning host-inaccessible pages would be
 		 * unsafe).
 		 */
-		mshv_region_init_pfns(region);
+		mshv_region_init_pfns(region, 0, region->nr_pfns);
 		goto err_out;
 	}
 err_out:
diff --git a/drivers/hv/mshv_root.h b/drivers/hv/mshv_root.h
index e9bd18013b486..d79dfaac88af9 100644
--- a/drivers/hv/mshv_root.h
+++ b/drivers/hv/mshv_root.h
@@ -93,8 +93,10 @@ struct mshv_region {
 	enum mshv_region_type mreg_type;
 	struct mmu_interval_notifier mreg_mni;
 	struct mutex mreg_mutex;	/* protects region PFNs remapping */
-	u64 mreg_mmio_pfn;
-	unsigned long mreg_pfns[];
+	union {
+		unsigned long *mreg_pfns;
+		u64 mreg_mmio_pfn;
+	};
 };
 
 struct mshv_irq_ack_notifier {
@@ -375,7 +377,6 @@ struct mshv_region *mshv_region_create(struct mshv_partition *partition,
 				       u64 guest_pfn, u64 nr_pfns,
 				       u64 uaddr, u32 flags,
 				       unsigned long mmio_pfn);
-void mshv_region_init_pfns(struct mshv_region *region);
 void mshv_region_put(struct mshv_region *region);
 int mshv_region_get(struct mshv_region *region);
 bool mshv_region_handle_gfn_fault(struct mshv_region *region, u64 gfn);



^ permalink raw reply related

* Re: [EXTERNAL] Re: [PATCH net-next] net: mana: Add handler for sriov configure
From: Bjorn Helgaas @ 2026-05-13 19:05 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Haiyang Zhang, Haiyang Zhang, Paul Rosswurm,
	linux-hyperv@vger.kernel.org, netdev@vger.kernel.org,
	KY Srinivasan, Wei Liu, Dexuan Cui, Long Li, Andrew Lunn,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Bjorn Helgaas, Simon Horman, Shradha Gupta, Dipayaan Roy,
	Erni Sri Satya Vennela, linux-kernel@vger.kernel.org,
	linux-pci@vger.kernel.org
In-Reply-To: <20260513184749.GI15586@unreal>

On Wed, May 13, 2026 at 09:47:49PM +0300, Leon Romanovsky wrote:
> On Fri, May 08, 2026 at 06:10:29PM -0500, Bjorn Helgaas wrote:
> > On Fri, May 08, 2026 at 10:47:14PM +0000, Haiyang Zhang wrote:
> > > > On Fri, May 08, 2026 at 03:04:06PM -0700, Haiyang Zhang wrote:
> > > > > From: Haiyang Zhang <haiyangz@microsoft.com>
> > > > >
> > > > > Add callback function for the pci_driver, sriov_configure.
> > > > >
> > > > > Also disable VF autoprobe when it runs as PF driver on bare metal,
> > > > > since the hardware side may not have the VF ready immediately.
> > > > >
> > > > > Export pci_vf_drivers_autoprobe() so the driver can toggle the VF
> > > > > autoprobe flag.
> > > > 
> > > > Technically pci_vf_drivers_autoprobe() doesn't *toggle* the autoprobe
> > > > flag.  That would mean setting it to the opposite of its current
> > > > value.
> > > > 
> > > > Here I would say "so the driver can prevent autoprobing of the VFs",
> > > > which is the intent.
> > > Thanks, I will change the wording.
> > > 
> > > > 
> > > > Out of curiosity, how do the VFs eventually get probed?  I guess
> > > > there's some other mechanism that tells you when they're ready, and
> > > > you manually use sysfs 'sriov_drivers_autoprobe' to enable probing,
> > > > then bind drivers to them via sysfs?
> > > We have a user program talking to the Azure backplane to get that information.
> > > @Paul Rosswurm, do you have more details?
> > > 
> > > 
> > > > The prevention of autoprobing sounds like a critical part of this
> > > > change; might be worth saying something in the subject, because "add
> > > > sriov configure" doesn't include much information.
> > > How about "Add handler for sriov configure with VF autoprobe off"?
> > 
> > OK by me :)
> 
> I believe it is the wrong decision to allow toggling a user‑visible knob  
> without the user’s awareness. In this case, they can either disable  
> autoprobe on the PF or rely on EPROBE_DEFER. In all cases, the same
> functionality can be achieved without changing PCI autoprobe code.

OK, Haiyang, can you drop my ack please?  If Leon's solutions don't
work for you, continue this conversation and we can explore
alternatives.

Bjorn

^ permalink raw reply

* Re: [PATCH v3 06/10] RDMA/qedr: Replace qedr_ib_copy_to_udata() with ib_respond_udata()
From: sashiko-bot @ 2026-05-13 19:07 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: linux-hyperv
In-Reply-To: <6-v3-4effdebad75a+e1-rdma_udata_rep_jgg@nvidia.com>

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [High] In `qedr_create_user_qp`, the error code from `qedr_copy_qp_uresp` is overwritten by the return value of `rdma_destroy_qp` in the error cleanup path, causing a false success return.
--

commit 5dc8fd66d59366118ca19c878df5216ffd4a4a52
Author: Jason Gunthorpe <jgg@ziepe.ca>

RDMA/qedr: Replace qedr_ib_copy_to_udata() with ib_respond_udata()

This replaces qedr_ib_copy_to_udata() with ib_respond_udata() to handle
copying data to user-space, removing the local inline wrapper.

> diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
> index 3b86ea1cf8888..79190c5b8b50b 100644
> --- a/drivers/infiniband/hw/qedr/verbs.c
> +++ b/drivers/infiniband/hw/qedr/verbs.c
[ ... ]
> @@ -1311,13 +1294,7 @@ static int qedr_copy_qp_uresp(struct qedr_dev *dev,
>  	uresp->atomic_supported = dev->atomic_cap != IB_ATOMIC_NONE;
>  	uresp->qp_id = qp->qp_id;
>  
> -	rc = qedr_ib_copy_to_udata(udata, uresp, sizeof(*uresp));
> -	if (rc)
> -		DP_ERR(dev,
> -		       "create qp: failed a copy to user space with qp icid=0x%x.\n",
> -		       qp->icid);
> -
> -	return rc;
> +	return ib_respond_udata(udata, *uresp);

This isn't a bug introduced by your patch, but does the error handling
in qedr_create_user_qp() overwrite the return value of this function?

If qedr_copy_qp_uresp() or subsequent qedr_db_recovery_add() calls fail,
the function branches to the err label:

drivers/infiniband/hw/qedr/verbs.c:qedr_create_user_qp() {
    ...
	if (udata) {
		rc = qedr_copy_qp_uresp(dev, qp, udata, &uresp);
		if (rc)
			goto err;
	}
    ...
err:
	rc = dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
	if (rc)
		DP_ERR(dev, "create qp: fatal fault. rc=%d", rc);

err1:
	qedr_cleanup_user(dev, ctx, qp);
	return rc;
}

Since destroying the newly allocated QP typically succeeds and returns 0,
would this unconditionally overwrite the original error code in rc, returning
a false success to user-space?

If user-space subsequently attempts to post work requests to this destroyed
QP, could it lead to hardware undefined behavior, PCIe errors, or system
crashes?

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/0-v3-4effdebad75a+e1-rdma_udata_rep_jgg@nvidia.com?part=6

^ permalink raw reply

* Re: [PATCH V3 10/11] mshv: Populate mmio mappings for PCI passthru
From: sashiko-bot @ 2026-05-13 19:23 UTC (permalink / raw)
  To: Mukesh R; +Cc: linux-pci, linux-hyperv
In-Reply-To: <20260512020259.1678627-11-mrathor@linux.microsoft.com>

Thank you for your contribution! Sashiko AI review found 5 potential issue(s) to consider:
- [Critical] Unvalidated physical memory mapping and integer overflow allow mapping arbitrary host RAM.
- [Critical] Missing MMU notifier registration and TOCTOU race in MMIO fault handling.
- [High] Architecture page size mismatch corrupts physical address mappings on ARM64.
- [High] Hardcoded write permissions bypass host read-only restrictions for MMIO regions.
- [Medium] Uninitialized stack variable `pfnmap_args`.
--

Note: The format of this report is altered to provide a free-form summary of
the findings, rather than using traditional inline diff quotes, to adhere to
recitation restrictions.

commit f5a60b27c18f019e0a418a3f28dcff85e1128ed4
Author: Mukesh R <mrathor@linux.microsoft.com>
Subject: mshv: Populate mmio mappings for PCI passthru

This commit adds support for mapping mmio memory to the guest upon receiving
an unmapped guest physical address intercept. It looks up the physical frame
number from the driver and asks the hypervisor to establish the map.

While reviewing this code, a few questions came up regarding the handling of
the memory mappings.

When the memory region is created from the user memory structure, it appears
only the start address is validated against the virtual memory area. The page
count is taken directly from user input. Later, in the intercept handler, a
signed 32-bit integer is used to loop over the 64-bit page count. If a user
provides an excessively large size, could the loop counter overflow to a
negative value and prematurely terminate the validation loop? Would this
bypass the ram checks and allow mapping arbitrary host memory into the
guest, while also potentially causing a lockup without a scheduling point?

Is there a race condition when fetching the physical frame number? The memory
map read lock is dropped after resolving the address, and the hypercall is
made without holding it. Since there does not seem to be a memory management
unit interval notifier sequence check here, what happens if the host virtual
memory area is unmapped or modified concurrently? Could the guest retain
access to stale or reallocated host memory?

Does the physical frame number calculation work correctly on architectures
with varying page sizes? On arm64, the kernel page size might be 64KB,
meaning the value returned by the pfnmap functions is based on 64KB units.
However, the handler subtracts a 4KB-based offset from it. Will this
result in an incorrect base physical address being passed to the
hypervisor?

Will this logic handle read-only device memory properly? The code
unconditionally requests a write fault resolution and then hardcodes the
writable flag for the hypervisor map call. If userspace maps a read-only
memory region, will the fault fixup fail? If it does succeed, would the
guest be granted write access to a read-only region?

Does the pfnmap arguments structure need zero-initialization? The structure
is declared on the stack and only the virtual memory area and address
fields are explicitly populated. Could uninitialized padding or other
structure members contain stack garbage that might affect the pfnmap
operations?

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260512020259.1678627-1-mrathor@linux.microsoft.com?part=10

^ permalink raw reply

* Re: [PATCH V3 11/11] mshv: Mark mem regions as non-movable upfront if device passthru
From: sashiko-bot @ 2026-05-13 20:00 UTC (permalink / raw)
  To: Mukesh R; +Cc: linux-pci, linux-hyperv
In-Reply-To: <20260512020259.1678627-12-mrathor@linux.microsoft.com>

Thank you for your contribution! Sashiko AI review found 3 potential issue(s) to consider:
- [Critical] Failing `copy_to_user` during device creation bypasses the `pt_regions_pinned` enforcement while leaving the device fully accessible.
- [High] Device creation does not validate or pin pre-existing memory regions, allowing an unsafe mix of movable regions and device passthrough.
- [High] Missing synchronization between device creation and region creation allows concurrent bypass of the pinning constraint.
--

commit add8cd40c15400cf2b19c8993ce14d4b4a7adef6
Author: Mukesh R <mrathor@linux.microsoft.com>

mshv: Mark mem regions as non-movable upfront if device passthru

This commit ensures that if a VM is started with a device attached, its
memory regions are marked as non-movable upfront to allow the use of SLAT
for IOMMU. It also adds a boot parameter to disable movable pages globally.

> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> index a7864463961bc..ac71534733bd8 100644
> --- a/drivers/hv/mshv_root_main.c
> +++ b/drivers/hv/mshv_root_main.c

[ ... ]

> @@ -1303,6 +1307,12 @@ static void mshv_async_hvcall_handler(void *data, u64 *status)
>  	*status = partition->async_hypercall_status;
>  }
>  
> +static bool mshv_do_pt_regions_pinned(struct mshv_partition *pt)
> +{
> +	return pt->pt_regions_pinned || mshv_partition_encrypted(pt) ||
> +	       hv_no_movbl_pgs;
> +}

Is there a potential race condition here between device creation and region
creation?

pt->pt_regions_pinned is written in mshv_partition_ioctl_create_device()
without holding any locks. Concurrently, mshv_partition_create_region()
reads this flag via mshv_do_pt_regions_pinned() without synchronization.

If a region is created while a concurrent device creation thread is about to
set pt_regions_pinned to true, could the region incorrectly be instantiated
as movable while a passthrough device is active?

[ ... ]

> @@ -1808,6 +1818,9 @@ static long mshv_partition_ioctl_create_device(struct mshv_partition *partition,
>  	if (copy_to_user(uarg, &devargk, sizeof(devargk)))
>  		return -EFAULT;    /* cleanup in mshv_device_fop_release() */
>  
> +	/* For now, all regions must be pinned if there is device passthru. */
> +	partition->pt_regions_pinned = true;

If the copy_to_user() fails, the function returns early and skips setting
pt_regions_pinned to true. However, anon_inode_getfd() has already installed
the file descriptor in the process's file descriptor table, meaning the
device remains fully accessible. Can this allow userspace to bypass the
pt_regions_pinned enforcement?

Additionally, when a device is attached, this sets pt_regions_pinned to true
for future memory region creations. Does this handle the case where a
partition already has existing memory regions?

If userspace creates movable regions first and then attaches a device, is
there a risk that page migration could cause the device to DMA into incorrect
physical memory?

>  	return 0;
>  
>  undo_out:

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260512020259.1678627-1-mrathor@linux.microsoft.com?part=11

^ permalink raw reply

* Re: [PATCH v3 10/10] RDMA: Replace memset with = {} pattern for ib_respond_udata()
From: sashiko-bot @ 2026-05-13 20:59 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: linux-hyperv
In-Reply-To: <10-v3-4effdebad75a+e1-rdma_udata_rep_jgg@nvidia.com>

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [High] Removal of `memset` in `qedr_copy_qp_uresp` introduces a kernel stack information leak to user-space due to uninitialized struct padding bytes.
--

commit 5d1e218c098790b95c443563dc2cc0974927c81a
Author: Jason Gunthorpe <jgg@ziepe.ca>

RDMA: Replace memset with = {} pattern for ib_respond_udata()

This patch replaces multiple instances of memset() used for zeroing
response structures with the aggregate initialization pattern = {}. It
also removes a redundant memset() in qedr_copy_qp_uresp().

> diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
> index 79190c5b8b50b..1af908275ca72 100644
> --- a/drivers/infiniband/hw/qedr/verbs.c
> +++ b/drivers/infiniband/hw/qedr/verbs.c
[ ... ]
> @@ -1283,8 +1281,6 @@ static int qedr_copy_qp_uresp(struct qedr_dev *dev,
>  			      struct qedr_qp *qp, struct ib_udata *udata,
>  			      struct qedr_create_qp_uresp *uresp)
>  {
> -	memset(uresp, 0, sizeof(*uresp));
> -
>  	if (qedr_qp_has_sq(qp))
>  		qedr_copy_sq_uresp(dev, uresp, qp);

Does this change expose kernel stack memory to userspace?

The commit message notes that qedr_copy_qp_uresp() is already called with
zeroed memory. If the caller is using aggregate initialization (= {}) on
the stack, this may not completely zero the memory.

Unlike memset(), the C standard does not guarantee that = {} will zero out
implicit padding bytes. The compiler might optimize the initialization by
only zeroing the named fields.

Looking at struct qedr_create_qp_uresp, there appear to be 4 bytes of
implicit padding due to field alignment:

include/uapi/rdma/qedr-abi.h:struct qedr_create_qp_uresp {
	__u32 qp_id;
	__u32 atomic_supported;

	/* SQ */
	__u32 sq_db_offset;
	__u16 sq_icid;
        <--- 2 bytes of padding here to align rq_db_offset

	/* RQ */
	__u32 rq_db_offset;
	__u16 rq_icid;
        <--- 2 bytes of padding here to align rq_db2_offset

	__u32 rq_db2_offset;
	__u32 reserved;
...

Since ib_respond_udata() copies sizeof(*uresp) to userspace, could the
removal of memset() cause residual stack data in these padding holes to be
leaked?

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/0-v3-4effdebad75a+e1-rdma_udata_rep_jgg@nvidia.com?part=10

^ permalink raw reply

* [PATCH net-next v9 0/6] net: mana: Per-vPort EQ and MSI-X interrupt management
From: Long Li @ 2026-05-13 22:09 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel

This series moves EQ ownership from the shared mana_context to per-vPort
mana_port_context, enabling each vPort to have dedicated MSI-X vectors
when the hardware provides enough vectors. When vectors are limited, the
driver falls back to sharing MSI-X among vPorts.

The series introduces a GDMA IRQ Context (GIC) abstraction with reference
counting to manage interrupt context lifecycle. This allows both Ethernet
and RDMA EQs to dynamically acquire dedicated or shared MSI-X vectors at
vPort creation time rather than pre-allocating all vectors at probe time.

This series touches both the net and RDMA MANA drivers and is intended
to go through the net-next tree. The patches are available on a shared
branch for both netdev and RDMA maintainers to review.

The following changes since commit 73d587ae684d176fac9db94173f77d78a794ea4f:

  net: ethtool: fix missing closing paren in rings_reply_size() (2026-05-11 18:42:25 -0700)

are available in the Git repository at:

  https://github.com/longlimsft/linux.git tags/mana-eq-msi-v9

for you to fetch changes up to 8249f52c3a065d92d24f27ab12c0b4d197ba14c4:

  RDMA/mana_ib: Allocate interrupt contexts on EQs

Changes in v9:
- RSS QPs now take a vport reference via pd->vport_use_count to ensure
  EQs outlive all QP consumers. EQs are only destroyed when the last
  QP (raw or RSS) on the PD releases its reference (patch 1)
- Serialize mana_set_channels() against RDMA vport configuration via
  apc->vport_mutex when the port is down. When the port is up, Ethernet
  owns the vport exclusively so no locking is needed (patch 1)
- Change WARN_ON(apc->eqs) to bail out with -EEXIST to prevent
  leaking prior EQ array if invariant is violated (patch 1)
- Only commit pd->tx_shortform_allowed and pd->tx_vp_offset after
  mana_create_eq() succeeds (patch 1)
- Reset gc->msi_sharing at the top of mana_gd_query_max_resources()
  so it is recomputed from current hardware state on resume (patch 2)
- Fix reverse Christmas tree variable declaration ordering (patches
  1, 3, 5)

Changes in v8:
- Fix comment to reference per-vPort queue count instead of
  gc->max_num_queues (patch 2)
- Remove duplicate irq_update_affinity_hint() calls from error paths
  and mana_gd_remove_irqs(); the clearing is now centralized in
  mana_gd_put_gic() (patch 4)
- Note the IRQ name change (mana_q -> mana_msi) in the commit
  message (patch 4)
- Remove dead conditional write to spec.eq.msix_index (patch 5)
- Document GIC ownership contract and msix_index invariant change
  in commit message (patch 5)
- Populate eq.irq on RDMA EQs for consistency with the Ethernet
  path (patch 6)
- Document BIT(6) relocation and capability flag semantics in
  commit message (patch 6)
- Fix checkpatch --strict alignment and line length warnings

Changes in v7:
- Use rounddown_pow_of_two() instead of roundup_pow_of_two() when
  computing per-vPort queue count to avoid unnecessarily forcing shared
  MSI-X mode (patch 2)
- Call mana_gd_setup_remaining_irqs() unconditionally to ensure
  irq_contexts are populated in both dedicated and shared MSI-X modes,
  fixing bisectability between patches 2 and 5 (patch 2)
- Guard ibdev_dbg() in mana_ib_cfg_vport() with error check so the
  vport handle is not logged on the failure path (patch 1)
- Use cached gic->irq instead of pci_irq_vector() lookup in
  mana_gd_put_gic() for consistency with the allocation path (patch 3)
- Fix unsigned int* to int* pointer type mismatch when calling
  mana_gd_get_gic() by using a local int variable for the MSI index
  (patches 5, 6)

Changes in v6:
- Rebased on net-next/main (v7.1-rc1)

Changes in v5:
- Rebased on net-next/main

Changes in v4:
- Rebased on net-next/main 7.0-rc4
- Patch 2: Use MANA_DEF_NUM_QUEUES instead of hardcoded 16 for
  max_num_queues clamping
- Patch 3: Track dyn_msix in GIC context instead of re-checking
  pci_msix_can_alloc_dyn() on each call; improved remove_irqs iteration
  to skip unallocated entries

Changes in v3:
- Rebased on net-next/main
- Patch 1: Added NULL check for mpc->eqs in mana_ib_create_qp_rss() to
  prevent NULL pointer dereference when RSS QP is created before a raw QP
  has configured the vport and allocated EQs

Changes in v2:
- Rebased on net-next/main (adapted to kzalloc_objs/kzalloc_obj macros,
  new GDMA_DRV_CAP_FLAG definitions)
- Patch 2: Fixed misleading comment for max_num_queues vs
  max_num_queues_vport in gdma.h
- Patch 3: Fixed spelling typo in gdma_main.c ("difference" -> "different")

Long Li (6):
  net: mana: Create separate EQs for each vPort
  net: mana: Query device capabilities and configure MSI-X sharing for
    EQs
  net: mana: Introduce GIC context with refcounting for interrupt
    management
  net: mana: Use GIC functions to allocate global EQs
  net: mana: Allocate interrupt context for each EQ when creating vPort
  RDMA/mana_ib: Allocate interrupt contexts on EQs

 drivers/infiniband/hw/mana/main.c             |  67 +++-
 drivers/infiniband/hw/mana/qp.c               |  37 +-
 .../net/ethernet/microsoft/mana/gdma_main.c   | 323 +++++++++++++-----
 drivers/net/ethernet/microsoft/mana/mana_en.c | 170 +++++----
 .../ethernet/microsoft/mana/mana_ethtool.c    |  27 +-
 include/net/mana/gdma.h                       |  33 +-
 include/net/mana/mana.h                       |   7 +-
 7 files changed, 488 insertions(+), 176 deletions(-)

-- 
2.43.0

^ permalink raw reply

* [PATCH net-next v9 1/6] net: mana: Create separate EQs for each vPort
From: Long Li @ 2026-05-13 22:09 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260513220956.402058-1-longli@microsoft.com>

To prepare for assigning vPorts to dedicated MSI-X vectors, remove EQ
sharing among the vPorts and create dedicated EQs for each vPort.

Move the EQ definition from struct mana_context to struct mana_port_context
and update related support functions. Export mana_create_eq() and
mana_destroy_eq() for use by the MANA RDMA driver.

RSS QPs now take a vport reference via pd->vport_use_count to ensure
EQs outlive all QP consumers. The vport must already be configured by
a raw QP before an RSS QP can be created. EQs are only destroyed when
the last QP (raw or RSS) on the PD releases its reference.

Serialize mana_set_channels() against RDMA vport configuration to
prevent num_queues from changing while RDMA holds EQs sized to the
current value. When the port is down, apc->vport_mutex is held for
the entire operation since mana_detach()/mana_attach() do not take
vport_mutex in that case. When the port is up, Ethernet owns the
vport exclusively so no additional locking is needed.

Signed-off-by: Long Li <longli@microsoft.com>
---
 drivers/infiniband/hw/mana/main.c             |  24 ++--
 drivers/infiniband/hw/mana/qp.c               |  37 +++++-
 drivers/net/ethernet/microsoft/mana/mana_en.c | 112 +++++++++++-------
 .../ethernet/microsoft/mana/mana_ethtool.c    |  27 ++++-
 include/net/mana/mana.h                       |   7 +-
 5 files changed, 145 insertions(+), 62 deletions(-)

diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
index ac5e75dd3494..6159bd03a021 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -20,8 +20,10 @@ void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd,
 	pd->vport_use_count--;
 	WARN_ON(pd->vport_use_count < 0);
 
-	if (!pd->vport_use_count)
+	if (!pd->vport_use_count) {
+		mana_destroy_eq(mpc);
 		mana_uncfg_vport(mpc);
+	}
 
 	mutex_unlock(&pd->vport_mutex);
 }
@@ -55,15 +57,23 @@ int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd,
 		return err;
 	}
 
-	mutex_unlock(&pd->vport_mutex);
 
-	pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
-	pd->tx_vp_offset = mpc->tx_vp_offset;
+	err = mana_create_eq(mpc);
+	if (err) {
+		mana_uncfg_vport(mpc);
+		pd->vport_use_count--;
+	} else {
+		pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
+		pd->tx_vp_offset = mpc->tx_vp_offset;
+	}
 
-	ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
-		  mpc->port_handle, pd->pdn, doorbell_id);
+	mutex_unlock(&pd->vport_mutex);
 
-	return 0;
+	if (!err)
+		ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
+			  mpc->port_handle, pd->pdn, doorbell_id);
+
+	return err;
 }
 
 int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
index 0fbcf449c134..108ec4c5ce51 100644
--- a/drivers/infiniband/hw/mana/qp.c
+++ b/drivers/infiniband/hw/mana/qp.c
@@ -79,6 +79,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 				 struct ib_qp_init_attr *attr,
 				 struct ib_udata *udata)
 {
+	struct mana_ib_pd *mana_pd = container_of(pd, struct mana_ib_pd, ibpd);
 	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
 	struct mana_ib_dev *mdev =
 		container_of(pd->device, struct mana_ib_dev, ib_dev);
@@ -155,6 +156,18 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 
 	qp->port = port;
 
+	/* Take a reference on the vport to ensure EQs outlive this QP.
+	 * The vport must already be configured by a raw QP.
+	 */
+	mutex_lock(&mana_pd->vport_mutex);
+	if (!mana_pd->vport_use_count) {
+		mutex_unlock(&mana_pd->vport_mutex);
+		ret = -EINVAL;
+		goto fail;
+	}
+	mana_pd->vport_use_count++;
+	mutex_unlock(&mana_pd->vport_mutex);
+
 	for (i = 0; i < ind_tbl_size; i++) {
 		struct mana_obj_spec wq_spec = {};
 		struct mana_obj_spec cq_spec = {};
@@ -171,13 +184,13 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 		cq_spec.gdma_region = cq->queue.gdma_region;
 		cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE;
 		cq_spec.modr_ctx_id = 0;
-		eq = &mpc->ac->eqs[cq->comp_vector];
+		eq = &mpc->eqs[cq->comp_vector % mpc->num_queues];
 		cq_spec.attached_eq = eq->eq->id;
 
 		ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ,
 					 &wq_spec, &cq_spec, &wq->rx_object);
 		if (ret)
-			goto fail;
+			goto free_vport;
 
 		/* The GDMA regions are now owned by the WQ object */
 		wq->queue.gdma_region = GDMA_INVALID_DMA_REGION;
@@ -199,7 +212,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 		ret = mana_ib_install_cq_cb(mdev, cq);
 		if (ret) {
 			mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
-			goto fail;
+			goto free_vport;
 		}
 	}
 	resp.num_entries = i;
@@ -210,7 +223,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 					 ucmd.rx_hash_key_len,
 					 ucmd.rx_hash_key);
 	if (ret)
-		goto fail;
+		goto free_vport;
 
 	ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
 	if (ret) {
@@ -226,7 +239,7 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 
 err_disable_vport_rx:
 	mana_disable_vport_rx(mpc);
-fail:
+free_vport:
 	while (i-- > 0) {
 		ibwq = ind_tbl->ind_tbl[i];
 		ibcq = ibwq->cq;
@@ -237,6 +250,9 @@ static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
 		mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
 	}
 
+	mana_ib_uncfg_vport(mdev, mana_pd, port);
+
+fail:
 	kfree(mana_ind_table);
 
 	return ret;
@@ -321,7 +337,11 @@ static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
 	cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE;
 	cq_spec.modr_ctx_id = 0;
 	eq_vec = send_cq->comp_vector;
-	eq = &mpc->ac->eqs[eq_vec];
+	if (!mpc->eqs) {
+		err = -EINVAL;
+		goto err_destroy_queue;
+	}
+	eq = &mpc->eqs[eq_vec % mpc->num_queues];
 	cq_spec.attached_eq = eq->eq->id;
 
 	err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec,
@@ -785,14 +805,17 @@ static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp,
 {
 	struct mana_ib_dev *mdev =
 		container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev);
+	struct ib_pd *ibpd = qp->ibqp.pd;
 	struct mana_port_context *mpc;
 	struct net_device *ndev;
+	struct mana_ib_pd *pd;
 	struct mana_ib_wq *wq;
 	struct ib_wq *ibwq;
 	int i;
 
 	ndev = mana_ib_get_netdev(qp->ibqp.device, qp->port);
 	mpc = netdev_priv(ndev);
+	pd = container_of(ibpd, struct mana_ib_pd, ibpd);
 
 	/* Disable vPort RX steering before destroying RX WQ objects.
 	 * Otherwise firmware still routes traffic to the destroyed queues,
@@ -817,6 +840,8 @@ static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp,
 		mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
 	}
 
+	mana_ib_uncfg_vport(mdev, pd, qp->port);
+
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index b2faa7cf398f..f1f6f7940b61 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1615,78 +1615,84 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type,
 }
 EXPORT_SYMBOL_NS(mana_destroy_wq_obj, "NET_MANA");
 
-static void mana_destroy_eq(struct mana_context *ac)
+void mana_destroy_eq(struct mana_port_context *apc)
 {
+	struct mana_context *ac = apc->ac;
 	struct gdma_context *gc = ac->gdma_dev->gdma_context;
 	struct gdma_queue *eq;
 	int i;
 
-	if (!ac->eqs)
+	if (!apc->eqs)
 		return;
 
-	debugfs_remove_recursive(ac->mana_eqs_debugfs);
-	ac->mana_eqs_debugfs = NULL;
+	debugfs_remove_recursive(apc->mana_eqs_debugfs);
+	apc->mana_eqs_debugfs = NULL;
 
-	for (i = 0; i < gc->max_num_queues; i++) {
-		eq = ac->eqs[i].eq;
+	for (i = 0; i < apc->num_queues; i++) {
+		eq = apc->eqs[i].eq;
 		if (!eq)
 			continue;
 
 		mana_gd_destroy_queue(gc, eq);
 	}
 
-	kfree(ac->eqs);
-	ac->eqs = NULL;
+	kfree(apc->eqs);
+	apc->eqs = NULL;
 }
+EXPORT_SYMBOL_NS(mana_destroy_eq, "NET_MANA");
 
-static void mana_create_eq_debugfs(struct mana_context *ac, int i)
+static void mana_create_eq_debugfs(struct mana_port_context *apc, int i)
 {
-	struct mana_eq eq = ac->eqs[i];
+	struct mana_eq eq = apc->eqs[i];
 	char eqnum[32];
 
 	sprintf(eqnum, "eq%d", i);
-	eq.mana_eq_debugfs = debugfs_create_dir(eqnum, ac->mana_eqs_debugfs);
+	eq.mana_eq_debugfs = debugfs_create_dir(eqnum, apc->mana_eqs_debugfs);
 	debugfs_create_u32("head", 0400, eq.mana_eq_debugfs, &eq.eq->head);
 	debugfs_create_u32("tail", 0400, eq.mana_eq_debugfs, &eq.eq->tail);
 	debugfs_create_file("eq_dump", 0400, eq.mana_eq_debugfs, eq.eq, &mana_dbg_q_fops);
 }
 
-static int mana_create_eq(struct mana_context *ac)
+int mana_create_eq(struct mana_port_context *apc)
 {
-	struct gdma_dev *gd = ac->gdma_dev;
+	struct gdma_dev *gd = apc->ac->gdma_dev;
 	struct gdma_context *gc = gd->gdma_context;
 	struct gdma_queue_spec spec = {};
 	int err;
 	int i;
 
-	ac->eqs = kzalloc_objs(struct mana_eq, gc->max_num_queues);
-	if (!ac->eqs)
+	if (WARN_ON(apc->eqs))
+		return -EEXIST;
+	apc->eqs = kzalloc_objs(struct mana_eq, apc->num_queues);
+	if (!apc->eqs)
 		return -ENOMEM;
 
 	spec.type = GDMA_EQ;
 	spec.monitor_avl_buf = false;
 	spec.queue_size = EQ_SIZE;
 	spec.eq.callback = NULL;
-	spec.eq.context = ac->eqs;
+	spec.eq.context = apc->eqs;
 	spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE;
 
-	ac->mana_eqs_debugfs = debugfs_create_dir("EQs", gc->mana_pci_debugfs);
+	apc->mana_eqs_debugfs =
+		debugfs_create_dir("EQs", apc->mana_port_debugfs);
 
-	for (i = 0; i < gc->max_num_queues; i++) {
+	for (i = 0; i < apc->num_queues; i++) {
 		spec.eq.msix_index = (i + 1) % gc->num_msix_usable;
-		err = mana_gd_create_mana_eq(gd, &spec, &ac->eqs[i].eq);
+		err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq);
 		if (err) {
 			dev_err(gc->dev, "Failed to create EQ %d : %d\n", i, err);
 			goto out;
 		}
-		mana_create_eq_debugfs(ac, i);
+		mana_create_eq_debugfs(apc, i);
 	}
 
 	return 0;
 out:
-	mana_destroy_eq(ac);
+	mana_destroy_eq(apc);
 	return err;
 }
+EXPORT_SYMBOL_NS(mana_create_eq, "NET_MANA");
 
 static int mana_fence_rq(struct mana_port_context *apc, struct mana_rxq *rxq)
 {
@@ -2451,7 +2457,7 @@ static int mana_create_txq(struct mana_port_context *apc,
 		spec.monitor_avl_buf = false;
 		spec.queue_size = cq_size;
 		spec.cq.callback = mana_schedule_napi;
-		spec.cq.parent_eq = ac->eqs[i].eq;
+		spec.cq.parent_eq = apc->eqs[i].eq;
 		spec.cq.context = cq;
 		err = mana_gd_create_mana_wq_cq(gd, &spec, &cq->gdma_cq);
 		if (err)
@@ -2844,13 +2850,12 @@ static void mana_create_rxq_debugfs(struct mana_port_context *apc, int idx)
 static int mana_add_rx_queues(struct mana_port_context *apc,
 			      struct net_device *ndev)
 {
-	struct mana_context *ac = apc->ac;
 	struct mana_rxq *rxq;
 	int err = 0;
 	int i;
 
 	for (i = 0; i < apc->num_queues; i++) {
-		rxq = mana_create_rxq(apc, i, &ac->eqs[i], ndev);
+		rxq = mana_create_rxq(apc, i, &apc->eqs[i], ndev);
 		if (!rxq) {
 			err = -ENOMEM;
 			netdev_err(ndev, "Failed to create rxq %d : %d\n", i, err);
@@ -2869,9 +2874,8 @@ static int mana_add_rx_queues(struct mana_port_context *apc,
 	return err;
 }
 
-static void mana_destroy_vport(struct mana_port_context *apc)
+static void mana_destroy_rxqs(struct mana_port_context *apc)
 {
-	struct gdma_dev *gd = apc->ac->gdma_dev;
 	struct mana_rxq *rxq;
 	u32 rxq_idx;
 
@@ -2883,8 +2887,12 @@ static void mana_destroy_vport(struct mana_port_context *apc)
 		mana_destroy_rxq(apc, rxq, true);
 		apc->rxqs[rxq_idx] = NULL;
 	}
+}
+
+static void mana_destroy_vport(struct mana_port_context *apc)
+{
+	struct gdma_dev *gd = apc->ac->gdma_dev;
 
-	mana_destroy_txq(apc);
 	mana_uncfg_vport(apc);
 
 	if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode)
@@ -2905,11 +2913,7 @@ static int mana_create_vport(struct mana_port_context *apc,
 			return err;
 	}
 
-	err = mana_cfg_vport(apc, gd->pdid, gd->doorbell);
-	if (err)
-		return err;
-
-	return mana_create_txq(apc, net);
+	return mana_cfg_vport(apc, gd->pdid, gd->doorbell);
 }
 
 static int mana_rss_table_alloc(struct mana_port_context *apc)
@@ -3195,21 +3199,36 @@ int mana_alloc_queues(struct net_device *ndev)
 
 	err = mana_create_vport(apc, ndev);
 	if (err) {
-		netdev_err(ndev, "Failed to create vPort %u : %d\n", apc->port_idx, err);
+		netdev_err(ndev, "Failed to create vPort %u : %d\n",
+			   apc->port_idx, err);
 		return err;
 	}
 
+	err = mana_create_eq(apc);
+	if (err) {
+		netdev_err(ndev, "Failed to create EQ on vPort %u: %d\n",
+			   apc->port_idx, err);
+		goto destroy_vport;
+	}
+
+	err = mana_create_txq(apc, ndev);
+	if (err) {
+		netdev_err(ndev, "Failed to create TXQ on vPort %u: %d\n",
+			   apc->port_idx, err);
+		goto destroy_eq;
+	}
+
 	err = netif_set_real_num_tx_queues(ndev, apc->num_queues);
 	if (err) {
 		netdev_err(ndev,
 			   "netif_set_real_num_tx_queues () failed for ndev with num_queues %u : %d\n",
 			   apc->num_queues, err);
-		goto destroy_vport;
+		goto destroy_txq;
 	}
 
 	err = mana_add_rx_queues(apc, ndev);
 	if (err)
-		goto destroy_vport;
+		goto destroy_rxq;
 
 	apc->rss_state = apc->num_queues > 1 ? TRI_STATE_TRUE : TRI_STATE_FALSE;
 
@@ -3218,7 +3237,7 @@ int mana_alloc_queues(struct net_device *ndev)
 		netdev_err(ndev,
 			   "netif_set_real_num_rx_queues () failed for ndev with num_queues %u : %d\n",
 			   apc->num_queues, err);
-		goto destroy_vport;
+		goto destroy_rxq;
 	}
 
 	mana_rss_table_init(apc);
@@ -3226,19 +3245,25 @@ int mana_alloc_queues(struct net_device *ndev)
 	err = mana_config_rss(apc, TRI_STATE_TRUE, true, true);
 	if (err) {
 		netdev_err(ndev, "Failed to configure RSS table: %d\n", err);
-		goto destroy_vport;
+		goto destroy_rxq;
 	}
 
 	if (gd->gdma_context->is_pf && !apc->ac->bm_hostmode) {
 		err = mana_pf_register_filter(apc);
 		if (err)
-			goto destroy_vport;
+			goto destroy_rxq;
 	}
 
 	mana_chn_setxdp(apc, mana_xdp_get(apc));
 
 	return 0;
 
+destroy_rxq:
+	mana_destroy_rxqs(apc);
+destroy_txq:
+	mana_destroy_txq(apc);
+destroy_eq:
+	mana_destroy_eq(apc);
 destroy_vport:
 	mana_destroy_vport(apc);
 	return err;
@@ -3343,6 +3368,9 @@ static int mana_dealloc_queues(struct net_device *ndev)
 	mana_fence_rqs(apc);
 
 	/* Even in err case, still need to cleanup the vPort */
+	mana_destroy_rxqs(apc);
+	mana_destroy_txq(apc);
+	mana_destroy_eq(apc);
 	mana_destroy_vport(apc);
 
 	return 0;
@@ -3663,12 +3691,6 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 
 	INIT_DELAYED_WORK(&ac->gf_stats_work, mana_gf_stats_work_handler);
 
-	err = mana_create_eq(ac);
-	if (err) {
-		dev_err(dev, "Failed to create EQs: %d\n", err);
-		goto out;
-	}
-
 	err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION,
 				    MANA_MICRO_VERSION, &num_ports, &bm_hostmode);
 	if (err)
@@ -3808,8 +3830,6 @@ void mana_remove(struct gdma_dev *gd, bool suspending)
 		free_netdev(ndev);
 	}
 
-	mana_destroy_eq(ac);
-
 	if (ac->per_port_queue_reset_wq) {
 		destroy_workqueue(ac->per_port_queue_reset_wq);
 		ac->per_port_queue_reset_wq = NULL;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index 04350973e19e..e121834d17f3 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -454,18 +454,40 @@ static int mana_set_coalesce(struct net_device *ndev,
 	return err;
 }
 
+/* mana_set_channels - change the number of queues on a port
+ *
+ * Returns -EBUSY if the port is down and RDMA holds the vport with
+ * EQs sized to the current num_queues.
+ */
 static int mana_set_channels(struct net_device *ndev,
 			     struct ethtool_channels *channels)
 {
 	struct mana_port_context *apc = netdev_priv(ndev);
 	unsigned int new_count = channels->combined_count;
 	unsigned int old_count = apc->num_queues;
+	bool locked = false;
 	int err;
 
+	/* When the port is down, hold vport_mutex for the entire
+	 * operation to serialize against RDMA's mana_cfg_vport().
+	 * This is safe because mana_detach()/mana_attach() skip
+	 * vport teardown/setup when port_st_save is false.
+	 * When the port is up, Ethernet owns the vport exclusively
+	 * so no locking against RDMA is needed.
+	 */
+	if (!apc->port_is_up) {
+		mutex_lock(&apc->vport_mutex);
+		if (apc->vport_use_count) {
+			mutex_unlock(&apc->vport_mutex);
+			return -EBUSY;
+		}
+		locked = true;
+	}
+
 	err = mana_pre_alloc_rxbufs(apc, ndev->mtu, new_count);
 	if (err) {
 		netdev_err(ndev, "Insufficient memory for new allocations");
-		return err;
+		goto unlock;
 	}
 
 	err = mana_detach(ndev, false);
@@ -483,6 +505,9 @@ static int mana_set_channels(struct net_device *ndev,
 
 out:
 	mana_pre_dealloc_rxbufs(apc);
+unlock:
+	if (locked)
+		mutex_unlock(&apc->vport_mutex);
 	return err;
 }
 
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index aa90a858c8e3..c8e7d16f6685 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -480,8 +480,6 @@ struct mana_context {
 	u8 bm_hostmode;
 
 	struct mana_ethtool_hc_stats hc_stats;
-	struct mana_eq *eqs;
-	struct dentry *mana_eqs_debugfs;
 	struct workqueue_struct *per_port_queue_reset_wq;
 	/* Workqueue for querying hardware stats */
 	struct delayed_work gf_stats_work;
@@ -501,6 +499,9 @@ struct mana_port_context {
 
 	u8 mac_addr[ETH_ALEN];
 
+	struct mana_eq *eqs;
+	struct dentry *mana_eqs_debugfs;
+
 	enum TRI_STATE rss_state;
 
 	mana_handle_t default_rxobj;
@@ -1034,6 +1035,8 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type,
 int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
 		   u32 doorbell_pg_id);
 void mana_uncfg_vport(struct mana_port_context *apc);
+int mana_create_eq(struct mana_port_context *apc);
+void mana_destroy_eq(struct mana_port_context *apc);
 
 struct net_device *mana_get_primary_netdev(struct mana_context *ac,
 					   u32 port_index,
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v9 2/6] net: mana: Query device capabilities and configure MSI-X sharing for EQs
From: Long Li @ 2026-05-13 22:09 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260513220956.402058-1-longli@microsoft.com>

When querying the device, adjust the max number of queues to allow
dedicated MSI-X vectors for each vPort. The per-vPort queue count is
clamped towards MANA_DEF_NUM_QUEUES but will not exceed the hardware
maximum reported by the device.

MSI-X sharing among vPorts is enabled when there are not enough MSI-X
vectors for dedicated allocation, or when the platform does not support
dynamic MSI-X allocation (in which case all vectors are pre-allocated
at probe time and sharing is always used). The msi_sharing flag is
reset at the top of mana_gd_query_max_resources() so it is recomputed
from current hardware state on each probe or resume cycle.

A device reporting zero ports now results in a fatal probe error since
the per-vPort MSI-X math requires at least one port.

Rename mana_query_device_cfg() to mana_gd_query_device_cfg() as it is
used at GDMA device probe time for querying device capabilities.

Signed-off-by: Long Li <longli@microsoft.com>
---
 .../net/ethernet/microsoft/mana/gdma_main.c   | 66 ++++++++++++++++++-
 drivers/net/ethernet/microsoft/mana/mana_en.c | 40 ++++++-----
 include/net/mana/gdma.h                       | 13 +++-
 3 files changed, 100 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 3bc3fff55999..bbd055849e36 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -179,8 +179,18 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
 	struct gdma_context *gc = pci_get_drvdata(pdev);
 	struct gdma_query_max_resources_resp resp = {};
 	struct gdma_general_req req = {};
+	unsigned int max_num_queues;
+	u8 bm_hostmode;
+	u16 num_ports;
 	int err;
 
+	/* Reset msi_sharing so it is recomputed from current hardware
+	 * state. On resume, num_online_cpus() or num_msix_usable may
+	 * have changed, making dedicated MSI-X feasible where it was
+	 * not before.
+	 */
+	gc->msi_sharing = false;
+
 	mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_MAX_RESOURCES,
 			     sizeof(req), sizeof(resp));
 
@@ -227,6 +237,43 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev)
 	if (gc->max_num_queues == 0)
 		return -ENOSPC;
 
+	err = mana_gd_query_device_cfg(gc, MANA_MAJOR_VERSION,
+				       MANA_MINOR_VERSION,
+				       MANA_MICRO_VERSION,
+				       &num_ports, &bm_hostmode);
+	if (err)
+		return err;
+
+	if (!num_ports)
+		return -EINVAL;
+
+	/*
+	 * Adjust the per-vPort max queue count to allow dedicated
+	 * MSIx for each vPort. Clamp to no less than MANA_DEF_NUM_QUEUES.
+	 */
+	max_num_queues = (gc->num_msix_usable - 1) / num_ports;
+	max_num_queues = rounddown_pow_of_two(max(max_num_queues, 1U));
+	if (max_num_queues < MANA_DEF_NUM_QUEUES)
+		max_num_queues = MANA_DEF_NUM_QUEUES;
+
+	/*
+	 * Use dedicated MSIx for EQs whenever possible, use MSIx sharing for
+	 * Ethernet EQs when (max_num_queues * num_ports > num_msix_usable - 1).
+	 */
+	max_num_queues = min(gc->max_num_queues, max_num_queues);
+	if (max_num_queues * num_ports > gc->num_msix_usable - 1)
+		gc->msi_sharing = true;
+
+	/* If MSI is shared, use max allowed value */
+	if (gc->msi_sharing)
+		gc->max_num_queues_vport = min(gc->num_msix_usable - 1,
+					       gc->max_num_queues);
+	else
+		gc->max_num_queues_vport = max_num_queues;
+
+	dev_info(gc->dev, "MSI sharing mode %d max queues %d\n",
+		 gc->msi_sharing, gc->max_num_queues);
+
 	return 0;
 }
 
@@ -1889,6 +1936,7 @@ static int mana_gd_setup_hwc_irqs(struct pci_dev *pdev)
 		/* Need 1 interrupt for HWC */
 		max_irqs = min(num_online_cpus(), MANA_MAX_NUM_QUEUES) + 1;
 		min_irqs = 2;
+		gc->msi_sharing = true;
 	}
 
 	nvec = pci_alloc_irq_vectors(pdev, min_irqs, max_irqs, PCI_IRQ_MSIX);
@@ -1967,6 +2015,8 @@ static void mana_gd_remove_irqs(struct pci_dev *pdev)
 
 	pci_free_irq_vectors(pdev);
 
+	bitmap_free(gc->msi_bitmap);
+	gc->msi_bitmap = NULL;
 	gc->max_num_msix = 0;
 	gc->num_msix_usable = 0;
 }
@@ -2001,6 +2051,10 @@ static int mana_gd_setup(struct pci_dev *pdev)
 	if (err)
 		goto destroy_hwc;
 
+	err = mana_gd_detect_devices(pdev);
+	if (err)
+		goto destroy_hwc;
+
 	err = mana_gd_query_max_resources(pdev);
 	if (err)
 		goto destroy_hwc;
@@ -2011,9 +2065,15 @@ static int mana_gd_setup(struct pci_dev *pdev)
 		goto destroy_hwc;
 	}
 
-	err = mana_gd_detect_devices(pdev);
-	if (err)
-		goto destroy_hwc;
+	if (!gc->msi_sharing) {
+		gc->msi_bitmap = bitmap_zalloc(gc->num_msix_usable, GFP_KERNEL);
+		if (!gc->msi_bitmap) {
+			err = -ENOMEM;
+			goto destroy_hwc;
+		}
+		/* Set bit for HWC */
+		set_bit(0, gc->msi_bitmap);
+	}
 
 	dev_dbg(&pdev->dev, "mana gdma setup successful\n");
 	return 0;
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index f1f6f7940b61..3ee74e7e300c 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1007,10 +1007,9 @@ static int mana_init_port_context(struct mana_port_context *apc)
 	return !apc->rxqs ? -ENOMEM : 0;
 }
 
-static int mana_send_request(struct mana_context *ac, void *in_buf,
-			     u32 in_len, void *out_buf, u32 out_len)
+static int gdma_mana_send_request(struct gdma_context *gc, void *in_buf,
+				  u32 in_len, void *out_buf, u32 out_len)
 {
-	struct gdma_context *gc = ac->gdma_dev->gdma_context;
 	struct gdma_resp_hdr *resp = out_buf;
 	struct gdma_req_hdr *req = in_buf;
 	struct device *dev = gc->dev;
@@ -1044,6 +1043,14 @@ static int mana_send_request(struct mana_context *ac, void *in_buf,
 	return 0;
 }
 
+static int mana_send_request(struct mana_context *ac, void *in_buf,
+			     u32 in_len, void *out_buf, u32 out_len)
+{
+	struct gdma_context *gc = ac->gdma_dev->gdma_context;
+
+	return gdma_mana_send_request(gc, in_buf, in_len, out_buf, out_len);
+}
+
 static int mana_verify_resp_hdr(const struct gdma_resp_hdr *resp_hdr,
 				const enum mana_command_code expected_code,
 				const u32 min_size)
@@ -1177,11 +1184,10 @@ static void mana_pf_deregister_filter(struct mana_port_context *apc)
 			   err, resp.hdr.status);
 }
 
-static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver,
-				 u32 proto_minor_ver, u32 proto_micro_ver,
-				 u16 *max_num_vports, u8 *bm_hostmode)
+int mana_gd_query_device_cfg(struct gdma_context *gc, u32 proto_major_ver,
+			     u32 proto_minor_ver, u32 proto_micro_ver,
+			     u16 *max_num_vports, u8 *bm_hostmode)
 {
-	struct gdma_context *gc = ac->gdma_dev->gdma_context;
 	struct mana_query_device_cfg_resp resp = {};
 	struct mana_query_device_cfg_req req = {};
 	struct device *dev = gc->dev;
@@ -1196,7 +1202,8 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver,
 	req.proto_minor_ver = proto_minor_ver;
 	req.proto_micro_ver = proto_micro_ver;
 
-	err = mana_send_request(ac, &req, sizeof(req), &resp, sizeof(resp));
+	err = gdma_mana_send_request(gc, &req, sizeof(req),
+				     &resp, sizeof(resp));
 	if (err) {
 		dev_err(dev, "Failed to query config: %d", err);
 		return err;
@@ -1230,8 +1237,6 @@ static int mana_query_device_cfg(struct mana_context *ac, u32 proto_major_ver,
 	else
 		*bm_hostmode = 0;
 
-	debugfs_create_u16("adapter-MTU", 0400, gc->mana_pci_debugfs, &gc->adapter_mtu);
-
 	return 0;
 }
 
@@ -3416,7 +3421,7 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
 	int err;
 
 	ndev = alloc_etherdev_mq(sizeof(struct mana_port_context),
-				 gc->max_num_queues);
+				 gc->max_num_queues_vport);
 	if (!ndev)
 		return -ENOMEM;
 
@@ -3425,9 +3430,9 @@ static int mana_probe_port(struct mana_context *ac, int port_idx,
 	apc = netdev_priv(ndev);
 	apc->ac = ac;
 	apc->ndev = ndev;
-	apc->max_queues = gc->max_num_queues;
+	apc->max_queues = gc->max_num_queues_vport;
 	/* Use MANA_DEF_NUM_QUEUES as default, still honoring the HW limit */
-	apc->num_queues = min(gc->max_num_queues, MANA_DEF_NUM_QUEUES);
+	apc->num_queues = min(gc->max_num_queues_vport, MANA_DEF_NUM_QUEUES);
 	apc->tx_queue_size = DEF_TX_BUFFERS_PER_QUEUE;
 	apc->rx_queue_size = DEF_RX_BUFFERS_PER_QUEUE;
 	apc->port_handle = INVALID_MANA_HANDLE;
@@ -3691,13 +3696,18 @@ int mana_probe(struct gdma_dev *gd, bool resuming)
 
 	INIT_DELAYED_WORK(&ac->gf_stats_work, mana_gf_stats_work_handler);
 
-	err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION,
-				    MANA_MICRO_VERSION, &num_ports, &bm_hostmode);
+	err = mana_gd_query_device_cfg(gc, MANA_MAJOR_VERSION,
+				       MANA_MINOR_VERSION,
+				       MANA_MICRO_VERSION,
+				       &num_ports, &bm_hostmode);
 	if (err)
 		goto out;
 
 	ac->bm_hostmode = bm_hostmode;
 
+	debugfs_create_u16("adapter-MTU", 0400,
+			   gc->mana_pci_debugfs, &gc->adapter_mtu);
+
 	if (!resuming) {
 		ac->num_ports = num_ports;
 	} else {
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 6d836060976a..9c05b1e15c3e 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -399,8 +399,10 @@ struct gdma_context {
 	struct device		*dev;
 	struct dentry		*mana_pci_debugfs;
 
-	/* Per-vPort max number of queues */
+	/* Hardware max number of queues */
 	unsigned int		max_num_queues;
+	/* Per-vPort max number of queues */
+	unsigned int		max_num_queues_vport;
 	unsigned int		max_num_msix;
 	unsigned int		num_msix_usable;
 	struct xarray		irq_contexts;
@@ -446,6 +448,12 @@ struct gdma_context {
 	struct workqueue_struct *service_wq;
 
 	unsigned long		flags;
+
+	/* Indicate if this device is sharing MSI for EQs on MANA */
+	bool msi_sharing;
+
+	/* Bitmap tracks where MSI is allocated when it is not shared for EQs */
+	unsigned long *msi_bitmap;
 };
 
 static inline bool mana_gd_is_mana(struct gdma_dev *gd)
@@ -1018,4 +1026,7 @@ int mana_gd_resume(struct pci_dev *pdev);
 
 bool mana_need_log(struct gdma_context *gc, int err);
 
+int mana_gd_query_device_cfg(struct gdma_context *gc, u32 proto_major_ver,
+			     u32 proto_minor_ver, u32 proto_micro_ver,
+			     u16 *max_num_vports, u8 *bm_hostmode);
 #endif /* _GDMA_H */
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v9 3/6] net: mana: Introduce GIC context with refcounting for interrupt management
From: Long Li @ 2026-05-13 22:09 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260513220956.402058-1-longli@microsoft.com>

To allow Ethernet EQs to use dedicated or shared MSI-X vectors and RDMA
EQs to share the same MSI-X, introduce a GIC (GDMA IRQ Context) with
reference counting. This allows the driver to create an interrupt context
on an assigned or unassigned MSI-X vector and share it across multiple
EQ consumers.

Signed-off-by: Long Li <longli@microsoft.com>
---
 .../net/ethernet/microsoft/mana/gdma_main.c   | 159 ++++++++++++++++++
 include/net/mana/gdma.h                       |  12 ++
 2 files changed, 171 insertions(+)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index bbd055849e36..fdd2ef24414b 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -1655,6 +1655,164 @@ static irqreturn_t mana_gd_intr(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
+void mana_gd_put_gic(struct gdma_context *gc, bool use_msi_bitmap, int msi)
+{
+	struct pci_dev *dev = to_pci_dev(gc->dev);
+	struct gdma_irq_context *gic;
+	struct msi_map irq_map;
+	int irq;
+
+	mutex_lock(&gc->gic_mutex);
+
+	gic = xa_load(&gc->irq_contexts, msi);
+	if (WARN_ON(!gic)) {
+		mutex_unlock(&gc->gic_mutex);
+		return;
+	}
+
+	if (use_msi_bitmap)
+		gic->bitmap_refs--;
+
+	if (use_msi_bitmap && gic->bitmap_refs == 0)
+		clear_bit(msi, gc->msi_bitmap);
+
+	if (!refcount_dec_and_test(&gic->refcount))
+		goto out;
+
+	irq = gic->irq;
+
+	irq_update_affinity_hint(irq, NULL);
+	free_irq(irq, gic);
+
+	if (gic->dyn_msix) {
+		irq_map.virq = irq;
+		irq_map.index = msi;
+		pci_msix_free_irq(dev, irq_map);
+	}
+
+	xa_erase(&gc->irq_contexts, msi);
+	kfree(gic);
+
+out:
+	mutex_unlock(&gc->gic_mutex);
+}
+EXPORT_SYMBOL_NS(mana_gd_put_gic, "NET_MANA");
+
+/*
+ * Get a GIC (GDMA IRQ Context) on a MSI vector
+ * a MSI can be shared between different EQs, this function supports setting
+ * up separate MSIs using a bitmap, or directly using the MSI index
+ *
+ * @use_msi_bitmap:
+ * True if MSI is assigned by this function on available slots from bitmap.
+ * False if MSI is passed from *msi_requested
+ */
+struct gdma_irq_context *mana_gd_get_gic(struct gdma_context *gc,
+					 bool use_msi_bitmap,
+					 int *msi_requested)
+{
+	struct pci_dev *dev = to_pci_dev(gc->dev);
+	struct gdma_irq_context *gic;
+	struct msi_map irq_map = { };
+	int irq;
+	int msi;
+	int err;
+
+	mutex_lock(&gc->gic_mutex);
+
+	if (use_msi_bitmap) {
+		msi = find_first_zero_bit(gc->msi_bitmap, gc->num_msix_usable);
+		if (msi >= gc->num_msix_usable) {
+			dev_err(gc->dev, "No free MSI vectors available\n");
+			gic = NULL;
+			goto out;
+		}
+		*msi_requested = msi;
+	} else {
+		msi = *msi_requested;
+	}
+
+	gic = xa_load(&gc->irq_contexts, msi);
+	if (gic) {
+		refcount_inc(&gic->refcount);
+		if (use_msi_bitmap) {
+			gic->bitmap_refs++;
+			set_bit(msi, gc->msi_bitmap);
+		}
+		goto out;
+	}
+
+	irq = pci_irq_vector(dev, msi);
+	if (irq == -EINVAL) {
+		irq_map = pci_msix_alloc_irq_at(dev, msi, NULL);
+		if (!irq_map.virq) {
+			err = irq_map.index;
+			dev_err(gc->dev,
+				"Failed to alloc irq_map msi %d err %d\n",
+				msi, err);
+			gic = NULL;
+			goto out;
+		}
+		irq = irq_map.virq;
+		msi = irq_map.index;
+	}
+
+	gic = kzalloc(sizeof(*gic), GFP_KERNEL);
+	if (!gic) {
+		if (irq_map.virq)
+			pci_msix_free_irq(dev, irq_map);
+		goto out;
+	}
+
+	gic->handler = mana_gd_process_eq_events;
+	gic->msi = msi;
+	gic->irq = irq;
+	INIT_LIST_HEAD(&gic->eq_list);
+	spin_lock_init(&gic->lock);
+
+	if (!gic->msi)
+		snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_hwc@pci:%s",
+			 pci_name(dev));
+	else
+		snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_msi%d@pci:%s",
+			 gic->msi, pci_name(dev));
+
+	err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
+	if (err) {
+		dev_err(gc->dev, "Failed to request irq %d %s\n",
+			irq, gic->name);
+		kfree(gic);
+		gic = NULL;
+		if (irq_map.virq)
+			pci_msix_free_irq(dev, irq_map);
+		goto out;
+	}
+
+	gic->dyn_msix = !!irq_map.virq;
+	refcount_set(&gic->refcount, 1);
+	gic->bitmap_refs = use_msi_bitmap ? 1 : 0;
+
+	err = xa_err(xa_store(&gc->irq_contexts, msi, gic, GFP_KERNEL));
+	if (err) {
+		dev_err(gc->dev, "Failed to store irq context for msi %d: %d\n",
+			msi, err);
+		free_irq(irq, gic);
+		kfree(gic);
+		gic = NULL;
+		if (irq_map.virq)
+			pci_msix_free_irq(dev, irq_map);
+		goto out;
+	}
+
+	if (use_msi_bitmap)
+		set_bit(msi, gc->msi_bitmap);
+
+out:
+	mutex_unlock(&gc->gic_mutex);
+	return gic;
+}
+EXPORT_SYMBOL_NS(mana_gd_get_gic, "NET_MANA");
+
 int mana_gd_alloc_res_map(u32 res_avail, struct gdma_resource *r)
 {
 	r->map = bitmap_zalloc(res_avail, GFP_KERNEL);
@@ -2144,6 +2302,7 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto release_region;
 
 	mutex_init(&gc->eq_test_event_mutex);
+	mutex_init(&gc->gic_mutex);
 	pci_set_drvdata(pdev, gc);
 	gc->bar0_pa = pci_resource_start(pdev, 0);
 	gc->bar0_size = pci_resource_len(pdev, 0);
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 9c05b1e15c3e..fbe3c1427b45 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -388,6 +388,11 @@ struct gdma_irq_context {
 	spinlock_t lock;
 	struct list_head eq_list;
 	char name[MANA_IRQ_NAME_SZ];
+	unsigned int msi;
+	unsigned int irq;
+	refcount_t refcount;
+	unsigned int bitmap_refs;
+	bool dyn_msix;
 };
 
 enum gdma_context_flags {
@@ -449,6 +454,9 @@ struct gdma_context {
 
 	unsigned long		flags;
 
+	/* Protect access to GIC context */
+	struct mutex		gic_mutex;
+
 	/* Indicate if this device is sharing MSI for EQs on MANA */
 	bool msi_sharing;
 
@@ -1026,6 +1034,10 @@ int mana_gd_resume(struct pci_dev *pdev);
 
 bool mana_need_log(struct gdma_context *gc, int err);
 
+struct gdma_irq_context *mana_gd_get_gic(struct gdma_context *gc,
+					 bool use_msi_bitmap,
+					 int *msi_requested);
+void mana_gd_put_gic(struct gdma_context *gc, bool use_msi_bitmap, int msi);
 int mana_gd_query_device_cfg(struct gdma_context *gc, u32 proto_major_ver,
 			     u32 proto_minor_ver, u32 proto_micro_ver,
 			     u16 *max_num_vports, u8 *bm_hostmode);
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v9 4/6] net: mana: Use GIC functions to allocate global EQs
From: Long Li @ 2026-05-13 22:09 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260513220956.402058-1-longli@microsoft.com>

Replace the GDMA global interrupt setup code with the new GIC allocation
and release functions for managing interrupt contexts.

This changes the per-queue interrupt names in /proc/interrupts from
mana_q0, mana_q1, ... to mana_msi1, mana_msi2, ... to reflect the
MSI-X index rather than a zero-based queue number. The HWC interrupt
name (mana_hwc) is unchanged.

Signed-off-by: Long Li <longli@microsoft.com>
---
 .../net/ethernet/microsoft/mana/gdma_main.c   | 96 +++----------------
 1 file changed, 13 insertions(+), 83 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index fdd2ef24414b..e1a0e897b1b9 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -1915,7 +1915,7 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
 	struct gdma_context *gc = pci_get_drvdata(pdev);
 	struct gdma_irq_context *gic;
 	bool skip_first_cpu = false;
-	int *irqs, irq, err, i;
+	int *irqs, err, i;
 
 	irqs = kmalloc_objs(int, nvec);
 	if (!irqs)
@@ -1928,30 +1928,13 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
 	 * further used in irq_setup()
 	 */
 	for (i = 1; i <= nvec; i++) {
-		gic = kzalloc_obj(*gic);
+		gic = mana_gd_get_gic(gc, false, &i);
 		if (!gic) {
 			err = -ENOMEM;
 			goto free_irq;
 		}
-		gic->handler = mana_gd_process_eq_events;
-		INIT_LIST_HEAD(&gic->eq_list);
-		spin_lock_init(&gic->lock);
-
-		snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s",
-			 i - 1, pci_name(pdev));
-
-		/* one pci vector is already allocated for HWC */
-		irqs[i - 1] = pci_irq_vector(pdev, i);
-		if (irqs[i - 1] < 0) {
-			err = irqs[i - 1];
-			goto free_current_gic;
-		}
-
-		err = request_irq(irqs[i - 1], mana_gd_intr, 0, gic->name, gic);
-		if (err)
-			goto free_current_gic;
 
-		xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL);
+		irqs[i - 1] = gic->irq;
 	}
 
 	/*
@@ -1973,20 +1956,9 @@ static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec)
 	kfree(irqs);
 	return 0;
 
-free_current_gic:
-	kfree(gic);
 free_irq:
-	for (i -= 1; i > 0; i--) {
-		irq = pci_irq_vector(pdev, i);
-		gic = xa_load(&gc->irq_contexts, i);
-		if (WARN_ON(!gic))
-			continue;
-
-		irq_update_affinity_hint(irq, NULL);
-		free_irq(irq, gic);
-		xa_erase(&gc->irq_contexts, i);
-		kfree(gic);
-	}
+	for (i -= 1; i > 0; i--)
+		mana_gd_put_gic(gc, false, i);
 	kfree(irqs);
 	return err;
 }
@@ -1995,7 +1967,7 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec)
 {
 	struct gdma_context *gc = pci_get_drvdata(pdev);
 	struct gdma_irq_context *gic;
-	int *irqs, *start_irqs, irq;
+	int *irqs, *start_irqs;
 	unsigned int cpu;
 	int err, i;
 
@@ -2006,34 +1978,13 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec)
 	start_irqs = irqs;
 
 	for (i = 0; i < nvec; i++) {
-		gic = kzalloc_obj(*gic);
+		gic = mana_gd_get_gic(gc, false, &i);
 		if (!gic) {
 			err = -ENOMEM;
 			goto free_irq;
 		}
 
-		gic->handler = mana_gd_process_eq_events;
-		INIT_LIST_HEAD(&gic->eq_list);
-		spin_lock_init(&gic->lock);
-
-		if (!i)
-			snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_hwc@pci:%s",
-				 pci_name(pdev));
-		else
-			snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s",
-				 i - 1, pci_name(pdev));
-
-		irqs[i] = pci_irq_vector(pdev, i);
-		if (irqs[i] < 0) {
-			err = irqs[i];
-			goto free_current_gic;
-		}
-
-		err = request_irq(irqs[i], mana_gd_intr, 0, gic->name, gic);
-		if (err)
-			goto free_current_gic;
-
-		xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL);
+		irqs[i] = gic->irq;
 	}
 
 	/* If number of IRQ is one extra than number of online CPUs,
@@ -2062,20 +2013,9 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec)
 	kfree(start_irqs);
 	return 0;
 
-free_current_gic:
-	kfree(gic);
 free_irq:
-	for (i -= 1; i >= 0; i--) {
-		irq = pci_irq_vector(pdev, i);
-		gic = xa_load(&gc->irq_contexts, i);
-		if (WARN_ON(!gic))
-			continue;
-
-		irq_update_affinity_hint(irq, NULL);
-		free_irq(irq, gic);
-		xa_erase(&gc->irq_contexts, i);
-		kfree(gic);
-	}
+	for (i -= 1; i >= 0; i--)
+		mana_gd_put_gic(gc, false, i);
 
 	kfree(start_irqs);
 	return err;
@@ -2149,26 +2089,16 @@ static int mana_gd_setup_remaining_irqs(struct pci_dev *pdev)
 static void mana_gd_remove_irqs(struct pci_dev *pdev)
 {
 	struct gdma_context *gc = pci_get_drvdata(pdev);
-	struct gdma_irq_context *gic;
-	int irq, i;
+	int i;
 
 	if (gc->max_num_msix < 1)
 		return;
 
 	for (i = 0; i < gc->max_num_msix; i++) {
-		irq = pci_irq_vector(pdev, i);
-		if (irq < 0)
-			continue;
-
-		gic = xa_load(&gc->irq_contexts, i);
-		if (WARN_ON(!gic))
+		if (!xa_load(&gc->irq_contexts, i))
 			continue;
 
-		/* Need to clear the hint before free_irq */
-		irq_update_affinity_hint(irq, NULL);
-		free_irq(irq, gic);
-		xa_erase(&gc->irq_contexts, i);
-		kfree(gic);
+		mana_gd_put_gic(gc, false, i);
 	}
 
 	pci_free_irq_vectors(pdev);
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v9 5/6] net: mana: Allocate interrupt context for each EQ when creating vPort
From: Long Li @ 2026-05-13 22:09 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260513220956.402058-1-longli@microsoft.com>

Use GIC functions to create a dedicated interrupt context or acquire a
shared interrupt context for each EQ when setting up a vPort.

The caller now owns the GIC reference across the EQ create/destroy
lifecycle: mana_create_eq() calls mana_gd_get_gic() before creating
each EQ and mana_destroy_eq() calls mana_gd_put_gic() after destroying
it. The msix_index invalidation is moved from mana_gd_deregister_irq()
to the mana_gd_create_eq() error path so that mana_destroy_eq() can
read the index before teardown.

Signed-off-by: Long Li <longli@microsoft.com>
---
 .../net/ethernet/microsoft/mana/gdma_main.c    |  2 +-
 drivers/net/ethernet/microsoft/mana/mana_en.c  | 18 +++++++++++++++++-
 include/net/mana/gdma.h                        |  1 +
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index e1a0e897b1b9..53281fef2ccd 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -894,7 +894,6 @@ static void mana_gd_deregister_irq(struct gdma_queue *queue)
 	}
 	spin_unlock_irqrestore(&gic->lock, flags);
 
-	queue->eq.msix_index = INVALID_PCI_MSIX_INDEX;
 	synchronize_rcu();
 }
 
@@ -1009,6 +1008,7 @@ static int mana_gd_create_eq(struct gdma_dev *gd,
 out:
 	dev_err(dev, "Failed to create EQ: %d\n", err);
 	mana_gd_destroy_eq(gc, false, queue);
+	queue->eq.msix_index = INVALID_PCI_MSIX_INDEX;
 	return err;
 }
 
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index 3ee74e7e300c..433ec88d0d69 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1625,6 +1625,7 @@ void mana_destroy_eq(struct mana_port_context *apc)
 	struct mana_context *ac = apc->ac;
 	struct gdma_context *gc = ac->gdma_dev->gdma_context;
 	struct gdma_queue *eq;
+	unsigned int msi;
 	int i;
 
 	if (!apc->eqs)
@@ -1638,7 +1639,9 @@ void mana_destroy_eq(struct mana_port_context *apc)
 		if (!eq)
 			continue;
 
+		msi = eq->eq.msix_index;
 		mana_gd_destroy_queue(gc, eq);
+		mana_gd_put_gic(gc, !gc->msi_sharing, msi);
 	}
 
 	kfree(apc->eqs);
@@ -1655,6 +1658,7 @@ static void mana_create_eq_debugfs(struct mana_port_context *apc, int i)
 	eq.mana_eq_debugfs = debugfs_create_dir(eqnum, apc->mana_eqs_debugfs);
 	debugfs_create_u32("head", 0400, eq.mana_eq_debugfs, &eq.eq->head);
 	debugfs_create_u32("tail", 0400, eq.mana_eq_debugfs, &eq.eq->tail);
+	debugfs_create_u32("irq", 0400, eq.mana_eq_debugfs, &eq.eq->eq.irq);
 	debugfs_create_file("eq_dump", 0400, eq.mana_eq_debugfs, eq.eq, &mana_dbg_q_fops);
 }
 
@@ -1663,7 +1667,9 @@ int mana_create_eq(struct mana_port_context *apc)
 	struct gdma_dev *gd = apc->ac->gdma_dev;
 	struct gdma_context *gc = gd->gdma_context;
 	struct gdma_queue_spec spec = {};
+	struct gdma_irq_context *gic;
 	int err;
+	int msi;
 	int i;
 
 	if (WARN_ON(apc->eqs))
@@ -1683,12 +1689,22 @@ int mana_create_eq(struct mana_port_context *apc)
 		debugfs_create_dir("EQs", apc->mana_port_debugfs);
 
 	for (i = 0; i < apc->num_queues; i++) {
-		spec.eq.msix_index = (i + 1) % gc->num_msix_usable;
+		msi = (i + 1) % gc->num_msix_usable;
+
+		gic = mana_gd_get_gic(gc, !gc->msi_sharing, &msi);
+		if (!gic) {
+			err = -ENOMEM;
+			goto out;
+		}
+		spec.eq.msix_index = msi;
+
 		err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq);
 		if (err) {
 			dev_err(gc->dev, "Failed to create EQ %d : %d\n", i, err);
+			mana_gd_put_gic(gc, !gc->msi_sharing, msi);
 			goto out;
 		}
+		apc->eqs[i].eq->eq.irq = gic->irq;
 		mana_create_eq_debugfs(apc, i);
 	}
 
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index fbe3c1427b45..6c138cc77407 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -342,6 +342,7 @@ struct gdma_queue {
 			void *context;
 
 			unsigned int msix_index;
+			unsigned int irq;
 
 			u32 log2_throttle_limit;
 		} eq;
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next v9 6/6] RDMA/mana_ib: Allocate interrupt contexts on EQs
From: Long Li @ 2026-05-13 22:09 UTC (permalink / raw)
  To: Long Li, Konstantin Taranov, Jakub Kicinski, David S . Miller,
	Paolo Abeni, Eric Dumazet, Andrew Lunn, Jason Gunthorpe,
	Leon Romanovsky, Haiyang Zhang, K . Y . Srinivasan, Wei Liu,
	Dexuan Cui, shradhagupta
  Cc: Simon Horman, netdev, linux-rdma, linux-hyperv, linux-kernel
In-Reply-To: <20260513220956.402058-1-longli@microsoft.com>

Use the GIC functions to allocate interrupt contexts for RDMA EQs. These
interrupt contexts may be shared with Ethernet EQs when MSI-X vectors
are limited.

The driver now supports allocating dedicated MSI-X for each EQ. Indicate
this capability through driver capability bits. The RDMA EQs pass
use_msi_bitmap=false to share MSI-X vectors with Ethernet, while the
capability flag advertises that the driver supports per-vPort EQ
separation when hardware has sufficient vectors.

Populate eq.irq on all RDMA EQs for consistency with the Ethernet path.

Also relocate the GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE define to its
numeric BIT(6) position among the other capability flags.

Signed-off-by: Long Li <longli@microsoft.com>
---
 drivers/infiniband/hw/mana/main.c | 43 +++++++++++++++++++++++++------
 include/net/mana/gdma.h           |  7 +++--
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
index 6159bd03a021..47e5322bebca 100644
--- a/drivers/infiniband/hw/mana/main.c
+++ b/drivers/infiniband/hw/mana/main.c
@@ -750,7 +750,8 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev)
 {
 	struct gdma_context *gc = mdev_to_gc(mdev);
 	struct gdma_queue_spec spec = {};
-	int err, i;
+	struct gdma_irq_context *gic;
+	int err, i, msi;
 
 	spec.type = GDMA_EQ;
 	spec.monitor_avl_buf = false;
@@ -758,11 +759,19 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev)
 	spec.eq.callback = mana_ib_event_handler;
 	spec.eq.context = mdev;
 	spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE;
-	spec.eq.msix_index = 0;
+
+	msi = 0;
+	gic = mana_gd_get_gic(gc, false, &msi);
+	if (!gic)
+		return -ENOMEM;
+	spec.eq.msix_index = msi;
 
 	err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->fatal_err_eq);
-	if (err)
+	if (err) {
+		mana_gd_put_gic(gc, false, 0);
 		return err;
+	}
+	mdev->fatal_err_eq->eq.irq = gic->irq;
 
 	mdev->eqs = kzalloc_objs(struct gdma_queue *,
 				 mdev->ib_dev.num_comp_vectors);
@@ -772,32 +781,50 @@ int mana_ib_create_eqs(struct mana_ib_dev *mdev)
 	}
 	spec.eq.callback = NULL;
 	for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) {
-		spec.eq.msix_index = (i + 1) % gc->num_msix_usable;
+		msi = (i + 1) % gc->num_msix_usable;
+
+		gic = mana_gd_get_gic(gc, false, &msi);
+		if (!gic) {
+			err = -ENOMEM;
+			goto destroy_eqs;
+		}
+		spec.eq.msix_index = msi;
+
 		err = mana_gd_create_mana_eq(mdev->gdma_dev, &spec, &mdev->eqs[i]);
-		if (err)
+		if (err) {
+			mana_gd_put_gic(gc, false, msi);
 			goto destroy_eqs;
+		}
+		mdev->eqs[i]->eq.irq = gic->irq;
 	}
 
 	return 0;
 
 destroy_eqs:
-	while (i-- > 0)
+	while (i-- > 0) {
 		mana_gd_destroy_queue(gc, mdev->eqs[i]);
+		mana_gd_put_gic(gc, false, (i + 1) % gc->num_msix_usable);
+	}
 	kfree(mdev->eqs);
 destroy_fatal_eq:
 	mana_gd_destroy_queue(gc, mdev->fatal_err_eq);
+	mana_gd_put_gic(gc, false, 0);
 	return err;
 }
 
 void mana_ib_destroy_eqs(struct mana_ib_dev *mdev)
 {
 	struct gdma_context *gc = mdev_to_gc(mdev);
-	int i;
+	int i, msi;
 
 	mana_gd_destroy_queue(gc, mdev->fatal_err_eq);
+	mana_gd_put_gic(gc, false, 0);
 
-	for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++)
+	for (i = 0; i < mdev->ib_dev.num_comp_vectors; i++) {
 		mana_gd_destroy_queue(gc, mdev->eqs[i]);
+		msi = (i + 1) % gc->num_msix_usable;
+		mana_gd_put_gic(gc, false, msi);
+	}
 
 	kfree(mdev->eqs);
 }
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 6c138cc77407..d84e474309a3 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -615,6 +615,7 @@ enum {
 #define GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG BIT(3)
 #define GDMA_DRV_CAP_FLAG_1_GDMA_PAGES_4MB_1GB_2GB BIT(4)
 #define GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT BIT(5)
+#define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6)
 
 /* Driver can handle holes (zeros) in the device list */
 #define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11)
@@ -631,7 +632,8 @@ enum {
 /* Driver detects stalled send queues and recovers them */
 #define GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY BIT(18)
 
-#define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6)
+/* Driver supports separate EQ/MSIs for each vPort */
+#define GDMA_DRV_CAP_FLAG_1_EQ_MSI_UNSHARE_MULTI_VPORT BIT(19)
 
 /* Driver supports linearizing the skb when num_sge exceeds hardware limit */
 #define GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE BIT(20)
@@ -659,7 +661,8 @@ enum {
 	 GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE | \
 	 GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY | \
 	 GDMA_DRV_CAP_FLAG_1_HANDLE_STALL_SQ_RECOVERY | \
-	 GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECOVERY)
+	 GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECOVERY | \
+	 GDMA_DRV_CAP_FLAG_1_EQ_MSI_UNSHARE_MULTI_VPORT)
 
 #define GDMA_DRV_CAP_FLAGS2 0
 
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH v2 1/2] Drivers: hv: vmbus: Provide option to skip VMBus unload on panic
From: Wei Liu @ 2026-05-13 22:32 UTC (permalink / raw)
  To: Michael Kelley
  Cc: wei.liu@kernel.org, tzimmermann@suse.de, longli@microsoft.com,
	jfalempe@redhat.com, drawat.floss@gmail.com,
	maarten.lankhorst@linux.intel.com, mripard@kernel.org,
	airlied@gmail.com, simona@ffwll.ch, kys@microsoft.com,
	haiyangz@microsoft.com, decui@microsoft.com, ryasuoka@redhat.com,
	dri-devel@lists.freedesktop.org, linux-kernel@vger.kernel.org,
	linux-hyperv@vger.kernel.org, stable@vger.kernel.org
In-Reply-To: <SN6PR02MB415786A3C7D10C22FAB12E3ED4312@SN6PR02MB4157.namprd02.prod.outlook.com>

On Mon, May 04, 2026 at 08:47:50PM +0000, Michael Kelley wrote:
> From: Michael Kelley <mhklkml@zohomail.com> Sent: Tuesday, February 17, 2026 10:24 AM
> > 
> 
> Wei and Thomas --
> 
> This small patch series has been neglected. Patch 2 of the series is here [1].
> 
> Long Li < longli@microsoft.com> has given a Reviewed-by on this patch,
> and Jocelyn Falempe <jfalempe@redhat.com> has given a Reviewed-by
> on Patch 2 of the series, modulo a comment which I have incorporated.
> See [2]. But I neglected to add her R-b when I spun v2 of the series.
> 
> Any reason this can't be picked up as a bug fix for 7.1? I just checked,
> and it applies cleanly to a recent linux-next (20260423). I'd suggest
> going through the hyperv tree, as these two patches should be kept
> together in sequence.
> 

I thought they were not reviewed yet. I have now applied them to hyperv-fixes.

Thanks for the reminder.

Wei

^ permalink raw reply

* RE: [EXTERNAL] Re: [PATCH net-next v7 0/6] net: mana: Per-vPort EQ and MSI-X interrupt management
From: Long Li @ 2026-05-13 22:43 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Konstantin Taranov, David S . Miller, Paolo Abeni, Eric Dumazet,
	Andrew Lunn, Jason Gunthorpe, Leon Romanovsky, Haiyang Zhang,
	KY Srinivasan, Wei Liu, Dexuan Cui,
	shradhagupta@linux.microsoft.com, Simon Horman,
	netdev@vger.kernel.org, linux-rdma@vger.kernel.org,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <20260511191540.630e09b3@kernel.org>

> On Thu,  7 May 2026 12:12:31 -0700 Long Li wrote:
> > This series adds per-vPort Event Queue (EQ) allocation and MSI-X
> > interrupt management for the MANA driver. Previously, all vPorts
> > shared a single set of EQs. This change enables dedicated EQs per
> > vPort with support for both dedicated and shared MSI-X vector allocation
> modes.
> 
> Once all the AI review comments are address / only false positives remain - could
> you pop these patches on a branch and add PR info to the cover letter so that
> both RDMA and netdev can pull this?

I have sent v9 with PR info in the cover letter.

Thanks,
Long

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox