[PATCH] mshv: Align huge page stride with guest mapping

Linux-HyperV List
 help / color / mirror / Atom feed

* [PATCH] mshv: Align huge page stride with guest mapping
@ 2025-12-17  0:41 Stanislav Kinsburskii
  2025-12-18 19:41 ` Michael Kelley
  0 siblings, 1 reply; 18+ messages in thread
From: Stanislav Kinsburskii @ 2025-12-17  0:41 UTC (permalink / raw)
  To: kys, haiyangz, wei.liu, decui, longli; +Cc: linux-hyperv, linux-kernel

Ensure that a stride larger than 1 (huge page) is only used when both
the guest frame number (gfn) and the operation size (page_count) are
aligned to the huge page size (PTRS_PER_PMD). This matches the
hypervisor requirement that map/unmap operations for huge pages must be
guest-aligned and cover a full huge page.

Add mshv_chunk_stride() to encapsulate this alignment and page-order
validation, and plumb a huge_page flag into the region chunk handlers.
This prevents issuing large-page map/unmap/share operations that the
hypervisor would reject due to misaligned guest mappings.

Fixes: abceb4297bf8 ("mshv: Fix huge page handling in memory region traversal")
Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
---
 drivers/hv/mshv_regions.c |   94 ++++++++++++++++++++++++++++++---------------
 1 file changed, 63 insertions(+), 31 deletions(-)

diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
index 30bacba6aec3..29776019bcde 100644
--- a/drivers/hv/mshv_regions.c
+++ b/drivers/hv/mshv_regions.c
@@ -19,6 +19,42 @@
 
 #define MSHV_MAP_FAULT_IN_PAGES				PTRS_PER_PMD
 
+/**
+ * mshv_chunk_stride - Compute stride for mapping guest memory
+ * @page      : The page to check for huge page backing
+ * @gfn       : Guest frame number for the mapping
+ * @page_count: Total number of pages in the mapping
+ *
+ * Determines the appropriate stride (in pages) for mapping guest memory.
+ * Uses huge page stride if the backing page is huge and the guest mapping
+ * is properly aligned; otherwise falls back to single page stride.
+ *
+ * Return: Stride in pages, or -EINVAL if page order is unsupported.
+ */
+static int mshv_chunk_stride(struct page *page,
+			     u64 gfn, u64 page_count)
+{
+	unsigned int page_order;
+
+	page_order = folio_order(page_folio(page));
+	/* The hypervisor only supports 4K and 2M page sizes */
+	if (page_order && page_order != PMD_ORDER)
+		return -EINVAL;
+
+	/*
+	 * Default to a single page stride. If page_order is set and both
+	 * the guest frame number (gfn) and page_count are huge-page
+	 * aligned (PTRS_PER_PMD), use a larger stride so the mapping can
+	 * be backed by a huge page in both guest and hypervisor.
+	 */
+	if (page_order &&
+	    IS_ALIGNED(gfn, PTRS_PER_PMD) &&
+	    IS_ALIGNED(page_count, PTRS_PER_PMD))
+		return 1 << page_order;
+
+	return 1;
+}
+
 /**
  * mshv_region_process_chunk - Processes a contiguous chunk of memory pages
  *                             in a region.
@@ -45,25 +81,23 @@ static long mshv_region_process_chunk(struct mshv_mem_region *region,
 				      int (*handler)(struct mshv_mem_region *region,
 						     u32 flags,
 						     u64 page_offset,
-						     u64 page_count))
+						     u64 page_count,
+						     bool huge_page))
 {
-	u64 count, stride;
-	unsigned int page_order;
+	u64 gfn = region->start_gfn + page_offset;
+	u64 count;
 	struct page *page;
-	int ret;
+	int stride, ret;
 
 	page = region->pages[page_offset];
 	if (!page)
 		return -EINVAL;
 
-	page_order = folio_order(page_folio(page));
-	/* The hypervisor only supports 4K and 2M page sizes */
-	if (page_order && page_order != PMD_ORDER)
-		return -EINVAL;
-
-	stride = 1 << page_order;
+	stride = mshv_chunk_stride(page, gfn, page_count);
+	if (stride < 0)
+		return stride;
 
-	/* Start at stride since the first page is validated */
+	/* Start at stride since the first stride is validated */
 	for (count = stride; count < page_count; count += stride) {
 		page = region->pages[page_offset + count];
 
@@ -71,12 +105,13 @@ static long mshv_region_process_chunk(struct mshv_mem_region *region,
 		if (!page)
 			break;
 
-		/* Break if page size changes */
-		if (page_order != folio_order(page_folio(page)))
+		/* Break if stride size changes */
+		if (stride != mshv_chunk_stride(page, gfn + count,
+						page_count - count))
 			break;
 	}
 
-	ret = handler(region, flags, page_offset, count);
+	ret = handler(region, flags, page_offset, count, stride > 1);
 	if (ret)
 		return ret;
 
@@ -108,7 +143,8 @@ static int mshv_region_process_range(struct mshv_mem_region *region,
 				     int (*handler)(struct mshv_mem_region *region,
 						    u32 flags,
 						    u64 page_offset,
-						    u64 page_count))
+						    u64 page_count,
+						    bool huge_page))
 {
 	long ret;
 
@@ -162,11 +198,10 @@ struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
 
 static int mshv_region_chunk_share(struct mshv_mem_region *region,
 				   u32 flags,
-				   u64 page_offset, u64 page_count)
+				   u64 page_offset, u64 page_count,
+				   bool huge_page)
 {
-	struct page *page = region->pages[page_offset];
-
-	if (PageHuge(page) || PageTransCompound(page))
+	if (huge_page)
 		flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
 
 	return hv_call_modify_spa_host_access(region->partition->pt_id,
@@ -188,11 +223,10 @@ int mshv_region_share(struct mshv_mem_region *region)
 
 static int mshv_region_chunk_unshare(struct mshv_mem_region *region,
 				     u32 flags,
-				     u64 page_offset, u64 page_count)
+				     u64 page_offset, u64 page_count,
+				     bool huge_page)
 {
-	struct page *page = region->pages[page_offset];
-
-	if (PageHuge(page) || PageTransCompound(page))
+	if (huge_page)
 		flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
 
 	return hv_call_modify_spa_host_access(region->partition->pt_id,
@@ -212,11 +246,10 @@ int mshv_region_unshare(struct mshv_mem_region *region)
 
 static int mshv_region_chunk_remap(struct mshv_mem_region *region,
 				   u32 flags,
-				   u64 page_offset, u64 page_count)
+				   u64 page_offset, u64 page_count,
+				   bool huge_page)
 {
-	struct page *page = region->pages[page_offset];
-
-	if (PageHuge(page) || PageTransCompound(page))
+	if (huge_page)
 		flags |= HV_MAP_GPA_LARGE_PAGE;
 
 	return hv_call_map_gpa_pages(region->partition->pt_id,
@@ -295,11 +328,10 @@ int mshv_region_pin(struct mshv_mem_region *region)
 
 static int mshv_region_chunk_unmap(struct mshv_mem_region *region,
 				   u32 flags,
-				   u64 page_offset, u64 page_count)
+				   u64 page_offset, u64 page_count,
+				   bool huge_page)
 {
-	struct page *page = region->pages[page_offset];
-
-	if (PageHuge(page) || PageTransCompound(page))
+	if (huge_page)
 		flags |= HV_UNMAP_GPA_LARGE_PAGE;
 
 	return hv_call_unmap_gpa_pages(region->partition->pt_id,



^ permalink raw reply related	[flat|nested] 18+ messages in thread

* RE: [PATCH] mshv: Align huge page stride with guest mapping
  2025-12-17  0:41 [PATCH] mshv: Align huge page stride with guest mapping Stanislav Kinsburskii
@ 2025-12-18 19:41 ` Michael Kelley
  2025-12-19 22:53   ` Stanislav Kinsburskii
  0 siblings, 1 reply; 18+ messages in thread
From: Michael Kelley @ 2025-12-18 19:41 UTC (permalink / raw)
  To: Stanislav Kinsburskii, kys@microsoft.com, haiyangz@microsoft.com,
	wei.liu@kernel.org, decui@microsoft.com, longli@microsoft.com
  Cc: linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Tuesday, December 16, 2025 4:41 PM
> 
> Ensure that a stride larger than 1 (huge page) is only used when both
> the guest frame number (gfn) and the operation size (page_count) are
> aligned to the huge page size (PTRS_PER_PMD). This matches the
> hypervisor requirement that map/unmap operations for huge pages must be
> guest-aligned and cover a full huge page.
> 
> Add mshv_chunk_stride() to encapsulate this alignment and page-order
> validation, and plumb a huge_page flag into the region chunk handlers.
> This prevents issuing large-page map/unmap/share operations that the
> hypervisor would reject due to misaligned guest mappings.

This code looks good to me on the surface. But I can only make an educated
guess as to the hypervisor behavior in certain situations, and if my guess is
correct there's still a flaw in one case.

Consider the madvise() DONTNEED experiment that I previously called out. [1]
I surmise that the intent of this patch is to make that case work correctly.
When the .invalidate callback is made for the 32 Kbyte range embedded in
a previously mapped 2 Meg page, this new code detects that case. It calls the
hypervisor to remap the 32 Kbyte range for no access, and clears the 8
corresponding entries in the struct page array attached to the mshv region. The
call to the hypervisor is made *without* the HV_MAP_GPA_LARGE_PAGE flag.
Since the mapping was originally done *with* the HV_MAP_GPA_LARGE_PAGE
flag, my guess is that the hypervisor is smart enough to handle this case by
splitting the 2 Meg mapping it created, setting the 32 Kbyte range to no access,
and returning "success". If my guess is correct, there's no problem here.

But then there's a second .invalidate callback for the entire 2 Meg page. Here's
the call stack:

[  194.259337]  dump_stack+0x14/0x20
[  194.259339]  mhktest_invalidate+0x2a/0x40  [my dummy invalidate callback]
[  194.259342]  __mmu_notifier_invalidate_range_start+0x1f4/0x250
[  194.259347]  __split_huge_pmd+0x14f/0x170
[  194.259349]  unmap_page_range+0x104d/0x1a00
[  194.259358]  unmap_single_vma+0x7d/0xc0
[  194.259360]  zap_page_range_single_batched+0xe0/0x1c0
[  194.259363]  madvise_vma_behavior+0xb01/0xc00
[  194.259366]  madvise_do_behavior.part.0+0x3cd/0x4a0
[  194.259375]  do_madvise+0xc7/0x170
[  194.259380]  __x64_sys_madvise+0x2f/0x40
[  194.259382]  x64_sys_call+0x1d77/0x21b0
[  194.259385]  do_syscall_64+0x56/0x640
[  194.259388]  entry_SYSCALL_64_after_hwframe+0x76/0x7e

In __split_huge_pmd(), the .invalidate callback is made *before* the 2 Meg
page is actually split by the root partition. So mshv_chunk_stride() returns "9"
for the stride, and the hypervisor is called with HV_MAP_GPA_LARGE_PAGE
set. My guess is that the hypervisor returns an error because it has already
split the mapping. The whole point of this patch set is to avoid passing
HV_MAP_GPA_LARGE_PAGE to the hypervisor when the hypervisor mapping
is not a large page mapping, but this looks like a case where it still happens.

My concern is solely from looking at the code and thinking about the problem,
as I don't have an environment where I can test root partition interactions
with the hypervisor. So maybe I'm missing something. Lemme know what you
think .....

Michael

[1] https://lore.kernel.org/linux-hyperv/SN6PR02MB4157978DFAA6C2584D0678E1D4A1A@SN6PR02MB4157.namprd02.prod.outlook.com/

> 
> Fixes: abceb4297bf8 ("mshv: Fix huge page handling in memory region traversal")
> Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> ---
>  drivers/hv/mshv_regions.c |   94 ++++++++++++++++++++++++++++++---------------
>  1 file changed, 63 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
> index 30bacba6aec3..29776019bcde 100644
> --- a/drivers/hv/mshv_regions.c
> +++ b/drivers/hv/mshv_regions.c
> @@ -19,6 +19,42 @@
> 
>  #define MSHV_MAP_FAULT_IN_PAGES				PTRS_PER_PMD
> 
> +/**
> + * mshv_chunk_stride - Compute stride for mapping guest memory
> + * @page      : The page to check for huge page backing
> + * @gfn       : Guest frame number for the mapping
> + * @page_count: Total number of pages in the mapping
> + *
> + * Determines the appropriate stride (in pages) for mapping guest memory.
> + * Uses huge page stride if the backing page is huge and the guest mapping
> + * is properly aligned; otherwise falls back to single page stride.
> + *
> + * Return: Stride in pages, or -EINVAL if page order is unsupported.
> + */
> +static int mshv_chunk_stride(struct page *page,
> +			     u64 gfn, u64 page_count)
> +{
> +	unsigned int page_order;
> +
> +	page_order = folio_order(page_folio(page));
> +	/* The hypervisor only supports 4K and 2M page sizes */
> +	if (page_order && page_order != PMD_ORDER)
> +		return -EINVAL;
> +
> +	/*
> +	 * Default to a single page stride. If page_order is set and both
> +	 * the guest frame number (gfn) and page_count are huge-page
> +	 * aligned (PTRS_PER_PMD), use a larger stride so the mapping can
> +	 * be backed by a huge page in both guest and hypervisor.
> +	 */
> +	if (page_order &&
> +	    IS_ALIGNED(gfn, PTRS_PER_PMD) &&
> +	    IS_ALIGNED(page_count, PTRS_PER_PMD))
> +		return 1 << page_order;
> +
> +	return 1;
> +}
> +
>  /**
>   * mshv_region_process_chunk - Processes a contiguous chunk of memory pages
>   *                             in a region.
> @@ -45,25 +81,23 @@ static long mshv_region_process_chunk(struct mshv_mem_region *region,
>  				      int (*handler)(struct mshv_mem_region *region,
>  						     u32 flags,
>  						     u64 page_offset,
> -						     u64 page_count))
> +						     u64 page_count,
> +						     bool huge_page))
>  {
> -	u64 count, stride;
> -	unsigned int page_order;
> +	u64 gfn = region->start_gfn + page_offset;
> +	u64 count;
>  	struct page *page;
> -	int ret;
> +	int stride, ret;
> 
>  	page = region->pages[page_offset];
>  	if (!page)
>  		return -EINVAL;
> 
> -	page_order = folio_order(page_folio(page));
> -	/* The hypervisor only supports 4K and 2M page sizes */
> -	if (page_order && page_order != PMD_ORDER)
> -		return -EINVAL;
> -
> -	stride = 1 << page_order;
> +	stride = mshv_chunk_stride(page, gfn, page_count);
> +	if (stride < 0)
> +		return stride;
> 
> -	/* Start at stride since the first page is validated */
> +	/* Start at stride since the first stride is validated */
>  	for (count = stride; count < page_count; count += stride) {
>  		page = region->pages[page_offset + count];
> 
> @@ -71,12 +105,13 @@ static long mshv_region_process_chunk(struct mshv_mem_region *region,
>  		if (!page)
>  			break;
> 
> -		/* Break if page size changes */
> -		if (page_order != folio_order(page_folio(page)))
> +		/* Break if stride size changes */
> +		if (stride != mshv_chunk_stride(page, gfn + count,
> +						page_count - count))
>  			break;
>  	}
> 
> -	ret = handler(region, flags, page_offset, count);
> +	ret = handler(region, flags, page_offset, count, stride > 1);
>  	if (ret)
>  		return ret;
> 
> @@ -108,7 +143,8 @@ static int mshv_region_process_range(struct mshv_mem_region *region,
>  				     int (*handler)(struct mshv_mem_region *region,
>  						    u32 flags,
>  						    u64 page_offset,
> -						    u64 page_count))
> +						    u64 page_count,
> +						    bool huge_page))
>  {
>  	long ret;
> 
> @@ -162,11 +198,10 @@ struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
> 
>  static int mshv_region_chunk_share(struct mshv_mem_region *region,
>  				   u32 flags,
> -				   u64 page_offset, u64 page_count)
> +				   u64 page_offset, u64 page_count,
> +				   bool huge_page)
>  {
> -	struct page *page = region->pages[page_offset];
> -
> -	if (PageHuge(page) || PageTransCompound(page))
> +	if (huge_page)
>  		flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
> 
>  	return hv_call_modify_spa_host_access(region->partition->pt_id,
> @@ -188,11 +223,10 @@ int mshv_region_share(struct mshv_mem_region *region)
> 
>  static int mshv_region_chunk_unshare(struct mshv_mem_region *region,
>  				     u32 flags,
> -				     u64 page_offset, u64 page_count)
> +				     u64 page_offset, u64 page_count,
> +				     bool huge_page)
>  {
> -	struct page *page = region->pages[page_offset];
> -
> -	if (PageHuge(page) || PageTransCompound(page))
> +	if (huge_page)
>  		flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
> 
>  	return hv_call_modify_spa_host_access(region->partition->pt_id,
> @@ -212,11 +246,10 @@ int mshv_region_unshare(struct mshv_mem_region *region)
> 
>  static int mshv_region_chunk_remap(struct mshv_mem_region *region,
>  				   u32 flags,
> -				   u64 page_offset, u64 page_count)
> +				   u64 page_offset, u64 page_count,
> +				   bool huge_page)
>  {
> -	struct page *page = region->pages[page_offset];
> -
> -	if (PageHuge(page) || PageTransCompound(page))
> +	if (huge_page)
>  		flags |= HV_MAP_GPA_LARGE_PAGE;
> 
>  	return hv_call_map_gpa_pages(region->partition->pt_id,
> @@ -295,11 +328,10 @@ int mshv_region_pin(struct mshv_mem_region *region)
> 
>  static int mshv_region_chunk_unmap(struct mshv_mem_region *region,
>  				   u32 flags,
> -				   u64 page_offset, u64 page_count)
> +				   u64 page_offset, u64 page_count,
> +				   bool huge_page)
>  {
> -	struct page *page = region->pages[page_offset];
> -
> -	if (PageHuge(page) || PageTransCompound(page))
> +	if (huge_page)
>  		flags |= HV_UNMAP_GPA_LARGE_PAGE;
> 
>  	return hv_call_unmap_gpa_pages(region->partition->pt_id,
> 
> 


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mshv: Align huge page stride with guest mapping
  2025-12-18 19:41 ` Michael Kelley
@ 2025-12-19 22:53   ` Stanislav Kinsburskii
  2025-12-22 18:25     ` Michael Kelley
  0 siblings, 1 reply; 18+ messages in thread
From: Stanislav Kinsburskii @ 2025-12-19 22:53 UTC (permalink / raw)
  To: Michael Kelley
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

On Thu, Dec 18, 2025 at 07:41:24PM +0000, Michael Kelley wrote:
> From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Tuesday, December 16, 2025 4:41 PM
> > 
> > Ensure that a stride larger than 1 (huge page) is only used when both
> > the guest frame number (gfn) and the operation size (page_count) are
> > aligned to the huge page size (PTRS_PER_PMD). This matches the
> > hypervisor requirement that map/unmap operations for huge pages must be
> > guest-aligned and cover a full huge page.
> > 
> > Add mshv_chunk_stride() to encapsulate this alignment and page-order
> > validation, and plumb a huge_page flag into the region chunk handlers.
> > This prevents issuing large-page map/unmap/share operations that the
> > hypervisor would reject due to misaligned guest mappings.
> 
> This code looks good to me on the surface. But I can only make an educated
> guess as to the hypervisor behavior in certain situations, and if my guess is
> correct there's still a flaw in one case.
> 
> Consider the madvise() DONTNEED experiment that I previously called out. [1]
> I surmise that the intent of this patch is to make that case work correctly.
> When the .invalidate callback is made for the 32 Kbyte range embedded in
> a previously mapped 2 Meg page, this new code detects that case. It calls the
> hypervisor to remap the 32 Kbyte range for no access, and clears the 8
> corresponding entries in the struct page array attached to the mshv region. The
> call to the hypervisor is made *without* the HV_MAP_GPA_LARGE_PAGE flag.
> Since the mapping was originally done *with* the HV_MAP_GPA_LARGE_PAGE
> flag, my guess is that the hypervisor is smart enough to handle this case by
> splitting the 2 Meg mapping it created, setting the 32 Kbyte range to no access,
> and returning "success". If my guess is correct, there's no problem here.
> 
> But then there's a second .invalidate callback for the entire 2 Meg page. Here's
> the call stack:
> 
> [  194.259337]  dump_stack+0x14/0x20
> [  194.259339]  mhktest_invalidate+0x2a/0x40  [my dummy invalidate callback]
> [  194.259342]  __mmu_notifier_invalidate_range_start+0x1f4/0x250
> [  194.259347]  __split_huge_pmd+0x14f/0x170
> [  194.259349]  unmap_page_range+0x104d/0x1a00
> [  194.259358]  unmap_single_vma+0x7d/0xc0
> [  194.259360]  zap_page_range_single_batched+0xe0/0x1c0
> [  194.259363]  madvise_vma_behavior+0xb01/0xc00
> [  194.259366]  madvise_do_behavior.part.0+0x3cd/0x4a0
> [  194.259375]  do_madvise+0xc7/0x170
> [  194.259380]  __x64_sys_madvise+0x2f/0x40
> [  194.259382]  x64_sys_call+0x1d77/0x21b0
> [  194.259385]  do_syscall_64+0x56/0x640
> [  194.259388]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> 
> In __split_huge_pmd(), the .invalidate callback is made *before* the 2 Meg
> page is actually split by the root partition. So mshv_chunk_stride() returns "9"
> for the stride, and the hypervisor is called with HV_MAP_GPA_LARGE_PAGE
> set. My guess is that the hypervisor returns an error because it has already
> split the mapping. The whole point of this patch set is to avoid passing
> HV_MAP_GPA_LARGE_PAGE to the hypervisor when the hypervisor mapping
> is not a large page mapping, but this looks like a case where it still happens.
> 
> My concern is solely from looking at the code and thinking about the problem,
> as I don't have an environment where I can test root partition interactions
> with the hypervisor. So maybe I'm missing something. Lemme know what you
> think .....
> 

Yeah, I see your point: according to this stack, once a part of the page
is invalidated, the folio order remains the same until another invocation
of the same callback — this time for the whole huge
page — is made. Thus, the stride is still reported as the huge page size,
even though a part of the page has already been unmapped.

This indeed looks like a flaw in the current approach, but it's actually
not. The reason is that upon the invalidation callback, the driver
simply remaps the whole huge page with no access (in this case, the PFNs
provided to the hypervisor are zero), and it's fine as the hypervisor
simply drops all the pages from the previous mapping and marks this page
as inaccessible. The only check the hypervisor makes in this case is
that both the GFN and mapping size are huge page aligned (which they are
in this case).
 
I hope this clarifies the situation. Please let me know if you have any
other questions.

Thanks,
Stanislav

> Michael
> 
> [1] https://lore.kernel.org/linux-hyperv/SN6PR02MB4157978DFAA6C2584D0678E1D4A1A@SN6PR02MB4157.namprd02.prod.outlook.com/
> 
> > 
> > Fixes: abceb4297bf8 ("mshv: Fix huge page handling in memory region traversal")
> > Signed-off-by: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com>
> > ---
> >  drivers/hv/mshv_regions.c |   94 ++++++++++++++++++++++++++++++---------------
> >  1 file changed, 63 insertions(+), 31 deletions(-)
> > 
> > diff --git a/drivers/hv/mshv_regions.c b/drivers/hv/mshv_regions.c
> > index 30bacba6aec3..29776019bcde 100644
> > --- a/drivers/hv/mshv_regions.c
> > +++ b/drivers/hv/mshv_regions.c
> > @@ -19,6 +19,42 @@
> > 
> >  #define MSHV_MAP_FAULT_IN_PAGES				PTRS_PER_PMD
> > 
> > +/**
> > + * mshv_chunk_stride - Compute stride for mapping guest memory
> > + * @page      : The page to check for huge page backing
> > + * @gfn       : Guest frame number for the mapping
> > + * @page_count: Total number of pages in the mapping
> > + *
> > + * Determines the appropriate stride (in pages) for mapping guest memory.
> > + * Uses huge page stride if the backing page is huge and the guest mapping
> > + * is properly aligned; otherwise falls back to single page stride.
> > + *
> > + * Return: Stride in pages, or -EINVAL if page order is unsupported.
> > + */
> > +static int mshv_chunk_stride(struct page *page,
> > +			     u64 gfn, u64 page_count)
> > +{
> > +	unsigned int page_order;
> > +
> > +	page_order = folio_order(page_folio(page));
> > +	/* The hypervisor only supports 4K and 2M page sizes */
> > +	if (page_order && page_order != PMD_ORDER)
> > +		return -EINVAL;
> > +
> > +	/*
> > +	 * Default to a single page stride. If page_order is set and both
> > +	 * the guest frame number (gfn) and page_count are huge-page
> > +	 * aligned (PTRS_PER_PMD), use a larger stride so the mapping can
> > +	 * be backed by a huge page in both guest and hypervisor.
> > +	 */
> > +	if (page_order &&
> > +	    IS_ALIGNED(gfn, PTRS_PER_PMD) &&
> > +	    IS_ALIGNED(page_count, PTRS_PER_PMD))
> > +		return 1 << page_order;
> > +
> > +	return 1;
> > +}
> > +
> >  /**
> >   * mshv_region_process_chunk - Processes a contiguous chunk of memory pages
> >   *                             in a region.
> > @@ -45,25 +81,23 @@ static long mshv_region_process_chunk(struct mshv_mem_region *region,
> >  				      int (*handler)(struct mshv_mem_region *region,
> >  						     u32 flags,
> >  						     u64 page_offset,
> > -						     u64 page_count))
> > +						     u64 page_count,
> > +						     bool huge_page))
> >  {
> > -	u64 count, stride;
> > -	unsigned int page_order;
> > +	u64 gfn = region->start_gfn + page_offset;
> > +	u64 count;
> >  	struct page *page;
> > -	int ret;
> > +	int stride, ret;
> > 
> >  	page = region->pages[page_offset];
> >  	if (!page)
> >  		return -EINVAL;
> > 
> > -	page_order = folio_order(page_folio(page));
> > -	/* The hypervisor only supports 4K and 2M page sizes */
> > -	if (page_order && page_order != PMD_ORDER)
> > -		return -EINVAL;
> > -
> > -	stride = 1 << page_order;
> > +	stride = mshv_chunk_stride(page, gfn, page_count);
> > +	if (stride < 0)
> > +		return stride;
> > 
> > -	/* Start at stride since the first page is validated */
> > +	/* Start at stride since the first stride is validated */
> >  	for (count = stride; count < page_count; count += stride) {
> >  		page = region->pages[page_offset + count];
> > 
> > @@ -71,12 +105,13 @@ static long mshv_region_process_chunk(struct mshv_mem_region *region,
> >  		if (!page)
> >  			break;
> > 
> > -		/* Break if page size changes */
> > -		if (page_order != folio_order(page_folio(page)))
> > +		/* Break if stride size changes */
> > +		if (stride != mshv_chunk_stride(page, gfn + count,
> > +						page_count - count))
> >  			break;
> >  	}
> > 
> > -	ret = handler(region, flags, page_offset, count);
> > +	ret = handler(region, flags, page_offset, count, stride > 1);
> >  	if (ret)
> >  		return ret;
> > 
> > @@ -108,7 +143,8 @@ static int mshv_region_process_range(struct mshv_mem_region *region,
> >  				     int (*handler)(struct mshv_mem_region *region,
> >  						    u32 flags,
> >  						    u64 page_offset,
> > -						    u64 page_count))
> > +						    u64 page_count,
> > +						    bool huge_page))
> >  {
> >  	long ret;
> > 
> > @@ -162,11 +198,10 @@ struct mshv_mem_region *mshv_region_create(u64 guest_pfn, u64 nr_pages,
> > 
> >  static int mshv_region_chunk_share(struct mshv_mem_region *region,
> >  				   u32 flags,
> > -				   u64 page_offset, u64 page_count)
> > +				   u64 page_offset, u64 page_count,
> > +				   bool huge_page)
> >  {
> > -	struct page *page = region->pages[page_offset];
> > -
> > -	if (PageHuge(page) || PageTransCompound(page))
> > +	if (huge_page)
> >  		flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
> > 
> >  	return hv_call_modify_spa_host_access(region->partition->pt_id,
> > @@ -188,11 +223,10 @@ int mshv_region_share(struct mshv_mem_region *region)
> > 
> >  static int mshv_region_chunk_unshare(struct mshv_mem_region *region,
> >  				     u32 flags,
> > -				     u64 page_offset, u64 page_count)
> > +				     u64 page_offset, u64 page_count,
> > +				     bool huge_page)
> >  {
> > -	struct page *page = region->pages[page_offset];
> > -
> > -	if (PageHuge(page) || PageTransCompound(page))
> > +	if (huge_page)
> >  		flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
> > 
> >  	return hv_call_modify_spa_host_access(region->partition->pt_id,
> > @@ -212,11 +246,10 @@ int mshv_region_unshare(struct mshv_mem_region *region)
> > 
> >  static int mshv_region_chunk_remap(struct mshv_mem_region *region,
> >  				   u32 flags,
> > -				   u64 page_offset, u64 page_count)
> > +				   u64 page_offset, u64 page_count,
> > +				   bool huge_page)
> >  {
> > -	struct page *page = region->pages[page_offset];
> > -
> > -	if (PageHuge(page) || PageTransCompound(page))
> > +	if (huge_page)
> >  		flags |= HV_MAP_GPA_LARGE_PAGE;
> > 
> >  	return hv_call_map_gpa_pages(region->partition->pt_id,
> > @@ -295,11 +328,10 @@ int mshv_region_pin(struct mshv_mem_region *region)
> > 
> >  static int mshv_region_chunk_unmap(struct mshv_mem_region *region,
> >  				   u32 flags,
> > -				   u64 page_offset, u64 page_count)
> > +				   u64 page_offset, u64 page_count,
> > +				   bool huge_page)
> >  {
> > -	struct page *page = region->pages[page_offset];
> > -
> > -	if (PageHuge(page) || PageTransCompound(page))
> > +	if (huge_page)
> >  		flags |= HV_UNMAP_GPA_LARGE_PAGE;
> > 
> >  	return hv_call_unmap_gpa_pages(region->partition->pt_id,
> > 
> > 
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH] mshv: Align huge page stride with guest mapping
  2025-12-19 22:53   ` Stanislav Kinsburskii
@ 2025-12-22 18:25     ` Michael Kelley
  2025-12-23 15:51       ` Michael Kelley
  2025-12-23 16:27       ` Stanislav Kinsburskii
  0 siblings, 2 replies; 18+ messages in thread
From: Michael Kelley @ 2025-12-22 18:25 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, December 19, 2025 2:54 PM
> 
> On Thu, Dec 18, 2025 at 07:41:24PM +0000, Michael Kelley wrote:
> > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Tuesday,
> December 16, 2025 4:41 PM
> > >
> > > Ensure that a stride larger than 1 (huge page) is only used when both
> > > the guest frame number (gfn) and the operation size (page_count) are
> > > aligned to the huge page size (PTRS_PER_PMD). This matches the
> > > hypervisor requirement that map/unmap operations for huge pages must be
> > > guest-aligned and cover a full huge page.
> > >
> > > Add mshv_chunk_stride() to encapsulate this alignment and page-order
> > > validation, and plumb a huge_page flag into the region chunk handlers.
> > > This prevents issuing large-page map/unmap/share operations that the
> > > hypervisor would reject due to misaligned guest mappings.
> >
> > This code looks good to me on the surface. But I can only make an educated
> > guess as to the hypervisor behavior in certain situations, and if my guess is
> > correct there's still a flaw in one case.
> >
> > Consider the madvise() DONTNEED experiment that I previously called out. [1]
> > I surmise that the intent of this patch is to make that case work correctly.
> > When the .invalidate callback is made for the 32 Kbyte range embedded in
> > a previously mapped 2 Meg page, this new code detects that case. It calls the
> > hypervisor to remap the 32 Kbyte range for no access, and clears the 8
> > corresponding entries in the struct page array attached to the mshv region. The
> > call to the hypervisor is made *without* the HV_MAP_GPA_LARGE_PAGE flag.
> > Since the mapping was originally done *with* the HV_MAP_GPA_LARGE_PAGE
> > flag, my guess is that the hypervisor is smart enough to handle this case by
> > splitting the 2 Meg mapping it created, setting the 32 Kbyte range to no access,
> > and returning "success". If my guess is correct, there's no problem here.
> >
> > But then there's a second .invalidate callback for the entire 2 Meg page. Here's
> > the call stack:
> >
> > [  194.259337]  dump_stack+0x14/0x20
> > [  194.259339]  mhktest_invalidate+0x2a/0x40  [my dummy invalidate callback]
> > [  194.259342]  __mmu_notifier_invalidate_range_start+0x1f4/0x250
> > [  194.259347]  __split_huge_pmd+0x14f/0x170
> > [  194.259349]  unmap_page_range+0x104d/0x1a00
> > [  194.259358]  unmap_single_vma+0x7d/0xc0
> > [  194.259360]  zap_page_range_single_batched+0xe0/0x1c0
> > [  194.259363]  madvise_vma_behavior+0xb01/0xc00
> > [  194.259366]  madvise_do_behavior.part.0+0x3cd/0x4a0
> > [  194.259375]  do_madvise+0xc7/0x170
> > [  194.259380]  __x64_sys_madvise+0x2f/0x40
> > [  194.259382]  x64_sys_call+0x1d77/0x21b0
> > [  194.259385]  do_syscall_64+0x56/0x640
> > [  194.259388]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> >
> > In __split_huge_pmd(), the .invalidate callback is made *before* the 2 Meg
> > page is actually split by the root partition. So mshv_chunk_stride() returns "9"
> > for the stride, and the hypervisor is called with HV_MAP_GPA_LARGE_PAGE
> > set. My guess is that the hypervisor returns an error because it has already
> > split the mapping. The whole point of this patch set is to avoid passing
> > HV_MAP_GPA_LARGE_PAGE to the hypervisor when the hypervisor mapping
> > is not a large page mapping, but this looks like a case where it still happens.
> >
> > My concern is solely from looking at the code and thinking about the problem,
> > as I don't have an environment where I can test root partition interactions
> > with the hypervisor. So maybe I'm missing something. Lemme know what you
> > think .....
> >
> 
> Yeah, I see your point: according to this stack, once a part of the page
> is invalidated, the folio order remains the same until another invocation
> of the same callback - this time for the whole huge
> page - is made. Thus, the stride is still reported as the huge page size,
> even though a part of the page has already been unmapped.
> 
> This indeed looks like a flaw in the current approach, but it's actually
> not. The reason is that upon the invalidation callback, the driver
> simply remaps the whole huge page with no access (in this case, the PFNs
> provided to the hypervisor are zero), and it's fine as the hypervisor
> simply drops all the pages from the previous mapping and marks this page
> as inaccessible. The only check the hypervisor makes in this case is
> that both the GFN and mapping size are huge page aligned (which they are
> in this case).
> 
> I hope this clarifies the situation. Please let me know if you have any
> other questions.

Thanks. Yes, this clarifies. My guess about the hypervisor behavior was wrong.
Based on what you've said about what the hypervisor does, and further studying
MSHV code, here's my recap of the HV_MAP_GPA_LARGE_PAGE flag:

1. The hypervisor uses the flag to determine the granularity (4K or 2M) of the
mapping HVCALL_MAP_GPA_PAGES or HVCALL_UNMAP_GPA_PAGES will
create/remove. As such, the hypercall "repcount" is in this granularity. GFNs,
such as the target_gpa_base input parameter and GFNs in the pfn_array, are
always 4K GFNs, but if the flag is set, a GFN is treated as the first 4K GFN in
a contiguous 2M range. If the flag is set, the target_gpa_base GFN must be
2M aligned.

2. The hypervisor doesn't care whether any existing mapping is 4K or 2M. It
always removes an existing mapping, including splitting any 2M mappings if
necessary. Then if the operation is to create/re-create a mapping, it creates
an appropriate new mapping.

My error was in thinking that the flag had to match any existing mapping.
But the behavior you've clarified is certainly better. It handles the vagaries
of the Linux "mm" subsystem, which in one case in my original experiment
(madvise) invalidates the small range, then the 2M range, but the other
case (mprotect) invalidates the 2M range, then the small range.

Since there's no documentation for these root partition hypercalls, it sure
would be nice if this info could be captured in code comments for some
future developer to benefit from. If that's not something you want to
worry about, I could submit a patch later to add the code comments
(subject to your review, of course).

Separately, in looking at this, I spotted another potential problem with
2 Meg mappings that somewhat depends on hypervisor behavior that I'm
not clear on. To create a new region, the user space VMM issues the
MSHV_GET_GUEST_MEMORY ioctl, specifying the userspace address, the
size, and the guest PFN. The only requirement on these values is that the
userspace address and size be page aligned. But suppose a 4 Meg region is
specified where the userspace address and the guest PFN have different
offsets modulo 2 Meg. The userspace address range gets populated first,
and may contain a 2 Meg large page. Then when mshv_chunk_stride()
detects a 2 Meg aligned guest PFN so HVCALL_MAP_GPA_PAGES can be told
to create a 2 Meg mapping for the guest, the corresponding system PFN in
the page array may not be 2 Meg aligned. What does the hypervisor do in
this case? It can't create a 2 Meg mapping, right? So does it silently fallback
to creating 4K mappings, or does it return an error? Returning an error would
seem to be problematic for movable pages because the error wouldn't
occur until the guest VM is running and takes a range fault on the region.
Silently falling back to creating 4K mappings has performance implications,
though I guess it would work. My question is whether the
MSHV_GET_GUEST_MEMORY ioctl should detect this case and return an
error immediately.

Michael

> >
> > [1] https://lore.kernel.org/linux-hyperv/SN6PR02MB4157978DFAA6C2584D0678E1D4A1A@SN6PR02MB4157.namprd02.prod.outlook.com/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH] mshv: Align huge page stride with guest mapping
  2025-12-22 18:25     ` Michael Kelley
@ 2025-12-23 15:51       ` Michael Kelley
  2025-12-23 16:26         ` Stanislav Kinsburskii
  2025-12-23 16:27       ` Stanislav Kinsburskii
  1 sibling, 1 reply; 18+ messages in thread
From: Michael Kelley @ 2025-12-23 15:51 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

From: Michael Kelley Sent: Monday, December 22, 2025 10:25 AM
> 
[snip]
> 
> Separately, in looking at this, I spotted another potential problem with
> 2 Meg mappings that somewhat depends on hypervisor behavior that I'm
> not clear on. To create a new region, the user space VMM issues the
> MSHV_GET_GUEST_MEMORY ioctl, specifying the userspace address, the
> size, and the guest PFN. The only requirement on these values is that the
> userspace address and size be page aligned. But suppose a 4 Meg region is
> specified where the userspace address and the guest PFN have different
> offsets modulo 2 Meg. The userspace address range gets populated first,
> and may contain a 2 Meg large page. Then when mshv_chunk_stride()
> detects a 2 Meg aligned guest PFN so HVCALL_MAP_GPA_PAGES can be told
> to create a 2 Meg mapping for the guest, the corresponding system PFN in
> the page array may not be 2 Meg aligned. What does the hypervisor do in
> this case? It can't create a 2 Meg mapping, right? So does it silently fallback
> to creating 4K mappings, or does it return an error? Returning an error would
> seem to be problematic for movable pages because the error wouldn't
> occur until the guest VM is running and takes a range fault on the region.
> Silently falling back to creating 4K mappings has performance implications,
> though I guess it would work. My question is whether the
> MSHV_GET_GUEST_MEMORY ioctl should detect this case and return an
> error immediately.
> 

In thinking about this more, I can answer my own question about the
hypervisor behavior. When HVCALL_MAP_GPA_PAGES is set, the full
list of 4K system PFNs is not provided as an input to the hypercall, so
the hypervisor cannot silently fall back to 4K mappings. Assuming
sequential PFNs would be wrong, so it must return an error if the
alignment of a system PFN isn't on a 2 Meg boundary.

For a pinned region, this error happens in mshv_region_map() as
called from  mshv_prepare_pinned_region(), so will propagate back
to the ioctl. But the error happens only if pin_user_pages_fast()
allocates one or more 2 Meg pages. So creating a pinned region
where the guest PFN and userspace address have different offsets
modulo 2 Meg might or might not succeed.

For a movable region, the error probably can't occur.
mshv_region_handle_gfn_fault() builds an aligned 2 Meg chunk
around the faulting guest PFN. mshv_region_range_fault() then
determines the corresponding userspace addr, which won't be on
a 2 Meg boundary, so the allocated memory won't contain a 2 Meg
page. With no 2 Meg pages, mshv_region_remap_pages() will
always do 4K mappings and will succeed. The downside is that a
movable region with a guest PFN and userspace address with
different offsets never gets any 2 Meg pages or mappings.

My conclusion is the same -- such misalignment should not be
allowed when creating a region that has the potential to use 2 Meg
pages. Regions less than 2 Meg in size could be excluded from such
a requirement if there is benefit in doing so. It's possible to have
regions up to (but not including) 4 Meg where the alignment prevents
having a 2 Meg page, and those could also be excluded from the
requirement.

Michael

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mshv: Align huge page stride with guest mapping
  2025-12-23 15:51       ` Michael Kelley
@ 2025-12-23 16:26         ` Stanislav Kinsburskii
  2025-12-23 19:17           ` Michael Kelley
  0 siblings, 1 reply; 18+ messages in thread
From: Stanislav Kinsburskii @ 2025-12-23 16:26 UTC (permalink / raw)
  To: Michael Kelley
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

On Tue, Dec 23, 2025 at 03:51:22PM +0000, Michael Kelley wrote:
> From: Michael Kelley Sent: Monday, December 22, 2025 10:25 AM
> > 
> [snip]
> > 
> > Separately, in looking at this, I spotted another potential problem with
> > 2 Meg mappings that somewhat depends on hypervisor behavior that I'm
> > not clear on. To create a new region, the user space VMM issues the
> > MSHV_GET_GUEST_MEMORY ioctl, specifying the userspace address, the
> > size, and the guest PFN. The only requirement on these values is that the
> > userspace address and size be page aligned. But suppose a 4 Meg region is
> > specified where the userspace address and the guest PFN have different
> > offsets modulo 2 Meg. The userspace address range gets populated first,
> > and may contain a 2 Meg large page. Then when mshv_chunk_stride()
> > detects a 2 Meg aligned guest PFN so HVCALL_MAP_GPA_PAGES can be told
> > to create a 2 Meg mapping for the guest, the corresponding system PFN in
> > the page array may not be 2 Meg aligned. What does the hypervisor do in
> > this case? It can't create a 2 Meg mapping, right? So does it silently fallback
> > to creating 4K mappings, or does it return an error? Returning an error would
> > seem to be problematic for movable pages because the error wouldn't
> > occur until the guest VM is running and takes a range fault on the region.
> > Silently falling back to creating 4K mappings has performance implications,
> > though I guess it would work. My question is whether the
> > MSHV_GET_GUEST_MEMORY ioctl should detect this case and return an
> > error immediately.
> > 
> 
> In thinking about this more, I can answer my own question about the
> hypervisor behavior. When HVCALL_MAP_GPA_PAGES is set, the full
> list of 4K system PFNs is not provided as an input to the hypercall, so
> the hypervisor cannot silently fall back to 4K mappings. Assuming
> sequential PFNs would be wrong, so it must return an error if the
> alignment of a system PFN isn't on a 2 Meg boundary.
> 
> For a pinned region, this error happens in mshv_region_map() as
> called from  mshv_prepare_pinned_region(), so will propagate back
> to the ioctl. But the error happens only if pin_user_pages_fast()
> allocates one or more 2 Meg pages. So creating a pinned region
> where the guest PFN and userspace address have different offsets
> modulo 2 Meg might or might not succeed.
> 
> For a movable region, the error probably can't occur.
> mshv_region_handle_gfn_fault() builds an aligned 2 Meg chunk
> around the faulting guest PFN. mshv_region_range_fault() then
> determines the corresponding userspace addr, which won't be on
> a 2 Meg boundary, so the allocated memory won't contain a 2 Meg
> page. With no 2 Meg pages, mshv_region_remap_pages() will
> always do 4K mappings and will succeed. The downside is that a
> movable region with a guest PFN and userspace address with
> different offsets never gets any 2 Meg pages or mappings.
> 
> My conclusion is the same -- such misalignment should not be
> allowed when creating a region that has the potential to use 2 Meg
> pages. Regions less than 2 Meg in size could be excluded from such
> a requirement if there is benefit in doing so. It's possible to have
> regions up to (but not including) 4 Meg where the alignment prevents
> having a 2 Meg page, and those could also be excluded from the
> requirement.
> 

I'm not sure I understand the problem.  
There are three cases to consider:  
1. Guest mapping, where page sizes are controlled by the guest.  
2. Host mapping, where page sizes are controlled by the host.  
3. Hypervisor mapping, where page sizes are controlled by the hypervisor.  

The first case is not relevant here and is included for completeness.  

The second and third cases (host and hypervisor) share the memory layout, but it is up to each entity to decide which page sizes to use. For example, the host might map the proposed 4M region with only 4K pages, even if a 2M page is available in the middle. In this case, the host will map the memory as represented by 4K pages, but the hypervisor can still discover the 2M page in the middle and adjust its page tables to use a 2M page.  

This adjustment happens at runtime. Could this be the missing detail here?  

Thanks,  
Stanislav  

> Michael

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH] mshv: Align huge page stride with guest mapping
  2025-12-23 16:26         ` Stanislav Kinsburskii
@ 2025-12-23 19:17           ` Michael Kelley
  2026-01-02 17:42             ` Stanislav Kinsburskii
  0 siblings, 1 reply; 18+ messages in thread
From: Michael Kelley @ 2025-12-23 19:17 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Tuesday, December 23, 2025 8:26 AM
> 
> On Tue, Dec 23, 2025 at 03:51:22PM +0000, Michael Kelley wrote:
> > From: Michael Kelley Sent: Monday, December 22, 2025 10:25 AM
> > >
> > [snip]
> > >
> > > Separately, in looking at this, I spotted another potential problem with
> > > 2 Meg mappings that somewhat depends on hypervisor behavior that I'm
> > > not clear on. To create a new region, the user space VMM issues the
> > > MSHV_GET_GUEST_MEMORY ioctl, specifying the userspace address, the
> > > size, and the guest PFN. The only requirement on these values is that the
> > > userspace address and size be page aligned. But suppose a 4 Meg region is
> > > specified where the userspace address and the guest PFN have different
> > > offsets modulo 2 Meg. The userspace address range gets populated first,
> > > and may contain a 2 Meg large page. Then when mshv_chunk_stride()
> > > detects a 2 Meg aligned guest PFN so HVCALL_MAP_GPA_PAGES can be told
> > > to create a 2 Meg mapping for the guest, the corresponding system PFN in
> > > the page array may not be 2 Meg aligned. What does the hypervisor do in
> > > this case? It can't create a 2 Meg mapping, right? So does it silently fallback
> > > to creating 4K mappings, or does it return an error? Returning an error would
> > > seem to be problematic for movable pages because the error wouldn't
> > > occur until the guest VM is running and takes a range fault on the region.
> > > Silently falling back to creating 4K mappings has performance implications,
> > > though I guess it would work. My question is whether the
> > > MSHV_GET_GUEST_MEMORY ioctl should detect this case and return an
> > > error immediately.
> > >
> >
> > In thinking about this more, I can answer my own question about the
> > hypervisor behavior. When HVCALL_MAP_GPA_PAGES is set, the full
> > list of 4K system PFNs is not provided as an input to the hypercall, so
> > the hypervisor cannot silently fall back to 4K mappings. Assuming
> > sequential PFNs would be wrong, so it must return an error if the
> > alignment of a system PFN isn't on a 2 Meg boundary.
> >
> > For a pinned region, this error happens in mshv_region_map() as
> > called from  mshv_prepare_pinned_region(), so will propagate back
> > to the ioctl. But the error happens only if pin_user_pages_fast()
> > allocates one or more 2 Meg pages. So creating a pinned region
> > where the guest PFN and userspace address have different offsets
> > modulo 2 Meg might or might not succeed.
> >
> > For a movable region, the error probably can't occur.
> > mshv_region_handle_gfn_fault() builds an aligned 2 Meg chunk
> > around the faulting guest PFN. mshv_region_range_fault() then
> > determines the corresponding userspace addr, which won't be on
> > a 2 Meg boundary, so the allocated memory won't contain a 2 Meg
> > page. With no 2 Meg pages, mshv_region_remap_pages() will
> > always do 4K mappings and will succeed. The downside is that a
> > movable region with a guest PFN and userspace address with
> > different offsets never gets any 2 Meg pages or mappings.
> >
> > My conclusion is the same -- such misalignment should not be
> > allowed when creating a region that has the potential to use 2 Meg
> > pages. Regions less than 2 Meg in size could be excluded from such
> > a requirement if there is benefit in doing so. It's possible to have
> > regions up to (but not including) 4 Meg where the alignment prevents
> > having a 2 Meg page, and those could also be excluded from the
> > requirement.
> >
> 
> I'm not sure I understand the problem.
> There are three cases to consider:
> 1. Guest mapping, where page sizes are controlled by the guest.
> 2. Host mapping, where page sizes are controlled by the host.

And by "host", you mean specifically the Linux instance running in the
root partition. It hosts the VMM processes and creates the memory
regions for each guest.

> 3. Hypervisor mapping, where page sizes are controlled by the hypervisor.
> 
> The first case is not relevant here and is included for completeness.

Agreed.

> 
> The second and third cases (host and hypervisor) share the memory layout, 

Right. More specifically, they are both operating on the same set of physical
memory pages, and hence "share" a set of what I've referred to as
"system PFNs" (to distinguish from guest PFNs, or GFNs).

> but it is up
> to each entity to decide which page sizes to use. For example, the host might map the
> proposed 4M region with only 4K pages, even if a 2M page is available in the middle.

Agreed.

> In this case, the host will map the memory as represented by 4K pages, but the hypervisor
> can still discover the 2M page in the middle and adjust its page tables to use a 2M page.

Yes, that's possible, but subject to significant requirements. A 2M page can be
used only if the underlying physical memory is a physically contiguous 2M chunk.
Furthermore, that contiguous 2M chunk must start on a physical 2M boundary,
and the virtual address to which it is being mapped must be on a 2M boundary.
In the case of the host, that virtual address is the user space address in the
user space process. In the case of the hypervisor, that "virtual address" is the
the location in guest physical address space; i.e., the guest PFN left-shifted 9
to be a guest physical address.

These requirements are from the physical processor and its requirements on
page table formats as specified by the hardware architecture. Whereas the
page table entry for a 4K page contains the entire PFN, the page table entry
for a 2M page omits the low order 9 bits of the PFN -- those bits must be zero,
which is equivalent to requiring that the PFN be on a 2M boundary. These
requirements apply to both host and hypervisor mappings.

When MSHV code in the host creates a new pinned region via the ioctl,
MSHV code first allocates memory for the region using pin_user_pages_fast(),
which returns the system PFN for each page of physical memory that is
allocated. If the host, at its discretion, allocates a 2M page, then a series
of 512 sequential 4K PFNs is returned for that 2M page, and the first of
the 512 sequential PFNs must have its low order 9 bits be zero.

Then the MSHV ioctl makes the HVCALL_MAP_GPA_PAGES hypercall for
the hypervisor to map the allocated memory into the guest physical
address space at a particular guest PFN. If the allocated memory contains
a 2M page, mshv_chunk_stride() will see a folio order of 9 for the 2M page,
causing the HV_MAP_GPA_LARGE_PAGE flag to be set, which requests that
the hypervisor do that mapping as a 2M large page. The hypercall does not
have the option of dropping back to 4K page mappings in this case. If
the 2M alignment of the system PFN is different from the 2M alignment
of the target guest PFN, it's not possible to create the mapping and the
hypercall fails.

The core problem is that the same 2M of physical memory wants to be
mapped by the host as a 2M page and by the hypervisor as a 2M page.
That can't be done unless the host alignment (in the VMM virtual address
space) and the guest physical address (i.e., the target guest PFN) alignment
match and are both on 2M boundaries.

Movable regions behave a bit differently because the memory for the
region is not allocated on the host "up front" when the region is created.
The memory is faulted in as the guest runs, and the vagaries of the current
MSHV in Linux code are such that 2M pages are never created on the host
if the alignments don't match. HV_MAP_GPA_LARGE_PAGE is never passed
to the HVCALL_MAP_GPA_PAGES hypercall, so the hypervisor just does 4K
mappings, which works even with the misalignment.

> 
> This adjustment happens at runtime. Could this be the missing detail here?

Adjustments at runtime are a different topic from the issue I'm raising,
though eventually there's some relationship. My issue occurs in the
creation of a new region, and the setting up of the initial hypervisor
mapping. I haven't thought through the details of adjustments at runtime.

My usual caveats apply -- this is all "thought experiment". If I had the
means do some runtime testing to confirm, I would. It's possible the
hypervisor is playing some trick I haven't envisioned, but I'm skeptical of
that given the basics of how physical processors work with page tables.

Michael

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mshv: Align huge page stride with guest mapping
  2025-12-23 19:17           ` Michael Kelley
@ 2026-01-02 17:42             ` Stanislav Kinsburskii
  2026-01-02 18:04               ` Michael Kelley
  0 siblings, 1 reply; 18+ messages in thread
From: Stanislav Kinsburskii @ 2026-01-02 17:42 UTC (permalink / raw)
  To: Michael Kelley
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

On Tue, Dec 23, 2025 at 07:17:23PM +0000, Michael Kelley wrote:
> From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Tuesday, December 23, 2025 8:26 AM
> > 
> > On Tue, Dec 23, 2025 at 03:51:22PM +0000, Michael Kelley wrote:
> > > From: Michael Kelley Sent: Monday, December 22, 2025 10:25 AM
> > > >
> > > [snip]
> > > >
> > > > Separately, in looking at this, I spotted another potential problem with
> > > > 2 Meg mappings that somewhat depends on hypervisor behavior that I'm
> > > > not clear on. To create a new region, the user space VMM issues the
> > > > MSHV_GET_GUEST_MEMORY ioctl, specifying the userspace address, the
> > > > size, and the guest PFN. The only requirement on these values is that the
> > > > userspace address and size be page aligned. But suppose a 4 Meg region is
> > > > specified where the userspace address and the guest PFN have different
> > > > offsets modulo 2 Meg. The userspace address range gets populated first,
> > > > and may contain a 2 Meg large page. Then when mshv_chunk_stride()
> > > > detects a 2 Meg aligned guest PFN so HVCALL_MAP_GPA_PAGES can be told
> > > > to create a 2 Meg mapping for the guest, the corresponding system PFN in
> > > > the page array may not be 2 Meg aligned. What does the hypervisor do in
> > > > this case? It can't create a 2 Meg mapping, right? So does it silently fallback
> > > > to creating 4K mappings, or does it return an error? Returning an error would
> > > > seem to be problematic for movable pages because the error wouldn't
> > > > occur until the guest VM is running and takes a range fault on the region.
> > > > Silently falling back to creating 4K mappings has performance implications,
> > > > though I guess it would work. My question is whether the
> > > > MSHV_GET_GUEST_MEMORY ioctl should detect this case and return an
> > > > error immediately.
> > > >
> > >
> > > In thinking about this more, I can answer my own question about the
> > > hypervisor behavior. When HVCALL_MAP_GPA_PAGES is set, the full
> > > list of 4K system PFNs is not provided as an input to the hypercall, so
> > > the hypervisor cannot silently fall back to 4K mappings. Assuming
> > > sequential PFNs would be wrong, so it must return an error if the
> > > alignment of a system PFN isn't on a 2 Meg boundary.
> > >
> > > For a pinned region, this error happens in mshv_region_map() as
> > > called from  mshv_prepare_pinned_region(), so will propagate back
> > > to the ioctl. But the error happens only if pin_user_pages_fast()
> > > allocates one or more 2 Meg pages. So creating a pinned region
> > > where the guest PFN and userspace address have different offsets
> > > modulo 2 Meg might or might not succeed.
> > >
> > > For a movable region, the error probably can't occur.
> > > mshv_region_handle_gfn_fault() builds an aligned 2 Meg chunk
> > > around the faulting guest PFN. mshv_region_range_fault() then
> > > determines the corresponding userspace addr, which won't be on
> > > a 2 Meg boundary, so the allocated memory won't contain a 2 Meg
> > > page. With no 2 Meg pages, mshv_region_remap_pages() will
> > > always do 4K mappings and will succeed. The downside is that a
> > > movable region with a guest PFN and userspace address with
> > > different offsets never gets any 2 Meg pages or mappings.
> > >
> > > My conclusion is the same -- such misalignment should not be
> > > allowed when creating a region that has the potential to use 2 Meg
> > > pages. Regions less than 2 Meg in size could be excluded from such
> > > a requirement if there is benefit in doing so. It's possible to have
> > > regions up to (but not including) 4 Meg where the alignment prevents
> > > having a 2 Meg page, and those could also be excluded from the
> > > requirement.
> > >
> > 
> > I'm not sure I understand the problem.
> > There are three cases to consider:
> > 1. Guest mapping, where page sizes are controlled by the guest.
> > 2. Host mapping, where page sizes are controlled by the host.
> 
> And by "host", you mean specifically the Linux instance running in the
> root partition. It hosts the VMM processes and creates the memory
> regions for each guest.
> 
> > 3. Hypervisor mapping, where page sizes are controlled by the hypervisor.
> > 
> > The first case is not relevant here and is included for completeness.
> 
> Agreed.
> 
> > 
> > The second and third cases (host and hypervisor) share the memory layout, 
> 
> Right. More specifically, they are both operating on the same set of physical
> memory pages, and hence "share" a set of what I've referred to as
> "system PFNs" (to distinguish from guest PFNs, or GFNs).
> 
> > but it is up
> > to each entity to decide which page sizes to use. For example, the host might map the
> > proposed 4M region with only 4K pages, even if a 2M page is available in the middle.
> 
> Agreed.
> 
> > In this case, the host will map the memory as represented by 4K pages, but the hypervisor
> > can still discover the 2M page in the middle and adjust its page tables to use a 2M page.
> 
> Yes, that's possible, but subject to significant requirements. A 2M page can be
> used only if the underlying physical memory is a physically contiguous 2M chunk.
> Furthermore, that contiguous 2M chunk must start on a physical 2M boundary,
> and the virtual address to which it is being mapped must be on a 2M boundary.
> In the case of the host, that virtual address is the user space address in the
> user space process. In the case of the hypervisor, that "virtual address" is the
> the location in guest physical address space; i.e., the guest PFN left-shifted 9
> to be a guest physical address.
> 
> These requirements are from the physical processor and its requirements on
> page table formats as specified by the hardware architecture. Whereas the
> page table entry for a 4K page contains the entire PFN, the page table entry
> for a 2M page omits the low order 9 bits of the PFN -- those bits must be zero,
> which is equivalent to requiring that the PFN be on a 2M boundary. These
> requirements apply to both host and hypervisor mappings.
> 
> When MSHV code in the host creates a new pinned region via the ioctl,
> MSHV code first allocates memory for the region using pin_user_pages_fast(),
> which returns the system PFN for each page of physical memory that is
> allocated. If the host, at its discretion, allocates a 2M page, then a series
> of 512 sequential 4K PFNs is returned for that 2M page, and the first of
> the 512 sequential PFNs must have its low order 9 bits be zero.
> 
> Then the MSHV ioctl makes the HVCALL_MAP_GPA_PAGES hypercall for
> the hypervisor to map the allocated memory into the guest physical
> address space at a particular guest PFN. If the allocated memory contains
> a 2M page, mshv_chunk_stride() will see a folio order of 9 for the 2M page,
> causing the HV_MAP_GPA_LARGE_PAGE flag to be set, which requests that
> the hypervisor do that mapping as a 2M large page. The hypercall does not
> have the option of dropping back to 4K page mappings in this case. If
> the 2M alignment of the system PFN is different from the 2M alignment
> of the target guest PFN, it's not possible to create the mapping and the
> hypercall fails.
> 
> The core problem is that the same 2M of physical memory wants to be
> mapped by the host as a 2M page and by the hypervisor as a 2M page.
> That can't be done unless the host alignment (in the VMM virtual address
> space) and the guest physical address (i.e., the target guest PFN) alignment
> match and are both on 2M boundaries.
> 

But why is it a problem? If both the host and the hypervisor can map ap
huge page, but the guest can't, it's still a win, no?
In other words, if VMM passes a host huge page aligned region as a guest
unaligned, it's a VMM problem, not a hypervisor problem. And I' don't
understand why would we want to prevent such cases.

Thanks,
Stanislav

> Movable regions behave a bit differently because the memory for the
> region is not allocated on the host "up front" when the region is created.
> The memory is faulted in as the guest runs, and the vagaries of the current
> MSHV in Linux code are such that 2M pages are never created on the host
> if the alignments don't match. HV_MAP_GPA_LARGE_PAGE is never passed
> to the HVCALL_MAP_GPA_PAGES hypercall, so the hypervisor just does 4K
> mappings, which works even with the misalignment.
> 
> > 
> > This adjustment happens at runtime. Could this be the missing detail here?
> 
> Adjustments at runtime are a different topic from the issue I'm raising,
> though eventually there's some relationship. My issue occurs in the
> creation of a new region, and the setting up of the initial hypervisor
> mapping. I haven't thought through the details of adjustments at runtime.
> 
> My usual caveats apply -- this is all "thought experiment". If I had the
> means do some runtime testing to confirm, I would. It's possible the
> hypervisor is playing some trick I haven't envisioned, but I'm skeptical of
> that given the basics of how physical processors work with page tables.
> 
> Michael

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH] mshv: Align huge page stride with guest mapping
  2026-01-02 17:42             ` Stanislav Kinsburskii
@ 2026-01-02 18:04               ` Michael Kelley
  2026-01-02 20:03                 ` Stanislav Kinsburskii
  0 siblings, 1 reply; 18+ messages in thread
From: Michael Kelley @ 2026-01-02 18:04 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 9:43 AM
> 
> On Tue, Dec 23, 2025 at 07:17:23PM +0000, Michael Kelley wrote:
> > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Tuesday,
> December 23, 2025 8:26 AM
> > >
> > > On Tue, Dec 23, 2025 at 03:51:22PM +0000, Michael Kelley wrote:
> > > > From: Michael Kelley Sent: Monday, December 22, 2025 10:25 AM
> > > > >
> > > > [snip]
> > > > >
> > > > > Separately, in looking at this, I spotted another potential problem with
> > > > > 2 Meg mappings that somewhat depends on hypervisor behavior that I'm
> > > > > not clear on. To create a new region, the user space VMM issues the
> > > > > MSHV_GET_GUEST_MEMORY ioctl, specifying the userspace address, the
> > > > > size, and the guest PFN. The only requirement on these values is that the
> > > > > userspace address and size be page aligned. But suppose a 4 Meg region is
> > > > > specified where the userspace address and the guest PFN have different
> > > > > offsets modulo 2 Meg. The userspace address range gets populated first,
> > > > > and may contain a 2 Meg large page. Then when mshv_chunk_stride()
> > > > > detects a 2 Meg aligned guest PFN so HVCALL_MAP_GPA_PAGES can be told
> > > > > to create a 2 Meg mapping for the guest, the corresponding system PFN in
> > > > > the page array may not be 2 Meg aligned. What does the hypervisor do in
> > > > > this case? It can't create a 2 Meg mapping, right? So does it silently fallback
> > > > > to creating 4K mappings, or does it return an error? Returning an error would
> > > > > seem to be problematic for movable pages because the error wouldn't
> > > > > occur until the guest VM is running and takes a range fault on the region.
> > > > > Silently falling back to creating 4K mappings has performance implications,
> > > > > though I guess it would work. My question is whether the
> > > > > MSHV_GET_GUEST_MEMORY ioctl should detect this case and return an
> > > > > error immediately.
> > > > >
> > > >
> > > > In thinking about this more, I can answer my own question about the
> > > > hypervisor behavior. When HVCALL_MAP_GPA_PAGES is set, the full
> > > > list of 4K system PFNs is not provided as an input to the hypercall, so
> > > > the hypervisor cannot silently fall back to 4K mappings. Assuming
> > > > sequential PFNs would be wrong, so it must return an error if the
> > > > alignment of a system PFN isn't on a 2 Meg boundary.
> > > >
> > > > For a pinned region, this error happens in mshv_region_map() as
> > > > called from  mshv_prepare_pinned_region(), so will propagate back
> > > > to the ioctl. But the error happens only if pin_user_pages_fast()
> > > > allocates one or more 2 Meg pages. So creating a pinned region
> > > > where the guest PFN and userspace address have different offsets
> > > > modulo 2 Meg might or might not succeed.
> > > >
> > > > For a movable region, the error probably can't occur.
> > > > mshv_region_handle_gfn_fault() builds an aligned 2 Meg chunk
> > > > around the faulting guest PFN. mshv_region_range_fault() then
> > > > determines the corresponding userspace addr, which won't be on
> > > > a 2 Meg boundary, so the allocated memory won't contain a 2 Meg
> > > > page. With no 2 Meg pages, mshv_region_remap_pages() will
> > > > always do 4K mappings and will succeed. The downside is that a
> > > > movable region with a guest PFN and userspace address with
> > > > different offsets never gets any 2 Meg pages or mappings.
> > > >
> > > > My conclusion is the same -- such misalignment should not be
> > > > allowed when creating a region that has the potential to use 2 Meg
> > > > pages. Regions less than 2 Meg in size could be excluded from such
> > > > a requirement if there is benefit in doing so. It's possible to have
> > > > regions up to (but not including) 4 Meg where the alignment prevents
> > > > having a 2 Meg page, and those could also be excluded from the
> > > > requirement.
> > > >
> > >
> > > I'm not sure I understand the problem.
> > > There are three cases to consider:
> > > 1. Guest mapping, where page sizes are controlled by the guest.
> > > 2. Host mapping, where page sizes are controlled by the host.
> >
> > And by "host", you mean specifically the Linux instance running in the
> > root partition. It hosts the VMM processes and creates the memory
> > regions for each guest.
> >
> > > 3. Hypervisor mapping, where page sizes are controlled by the hypervisor.
> > >
> > > The first case is not relevant here and is included for completeness.
> >
> > Agreed.
> >
> > >
> > > The second and third cases (host and hypervisor) share the memory layout,
> >
> > Right. More specifically, they are both operating on the same set of physical
> > memory pages, and hence "share" a set of what I've referred to as
> > "system PFNs" (to distinguish from guest PFNs, or GFNs).
> >
> > > but it is up
> > > to each entity to decide which page sizes to use. For example, the host might map the
> > > proposed 4M region with only 4K pages, even if a 2M page is available in the middle.
> >
> > Agreed.
> >
> > > In this case, the host will map the memory as represented by 4K pages, but the hypervisor
> > > can still discover the 2M page in the middle and adjust its page tables to use a 2M page.
> >
> > Yes, that's possible, but subject to significant requirements. A 2M page can be
> > used only if the underlying physical memory is a physically contiguous 2M chunk.
> > Furthermore, that contiguous 2M chunk must start on a physical 2M boundary,
> > and the virtual address to which it is being mapped must be on a 2M boundary.
> > In the case of the host, that virtual address is the user space address in the
> > user space process. In the case of the hypervisor, that "virtual address" is the
> > the location in guest physical address space; i.e., the guest PFN left-shifted 9
> > to be a guest physical address.
> >
> > These requirements are from the physical processor and its requirements on
> > page table formats as specified by the hardware architecture. Whereas the
> > page table entry for a 4K page contains the entire PFN, the page table entry
> > for a 2M page omits the low order 9 bits of the PFN -- those bits must be zero,
> > which is equivalent to requiring that the PFN be on a 2M boundary. These
> > requirements apply to both host and hypervisor mappings.
> >
> > When MSHV code in the host creates a new pinned region via the ioctl,
> > MSHV code first allocates memory for the region using pin_user_pages_fast(),
> > which returns the system PFN for each page of physical memory that is
> > allocated. If the host, at its discretion, allocates a 2M page, then a series
> > of 512 sequential 4K PFNs is returned for that 2M page, and the first of
> > the 512 sequential PFNs must have its low order 9 bits be zero.
> >
> > Then the MSHV ioctl makes the HVCALL_MAP_GPA_PAGES hypercall for
> > the hypervisor to map the allocated memory into the guest physical
> > address space at a particular guest PFN. If the allocated memory contains
> > a 2M page, mshv_chunk_stride() will see a folio order of 9 for the 2M page,
> > causing the HV_MAP_GPA_LARGE_PAGE flag to be set, which requests that
> > the hypervisor do that mapping as a 2M large page. The hypercall does not
> > have the option of dropping back to 4K page mappings in this case. If
> > the 2M alignment of the system PFN is different from the 2M alignment
> > of the target guest PFN, it's not possible to create the mapping and the
> > hypercall fails.
> >
> > The core problem is that the same 2M of physical memory wants to be
> > mapped by the host as a 2M page and by the hypervisor as a 2M page.
> > That can't be done unless the host alignment (in the VMM virtual address
> > space) and the guest physical address (i.e., the target guest PFN) alignment
> > match and are both on 2M boundaries.
> >
> 
> But why is it a problem? If both the host and the hypervisor can map ap
> huge page, but the guest can't, it's still a win, no?
> In other words, if VMM passes a host huge page aligned region as a guest
> unaligned, it's a VMM problem, not a hypervisor problem. And I' don't
> understand why would we want to prevent such cases.
> 

Fair enough -- mostly. If you want to allow the misaligned case and live
with not getting the 2M mapping in the guest, that works except in the
situation that I described above, where the HVCALL_MAP_GPA_PAGES
hypercall fails when creating a pinned region.

The failure is flakey in that if the Linux in the root partition does not
map any of the region as a 2M page, the hypercall succeeds and the
MSHV_GET_GUEST_MEMORY ioctl succeeds. But if the root partition
happens to map any of the region as a 2M page, the hypercall will fail,
and the MSHV_GET_GUEST_MEMORY ioctl will fail. Presumably such
flakey behavior is bad for the VMM.

One solution is that mshv_chunk_stride() must return a stride > 1 only
if both the gfn (which it currently checks) AND the corresponding
userspace_addr are 2M aligned. Then the HVCALL_MAP_GPA_PAGES
hypercall will never have HV_MAP_GPA_LARGE_PAGE set for the
misaligned case, and the failure won't occur.

Michael

> 
> > Movable regions behave a bit differently because the memory for the
> > region is not allocated on the host "up front" when the region is created.
> > The memory is faulted in as the guest runs, and the vagaries of the current
> > MSHV in Linux code are such that 2M pages are never created on the host
> > if the alignments don't match. HV_MAP_GPA_LARGE_PAGE is never passed
> > to the HVCALL_MAP_GPA_PAGES hypercall, so the hypervisor just does 4K
> > mappings, which works even with the misalignment.
> >
> > >
> > > This adjustment happens at runtime. Could this be the missing detail here?
> >
> > Adjustments at runtime are a different topic from the issue I'm raising,
> > though eventually there's some relationship. My issue occurs in the
> > creation of a new region, and the setting up of the initial hypervisor
> > mapping. I haven't thought through the details of adjustments at runtime.
> >
> > My usual caveats apply -- this is all "thought experiment". If I had the
> > means do some runtime testing to confirm, I would. It's possible the
> > hypervisor is playing some trick I haven't envisioned, but I'm skeptical of
> > that given the basics of how physical processors work with page tables.
> >
> > Michael

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mshv: Align huge page stride with guest mapping
  2026-01-02 18:04               ` Michael Kelley
@ 2026-01-02 20:03                 ` Stanislav Kinsburskii
  2026-01-02 21:13                   ` Michael Kelley
  0 siblings, 1 reply; 18+ messages in thread
From: Stanislav Kinsburskii @ 2026-01-02 20:03 UTC (permalink / raw)
  To: Michael Kelley
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

On Fri, Jan 02, 2026 at 06:04:56PM +0000, Michael Kelley wrote:
> From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 9:43 AM
> > 
> > On Tue, Dec 23, 2025 at 07:17:23PM +0000, Michael Kelley wrote:
> > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Tuesday,
> > December 23, 2025 8:26 AM
> > > >
> > > > On Tue, Dec 23, 2025 at 03:51:22PM +0000, Michael Kelley wrote:
> > > > > From: Michael Kelley Sent: Monday, December 22, 2025 10:25 AM
> > > > > >
> > > > > [snip]
> > > > > >
> > > > > > Separately, in looking at this, I spotted another potential problem with
> > > > > > 2 Meg mappings that somewhat depends on hypervisor behavior that I'm
> > > > > > not clear on. To create a new region, the user space VMM issues the
> > > > > > MSHV_GET_GUEST_MEMORY ioctl, specifying the userspace address, the
> > > > > > size, and the guest PFN. The only requirement on these values is that the
> > > > > > userspace address and size be page aligned. But suppose a 4 Meg region is
> > > > > > specified where the userspace address and the guest PFN have different
> > > > > > offsets modulo 2 Meg. The userspace address range gets populated first,
> > > > > > and may contain a 2 Meg large page. Then when mshv_chunk_stride()
> > > > > > detects a 2 Meg aligned guest PFN so HVCALL_MAP_GPA_PAGES can be told
> > > > > > to create a 2 Meg mapping for the guest, the corresponding system PFN in
> > > > > > the page array may not be 2 Meg aligned. What does the hypervisor do in
> > > > > > this case? It can't create a 2 Meg mapping, right? So does it silently fallback
> > > > > > to creating 4K mappings, or does it return an error? Returning an error would
> > > > > > seem to be problematic for movable pages because the error wouldn't
> > > > > > occur until the guest VM is running and takes a range fault on the region.
> > > > > > Silently falling back to creating 4K mappings has performance implications,
> > > > > > though I guess it would work. My question is whether the
> > > > > > MSHV_GET_GUEST_MEMORY ioctl should detect this case and return an
> > > > > > error immediately.
> > > > > >
> > > > >
> > > > > In thinking about this more, I can answer my own question about the
> > > > > hypervisor behavior. When HVCALL_MAP_GPA_PAGES is set, the full
> > > > > list of 4K system PFNs is not provided as an input to the hypercall, so
> > > > > the hypervisor cannot silently fall back to 4K mappings. Assuming
> > > > > sequential PFNs would be wrong, so it must return an error if the
> > > > > alignment of a system PFN isn't on a 2 Meg boundary.
> > > > >
> > > > > For a pinned region, this error happens in mshv_region_map() as
> > > > > called from  mshv_prepare_pinned_region(), so will propagate back
> > > > > to the ioctl. But the error happens only if pin_user_pages_fast()
> > > > > allocates one or more 2 Meg pages. So creating a pinned region
> > > > > where the guest PFN and userspace address have different offsets
> > > > > modulo 2 Meg might or might not succeed.
> > > > >
> > > > > For a movable region, the error probably can't occur.
> > > > > mshv_region_handle_gfn_fault() builds an aligned 2 Meg chunk
> > > > > around the faulting guest PFN. mshv_region_range_fault() then
> > > > > determines the corresponding userspace addr, which won't be on
> > > > > a 2 Meg boundary, so the allocated memory won't contain a 2 Meg
> > > > > page. With no 2 Meg pages, mshv_region_remap_pages() will
> > > > > always do 4K mappings and will succeed. The downside is that a
> > > > > movable region with a guest PFN and userspace address with
> > > > > different offsets never gets any 2 Meg pages or mappings.
> > > > >
> > > > > My conclusion is the same -- such misalignment should not be
> > > > > allowed when creating a region that has the potential to use 2 Meg
> > > > > pages. Regions less than 2 Meg in size could be excluded from such
> > > > > a requirement if there is benefit in doing so. It's possible to have
> > > > > regions up to (but not including) 4 Meg where the alignment prevents
> > > > > having a 2 Meg page, and those could also be excluded from the
> > > > > requirement.
> > > > >
> > > >
> > > > I'm not sure I understand the problem.
> > > > There are three cases to consider:
> > > > 1. Guest mapping, where page sizes are controlled by the guest.
> > > > 2. Host mapping, where page sizes are controlled by the host.
> > >
> > > And by "host", you mean specifically the Linux instance running in the
> > > root partition. It hosts the VMM processes and creates the memory
> > > regions for each guest.
> > >
> > > > 3. Hypervisor mapping, where page sizes are controlled by the hypervisor.
> > > >
> > > > The first case is not relevant here and is included for completeness.
> > >
> > > Agreed.
> > >
> > > >
> > > > The second and third cases (host and hypervisor) share the memory layout,
> > >
> > > Right. More specifically, they are both operating on the same set of physical
> > > memory pages, and hence "share" a set of what I've referred to as
> > > "system PFNs" (to distinguish from guest PFNs, or GFNs).
> > >
> > > > but it is up
> > > > to each entity to decide which page sizes to use. For example, the host might map the
> > > > proposed 4M region with only 4K pages, even if a 2M page is available in the middle.
> > >
> > > Agreed.
> > >
> > > > In this case, the host will map the memory as represented by 4K pages, but the hypervisor
> > > > can still discover the 2M page in the middle and adjust its page tables to use a 2M page.
> > >
> > > Yes, that's possible, but subject to significant requirements. A 2M page can be
> > > used only if the underlying physical memory is a physically contiguous 2M chunk.
> > > Furthermore, that contiguous 2M chunk must start on a physical 2M boundary,
> > > and the virtual address to which it is being mapped must be on a 2M boundary.
> > > In the case of the host, that virtual address is the user space address in the
> > > user space process. In the case of the hypervisor, that "virtual address" is the
> > > the location in guest physical address space; i.e., the guest PFN left-shifted 9
> > > to be a guest physical address.
> > >
> > > These requirements are from the physical processor and its requirements on
> > > page table formats as specified by the hardware architecture. Whereas the
> > > page table entry for a 4K page contains the entire PFN, the page table entry
> > > for a 2M page omits the low order 9 bits of the PFN -- those bits must be zero,
> > > which is equivalent to requiring that the PFN be on a 2M boundary. These
> > > requirements apply to both host and hypervisor mappings.
> > >
> > > When MSHV code in the host creates a new pinned region via the ioctl,
> > > MSHV code first allocates memory for the region using pin_user_pages_fast(),
> > > which returns the system PFN for each page of physical memory that is
> > > allocated. If the host, at its discretion, allocates a 2M page, then a series
> > > of 512 sequential 4K PFNs is returned for that 2M page, and the first of
> > > the 512 sequential PFNs must have its low order 9 bits be zero.
> > >
> > > Then the MSHV ioctl makes the HVCALL_MAP_GPA_PAGES hypercall for
> > > the hypervisor to map the allocated memory into the guest physical
> > > address space at a particular guest PFN. If the allocated memory contains
> > > a 2M page, mshv_chunk_stride() will see a folio order of 9 for the 2M page,
> > > causing the HV_MAP_GPA_LARGE_PAGE flag to be set, which requests that
> > > the hypervisor do that mapping as a 2M large page. The hypercall does not
> > > have the option of dropping back to 4K page mappings in this case. If
> > > the 2M alignment of the system PFN is different from the 2M alignment
> > > of the target guest PFN, it's not possible to create the mapping and the
> > > hypercall fails.
> > >
> > > The core problem is that the same 2M of physical memory wants to be
> > > mapped by the host as a 2M page and by the hypervisor as a 2M page.
> > > That can't be done unless the host alignment (in the VMM virtual address
> > > space) and the guest physical address (i.e., the target guest PFN) alignment
> > > match and are both on 2M boundaries.
> > >
> > 
> > But why is it a problem? If both the host and the hypervisor can map ap
> > huge page, but the guest can't, it's still a win, no?
> > In other words, if VMM passes a host huge page aligned region as a guest
> > unaligned, it's a VMM problem, not a hypervisor problem. And I' don't
> > understand why would we want to prevent such cases.
> > 
> 
> Fair enough -- mostly. If you want to allow the misaligned case and live
> with not getting the 2M mapping in the guest, that works except in the
> situation that I described above, where the HVCALL_MAP_GPA_PAGES
> hypercall fails when creating a pinned region.
> 
> The failure is flakey in that if the Linux in the root partition does not
> map any of the region as a 2M page, the hypercall succeeds and the
> MSHV_GET_GUEST_MEMORY ioctl succeeds. But if the root partition
> happens to map any of the region as a 2M page, the hypercall will fail,
> and the MSHV_GET_GUEST_MEMORY ioctl will fail. Presumably such
> flakey behavior is bad for the VMM.
> 
> One solution is that mshv_chunk_stride() must return a stride > 1 only
> if both the gfn (which it currently checks) AND the corresponding
> userspace_addr are 2M aligned. Then the HVCALL_MAP_GPA_PAGES
> hypercall will never have HV_MAP_GPA_LARGE_PAGE set for the
> misaligned case, and the failure won't occur.
> 

I think see your point, but I also think this issue doesn't exist,
because map_chunk_stride() returns huge page stride iff page if:
1. the folio order is PMD_ORDER and
2. GFN is huge page aligned and
3. number of 4K pages is huge pages aligned.

On other words, a host huge page won't be mapped as huge if the page
can't be mapped as huge in the guest. And this function is called for
both movable and pinned region, so the hypercal should never fail due to
huge page alignment issue.

What do I miss here?

Thanks,
Stanislav


> Michael
> 
> > 
> > > Movable regions behave a bit differently because the memory for the
> > > region is not allocated on the host "up front" when the region is created.
> > > The memory is faulted in as the guest runs, and the vagaries of the current
> > > MSHV in Linux code are such that 2M pages are never created on the host
> > > if the alignments don't match. HV_MAP_GPA_LARGE_PAGE is never passed
> > > to the HVCALL_MAP_GPA_PAGES hypercall, so the hypervisor just does 4K
> > > mappings, which works even with the misalignment.
> > >
> > > >
> > > > This adjustment happens at runtime. Could this be the missing detail here?
> > >
> > > Adjustments at runtime are a different topic from the issue I'm raising,
> > > though eventually there's some relationship. My issue occurs in the
> > > creation of a new region, and the setting up of the initial hypervisor
> > > mapping. I haven't thought through the details of adjustments at runtime.
> > >
> > > My usual caveats apply -- this is all "thought experiment". If I had the
> > > means do some runtime testing to confirm, I would. It's possible the
> > > hypervisor is playing some trick I haven't envisioned, but I'm skeptical of
> > > that given the basics of how physical processors work with page tables.
> > >
> > > Michael

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH] mshv: Align huge page stride with guest mapping
  2026-01-02 20:03                 ` Stanislav Kinsburskii
@ 2026-01-02 21:13                   ` Michael Kelley
  2026-01-02 23:35                     ` Stanislav Kinsburskii
  0 siblings, 1 reply; 18+ messages in thread
From: Michael Kelley @ 2026-01-02 21:13 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 12:03 PM
> 
> On Fri, Jan 02, 2026 at 06:04:56PM +0000, Michael Kelley wrote:
> > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 9:43 AM
> > >
> > > On Tue, Dec 23, 2025 at 07:17:23PM +0000, Michael Kelley wrote:
> > > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Tuesday, December 23, 2025 8:26 AM
> > > > >
> > > > > On Tue, Dec 23, 2025 at 03:51:22PM +0000, Michael Kelley wrote:
> > > > > > From: Michael Kelley Sent: Monday, December 22, 2025 10:25 AM
> > > > > > >
> > > > > > [snip]
> > > > > > >
> > > > > > > Separately, in looking at this, I spotted another potential problem with
> > > > > > > 2 Meg mappings that somewhat depends on hypervisor behavior that I'm
> > > > > > > not clear on. To create a new region, the user space VMM issues the
> > > > > > > MSHV_GET_GUEST_MEMORY ioctl, specifying the userspace address, the
> > > > > > > size, and the guest PFN. The only requirement on these values is that the
> > > > > > > userspace address and size be page aligned. But suppose a 4 Meg region is
> > > > > > > specified where the userspace address and the guest PFN have different
> > > > > > > offsets modulo 2 Meg. The userspace address range gets populated first,
> > > > > > > and may contain a 2 Meg large page. Then when mshv_chunk_stride()
> > > > > > > detects a 2 Meg aligned guest PFN so HVCALL_MAP_GPA_PAGES can be told
> > > > > > > to create a 2 Meg mapping for the guest, the corresponding system PFN in
> > > > > > > the page array may not be 2 Meg aligned. What does the hypervisor do in
> > > > > > > this case? It can't create a 2 Meg mapping, right? So does it silently fallback
> > > > > > > to creating 4K mappings, or does it return an error? Returning an error would
> > > > > > > seem to be problematic for movable pages because the error wouldn't
> > > > > > > occur until the guest VM is running and takes a range fault on the region.
> > > > > > > Silently falling back to creating 4K mappings has performance implications,
> > > > > > > though I guess it would work. My question is whether the
> > > > > > > MSHV_GET_GUEST_MEMORY ioctl should detect this case and return an
> > > > > > > error immediately.
> > > > > > >
> > > > > >
> > > > > > In thinking about this more, I can answer my own question about the
> > > > > > hypervisor behavior. When HVCALL_MAP_GPA_PAGES is set, the full
> > > > > > list of 4K system PFNs is not provided as an input to the hypercall, so
> > > > > > the hypervisor cannot silently fall back to 4K mappings. Assuming
> > > > > > sequential PFNs would be wrong, so it must return an error if the
> > > > > > alignment of a system PFN isn't on a 2 Meg boundary.
> > > > > >
> > > > > > For a pinned region, this error happens in mshv_region_map() as
> > > > > > called from  mshv_prepare_pinned_region(), so will propagate back
> > > > > > to the ioctl. But the error happens only if pin_user_pages_fast()
> > > > > > allocates one or more 2 Meg pages. So creating a pinned region
> > > > > > where the guest PFN and userspace address have different offsets
> > > > > > modulo 2 Meg might or might not succeed.
> > > > > >
> > > > > > For a movable region, the error probably can't occur.
> > > > > > mshv_region_handle_gfn_fault() builds an aligned 2 Meg chunk
> > > > > > around the faulting guest PFN. mshv_region_range_fault() then
> > > > > > determines the corresponding userspace addr, which won't be on
> > > > > > a 2 Meg boundary, so the allocated memory won't contain a 2 Meg
> > > > > > page. With no 2 Meg pages, mshv_region_remap_pages() will
> > > > > > always do 4K mappings and will succeed. The downside is that a
> > > > > > movable region with a guest PFN and userspace address with
> > > > > > different offsets never gets any 2 Meg pages or mappings.
> > > > > >
> > > > > > My conclusion is the same -- such misalignment should not be
> > > > > > allowed when creating a region that has the potential to use 2 Meg
> > > > > > pages. Regions less than 2 Meg in size could be excluded from such
> > > > > > a requirement if there is benefit in doing so. It's possible to have
> > > > > > regions up to (but not including) 4 Meg where the alignment prevents
> > > > > > having a 2 Meg page, and those could also be excluded from the
> > > > > > requirement.
> > > > > >
> > > > >
> > > > > I'm not sure I understand the problem.
> > > > > There are three cases to consider:
> > > > > 1. Guest mapping, where page sizes are controlled by the guest.
> > > > > 2. Host mapping, where page sizes are controlled by the host.
> > > >
> > > > And by "host", you mean specifically the Linux instance running in the
> > > > root partition. It hosts the VMM processes and creates the memory
> > > > regions for each guest.
> > > >
> > > > > 3. Hypervisor mapping, where page sizes are controlled by the hypervisor.
> > > > >
> > > > > The first case is not relevant here and is included for completeness.
> > > >
> > > > Agreed.
> > > >
> > > > >
> > > > > The second and third cases (host and hypervisor) share the memory layout,
> > > >
> > > > Right. More specifically, they are both operating on the same set of physical
> > > > memory pages, and hence "share" a set of what I've referred to as
> > > > "system PFNs" (to distinguish from guest PFNs, or GFNs).
> > > >
> > > > > but it is up
> > > > > to each entity to decide which page sizes to use. For example, the host might map the
> > > > > proposed 4M region with only 4K pages, even if a 2M page is available in the middle.
> > > >
> > > > Agreed.
> > > >
> > > > > In this case, the host will map the memory as represented by 4K pages, but the hypervisor
> > > > > can still discover the 2M page in the middle and adjust its page tables to use a 2M page.
> > > >
> > > > Yes, that's possible, but subject to significant requirements. A 2M page can be
> > > > used only if the underlying physical memory is a physically contiguous 2M chunk.
> > > > Furthermore, that contiguous 2M chunk must start on a physical 2M boundary,
> > > > and the virtual address to which it is being mapped must be on a 2M boundary.
> > > > In the case of the host, that virtual address is the user space address in the
> > > > user space process. In the case of the hypervisor, that "virtual address" is the
> > > > the location in guest physical address space; i.e., the guest PFN left-shifted 9
> > > > to be a guest physical address.
> > > >
> > > > These requirements are from the physical processor and its requirements on
> > > > page table formats as specified by the hardware architecture. Whereas the
> > > > page table entry for a 4K page contains the entire PFN, the page table entry
> > > > for a 2M page omits the low order 9 bits of the PFN -- those bits must be zero,
> > > > which is equivalent to requiring that the PFN be on a 2M boundary. These
> > > > requirements apply to both host and hypervisor mappings.
> > > >
> > > > When MSHV code in the host creates a new pinned region via the ioctl,
> > > > MSHV code first allocates memory for the region using pin_user_pages_fast(),
> > > > which returns the system PFN for each page of physical memory that is
> > > > allocated. If the host, at its discretion, allocates a 2M page, then a series
> > > > of 512 sequential 4K PFNs is returned for that 2M page, and the first of
> > > > the 512 sequential PFNs must have its low order 9 bits be zero.
> > > >
> > > > Then the MSHV ioctl makes the HVCALL_MAP_GPA_PAGES hypercall for
> > > > the hypervisor to map the allocated memory into the guest physical
> > > > address space at a particular guest PFN. If the allocated memory contains
> > > > a 2M page, mshv_chunk_stride() will see a folio order of 9 for the 2M page,
> > > > causing the HV_MAP_GPA_LARGE_PAGE flag to be set, which requests that
> > > > the hypervisor do that mapping as a 2M large page. The hypercall does not
> > > > have the option of dropping back to 4K page mappings in this case. If
> > > > the 2M alignment of the system PFN is different from the 2M alignment
> > > > of the target guest PFN, it's not possible to create the mapping and the
> > > > hypercall fails.
> > > >
> > > > The core problem is that the same 2M of physical memory wants to be
> > > > mapped by the host as a 2M page and by the hypervisor as a 2M page.
> > > > That can't be done unless the host alignment (in the VMM virtual address
> > > > space) and the guest physical address (i.e., the target guest PFN) alignment
> > > > match and are both on 2M boundaries.
> > > >
> > >
> > > But why is it a problem? If both the host and the hypervisor can map ap
> > > huge page, but the guest can't, it's still a win, no?
> > > In other words, if VMM passes a host huge page aligned region as a guest
> > > unaligned, it's a VMM problem, not a hypervisor problem. And I' don't
> > > understand why would we want to prevent such cases.
> > >
> >
> > Fair enough -- mostly. If you want to allow the misaligned case and live
> > with not getting the 2M mapping in the guest, that works except in the
> > situation that I described above, where the HVCALL_MAP_GPA_PAGES
> > hypercall fails when creating a pinned region.
> >
> > The failure is flakey in that if the Linux in the root partition does not
> > map any of the region as a 2M page, the hypercall succeeds and the
> > MSHV_GET_GUEST_MEMORY ioctl succeeds. But if the root partition
> > happens to map any of the region as a 2M page, the hypercall will fail,
> > and the MSHV_GET_GUEST_MEMORY ioctl will fail. Presumably such
> > flakey behavior is bad for the VMM.
> >
> > One solution is that mshv_chunk_stride() must return a stride > 1 only
> > if both the gfn (which it currently checks) AND the corresponding
> > userspace_addr are 2M aligned. Then the HVCALL_MAP_GPA_PAGES
> > hypercall will never have HV_MAP_GPA_LARGE_PAGE set for the
> > misaligned case, and the failure won't occur.
> >
> 
> I think see your point, but I also think this issue doesn't exist,
> because map_chunk_stride() returns huge page stride iff page if:
> 1. the folio order is PMD_ORDER and
> 2. GFN is huge page aligned and
> 3. number of 4K pages is huge pages aligned.
> 
> On other words, a host huge page won't be mapped as huge if the page
> can't be mapped as huge in the guest.

OK, I'm missing how what you say is true. For pinned regions,
the memory is allocated and mapped into the host userspace address
first, as done by mshv_prepare_pinned_region() calling mshv_region_pin(),
which calls pin_user_pages_fast(). This is all done without considering
the GFN or GFN alignment. So one or more 2M pages might be allocated
and mapped in the host before any guest mapping is looked at. Agreed?

Then mshv_prepare_pinned_region() calls mshv_region_map() to do the
guest mapping. This eventually gets down to mshv_chunk_stride(). In
mshv_chunk_stride() when your conditions #2 and #3 are met, the
corresponding struct page argument to mshv_chunk_stride() may be a
4K page that is in the middle of a 2M page instead of at the beginning
(if the region is mis-aligned). But the key point is that the 4K page in
the middle is part of a folio that will return a folio order of PMD_ORDER.
I.e., a folio order of PMD_ORDER is not sufficient to ensure that the
struct page arg is at the *start* of a 2M-aligned physical memory range
that can be mapped into the guest as a 2M page.

The problem does *not* happen with a movable region, but the reasoning
is different. hmm_range_fault() is always called with a 2M range aligned
to the GFN, which in a mis-aligned region means that the host userspace
address is never 2M aligned. So hmm_range_fault() is never able to allocate
and map a 2M page. mshv_chunk_stride() will never get a folio order > 1,
and the hypercall is never asked to do a 2M mapping. Both host and guest
mappings will always be 4K and everything works.

Michael

> And this function is called for
> both movable and pinned region, so the hypercal should never fail due to
> huge page alignment issue.
> 
> What do I miss here?
> 
> Thanks,
> Stanislav
> 
> 
> > Michael
> >
> > >
> > > > Movable regions behave a bit differently because the memory for the
> > > > region is not allocated on the host "up front" when the region is created.
> > > > The memory is faulted in as the guest runs, and the vagaries of the current
> > > > MSHV in Linux code are such that 2M pages are never created on the host
> > > > if the alignments don't match. HV_MAP_GPA_LARGE_PAGE is never passed
> > > > to the HVCALL_MAP_GPA_PAGES hypercall, so the hypervisor just does 4K
> > > > mappings, which works even with the misalignment.
> > > >
> > > > >
> > > > > This adjustment happens at runtime. Could this be the missing detail here?
> > > >
> > > > Adjustments at runtime are a different topic from the issue I'm raising,
> > > > though eventually there's some relationship. My issue occurs in the
> > > > creation of a new region, and the setting up of the initial hypervisor
> > > > mapping. I haven't thought through the details of adjustments at runtime.
> > > >
> > > > My usual caveats apply -- this is all "thought experiment". If I had the
> > > > means do some runtime testing to confirm, I would. It's possible the
> > > > hypervisor is playing some trick I haven't envisioned, but I'm skeptical of
> > > > that given the basics of how physical processors work with page tables.
> > > >
> > > > Michael

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mshv: Align huge page stride with guest mapping
  2026-01-02 21:13                   ` Michael Kelley
@ 2026-01-02 23:35                     ` Stanislav Kinsburskii
  2026-01-03  1:16                       ` Michael Kelley
  0 siblings, 1 reply; 18+ messages in thread
From: Stanislav Kinsburskii @ 2026-01-02 23:35 UTC (permalink / raw)
  To: Michael Kelley
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

On Fri, Jan 02, 2026 at 09:13:31PM +0000, Michael Kelley wrote:
> From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 12:03 PM
> > 
> > On Fri, Jan 02, 2026 at 06:04:56PM +0000, Michael Kelley wrote:
> > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 9:43 AM
> > > >
> > > > On Tue, Dec 23, 2025 at 07:17:23PM +0000, Michael Kelley wrote:
> > > > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Tuesday, December 23, 2025 8:26 AM
> > > > > >
> > > > > > On Tue, Dec 23, 2025 at 03:51:22PM +0000, Michael Kelley wrote:
> > > > > > > From: Michael Kelley Sent: Monday, December 22, 2025 10:25 AM
> > > > > > > >
> > > > > > > [snip]
> > > > > > > >
> > > > > > > > Separately, in looking at this, I spotted another potential problem with
> > > > > > > > 2 Meg mappings that somewhat depends on hypervisor behavior that I'm
> > > > > > > > not clear on. To create a new region, the user space VMM issues the
> > > > > > > > MSHV_GET_GUEST_MEMORY ioctl, specifying the userspace address, the
> > > > > > > > size, and the guest PFN. The only requirement on these values is that the
> > > > > > > > userspace address and size be page aligned. But suppose a 4 Meg region is
> > > > > > > > specified where the userspace address and the guest PFN have different
> > > > > > > > offsets modulo 2 Meg. The userspace address range gets populated first,
> > > > > > > > and may contain a 2 Meg large page. Then when mshv_chunk_stride()
> > > > > > > > detects a 2 Meg aligned guest PFN so HVCALL_MAP_GPA_PAGES can be told
> > > > > > > > to create a 2 Meg mapping for the guest, the corresponding system PFN in
> > > > > > > > the page array may not be 2 Meg aligned. What does the hypervisor do in
> > > > > > > > this case? It can't create a 2 Meg mapping, right? So does it silently fallback
> > > > > > > > to creating 4K mappings, or does it return an error? Returning an error would
> > > > > > > > seem to be problematic for movable pages because the error wouldn't
> > > > > > > > occur until the guest VM is running and takes a range fault on the region.
> > > > > > > > Silently falling back to creating 4K mappings has performance implications,
> > > > > > > > though I guess it would work. My question is whether the
> > > > > > > > MSHV_GET_GUEST_MEMORY ioctl should detect this case and return an
> > > > > > > > error immediately.
> > > > > > > >
> > > > > > >
> > > > > > > In thinking about this more, I can answer my own question about the
> > > > > > > hypervisor behavior. When HVCALL_MAP_GPA_PAGES is set, the full
> > > > > > > list of 4K system PFNs is not provided as an input to the hypercall, so
> > > > > > > the hypervisor cannot silently fall back to 4K mappings. Assuming
> > > > > > > sequential PFNs would be wrong, so it must return an error if the
> > > > > > > alignment of a system PFN isn't on a 2 Meg boundary.
> > > > > > >
> > > > > > > For a pinned region, this error happens in mshv_region_map() as
> > > > > > > called from  mshv_prepare_pinned_region(), so will propagate back
> > > > > > > to the ioctl. But the error happens only if pin_user_pages_fast()
> > > > > > > allocates one or more 2 Meg pages. So creating a pinned region
> > > > > > > where the guest PFN and userspace address have different offsets
> > > > > > > modulo 2 Meg might or might not succeed.
> > > > > > >
> > > > > > > For a movable region, the error probably can't occur.
> > > > > > > mshv_region_handle_gfn_fault() builds an aligned 2 Meg chunk
> > > > > > > around the faulting guest PFN. mshv_region_range_fault() then
> > > > > > > determines the corresponding userspace addr, which won't be on
> > > > > > > a 2 Meg boundary, so the allocated memory won't contain a 2 Meg
> > > > > > > page. With no 2 Meg pages, mshv_region_remap_pages() will
> > > > > > > always do 4K mappings and will succeed. The downside is that a
> > > > > > > movable region with a guest PFN and userspace address with
> > > > > > > different offsets never gets any 2 Meg pages or mappings.
> > > > > > >
> > > > > > > My conclusion is the same -- such misalignment should not be
> > > > > > > allowed when creating a region that has the potential to use 2 Meg
> > > > > > > pages. Regions less than 2 Meg in size could be excluded from such
> > > > > > > a requirement if there is benefit in doing so. It's possible to have
> > > > > > > regions up to (but not including) 4 Meg where the alignment prevents
> > > > > > > having a 2 Meg page, and those could also be excluded from the
> > > > > > > requirement.
> > > > > > >
> > > > > >
> > > > > > I'm not sure I understand the problem.
> > > > > > There are three cases to consider:
> > > > > > 1. Guest mapping, where page sizes are controlled by the guest.
> > > > > > 2. Host mapping, where page sizes are controlled by the host.
> > > > >
> > > > > And by "host", you mean specifically the Linux instance running in the
> > > > > root partition. It hosts the VMM processes and creates the memory
> > > > > regions for each guest.
> > > > >
> > > > > > 3. Hypervisor mapping, where page sizes are controlled by the hypervisor.
> > > > > >
> > > > > > The first case is not relevant here and is included for completeness.
> > > > >
> > > > > Agreed.
> > > > >
> > > > > >
> > > > > > The second and third cases (host and hypervisor) share the memory layout,
> > > > >
> > > > > Right. More specifically, they are both operating on the same set of physical
> > > > > memory pages, and hence "share" a set of what I've referred to as
> > > > > "system PFNs" (to distinguish from guest PFNs, or GFNs).
> > > > >
> > > > > > but it is up
> > > > > > to each entity to decide which page sizes to use. For example, the host might map the
> > > > > > proposed 4M region with only 4K pages, even if a 2M page is available in the middle.
> > > > >
> > > > > Agreed.
> > > > >
> > > > > > In this case, the host will map the memory as represented by 4K pages, but the hypervisor
> > > > > > can still discover the 2M page in the middle and adjust its page tables to use a 2M page.
> > > > >
> > > > > Yes, that's possible, but subject to significant requirements. A 2M page can be
> > > > > used only if the underlying physical memory is a physically contiguous 2M chunk.
> > > > > Furthermore, that contiguous 2M chunk must start on a physical 2M boundary,
> > > > > and the virtual address to which it is being mapped must be on a 2M boundary.
> > > > > In the case of the host, that virtual address is the user space address in the
> > > > > user space process. In the case of the hypervisor, that "virtual address" is the
> > > > > the location in guest physical address space; i.e., the guest PFN left-shifted 9
> > > > > to be a guest physical address.
> > > > >
> > > > > These requirements are from the physical processor and its requirements on
> > > > > page table formats as specified by the hardware architecture. Whereas the
> > > > > page table entry for a 4K page contains the entire PFN, the page table entry
> > > > > for a 2M page omits the low order 9 bits of the PFN -- those bits must be zero,
> > > > > which is equivalent to requiring that the PFN be on a 2M boundary. These
> > > > > requirements apply to both host and hypervisor mappings.
> > > > >
> > > > > When MSHV code in the host creates a new pinned region via the ioctl,
> > > > > MSHV code first allocates memory for the region using pin_user_pages_fast(),
> > > > > which returns the system PFN for each page of physical memory that is
> > > > > allocated. If the host, at its discretion, allocates a 2M page, then a series
> > > > > of 512 sequential 4K PFNs is returned for that 2M page, and the first of
> > > > > the 512 sequential PFNs must have its low order 9 bits be zero.
> > > > >
> > > > > Then the MSHV ioctl makes the HVCALL_MAP_GPA_PAGES hypercall for
> > > > > the hypervisor to map the allocated memory into the guest physical
> > > > > address space at a particular guest PFN. If the allocated memory contains
> > > > > a 2M page, mshv_chunk_stride() will see a folio order of 9 for the 2M page,
> > > > > causing the HV_MAP_GPA_LARGE_PAGE flag to be set, which requests that
> > > > > the hypervisor do that mapping as a 2M large page. The hypercall does not
> > > > > have the option of dropping back to 4K page mappings in this case. If
> > > > > the 2M alignment of the system PFN is different from the 2M alignment
> > > > > of the target guest PFN, it's not possible to create the mapping and the
> > > > > hypercall fails.
> > > > >
> > > > > The core problem is that the same 2M of physical memory wants to be
> > > > > mapped by the host as a 2M page and by the hypervisor as a 2M page.
> > > > > That can't be done unless the host alignment (in the VMM virtual address
> > > > > space) and the guest physical address (i.e., the target guest PFN) alignment
> > > > > match and are both on 2M boundaries.
> > > > >
> > > >
> > > > But why is it a problem? If both the host and the hypervisor can map ap
> > > > huge page, but the guest can't, it's still a win, no?
> > > > In other words, if VMM passes a host huge page aligned region as a guest
> > > > unaligned, it's a VMM problem, not a hypervisor problem. And I' don't
> > > > understand why would we want to prevent such cases.
> > > >
> > >
> > > Fair enough -- mostly. If you want to allow the misaligned case and live
> > > with not getting the 2M mapping in the guest, that works except in the
> > > situation that I described above, where the HVCALL_MAP_GPA_PAGES
> > > hypercall fails when creating a pinned region.
> > >
> > > The failure is flakey in that if the Linux in the root partition does not
> > > map any of the region as a 2M page, the hypercall succeeds and the
> > > MSHV_GET_GUEST_MEMORY ioctl succeeds. But if the root partition
> > > happens to map any of the region as a 2M page, the hypercall will fail,
> > > and the MSHV_GET_GUEST_MEMORY ioctl will fail. Presumably such
> > > flakey behavior is bad for the VMM.
> > >
> > > One solution is that mshv_chunk_stride() must return a stride > 1 only
> > > if both the gfn (which it currently checks) AND the corresponding
> > > userspace_addr are 2M aligned. Then the HVCALL_MAP_GPA_PAGES
> > > hypercall will never have HV_MAP_GPA_LARGE_PAGE set for the
> > > misaligned case, and the failure won't occur.
> > >
> > 
> > I think see your point, but I also think this issue doesn't exist,
> > because map_chunk_stride() returns huge page stride iff page if:
> > 1. the folio order is PMD_ORDER and
> > 2. GFN is huge page aligned and
> > 3. number of 4K pages is huge pages aligned.
> > 
> > On other words, a host huge page won't be mapped as huge if the page
> > can't be mapped as huge in the guest.
> 
> OK, I'm missing how what you say is true. For pinned regions,
> the memory is allocated and mapped into the host userspace address
> first, as done by mshv_prepare_pinned_region() calling mshv_region_pin(),
> which calls pin_user_pages_fast(). This is all done without considering
> the GFN or GFN alignment. So one or more 2M pages might be allocated
> and mapped in the host before any guest mapping is looked at. Agreed?
> 

Agreed.

> Then mshv_prepare_pinned_region() calls mshv_region_map() to do the
> guest mapping. This eventually gets down to mshv_chunk_stride(). In
> mshv_chunk_stride() when your conditions #2 and #3 are met, the
> corresponding struct page argument to mshv_chunk_stride() may be a
> 4K page that is in the middle of a 2M page instead of at the beginning
> (if the region is mis-aligned). But the key point is that the 4K page in
> the middle is part of a folio that will return a folio order of PMD_ORDER.
> I.e., a folio order of PMD_ORDER is not sufficient to ensure that the
> struct page arg is at the *start* of a 2M-aligned physical memory range
> that can be mapped into the guest as a 2M page.
> 

I'm trying to undestand how this can even happen, so please bear with
me.
In other words (and AFAIU), what you are saying in the following:

1. VMM creates a mapping with a huge page(s) (this implies that virtual
   address is huge page aligned, size is huge page aligned and physical
   pages are consequtive).
2. VMM tries to create a region via ioctl, but instead of passing the
   start of the region, is passes an offset into one of the the region's
   huge pages, and in the same time with the base GFN and the size huge
   page aligned (to meet the #2 and #3 conditions).
3. mshv_chunk_stride() sees a folio order of PMD_ORDER, and tries to map
   the corresponding pages as huge, which will be rejected by the
   hypervisor.

Is this accurate?
A subseqeunt question: if it is accurate, why the driver needs to
support this case? It looks like a VMM bug to me.
Also, how should it support it? By rejecting such requests in the ioctl?

Thanks,
Stanislav

> The problem does *not* happen with a movable region, but the reasoning
> is different. hmm_range_fault() is always called with a 2M range aligned
> to the GFN, which in a mis-aligned region means that the host userspace
> address is never 2M aligned. So hmm_range_fault() is never able to allocate
> and map a 2M page. mshv_chunk_stride() will never get a folio order > 1,
> and the hypercall is never asked to do a 2M mapping. Both host and guest
> mappings will always be 4K and everything works.
> 
> Michael
> 
> > And this function is called for
> > both movable and pinned region, so the hypercal should never fail due to
> > huge page alignment issue.
> > 
> > What do I miss here?
> > 
> > Thanks,
> > Stanislav
> > 
> > 
> > > Michael
> > >
> > > >
> > > > > Movable regions behave a bit differently because the memory for the
> > > > > region is not allocated on the host "up front" when the region is created.
> > > > > The memory is faulted in as the guest runs, and the vagaries of the current
> > > > > MSHV in Linux code are such that 2M pages are never created on the host
> > > > > if the alignments don't match. HV_MAP_GPA_LARGE_PAGE is never passed
> > > > > to the HVCALL_MAP_GPA_PAGES hypercall, so the hypervisor just does 4K
> > > > > mappings, which works even with the misalignment.
> > > > >
> > > > > >
> > > > > > This adjustment happens at runtime. Could this be the missing detail here?
> > > > >
> > > > > Adjustments at runtime are a different topic from the issue I'm raising,
> > > > > though eventually there's some relationship. My issue occurs in the
> > > > > creation of a new region, and the setting up of the initial hypervisor
> > > > > mapping. I haven't thought through the details of adjustments at runtime.
> > > > >
> > > > > My usual caveats apply -- this is all "thought experiment". If I had the
> > > > > means do some runtime testing to confirm, I would. It's possible the
> > > > > hypervisor is playing some trick I haven't envisioned, but I'm skeptical of
> > > > > that given the basics of how physical processors work with page tables.
> > > > >
> > > > > Michael

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH] mshv: Align huge page stride with guest mapping
  2026-01-02 23:35                     ` Stanislav Kinsburskii
@ 2026-01-03  1:16                       ` Michael Kelley
  2026-01-05 17:25                         ` Stanislav Kinsburskii
  0 siblings, 1 reply; 18+ messages in thread
From: Michael Kelley @ 2026-01-03  1:16 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 3:35 PM
> 
> On Fri, Jan 02, 2026 at 09:13:31PM +0000, Michael Kelley wrote:
> > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 12:03 PM
> > >
> > > On Fri, Jan 02, 2026 at 06:04:56PM +0000, Michael Kelley wrote:
> > > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 9:43 AM
> > > > >
> > > > > On Tue, Dec 23, 2025 at 07:17:23PM +0000, Michael Kelley wrote:
> > > > > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Tuesday, December 23, 2025 8:26 AM
> > > > > > >
> > > > > > > On Tue, Dec 23, 2025 at 03:51:22PM +0000, Michael Kelley wrote:
> > > > > > > > From: Michael Kelley Sent: Monday, December 22, 2025 10:25 AM
> > > > > > > > >
> > > > > > > > [snip]
> > > > > > > > >
> > > > > > > > > Separately, in looking at this, I spotted another potential problem with
> > > > > > > > > 2 Meg mappings that somewhat depends on hypervisor behavior that I'm
> > > > > > > > > not clear on. To create a new region, the user space VMM issues the
> > > > > > > > > MSHV_GET_GUEST_MEMORY ioctl, specifying the userspace address, the
> > > > > > > > > size, and the guest PFN. The only requirement on these values is that the
> > > > > > > > > userspace address and size be page aligned. But suppose a 4 Meg region is
> > > > > > > > > specified where the userspace address and the guest PFN have different
> > > > > > > > > offsets modulo 2 Meg. The userspace address range gets populated first,
> > > > > > > > > and may contain a 2 Meg large page. Then when mshv_chunk_stride()
> > > > > > > > > detects a 2 Meg aligned guest PFN so HVCALL_MAP_GPA_PAGES can be told
> > > > > > > > > to create a 2 Meg mapping for the guest, the corresponding system PFN in
> > > > > > > > > the page array may not be 2 Meg aligned. What does the hypervisor do in
> > > > > > > > > this case? It can't create a 2 Meg mapping, right? So does it silently fallback
> > > > > > > > > to creating 4K mappings, or does it return an error? Returning an error would
> > > > > > > > > seem to be problematic for movable pages because the error wouldn't
> > > > > > > > > occur until the guest VM is running and takes a range fault on the region.
> > > > > > > > > Silently falling back to creating 4K mappings has performance implications,
> > > > > > > > > though I guess it would work. My question is whether the
> > > > > > > > > MSHV_GET_GUEST_MEMORY ioctl should detect this case and return an
> > > > > > > > > error immediately.
> > > > > > > > >
> > > > > > > >
> > > > > > > > In thinking about this more, I can answer my own question about the
> > > > > > > > hypervisor behavior. When HVCALL_MAP_GPA_PAGES is set, the full
> > > > > > > > list of 4K system PFNs is not provided as an input to the hypercall, so
> > > > > > > > the hypervisor cannot silently fall back to 4K mappings. Assuming
> > > > > > > > sequential PFNs would be wrong, so it must return an error if the
> > > > > > > > alignment of a system PFN isn't on a 2 Meg boundary.
> > > > > > > >
> > > > > > > > For a pinned region, this error happens in mshv_region_map() as
> > > > > > > > called from  mshv_prepare_pinned_region(), so will propagate back
> > > > > > > > to the ioctl. But the error happens only if pin_user_pages_fast()
> > > > > > > > allocates one or more 2 Meg pages. So creating a pinned region
> > > > > > > > where the guest PFN and userspace address have different offsets
> > > > > > > > modulo 2 Meg might or might not succeed.
> > > > > > > >
> > > > > > > > For a movable region, the error probably can't occur.
> > > > > > > > mshv_region_handle_gfn_fault() builds an aligned 2 Meg chunk
> > > > > > > > around the faulting guest PFN. mshv_region_range_fault() then
> > > > > > > > determines the corresponding userspace addr, which won't be on
> > > > > > > > a 2 Meg boundary, so the allocated memory won't contain a 2 Meg
> > > > > > > > page. With no 2 Meg pages, mshv_region_remap_pages() will
> > > > > > > > always do 4K mappings and will succeed. The downside is that a
> > > > > > > > movable region with a guest PFN and userspace address with
> > > > > > > > different offsets never gets any 2 Meg pages or mappings.
> > > > > > > >
> > > > > > > > My conclusion is the same -- such misalignment should not be
> > > > > > > > allowed when creating a region that has the potential to use 2 Meg
> > > > > > > > pages. Regions less than 2 Meg in size could be excluded from such
> > > > > > > > a requirement if there is benefit in doing so. It's possible to have
> > > > > > > > regions up to (but not including) 4 Meg where the alignment prevents
> > > > > > > > having a 2 Meg page, and those could also be excluded from the
> > > > > > > > requirement.
> > > > > > > >
> > > > > > >
> > > > > > > I'm not sure I understand the problem.
> > > > > > > There are three cases to consider:
> > > > > > > 1. Guest mapping, where page sizes are controlled by the guest.
> > > > > > > 2. Host mapping, where page sizes are controlled by the host.
> > > > > >
> > > > > > And by "host", you mean specifically the Linux instance running in the
> > > > > > root partition. It hosts the VMM processes and creates the memory
> > > > > > regions for each guest.
> > > > > >
> > > > > > > 3. Hypervisor mapping, where page sizes are controlled by the hypervisor.
> > > > > > >
> > > > > > > The first case is not relevant here and is included for completeness.
> > > > > >
> > > > > > Agreed.
> > > > > >
> > > > > > >
> > > > > > > The second and third cases (host and hypervisor) share the memory layout,
> > > > > >
> > > > > > Right. More specifically, they are both operating on the same set of physical
> > > > > > memory pages, and hence "share" a set of what I've referred to as
> > > > > > "system PFNs" (to distinguish from guest PFNs, or GFNs).
> > > > > >
> > > > > > > but it is up
> > > > > > > to each entity to decide which page sizes to use. For example, the host might map the
> > > > > > > proposed 4M region with only 4K pages, even if a 2M page is available in the middle.
> > > > > >
> > > > > > Agreed.
> > > > > >
> > > > > > > In this case, the host will map the memory as represented by 4K pages, but the hypervisor
> > > > > > > can still discover the 2M page in the middle and adjust its page tables to use a 2M page.
> > > > > >
> > > > > > Yes, that's possible, but subject to significant requirements. A 2M page can be
> > > > > > used only if the underlying physical memory is a physically contiguous 2M chunk.
> > > > > > Furthermore, that contiguous 2M chunk must start on a physical 2M boundary,
> > > > > > and the virtual address to which it is being mapped must be on a 2M boundary.
> > > > > > In the case of the host, that virtual address is the user space address in the
> > > > > > user space process. In the case of the hypervisor, that "virtual address" is the
> > > > > > the location in guest physical address space; i.e., the guest PFN left-shifted 9
> > > > > > to be a guest physical address.
> > > > > >
> > > > > > These requirements are from the physical processor and its requirements on
> > > > > > page table formats as specified by the hardware architecture. Whereas the
> > > > > > page table entry for a 4K page contains the entire PFN, the page table entry
> > > > > > for a 2M page omits the low order 9 bits of the PFN -- those bits must be zero,
> > > > > > which is equivalent to requiring that the PFN be on a 2M boundary. These
> > > > > > requirements apply to both host and hypervisor mappings.
> > > > > >
> > > > > > When MSHV code in the host creates a new pinned region via the ioctl,
> > > > > > MSHV code first allocates memory for the region using pin_user_pages_fast(),
> > > > > > which returns the system PFN for each page of physical memory that is
> > > > > > allocated. If the host, at its discretion, allocates a 2M page, then a series
> > > > > > of 512 sequential 4K PFNs is returned for that 2M page, and the first of
> > > > > > the 512 sequential PFNs must have its low order 9 bits be zero.
> > > > > >
> > > > > > Then the MSHV ioctl makes the HVCALL_MAP_GPA_PAGES hypercall for
> > > > > > the hypervisor to map the allocated memory into the guest physical
> > > > > > address space at a particular guest PFN. If the allocated memory contains
> > > > > > a 2M page, mshv_chunk_stride() will see a folio order of 9 for the 2M page,
> > > > > > causing the HV_MAP_GPA_LARGE_PAGE flag to be set, which requests that
> > > > > > the hypervisor do that mapping as a 2M large page. The hypercall does not
> > > > > > have the option of dropping back to 4K page mappings in this case. If
> > > > > > the 2M alignment of the system PFN is different from the 2M alignment
> > > > > > of the target guest PFN, it's not possible to create the mapping and the
> > > > > > hypercall fails.
> > > > > >
> > > > > > The core problem is that the same 2M of physical memory wants to be
> > > > > > mapped by the host as a 2M page and by the hypervisor as a 2M page.
> > > > > > That can't be done unless the host alignment (in the VMM virtual address
> > > > > > space) and the guest physical address (i.e., the target guest PFN) alignment
> > > > > > match and are both on 2M boundaries.
> > > > > >
> > > > >
> > > > > But why is it a problem? If both the host and the hypervisor can map ap
> > > > > huge page, but the guest can't, it's still a win, no?
> > > > > In other words, if VMM passes a host huge page aligned region as a guest
> > > > > unaligned, it's a VMM problem, not a hypervisor problem. And I' don't
> > > > > understand why would we want to prevent such cases.
> > > > >
> > > >
> > > > Fair enough -- mostly. If you want to allow the misaligned case and live
> > > > with not getting the 2M mapping in the guest, that works except in the
> > > > situation that I described above, where the HVCALL_MAP_GPA_PAGES
> > > > hypercall fails when creating a pinned region.
> > > >
> > > > The failure is flakey in that if the Linux in the root partition does not
> > > > map any of the region as a 2M page, the hypercall succeeds and the
> > > > MSHV_GET_GUEST_MEMORY ioctl succeeds. But if the root partition
> > > > happens to map any of the region as a 2M page, the hypercall will fail,
> > > > and the MSHV_GET_GUEST_MEMORY ioctl will fail. Presumably such
> > > > flakey behavior is bad for the VMM.
> > > >
> > > > One solution is that mshv_chunk_stride() must return a stride > 1 only
> > > > if both the gfn (which it currently checks) AND the corresponding
> > > > userspace_addr are 2M aligned. Then the HVCALL_MAP_GPA_PAGES
> > > > hypercall will never have HV_MAP_GPA_LARGE_PAGE set for the
> > > > misaligned case, and the failure won't occur.
> > > >
> > >
> > > I think see your point, but I also think this issue doesn't exist,
> > > because map_chunk_stride() returns huge page stride iff page if:
> > > 1. the folio order is PMD_ORDER and
> > > 2. GFN is huge page aligned and
> > > 3. number of 4K pages is huge pages aligned.
> > >
> > > On other words, a host huge page won't be mapped as huge if the page
> > > can't be mapped as huge in the guest.
> >
> > OK, I'm missing how what you say is true. For pinned regions,
> > the memory is allocated and mapped into the host userspace address
> > first, as done by mshv_prepare_pinned_region() calling mshv_region_pin(),
> > which calls pin_user_pages_fast(). This is all done without considering
> > the GFN or GFN alignment. So one or more 2M pages might be allocated
> > and mapped in the host before any guest mapping is looked at. Agreed?
> >
> 
> Agreed.
> 
> > Then mshv_prepare_pinned_region() calls mshv_region_map() to do the
> > guest mapping. This eventually gets down to mshv_chunk_stride(). In
> > mshv_chunk_stride() when your conditions #2 and #3 are met, the
> > corresponding struct page argument to mshv_chunk_stride() may be a
> > 4K page that is in the middle of a 2M page instead of at the beginning
> > (if the region is mis-aligned). But the key point is that the 4K page in
> > the middle is part of a folio that will return a folio order of PMD_ORDER.
> > I.e., a folio order of PMD_ORDER is not sufficient to ensure that the
> > struct page arg is at the *start* of a 2M-aligned physical memory range
> > that can be mapped into the guest as a 2M page.
> >
> 
> I'm trying to undestand how this can even happen, so please bear with
> me.
> In other words (and AFAIU), what you are saying in the following:
> 
> 1. VMM creates a mapping with a huge page(s) (this implies that virtual
>    address is huge page aligned, size is huge page aligned and physical
>    pages are consequtive).
> 2. VMM tries to create a region via ioctl, but instead of passing the
>    start of the region, is passes an offset into one of the the region's
>    huge pages, and in the same time with the base GFN and the size huge
>    page aligned (to meet the #2 and #3 conditions).
> 3. mshv_chunk_stride() sees a folio order of PMD_ORDER, and tries to map
>    the corresponding pages as huge, which will be rejected by the
>    hypervisor.
> 
> Is this accurate?

Yes, pretty much. In Step 1, the VMM may just allocate some virtual
address space, and not do anything to populate it with physical pages.
So populating with any 2M pages may not happen until Step 2 when
the ioctl calls pin_user_pages_fast(). But *when* the virtual address
space gets populated with physical pages doesn't really matter. We
just know that it happens before the ioctl tries to map the memory
into the guest -- i.e., mshv_prepare_pinned_region() calls
mshv_region_map().

And yes, the problem is what you call out in Step 2: as input to the
ioctl, the fields "userspace_addr" and "guest_pfn" in struct
mshv_user_mem_region could have different alignments modulo 2M
boundaries. When they are different, that's what I'm calling a "mis-aligned
region", (referring to a struct mshv_mem_region that is created and
setup by the ioctl).

> A subseqeunt question: if it is accurate, why the driver needs to
> support this case? It looks like a VMM bug to me.

I don't know if the driver needs to support this case. That's a question
for the VMM people to answer. I wouldn't necessarily assume that the
VMM always allocates virtual address space with exactly the size and
alignment that matches the regions it creates with the ioctl. The
kernel ioctl doesn't care how the VMM allocates and manages its
virtual address space, so the VMM is free to do whatever it wants
in that regard, as long as it meets the requirements of the ioctl. So
the requirements of the ioctl in this case are something to be
negotiated with the VMM.

> Also, how should it support it? By rejecting such requests in the ioctl?

Rejecting requests to create a mis-aligned region is certainly one option
if the VMM agrees that's OK. The ioctl currently requires only that
"userspace_addr" and "size" be page aligned, so those requirements
could be tightened.

The other approach is to fix mshv_chunk_stride() to handle the
mis-aligned case. Doing so it even easier than I first envisioned.
I think this works:

@@ -49,7 +49,8 @@ static int mshv_chunk_stride(struct page *page,
         */
        if (page_order &&
            IS_ALIGNED(gfn, PTRS_PER_PMD) &&
-           IS_ALIGNED(page_count, PTRS_PER_PMD))
+           IS_ALIGNED(page_count, PTRS_PER_PMD) &&
+           IS_ALIGNED(page_to_pfn(page), PTRS_PER_PMD))
                return 1 << page_order;

        return 1;

But as we discussed earlier, this fix means never getting 2M mappings
in the guest for a region that is mis-aligned.

Michael

> 
> Thanks,
> Stanislav
> 
> > The problem does *not* happen with a movable region, but the reasoning
> > is different. hmm_range_fault() is always called with a 2M range aligned
> > to the GFN, which in a mis-aligned region means that the host userspace
> > address is never 2M aligned. So hmm_range_fault() is never able to allocate
> > and map a 2M page. mshv_chunk_stride() will never get a folio order > 1,
> > and the hypercall is never asked to do a 2M mapping. Both host and guest
> > mappings will always be 4K and everything works.
> >
> > Michael
> >
> > > And this function is called for
> > > both movable and pinned region, so the hypercal should never fail due to
> > > huge page alignment issue.
> > >
> > > What do I miss here?
> > >
> > > Thanks,
> > > Stanislav
> > >
> > >
> > > > Michael
> > > >
> > > > >
> > > > > > Movable regions behave a bit differently because the memory for the
> > > > > > region is not allocated on the host "up front" when the region is created.
> > > > > > The memory is faulted in as the guest runs, and the vagaries of the current
> > > > > > MSHV in Linux code are such that 2M pages are never created on the host
> > > > > > if the alignments don't match. HV_MAP_GPA_LARGE_PAGE is never passed
> > > > > > to the HVCALL_MAP_GPA_PAGES hypercall, so the hypervisor just does 4K
> > > > > > mappings, which works even with the misalignment.
> > > > > >
> > > > > > >
> > > > > > > This adjustment happens at runtime. Could this be the missing detail here?
> > > > > >
> > > > > > Adjustments at runtime are a different topic from the issue I'm raising,
> > > > > > though eventually there's some relationship. My issue occurs in the
> > > > > > creation of a new region, and the setting up of the initial hypervisor
> > > > > > mapping. I haven't thought through the details of adjustments at runtime.
> > > > > >
> > > > > > My usual caveats apply -- this is all "thought experiment". If I had the
> > > > > > means do some runtime testing to confirm, I would. It's possible the
> > > > > > hypervisor is playing some trick I haven't envisioned, but I'm skeptical of
> > > > > > that given the basics of how physical processors work with page tables.
> > > > > >
> > > > > > Michael

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mshv: Align huge page stride with guest mapping
  2026-01-03  1:16                       ` Michael Kelley
@ 2026-01-05 17:25                         ` Stanislav Kinsburskii
  2026-01-05 18:07                           ` Michael Kelley
  0 siblings, 1 reply; 18+ messages in thread
From: Stanislav Kinsburskii @ 2026-01-05 17:25 UTC (permalink / raw)
  To: Michael Kelley
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

On Sat, Jan 03, 2026 at 01:16:51AM +0000, Michael Kelley wrote:
> From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 3:35 PM
> > 
> > On Fri, Jan 02, 2026 at 09:13:31PM +0000, Michael Kelley wrote:
> > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 12:03 PM
> > > >
> > > > On Fri, Jan 02, 2026 at 06:04:56PM +0000, Michael Kelley wrote:
> > > > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 9:43 AM
> > > > > >
> > > > > > On Tue, Dec 23, 2025 at 07:17:23PM +0000, Michael Kelley wrote:
> > > > > > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Tuesday, December 23, 2025 8:26 AM
> > > > > > > >
> > > > > > > > On Tue, Dec 23, 2025 at 03:51:22PM +0000, Michael Kelley wrote:
> > > > > > > > > From: Michael Kelley Sent: Monday, December 22, 2025 10:25 AM
> > > > > > > > > >
> > > > > > > > > [snip]
> > > > > > > > > >
> > > > > > > > > > Separately, in looking at this, I spotted another potential problem with
> > > > > > > > > > 2 Meg mappings that somewhat depends on hypervisor behavior that I'm
> > > > > > > > > > not clear on. To create a new region, the user space VMM issues the
> > > > > > > > > > MSHV_GET_GUEST_MEMORY ioctl, specifying the userspace address, the
> > > > > > > > > > size, and the guest PFN. The only requirement on these values is that the
> > > > > > > > > > userspace address and size be page aligned. But suppose a 4 Meg region is
> > > > > > > > > > specified where the userspace address and the guest PFN have different
> > > > > > > > > > offsets modulo 2 Meg. The userspace address range gets populated first,
> > > > > > > > > > and may contain a 2 Meg large page. Then when mshv_chunk_stride()
> > > > > > > > > > detects a 2 Meg aligned guest PFN so HVCALL_MAP_GPA_PAGES can be told
> > > > > > > > > > to create a 2 Meg mapping for the guest, the corresponding system PFN in
> > > > > > > > > > the page array may not be 2 Meg aligned. What does the hypervisor do in
> > > > > > > > > > this case? It can't create a 2 Meg mapping, right? So does it silently fallback
> > > > > > > > > > to creating 4K mappings, or does it return an error? Returning an error would
> > > > > > > > > > seem to be problematic for movable pages because the error wouldn't
> > > > > > > > > > occur until the guest VM is running and takes a range fault on the region.
> > > > > > > > > > Silently falling back to creating 4K mappings has performance implications,
> > > > > > > > > > though I guess it would work. My question is whether the
> > > > > > > > > > MSHV_GET_GUEST_MEMORY ioctl should detect this case and return an
> > > > > > > > > > error immediately.
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > In thinking about this more, I can answer my own question about the
> > > > > > > > > hypervisor behavior. When HVCALL_MAP_GPA_PAGES is set, the full
> > > > > > > > > list of 4K system PFNs is not provided as an input to the hypercall, so
> > > > > > > > > the hypervisor cannot silently fall back to 4K mappings. Assuming
> > > > > > > > > sequential PFNs would be wrong, so it must return an error if the
> > > > > > > > > alignment of a system PFN isn't on a 2 Meg boundary.
> > > > > > > > >
> > > > > > > > > For a pinned region, this error happens in mshv_region_map() as
> > > > > > > > > called from  mshv_prepare_pinned_region(), so will propagate back
> > > > > > > > > to the ioctl. But the error happens only if pin_user_pages_fast()
> > > > > > > > > allocates one or more 2 Meg pages. So creating a pinned region
> > > > > > > > > where the guest PFN and userspace address have different offsets
> > > > > > > > > modulo 2 Meg might or might not succeed.
> > > > > > > > >
> > > > > > > > > For a movable region, the error probably can't occur.
> > > > > > > > > mshv_region_handle_gfn_fault() builds an aligned 2 Meg chunk
> > > > > > > > > around the faulting guest PFN. mshv_region_range_fault() then
> > > > > > > > > determines the corresponding userspace addr, which won't be on
> > > > > > > > > a 2 Meg boundary, so the allocated memory won't contain a 2 Meg
> > > > > > > > > page. With no 2 Meg pages, mshv_region_remap_pages() will
> > > > > > > > > always do 4K mappings and will succeed. The downside is that a
> > > > > > > > > movable region with a guest PFN and userspace address with
> > > > > > > > > different offsets never gets any 2 Meg pages or mappings.
> > > > > > > > >
> > > > > > > > > My conclusion is the same -- such misalignment should not be
> > > > > > > > > allowed when creating a region that has the potential to use 2 Meg
> > > > > > > > > pages. Regions less than 2 Meg in size could be excluded from such
> > > > > > > > > a requirement if there is benefit in doing so. It's possible to have
> > > > > > > > > regions up to (but not including) 4 Meg where the alignment prevents
> > > > > > > > > having a 2 Meg page, and those could also be excluded from the
> > > > > > > > > requirement.
> > > > > > > > >
> > > > > > > >
> > > > > > > > I'm not sure I understand the problem.
> > > > > > > > There are three cases to consider:
> > > > > > > > 1. Guest mapping, where page sizes are controlled by the guest.
> > > > > > > > 2. Host mapping, where page sizes are controlled by the host.
> > > > > > >
> > > > > > > And by "host", you mean specifically the Linux instance running in the
> > > > > > > root partition. It hosts the VMM processes and creates the memory
> > > > > > > regions for each guest.
> > > > > > >
> > > > > > > > 3. Hypervisor mapping, where page sizes are controlled by the hypervisor.
> > > > > > > >
> > > > > > > > The first case is not relevant here and is included for completeness.
> > > > > > >
> > > > > > > Agreed.
> > > > > > >
> > > > > > > >
> > > > > > > > The second and third cases (host and hypervisor) share the memory layout,
> > > > > > >
> > > > > > > Right. More specifically, they are both operating on the same set of physical
> > > > > > > memory pages, and hence "share" a set of what I've referred to as
> > > > > > > "system PFNs" (to distinguish from guest PFNs, or GFNs).
> > > > > > >
> > > > > > > > but it is up
> > > > > > > > to each entity to decide which page sizes to use. For example, the host might map the
> > > > > > > > proposed 4M region with only 4K pages, even if a 2M page is available in the middle.
> > > > > > >
> > > > > > > Agreed.
> > > > > > >
> > > > > > > > In this case, the host will map the memory as represented by 4K pages, but the hypervisor
> > > > > > > > can still discover the 2M page in the middle and adjust its page tables to use a 2M page.
> > > > > > >
> > > > > > > Yes, that's possible, but subject to significant requirements. A 2M page can be
> > > > > > > used only if the underlying physical memory is a physically contiguous 2M chunk.
> > > > > > > Furthermore, that contiguous 2M chunk must start on a physical 2M boundary,
> > > > > > > and the virtual address to which it is being mapped must be on a 2M boundary.
> > > > > > > In the case of the host, that virtual address is the user space address in the
> > > > > > > user space process. In the case of the hypervisor, that "virtual address" is the
> > > > > > > the location in guest physical address space; i.e., the guest PFN left-shifted 9
> > > > > > > to be a guest physical address.
> > > > > > >
> > > > > > > These requirements are from the physical processor and its requirements on
> > > > > > > page table formats as specified by the hardware architecture. Whereas the
> > > > > > > page table entry for a 4K page contains the entire PFN, the page table entry
> > > > > > > for a 2M page omits the low order 9 bits of the PFN -- those bits must be zero,
> > > > > > > which is equivalent to requiring that the PFN be on a 2M boundary. These
> > > > > > > requirements apply to both host and hypervisor mappings.
> > > > > > >
> > > > > > > When MSHV code in the host creates a new pinned region via the ioctl,
> > > > > > > MSHV code first allocates memory for the region using pin_user_pages_fast(),
> > > > > > > which returns the system PFN for each page of physical memory that is
> > > > > > > allocated. If the host, at its discretion, allocates a 2M page, then a series
> > > > > > > of 512 sequential 4K PFNs is returned for that 2M page, and the first of
> > > > > > > the 512 sequential PFNs must have its low order 9 bits be zero.
> > > > > > >
> > > > > > > Then the MSHV ioctl makes the HVCALL_MAP_GPA_PAGES hypercall for
> > > > > > > the hypervisor to map the allocated memory into the guest physical
> > > > > > > address space at a particular guest PFN. If the allocated memory contains
> > > > > > > a 2M page, mshv_chunk_stride() will see a folio order of 9 for the 2M page,
> > > > > > > causing the HV_MAP_GPA_LARGE_PAGE flag to be set, which requests that
> > > > > > > the hypervisor do that mapping as a 2M large page. The hypercall does not
> > > > > > > have the option of dropping back to 4K page mappings in this case. If
> > > > > > > the 2M alignment of the system PFN is different from the 2M alignment
> > > > > > > of the target guest PFN, it's not possible to create the mapping and the
> > > > > > > hypercall fails.
> > > > > > >
> > > > > > > The core problem is that the same 2M of physical memory wants to be
> > > > > > > mapped by the host as a 2M page and by the hypervisor as a 2M page.
> > > > > > > That can't be done unless the host alignment (in the VMM virtual address
> > > > > > > space) and the guest physical address (i.e., the target guest PFN) alignment
> > > > > > > match and are both on 2M boundaries.
> > > > > > >
> > > > > >
> > > > > > But why is it a problem? If both the host and the hypervisor can map ap
> > > > > > huge page, but the guest can't, it's still a win, no?
> > > > > > In other words, if VMM passes a host huge page aligned region as a guest
> > > > > > unaligned, it's a VMM problem, not a hypervisor problem. And I' don't
> > > > > > understand why would we want to prevent such cases.
> > > > > >
> > > > >
> > > > > Fair enough -- mostly. If you want to allow the misaligned case and live
> > > > > with not getting the 2M mapping in the guest, that works except in the
> > > > > situation that I described above, where the HVCALL_MAP_GPA_PAGES
> > > > > hypercall fails when creating a pinned region.
> > > > >
> > > > > The failure is flakey in that if the Linux in the root partition does not
> > > > > map any of the region as a 2M page, the hypercall succeeds and the
> > > > > MSHV_GET_GUEST_MEMORY ioctl succeeds. But if the root partition
> > > > > happens to map any of the region as a 2M page, the hypercall will fail,
> > > > > and the MSHV_GET_GUEST_MEMORY ioctl will fail. Presumably such
> > > > > flakey behavior is bad for the VMM.
> > > > >
> > > > > One solution is that mshv_chunk_stride() must return a stride > 1 only
> > > > > if both the gfn (which it currently checks) AND the corresponding
> > > > > userspace_addr are 2M aligned. Then the HVCALL_MAP_GPA_PAGES
> > > > > hypercall will never have HV_MAP_GPA_LARGE_PAGE set for the
> > > > > misaligned case, and the failure won't occur.
> > > > >
> > > >
> > > > I think see your point, but I also think this issue doesn't exist,
> > > > because map_chunk_stride() returns huge page stride iff page if:
> > > > 1. the folio order is PMD_ORDER and
> > > > 2. GFN is huge page aligned and
> > > > 3. number of 4K pages is huge pages aligned.
> > > >
> > > > On other words, a host huge page won't be mapped as huge if the page
> > > > can't be mapped as huge in the guest.
> > >
> > > OK, I'm missing how what you say is true. For pinned regions,
> > > the memory is allocated and mapped into the host userspace address
> > > first, as done by mshv_prepare_pinned_region() calling mshv_region_pin(),
> > > which calls pin_user_pages_fast(). This is all done without considering
> > > the GFN or GFN alignment. So one or more 2M pages might be allocated
> > > and mapped in the host before any guest mapping is looked at. Agreed?
> > >
> > 
> > Agreed.
> > 
> > > Then mshv_prepare_pinned_region() calls mshv_region_map() to do the
> > > guest mapping. This eventually gets down to mshv_chunk_stride(). In
> > > mshv_chunk_stride() when your conditions #2 and #3 are met, the
> > > corresponding struct page argument to mshv_chunk_stride() may be a
> > > 4K page that is in the middle of a 2M page instead of at the beginning
> > > (if the region is mis-aligned). But the key point is that the 4K page in
> > > the middle is part of a folio that will return a folio order of PMD_ORDER.
> > > I.e., a folio order of PMD_ORDER is not sufficient to ensure that the
> > > struct page arg is at the *start* of a 2M-aligned physical memory range
> > > that can be mapped into the guest as a 2M page.
> > >
> > 
> > I'm trying to undestand how this can even happen, so please bear with
> > me.
> > In other words (and AFAIU), what you are saying in the following:
> > 
> > 1. VMM creates a mapping with a huge page(s) (this implies that virtual
> >    address is huge page aligned, size is huge page aligned and physical
> >    pages are consequtive).
> > 2. VMM tries to create a region via ioctl, but instead of passing the
> >    start of the region, is passes an offset into one of the the region's
> >    huge pages, and in the same time with the base GFN and the size huge
> >    page aligned (to meet the #2 and #3 conditions).
> > 3. mshv_chunk_stride() sees a folio order of PMD_ORDER, and tries to map
> >    the corresponding pages as huge, which will be rejected by the
> >    hypervisor.
> > 
> > Is this accurate?
> 
> Yes, pretty much. In Step 1, the VMM may just allocate some virtual
> address space, and not do anything to populate it with physical pages.
> So populating with any 2M pages may not happen until Step 2 when
> the ioctl calls pin_user_pages_fast(). But *when* the virtual address
> space gets populated with physical pages doesn't really matter. We
> just know that it happens before the ioctl tries to map the memory
> into the guest -- i.e., mshv_prepare_pinned_region() calls
> mshv_region_map().
> 
> And yes, the problem is what you call out in Step 2: as input to the
> ioctl, the fields "userspace_addr" and "guest_pfn" in struct
> mshv_user_mem_region could have different alignments modulo 2M
> boundaries. When they are different, that's what I'm calling a "mis-aligned
> region", (referring to a struct mshv_mem_region that is created and
> setup by the ioctl).
> 
> > A subseqeunt question: if it is accurate, why the driver needs to
> > support this case? It looks like a VMM bug to me.
> 
> I don't know if the driver needs to support this case. That's a question
> for the VMM people to answer. I wouldn't necessarily assume that the
> VMM always allocates virtual address space with exactly the size and
> alignment that matches the regions it creates with the ioctl. The
> kernel ioctl doesn't care how the VMM allocates and manages its
> virtual address space, so the VMM is free to do whatever it wants
> in that regard, as long as it meets the requirements of the ioctl. So
> the requirements of the ioctl in this case are something to be
> negotiated with the VMM.
> 
> > Also, how should it support it? By rejecting such requests in the ioctl?
> 
> Rejecting requests to create a mis-aligned region is certainly one option
> if the VMM agrees that's OK. The ioctl currently requires only that
> "userspace_addr" and "size" be page aligned, so those requirements
> could be tightened.
> 
> The other approach is to fix mshv_chunk_stride() to handle the
> mis-aligned case. Doing so it even easier than I first envisioned.
> I think this works:
> 
> @@ -49,7 +49,8 @@ static int mshv_chunk_stride(struct page *page,
>          */
>         if (page_order &&
>             IS_ALIGNED(gfn, PTRS_PER_PMD) &&
> -           IS_ALIGNED(page_count, PTRS_PER_PMD))
> +           IS_ALIGNED(page_count, PTRS_PER_PMD) &&
> +           IS_ALIGNED(page_to_pfn(page), PTRS_PER_PMD))
>                 return 1 << page_order;
> 
>         return 1;
> 
> But as we discussed earlier, this fix means never getting 2M mappings
> in the guest for a region that is mis-aligned.
> 

Although I understand the logic behind this fix, I’m hesitant to add it
because it looks like a workaround for a VMM bug that could bite back.
The approach you propose will silently map a huge page as a collection
of 4K pages, impacting guest performance (this will be especially
visible for a region containing a single huge page).

This fix silently allows such behavior instead of reporting it as an
error to user space. It’s worth noting that pinned-region population and
mapping happen upon ioctl invocation, so the VMM will either get an
error from the hypervisor (current behavior) or get a region mapped with
4K pages (proposed behavior).

The first case is an explicit error; the second — although it allows
adding a region — will be less performant, significantly increase region
mapping time and thus potentailly guest spin-up (creation) time, and be
less noticeable to customers, especially those who don’t really
understand what’s happening under the hood and simply stumbled upon some
VMM bug.

What’s your take?

Thanks,
Stanislav

> Michael
> 
> > 
> > Thanks,
> > Stanislav
> > 
> > > The problem does *not* happen with a movable region, but the reasoning
> > > is different. hmm_range_fault() is always called with a 2M range aligned
> > > to the GFN, which in a mis-aligned region means that the host userspace
> > > address is never 2M aligned. So hmm_range_fault() is never able to allocate
> > > and map a 2M page. mshv_chunk_stride() will never get a folio order > 1,
> > > and the hypercall is never asked to do a 2M mapping. Both host and guest
> > > mappings will always be 4K and everything works.
> > >
> > > Michael
> > >
> > > > And this function is called for
> > > > both movable and pinned region, so the hypercal should never fail due to
> > > > huge page alignment issue.
> > > >
> > > > What do I miss here?
> > > >
> > > > Thanks,
> > > > Stanislav
> > > >
> > > >
> > > > > Michael
> > > > >
> > > > > >
> > > > > > > Movable regions behave a bit differently because the memory for the
> > > > > > > region is not allocated on the host "up front" when the region is created.
> > > > > > > The memory is faulted in as the guest runs, and the vagaries of the current
> > > > > > > MSHV in Linux code are such that 2M pages are never created on the host
> > > > > > > if the alignments don't match. HV_MAP_GPA_LARGE_PAGE is never passed
> > > > > > > to the HVCALL_MAP_GPA_PAGES hypercall, so the hypervisor just does 4K
> > > > > > > mappings, which works even with the misalignment.
> > > > > > >
> > > > > > > >
> > > > > > > > This adjustment happens at runtime. Could this be the missing detail here?
> > > > > > >
> > > > > > > Adjustments at runtime are a different topic from the issue I'm raising,
> > > > > > > though eventually there's some relationship. My issue occurs in the
> > > > > > > creation of a new region, and the setting up of the initial hypervisor
> > > > > > > mapping. I haven't thought through the details of adjustments at runtime.
> > > > > > >
> > > > > > > My usual caveats apply -- this is all "thought experiment". If I had the
> > > > > > > means do some runtime testing to confirm, I would. It's possible the
> > > > > > > hypervisor is playing some trick I haven't envisioned, but I'm skeptical of
> > > > > > > that given the basics of how physical processors work with page tables.
> > > > > > >
> > > > > > > Michael

^ permalink raw reply	[flat|nested] 18+ messages in thread

* RE: [PATCH] mshv: Align huge page stride with guest mapping
  2026-01-05 17:25                         ` Stanislav Kinsburskii
@ 2026-01-05 18:07                           ` Michael Kelley
  2026-01-05 19:47                             ` Stanislav Kinsburskii
  0 siblings, 1 reply; 18+ messages in thread
From: Michael Kelley @ 2026-01-05 18:07 UTC (permalink / raw)
  To: Stanislav Kinsburskii
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Monday, January 5, 2026 9:25 AM
> 
> On Sat, Jan 03, 2026 at 01:16:51AM +0000, Michael Kelley wrote:
> > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 3:35 PM
> > >
> > > On Fri, Jan 02, 2026 at 09:13:31PM +0000, Michael Kelley wrote:
> > > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 12:03 PM
> > > > >

[snip]

> > > > >
> > > > > I think see your point, but I also think this issue doesn't exist,
> > > > > because map_chunk_stride() returns huge page stride iff page if:
> > > > > 1. the folio order is PMD_ORDER and
> > > > > 2. GFN is huge page aligned and
> > > > > 3. number of 4K pages is huge pages aligned.
> > > > >
> > > > > On other words, a host huge page won't be mapped as huge if the page
> > > > > can't be mapped as huge in the guest.
> > > >
> > > > OK, I'm missing how what you say is true. For pinned regions,
> > > > the memory is allocated and mapped into the host userspace address
> > > > first, as done by mshv_prepare_pinned_region() calling mshv_region_pin(),
> > > > which calls pin_user_pages_fast(). This is all done without considering
> > > > the GFN or GFN alignment. So one or more 2M pages might be allocated
> > > > and mapped in the host before any guest mapping is looked at. Agreed?
> > > >
> > >
> > > Agreed.
> > >
> > > > Then mshv_prepare_pinned_region() calls mshv_region_map() to do the
> > > > guest mapping. This eventually gets down to mshv_chunk_stride(). In
> > > > mshv_chunk_stride() when your conditions #2 and #3 are met, the
> > > > corresponding struct page argument to mshv_chunk_stride() may be a
> > > > 4K page that is in the middle of a 2M page instead of at the beginning
> > > > (if the region is mis-aligned). But the key point is that the 4K page in
> > > > the middle is part of a folio that will return a folio order of PMD_ORDER.
> > > > I.e., a folio order of PMD_ORDER is not sufficient to ensure that the
> > > > struct page arg is at the *start* of a 2M-aligned physical memory range
> > > > that can be mapped into the guest as a 2M page.
> > > >
> > >
> > > I'm trying to undestand how this can even happen, so please bear with
> > > me.
> > > In other words (and AFAIU), what you are saying in the following:
> > >
> > > 1. VMM creates a mapping with a huge page(s) (this implies that virtual
> > >    address is huge page aligned, size is huge page aligned and physical
> > >    pages are consequtive).
> > > 2. VMM tries to create a region via ioctl, but instead of passing the
> > >    start of the region, is passes an offset into one of the the region's
> > >    huge pages, and in the same time with the base GFN and the size huge
> > >    page aligned (to meet the #2 and #3 conditions).
> > > 3. mshv_chunk_stride() sees a folio order of PMD_ORDER, and tries to map
> > >    the corresponding pages as huge, which will be rejected by the
> > >    hypervisor.
> > >
> > > Is this accurate?
> >
> > Yes, pretty much. In Step 1, the VMM may just allocate some virtual
> > address space, and not do anything to populate it with physical pages.
> > So populating with any 2M pages may not happen until Step 2 when
> > the ioctl calls pin_user_pages_fast(). But *when* the virtual address
> > space gets populated with physical pages doesn't really matter. We
> > just know that it happens before the ioctl tries to map the memory
> > into the guest -- i.e., mshv_prepare_pinned_region() calls
> > mshv_region_map().
> >
> > And yes, the problem is what you call out in Step 2: as input to the
> > ioctl, the fields "userspace_addr" and "guest_pfn" in struct
> > mshv_user_mem_region could have different alignments modulo 2M
> > boundaries. When they are different, that's what I'm calling a "mis-aligned
> > region", (referring to a struct mshv_mem_region that is created and
> > setup by the ioctl).
> >
> > > A subseqeunt question: if it is accurate, why the driver needs to
> > > support this case? It looks like a VMM bug to me.
> >
> > I don't know if the driver needs to support this case. That's a question
> > for the VMM people to answer. I wouldn't necessarily assume that the
> > VMM always allocates virtual address space with exactly the size and
> > alignment that matches the regions it creates with the ioctl. The
> > kernel ioctl doesn't care how the VMM allocates and manages its
> > virtual address space, so the VMM is free to do whatever it wants
> > in that regard, as long as it meets the requirements of the ioctl. So
> > the requirements of the ioctl in this case are something to be
> > negotiated with the VMM.
> >
> > > Also, how should it support it? By rejecting such requests in the ioctl?
> >
> > Rejecting requests to create a mis-aligned region is certainly one option
> > if the VMM agrees that's OK. The ioctl currently requires only that
> > "userspace_addr" and "size" be page aligned, so those requirements
> > could be tightened.
> >
> > The other approach is to fix mshv_chunk_stride() to handle the
> > mis-aligned case. Doing so it even easier than I first envisioned.
> > I think this works:
> >
> > @@ -49,7 +49,8 @@ static int mshv_chunk_stride(struct page *page,
> >          */
> >         if (page_order &&
> >             IS_ALIGNED(gfn, PTRS_PER_PMD) &&
> > -           IS_ALIGNED(page_count, PTRS_PER_PMD))
> > +           IS_ALIGNED(page_count, PTRS_PER_PMD) &&
> > +           IS_ALIGNED(page_to_pfn(page), PTRS_PER_PMD))
> >                 return 1 << page_order;
> >
> >         return 1;
> >
> > But as we discussed earlier, this fix means never getting 2M mappings
> > in the guest for a region that is mis-aligned.
> >
> 
> Although I understand the logic behind this fix, I’m hesitant to add it
> because it looks like a workaround for a VMM bug that could bite back.
> The approach you propose will silently map a huge page as a collection
> of 4K pages, impacting guest performance (this will be especially
> visible for a region containing a single huge page).
> 
> This fix silently allows such behavior instead of reporting it as an
> error to user space. It’s worth noting that pinned-region population and
> mapping happen upon ioctl invocation, so the VMM will either get an
> error from the hypervisor (current behavior) or get a region mapped with
> 4K pages (proposed behavior).
> 
> The first case is an explicit error; the second — although it allows
> adding a region — will be less performant, significantly increase region
> mapping time and thus potentailly guest spin-up (creation) time, and be
> less noticeable to customers, especially those who don’t really
> understand what’s happening under the hood and simply stumbled upon some
> VMM bug.
> 
> What’s your take?
> 

Yes, I agree with everything you say. Silently dropping into a mode where
guest performance might be noticeably affected is usually not a good
thing. So if the VMM code is OK with the restriction, then I'm fine with
adding an explicit alignment check in the ioctl path code to disallow the
mis-aligned case.

An explicit check is needed because the code "as is" is somewhat flakey
as I pointed out earlier. Mis-aligned pinned regions will succeed if the
host doesn't allocate any 2M pages, but will fail it is does. And mis-aligned
movable regions silently go into the mode of doing all 4K mappings. An
explicit check in the ioctl path avoids the flakiness and makes pinned
and movable regions have consistent requirements.

On the flip side: The ioctl that creates a region is only used by the VMM,
not by random end-user provided code like the system call API or general
ioctls. As such, I could see the VMM wanting mis-aligned regions to work,
with the understanding that there is potential perf impact. The VMM is
sophisticated system software, and it may want to take the responsibility
for making that tradeoff rather than have the kernel enforce a requirement.
There may be cases where it makes sense to create small regions that are
mis-aligned. I just don't know what the VMM needs or wants to do in
creating regions.

So it's hard for me to lean either way.  I think the question must go
to the VMM folks.

Michael









^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mshv: Align huge page stride with guest mapping
  2026-01-05 18:07                           ` Michael Kelley
@ 2026-01-05 19:47                             ` Stanislav Kinsburskii
  2026-01-07 18:39                               ` Stanislav Kinsburskii
  0 siblings, 1 reply; 18+ messages in thread
From: Stanislav Kinsburskii @ 2026-01-05 19:47 UTC (permalink / raw)
  To: Michael Kelley
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

On Mon, Jan 05, 2026 at 06:07:00PM +0000, Michael Kelley wrote:
> From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Monday, January 5, 2026 9:25 AM
> > 
> > On Sat, Jan 03, 2026 at 01:16:51AM +0000, Michael Kelley wrote:
> > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 3:35 PM
> > > >
> > > > On Fri, Jan 02, 2026 at 09:13:31PM +0000, Michael Kelley wrote:
> > > > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 12:03 PM
> > > > > >
> 
> [snip]
> 
> > > > > >
> > > > > > I think see your point, but I also think this issue doesn't exist,
> > > > > > because map_chunk_stride() returns huge page stride iff page if:
> > > > > > 1. the folio order is PMD_ORDER and
> > > > > > 2. GFN is huge page aligned and
> > > > > > 3. number of 4K pages is huge pages aligned.
> > > > > >
> > > > > > On other words, a host huge page won't be mapped as huge if the page
> > > > > > can't be mapped as huge in the guest.
> > > > >
> > > > > OK, I'm missing how what you say is true. For pinned regions,
> > > > > the memory is allocated and mapped into the host userspace address
> > > > > first, as done by mshv_prepare_pinned_region() calling mshv_region_pin(),
> > > > > which calls pin_user_pages_fast(). This is all done without considering
> > > > > the GFN or GFN alignment. So one or more 2M pages might be allocated
> > > > > and mapped in the host before any guest mapping is looked at. Agreed?
> > > > >
> > > >
> > > > Agreed.
> > > >
> > > > > Then mshv_prepare_pinned_region() calls mshv_region_map() to do the
> > > > > guest mapping. This eventually gets down to mshv_chunk_stride(). In
> > > > > mshv_chunk_stride() when your conditions #2 and #3 are met, the
> > > > > corresponding struct page argument to mshv_chunk_stride() may be a
> > > > > 4K page that is in the middle of a 2M page instead of at the beginning
> > > > > (if the region is mis-aligned). But the key point is that the 4K page in
> > > > > the middle is part of a folio that will return a folio order of PMD_ORDER.
> > > > > I.e., a folio order of PMD_ORDER is not sufficient to ensure that the
> > > > > struct page arg is at the *start* of a 2M-aligned physical memory range
> > > > > that can be mapped into the guest as a 2M page.
> > > > >
> > > >
> > > > I'm trying to undestand how this can even happen, so please bear with
> > > > me.
> > > > In other words (and AFAIU), what you are saying in the following:
> > > >
> > > > 1. VMM creates a mapping with a huge page(s) (this implies that virtual
> > > >    address is huge page aligned, size is huge page aligned and physical
> > > >    pages are consequtive).
> > > > 2. VMM tries to create a region via ioctl, but instead of passing the
> > > >    start of the region, is passes an offset into one of the the region's
> > > >    huge pages, and in the same time with the base GFN and the size huge
> > > >    page aligned (to meet the #2 and #3 conditions).
> > > > 3. mshv_chunk_stride() sees a folio order of PMD_ORDER, and tries to map
> > > >    the corresponding pages as huge, which will be rejected by the
> > > >    hypervisor.
> > > >
> > > > Is this accurate?
> > >
> > > Yes, pretty much. In Step 1, the VMM may just allocate some virtual
> > > address space, and not do anything to populate it with physical pages.
> > > So populating with any 2M pages may not happen until Step 2 when
> > > the ioctl calls pin_user_pages_fast(). But *when* the virtual address
> > > space gets populated with physical pages doesn't really matter. We
> > > just know that it happens before the ioctl tries to map the memory
> > > into the guest -- i.e., mshv_prepare_pinned_region() calls
> > > mshv_region_map().
> > >
> > > And yes, the problem is what you call out in Step 2: as input to the
> > > ioctl, the fields "userspace_addr" and "guest_pfn" in struct
> > > mshv_user_mem_region could have different alignments modulo 2M
> > > boundaries. When they are different, that's what I'm calling a "mis-aligned
> > > region", (referring to a struct mshv_mem_region that is created and
> > > setup by the ioctl).
> > >
> > > > A subseqeunt question: if it is accurate, why the driver needs to
> > > > support this case? It looks like a VMM bug to me.
> > >
> > > I don't know if the driver needs to support this case. That's a question
> > > for the VMM people to answer. I wouldn't necessarily assume that the
> > > VMM always allocates virtual address space with exactly the size and
> > > alignment that matches the regions it creates with the ioctl. The
> > > kernel ioctl doesn't care how the VMM allocates and manages its
> > > virtual address space, so the VMM is free to do whatever it wants
> > > in that regard, as long as it meets the requirements of the ioctl. So
> > > the requirements of the ioctl in this case are something to be
> > > negotiated with the VMM.
> > >
> > > > Also, how should it support it? By rejecting such requests in the ioctl?
> > >
> > > Rejecting requests to create a mis-aligned region is certainly one option
> > > if the VMM agrees that's OK. The ioctl currently requires only that
> > > "userspace_addr" and "size" be page aligned, so those requirements
> > > could be tightened.
> > >
> > > The other approach is to fix mshv_chunk_stride() to handle the
> > > mis-aligned case. Doing so it even easier than I first envisioned.
> > > I think this works:
> > >
> > > @@ -49,7 +49,8 @@ static int mshv_chunk_stride(struct page *page,
> > >          */
> > >         if (page_order &&
> > >             IS_ALIGNED(gfn, PTRS_PER_PMD) &&
> > > -           IS_ALIGNED(page_count, PTRS_PER_PMD))
> > > +           IS_ALIGNED(page_count, PTRS_PER_PMD) &&
> > > +           IS_ALIGNED(page_to_pfn(page), PTRS_PER_PMD))
> > >                 return 1 << page_order;
> > >
> > >         return 1;
> > >
> > > But as we discussed earlier, this fix means never getting 2M mappings
> > > in the guest for a region that is mis-aligned.
> > >
> > 
> > Although I understand the logic behind this fix, I’m hesitant to add it
> > because it looks like a workaround for a VMM bug that could bite back.
> > The approach you propose will silently map a huge page as a collection
> > of 4K pages, impacting guest performance (this will be especially
> > visible for a region containing a single huge page).
> > 
> > This fix silently allows such behavior instead of reporting it as an
> > error to user space. It’s worth noting that pinned-region population and
> > mapping happen upon ioctl invocation, so the VMM will either get an
> > error from the hypervisor (current behavior) or get a region mapped with
> > 4K pages (proposed behavior).
> > 
> > The first case is an explicit error; the second — although it allows
> > adding a region — will be less performant, significantly increase region
> > mapping time and thus potentailly guest spin-up (creation) time, and be
> > less noticeable to customers, especially those who don’t really
> > understand what’s happening under the hood and simply stumbled upon some
> > VMM bug.
> > 
> > What’s your take?
> > 
> 
> Yes, I agree with everything you say. Silently dropping into a mode where
> guest performance might be noticeably affected is usually not a good
> thing. So if the VMM code is OK with the restriction, then I'm fine with
> adding an explicit alignment check in the ioctl path code to disallow the
> mis-aligned case.
> 

But the explicit alignment check in the ioctl is already there. The only
difference is that it's done in the hypervisor and not in the kernel.

> An explicit check is needed because the code "as is" is somewhat flakey
> as I pointed out earlier. Mis-aligned pinned regions will succeed if the
> host doesn't allocate any 2M pages, but will fail it is does. And mis-aligned
> movable regions silently go into the mode of doing all 4K mappings. An
> explicit check in the ioctl path avoids the flakiness and makes pinned
> and movable regions have consistent requirements.
> 
> On the flip side: The ioctl that creates a region is only used by the VMM,
> not by random end-user provided code like the system call API or general
> ioctls. As such, I could see the VMM wanting mis-aligned regions to work,
> with the understanding that there is potential perf impact. The VMM is
> sophisticated system software, and it may want to take the responsibility
> for making that tradeoff rather than have the kernel enforce a requirement.
> There may be cases where it makes sense to create small regions that are
> mis-aligned. I just don't know what the VMM needs or wants to do in
> creating regions.
> 

That's a fair point. Let me loop back with the VMM folks and see what
they think.

Thanks,
Stanislav

> So it's hard for me to lean either way.  I think the question must go
> to the VMM folks.
> 
> Michael
> 
> 
> 
> 
> 
> 
> 
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mshv: Align huge page stride with guest mapping
  2026-01-05 19:47                             ` Stanislav Kinsburskii
@ 2026-01-07 18:39                               ` Stanislav Kinsburskii
  0 siblings, 0 replies; 18+ messages in thread
From: Stanislav Kinsburskii @ 2026-01-07 18:39 UTC (permalink / raw)
  To: Michael Kelley
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

On Mon, Jan 05, 2026 at 11:47:14AM -0800, Stanislav Kinsburskii wrote:
> On Mon, Jan 05, 2026 at 06:07:00PM +0000, Michael Kelley wrote:
> > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Monday, January 5, 2026 9:25 AM
> > > 
> > > On Sat, Jan 03, 2026 at 01:16:51AM +0000, Michael Kelley wrote:
> > > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 3:35 PM
> > > > >
> > > > > On Fri, Jan 02, 2026 at 09:13:31PM +0000, Michael Kelley wrote:
> > > > > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, January 2, 2026 12:03 PM
> > > > > > >
> > 
> > [snip]
> > 
> > > > > > >
> > > > > > > I think see your point, but I also think this issue doesn't exist,
> > > > > > > because map_chunk_stride() returns huge page stride iff page if:
> > > > > > > 1. the folio order is PMD_ORDER and
> > > > > > > 2. GFN is huge page aligned and
> > > > > > > 3. number of 4K pages is huge pages aligned.
> > > > > > >
> > > > > > > On other words, a host huge page won't be mapped as huge if the page
> > > > > > > can't be mapped as huge in the guest.
> > > > > >
> > > > > > OK, I'm missing how what you say is true. For pinned regions,
> > > > > > the memory is allocated and mapped into the host userspace address
> > > > > > first, as done by mshv_prepare_pinned_region() calling mshv_region_pin(),
> > > > > > which calls pin_user_pages_fast(). This is all done without considering
> > > > > > the GFN or GFN alignment. So one or more 2M pages might be allocated
> > > > > > and mapped in the host before any guest mapping is looked at. Agreed?
> > > > > >
> > > > >
> > > > > Agreed.
> > > > >
> > > > > > Then mshv_prepare_pinned_region() calls mshv_region_map() to do the
> > > > > > guest mapping. This eventually gets down to mshv_chunk_stride(). In
> > > > > > mshv_chunk_stride() when your conditions #2 and #3 are met, the
> > > > > > corresponding struct page argument to mshv_chunk_stride() may be a
> > > > > > 4K page that is in the middle of a 2M page instead of at the beginning
> > > > > > (if the region is mis-aligned). But the key point is that the 4K page in
> > > > > > the middle is part of a folio that will return a folio order of PMD_ORDER.
> > > > > > I.e., a folio order of PMD_ORDER is not sufficient to ensure that the
> > > > > > struct page arg is at the *start* of a 2M-aligned physical memory range
> > > > > > that can be mapped into the guest as a 2M page.
> > > > > >
> > > > >
> > > > > I'm trying to undestand how this can even happen, so please bear with
> > > > > me.
> > > > > In other words (and AFAIU), what you are saying in the following:
> > > > >
> > > > > 1. VMM creates a mapping with a huge page(s) (this implies that virtual
> > > > >    address is huge page aligned, size is huge page aligned and physical
> > > > >    pages are consequtive).
> > > > > 2. VMM tries to create a region via ioctl, but instead of passing the
> > > > >    start of the region, is passes an offset into one of the the region's
> > > > >    huge pages, and in the same time with the base GFN and the size huge
> > > > >    page aligned (to meet the #2 and #3 conditions).
> > > > > 3. mshv_chunk_stride() sees a folio order of PMD_ORDER, and tries to map
> > > > >    the corresponding pages as huge, which will be rejected by the
> > > > >    hypervisor.
> > > > >
> > > > > Is this accurate?
> > > >
> > > > Yes, pretty much. In Step 1, the VMM may just allocate some virtual
> > > > address space, and not do anything to populate it with physical pages.
> > > > So populating with any 2M pages may not happen until Step 2 when
> > > > the ioctl calls pin_user_pages_fast(). But *when* the virtual address
> > > > space gets populated with physical pages doesn't really matter. We
> > > > just know that it happens before the ioctl tries to map the memory
> > > > into the guest -- i.e., mshv_prepare_pinned_region() calls
> > > > mshv_region_map().
> > > >
> > > > And yes, the problem is what you call out in Step 2: as input to the
> > > > ioctl, the fields "userspace_addr" and "guest_pfn" in struct
> > > > mshv_user_mem_region could have different alignments modulo 2M
> > > > boundaries. When they are different, that's what I'm calling a "mis-aligned
> > > > region", (referring to a struct mshv_mem_region that is created and
> > > > setup by the ioctl).
> > > >
> > > > > A subseqeunt question: if it is accurate, why the driver needs to
> > > > > support this case? It looks like a VMM bug to me.
> > > >
> > > > I don't know if the driver needs to support this case. That's a question
> > > > for the VMM people to answer. I wouldn't necessarily assume that the
> > > > VMM always allocates virtual address space with exactly the size and
> > > > alignment that matches the regions it creates with the ioctl. The
> > > > kernel ioctl doesn't care how the VMM allocates and manages its
> > > > virtual address space, so the VMM is free to do whatever it wants
> > > > in that regard, as long as it meets the requirements of the ioctl. So
> > > > the requirements of the ioctl in this case are something to be
> > > > negotiated with the VMM.
> > > >
> > > > > Also, how should it support it? By rejecting such requests in the ioctl?
> > > >
> > > > Rejecting requests to create a mis-aligned region is certainly one option
> > > > if the VMM agrees that's OK. The ioctl currently requires only that
> > > > "userspace_addr" and "size" be page aligned, so those requirements
> > > > could be tightened.
> > > >
> > > > The other approach is to fix mshv_chunk_stride() to handle the
> > > > mis-aligned case. Doing so it even easier than I first envisioned.
> > > > I think this works:
> > > >
> > > > @@ -49,7 +49,8 @@ static int mshv_chunk_stride(struct page *page,
> > > >          */
> > > >         if (page_order &&
> > > >             IS_ALIGNED(gfn, PTRS_PER_PMD) &&
> > > > -           IS_ALIGNED(page_count, PTRS_PER_PMD))
> > > > +           IS_ALIGNED(page_count, PTRS_PER_PMD) &&
> > > > +           IS_ALIGNED(page_to_pfn(page), PTRS_PER_PMD))
> > > >                 return 1 << page_order;
> > > >
> > > >         return 1;
> > > >
> > > > But as we discussed earlier, this fix means never getting 2M mappings
> > > > in the guest for a region that is mis-aligned.
> > > >
> > > 
> > > Although I understand the logic behind this fix, I’m hesitant to add it
> > > because it looks like a workaround for a VMM bug that could bite back.
> > > The approach you propose will silently map a huge page as a collection
> > > of 4K pages, impacting guest performance (this will be especially
> > > visible for a region containing a single huge page).
> > > 
> > > This fix silently allows such behavior instead of reporting it as an
> > > error to user space. It’s worth noting that pinned-region population and
> > > mapping happen upon ioctl invocation, so the VMM will either get an
> > > error from the hypervisor (current behavior) or get a region mapped with
> > > 4K pages (proposed behavior).
> > > 
> > > The first case is an explicit error; the second — although it allows
> > > adding a region — will be less performant, significantly increase region
> > > mapping time and thus potentailly guest spin-up (creation) time, and be
> > > less noticeable to customers, especially those who don’t really
> > > understand what’s happening under the hood and simply stumbled upon some
> > > VMM bug.
> > > 
> > > What’s your take?
> > > 
> > 
> > Yes, I agree with everything you say. Silently dropping into a mode where
> > guest performance might be noticeably affected is usually not a good
> > thing. So if the VMM code is OK with the restriction, then I'm fine with
> > adding an explicit alignment check in the ioctl path code to disallow the
> > mis-aligned case.
> > 
> 
> But the explicit alignment check in the ioctl is already there. The only
> difference is that it's done in the hypervisor and not in the kernel.
> 
> > An explicit check is needed because the code "as is" is somewhat flakey
> > as I pointed out earlier. Mis-aligned pinned regions will succeed if the
> > host doesn't allocate any 2M pages, but will fail it is does. And mis-aligned
> > movable regions silently go into the mode of doing all 4K mappings. An
> > explicit check in the ioctl path avoids the flakiness and makes pinned
> > and movable regions have consistent requirements.
> > 
> > On the flip side: The ioctl that creates a region is only used by the VMM,
> > not by random end-user provided code like the system call API or general
> > ioctls. As such, I could see the VMM wanting mis-aligned regions to work,
> > with the understanding that there is potential perf impact. The VMM is
> > sophisticated system software, and it may want to take the responsibility
> > for making that tradeoff rather than have the kernel enforce a requirement.
> > There may be cases where it makes sense to create small regions that are
> > mis-aligned. I just don't know what the VMM needs or wants to do in
> > creating regions.
> > 
> 
> That's a fair point. Let me loop back with the VMM folks and see what
> they think.
> 

After discussion, we decided to proceed with the implicit approach.
I'll send an update soon.

Thanks,
Stanislav

> Thanks,
> Stanislav
> 
> > So it's hard for me to lean either way.  I think the question must go
> > to the VMM folks.
> > 
> > Michael
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mshv: Align huge page stride with guest mapping
  2025-12-22 18:25     ` Michael Kelley
  2025-12-23 15:51       ` Michael Kelley
@ 2025-12-23 16:27       ` Stanislav Kinsburskii
  1 sibling, 0 replies; 18+ messages in thread
From: Stanislav Kinsburskii @ 2025-12-23 16:27 UTC (permalink / raw)
  To: Michael Kelley
  Cc: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org,
	decui@microsoft.com, longli@microsoft.com,
	linux-hyperv@vger.kernel.org, linux-kernel@vger.kernel.org

On Mon, Dec 22, 2025 at 06:25:02PM +0000, Michael Kelley wrote:
> From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Friday, December 19, 2025 2:54 PM
> > 
> > On Thu, Dec 18, 2025 at 07:41:24PM +0000, Michael Kelley wrote:
> > > From: Stanislav Kinsburskii <skinsburskii@linux.microsoft.com> Sent: Tuesday,
> > December 16, 2025 4:41 PM
> > > >
> > > > Ensure that a stride larger than 1 (huge page) is only used when both
> > > > the guest frame number (gfn) and the operation size (page_count) are
> > > > aligned to the huge page size (PTRS_PER_PMD). This matches the
> > > > hypervisor requirement that map/unmap operations for huge pages must be
> > > > guest-aligned and cover a full huge page.
> > > >
> > > > Add mshv_chunk_stride() to encapsulate this alignment and page-order
> > > > validation, and plumb a huge_page flag into the region chunk handlers.
> > > > This prevents issuing large-page map/unmap/share operations that the
> > > > hypervisor would reject due to misaligned guest mappings.
> > >
> > > This code looks good to me on the surface. But I can only make an educated
> > > guess as to the hypervisor behavior in certain situations, and if my guess is
> > > correct there's still a flaw in one case.
> > >
> > > Consider the madvise() DONTNEED experiment that I previously called out. [1]
> > > I surmise that the intent of this patch is to make that case work correctly.
> > > When the .invalidate callback is made for the 32 Kbyte range embedded in
> > > a previously mapped 2 Meg page, this new code detects that case. It calls the
> > > hypervisor to remap the 32 Kbyte range for no access, and clears the 8
> > > corresponding entries in the struct page array attached to the mshv region. The
> > > call to the hypervisor is made *without* the HV_MAP_GPA_LARGE_PAGE flag.
> > > Since the mapping was originally done *with* the HV_MAP_GPA_LARGE_PAGE
> > > flag, my guess is that the hypervisor is smart enough to handle this case by
> > > splitting the 2 Meg mapping it created, setting the 32 Kbyte range to no access,
> > > and returning "success". If my guess is correct, there's no problem here.
> > >
> > > But then there's a second .invalidate callback for the entire 2 Meg page. Here's
> > > the call stack:
> > >
> > > [  194.259337]  dump_stack+0x14/0x20
> > > [  194.259339]  mhktest_invalidate+0x2a/0x40  [my dummy invalidate callback]
> > > [  194.259342]  __mmu_notifier_invalidate_range_start+0x1f4/0x250
> > > [  194.259347]  __split_huge_pmd+0x14f/0x170
> > > [  194.259349]  unmap_page_range+0x104d/0x1a00
> > > [  194.259358]  unmap_single_vma+0x7d/0xc0
> > > [  194.259360]  zap_page_range_single_batched+0xe0/0x1c0
> > > [  194.259363]  madvise_vma_behavior+0xb01/0xc00
> > > [  194.259366]  madvise_do_behavior.part.0+0x3cd/0x4a0
> > > [  194.259375]  do_madvise+0xc7/0x170
> > > [  194.259380]  __x64_sys_madvise+0x2f/0x40
> > > [  194.259382]  x64_sys_call+0x1d77/0x21b0
> > > [  194.259385]  do_syscall_64+0x56/0x640
> > > [  194.259388]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> > >
> > > In __split_huge_pmd(), the .invalidate callback is made *before* the 2 Meg
> > > page is actually split by the root partition. So mshv_chunk_stride() returns "9"
> > > for the stride, and the hypervisor is called with HV_MAP_GPA_LARGE_PAGE
> > > set. My guess is that the hypervisor returns an error because it has already
> > > split the mapping. The whole point of this patch set is to avoid passing
> > > HV_MAP_GPA_LARGE_PAGE to the hypervisor when the hypervisor mapping
> > > is not a large page mapping, but this looks like a case where it still happens.
> > >
> > > My concern is solely from looking at the code and thinking about the problem,
> > > as I don't have an environment where I can test root partition interactions
> > > with the hypervisor. So maybe I'm missing something. Lemme know what you
> > > think .....
> > >
> > 
> > Yeah, I see your point: according to this stack, once a part of the page
> > is invalidated, the folio order remains the same until another invocation
> > of the same callback - this time for the whole huge
> > page - is made. Thus, the stride is still reported as the huge page size,
> > even though a part of the page has already been unmapped.
> > 
> > This indeed looks like a flaw in the current approach, but it's actually
> > not. The reason is that upon the invalidation callback, the driver
> > simply remaps the whole huge page with no access (in this case, the PFNs
> > provided to the hypervisor are zero), and it's fine as the hypervisor
> > simply drops all the pages from the previous mapping and marks this page
> > as inaccessible. The only check the hypervisor makes in this case is
> > that both the GFN and mapping size are huge page aligned (which they are
> > in this case).
> > 
> > I hope this clarifies the situation. Please let me know if you have any
> > other questions.
> 
> Thanks. Yes, this clarifies. My guess about the hypervisor behavior was wrong.
> Based on what you've said about what the hypervisor does, and further studying
> MSHV code, here's my recap of the HV_MAP_GPA_LARGE_PAGE flag:
> 
> 1. The hypervisor uses the flag to determine the granularity (4K or 2M) of the
> mapping HVCALL_MAP_GPA_PAGES or HVCALL_UNMAP_GPA_PAGES will
> create/remove. As such, the hypercall "repcount" is in this granularity. GFNs,
> such as the target_gpa_base input parameter and GFNs in the pfn_array, are
> always 4K GFNs, but if the flag is set, a GFN is treated as the first 4K GFN in
> a contiguous 2M range. If the flag is set, the target_gpa_base GFN must be
> 2M aligned.
> 
> 2. The hypervisor doesn't care whether any existing mapping is 4K or 2M. It
> always removes an existing mapping, including splitting any 2M mappings if
> necessary. Then if the operation is to create/re-create a mapping, it creates
> an appropriate new mapping.
> 
> My error was in thinking that the flag had to match any existing mapping.
> But the behavior you've clarified is certainly better. It handles the vagaries
> of the Linux "mm" subsystem, which in one case in my original experiment
> (madvise) invalidates the small range, then the 2M range, but the other
> case (mprotect) invalidates the 2M range, then the small range.
> 
> Since there's no documentation for these root partition hypercalls, it sure
> would be nice if this info could be captured in code comments for some
> future developer to benefit from. If that's not something you want to
> worry about, I could submit a patch later to add the code comments
> (subject to your review, of course).
> 

Please, feel free ti add the comments you see fit. I think you'll do it
better as you have a better understanding of what needs to be
documented.

Thanks,
Stanislav

> Separately, in looking at this, I spotted another potential problem with
> 2 Meg mappings that somewhat depends on hypervisor behavior that I'm
> not clear on. To create a new region, the user space VMM issues the
> MSHV_GET_GUEST_MEMORY ioctl, specifying the userspace address, the
> size, and the guest PFN. The only requirement on these values is that the
> userspace address and size be page aligned. But suppose a 4 Meg region is
> specified where the userspace address and the guest PFN have different
> offsets modulo 2 Meg. The userspace address range gets populated first,
> and may contain a 2 Meg large page. Then when mshv_chunk_stride()
> detects a 2 Meg aligned guest PFN so HVCALL_MAP_GPA_PAGES can be told
> to create a 2 Meg mapping for the guest, the corresponding system PFN in
> the page array may not be 2 Meg aligned. What does the hypervisor do in
> this case? It can't create a 2 Meg mapping, right? So does it silently fallback
> to creating 4K mappings, or does it return an error? Returning an error would
> seem to be problematic for movable pages because the error wouldn't
> occur until the guest VM is running and takes a range fault on the region.
> Silently falling back to creating 4K mappings has performance implications,
> though I guess it would work. My question is whether the
> MSHV_GET_GUEST_MEMORY ioctl should detect this case and return an
> error immediately.
> 
> Michael
> 
> > >
> > > [1] https://lore.kernel.org/linux-hyperv/SN6PR02MB4157978DFAA6C2584D0678E1D4A1A@SN6PR02MB4157.namprd02.prod.outlook.com/

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2026-01-07 18:40 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-12-17  0:41 [PATCH] mshv: Align huge page stride with guest mapping Stanislav Kinsburskii
2025-12-18 19:41 ` Michael Kelley
2025-12-19 22:53   ` Stanislav Kinsburskii
2025-12-22 18:25     ` Michael Kelley
2025-12-23 15:51       ` Michael Kelley
2025-12-23 16:26         ` Stanislav Kinsburskii
2025-12-23 19:17           ` Michael Kelley
2026-01-02 17:42             ` Stanislav Kinsburskii
2026-01-02 18:04               ` Michael Kelley
2026-01-02 20:03                 ` Stanislav Kinsburskii
2026-01-02 21:13                   ` Michael Kelley
2026-01-02 23:35                     ` Stanislav Kinsburskii
2026-01-03  1:16                       ` Michael Kelley
2026-01-05 17:25                         ` Stanislav Kinsburskii
2026-01-05 18:07                           ` Michael Kelley
2026-01-05 19:47                             ` Stanislav Kinsburskii
2026-01-07 18:39                               ` Stanislav Kinsburskii
2025-12-23 16:27       ` Stanislav Kinsburskii

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox