[PATCH] i386/kvm: Prefault memory on page state change

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [PATCH] i386/kvm: Prefault memory on page state change
@ 2025-03-28 20:30 Tom Lendacky
  2025-06-02 13:17 ` Tom Lendacky
  2025-06-03  7:41 ` Xiaoyao Li
  0 siblings, 2 replies; 8+ messages in thread
From: Tom Lendacky @ 2025-03-28 20:30 UTC (permalink / raw)
  To: qemu-devel, kvm; +Cc: Paolo Bonzini, Marcelo Tosatti, Michael Roth

A page state change is typically followed by an access of the page(s) and
results in another VMEXIT in order to map the page into the nested page
table. Depending on the size of page state change request, this can
generate a number of additional VMEXITs. For example, under SNP, when
Linux is utilizing lazy memory acceptance, memory is typically accepted in
4M chunks. A page state change request is submitted to mark the pages as
private, followed by validation of the memory. Since the guest_memfd
currently only supports 4K pages, each page validation will result in
VMEXIT to map the page, resulting in 1024 additional exits.

When performing a page state change, invoke KVM_PRE_FAULT_MEMORY for the
size of the page state change in order to pre-map the pages and avoid the
additional VMEXITs. This helps speed up boot times.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 accel/kvm/kvm-all.c   |  2 ++
 include/system/kvm.h  |  1 +
 target/i386/kvm/kvm.c | 31 ++++++++++++++++++++++++++-----
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index f89568bfa3..0cd487cea7 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -93,6 +93,7 @@ bool kvm_allowed;
 bool kvm_readonly_mem_allowed;
 bool kvm_vm_attributes_allowed;
 bool kvm_msi_use_devid;
+bool kvm_pre_fault_memory_supported;
 static bool kvm_has_guest_debug;
 static int kvm_sstep_flags;
 static bool kvm_immediate_exit;
@@ -2732,6 +2733,7 @@ static int kvm_init(MachineState *ms)
         kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) &&
         kvm_check_extension(s, KVM_CAP_USER_MEMORY2) &&
         (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE);
+    kvm_pre_fault_memory_supported = kvm_vm_check_extension(s, KVM_CAP_PRE_FAULT_MEMORY);
 
     if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
         s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
diff --git a/include/system/kvm.h b/include/system/kvm.h
index ab17c09a55..492ea8a383 100644
--- a/include/system/kvm.h
+++ b/include/system/kvm.h
@@ -42,6 +42,7 @@ extern bool kvm_gsi_routing_allowed;
 extern bool kvm_gsi_direct_mapping;
 extern bool kvm_readonly_mem_allowed;
 extern bool kvm_msi_use_devid;
+extern bool kvm_pre_fault_memory_supported;
 
 #define kvm_enabled()           (kvm_allowed)
 /**
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 6c749d4ee8..7c39d30c5f 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -5999,9 +5999,11 @@ static bool host_supports_vmx(void)
  * because private/shared page tracking is already provided through other
  * means, these 2 use-cases should be treated as being mutually-exclusive.
  */
-static int kvm_handle_hc_map_gpa_range(struct kvm_run *run)
+static int kvm_handle_hc_map_gpa_range(X86CPU *cpu, struct kvm_run *run)
 {
+    struct kvm_pre_fault_memory mem;
     uint64_t gpa, size, attributes;
+    int ret;
 
     if (!machine_require_guest_memfd(current_machine))
         return -EINVAL;
@@ -6012,13 +6014,32 @@ static int kvm_handle_hc_map_gpa_range(struct kvm_run *run)
 
     trace_kvm_hc_map_gpa_range(gpa, size, attributes, run->hypercall.flags);
 
-    return kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED);
+    ret = kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED);
+    if (ret || !kvm_pre_fault_memory_supported) {
+        return ret;
+    }
+
+    /*
+     * Opportunistically pre-fault memory in. Failures are ignored so that any
+     * errors in faulting in the memory will get captured in KVM page fault
+     * path when the guest first accesses the page.
+     */
+    memset(&mem, 0, sizeof(mem));
+    mem.gpa = gpa;
+    mem.size = size;
+    while (mem.size) {
+        if (kvm_vcpu_ioctl(CPU(cpu), KVM_PRE_FAULT_MEMORY, &mem)) {
+            break;
+        }
+    }
+
+    return 0;
 }
 
-static int kvm_handle_hypercall(struct kvm_run *run)
+static int kvm_handle_hypercall(X86CPU *cpu, struct kvm_run *run)
 {
     if (run->hypercall.nr == KVM_HC_MAP_GPA_RANGE)
-        return kvm_handle_hc_map_gpa_range(run);
+        return kvm_handle_hc_map_gpa_range(cpu, run);
 
     return -EINVAL;
 }
@@ -6118,7 +6139,7 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
         break;
 #endif
     case KVM_EXIT_HYPERCALL:
-        ret = kvm_handle_hypercall(run);
+        ret = kvm_handle_hypercall(cpu, run);
         break;
     default:
         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);

base-commit: 0f15892acaf3f50ecc20c6dad4b3ebdd701aa93e
-- 
2.46.2



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] i386/kvm: Prefault memory on page state change
  2025-03-28 20:30 [PATCH] i386/kvm: Prefault memory on page state change Tom Lendacky
@ 2025-06-02 13:17 ` Tom Lendacky
  2025-06-03  7:41 ` Xiaoyao Li
  1 sibling, 0 replies; 8+ messages in thread
From: Tom Lendacky @ 2025-06-02 13:17 UTC (permalink / raw)
  To: qemu-devel@nongnu.org, kvm@vger.kernel.org
  Cc: Paolo Bonzini, Marcelo Tosatti, Roth, Michael

On 3/28/25 15:30, Lendacky, Thomas wrote:
> A page state change is typically followed by an access of the page(s) and
> results in another VMEXIT in order to map the page into the nested page
> table. Depending on the size of page state change request, this can
> generate a number of additional VMEXITs. For example, under SNP, when
> Linux is utilizing lazy memory acceptance, memory is typically accepted in
> 4M chunks. A page state change request is submitted to mark the pages as
> private, followed by validation of the memory. Since the guest_memfd
> currently only supports 4K pages, each page validation will result in
> VMEXIT to map the page, resulting in 1024 additional exits.
> 
> When performing a page state change, invoke KVM_PRE_FAULT_MEMORY for the
> size of the page state change in order to pre-map the pages and avoid the
> additional VMEXITs. This helps speed up boot times.

Ping...

> 
> Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
> ---
>  accel/kvm/kvm-all.c   |  2 ++
>  include/system/kvm.h  |  1 +
>  target/i386/kvm/kvm.c | 31 ++++++++++++++++++++++++++-----
>  3 files changed, 29 insertions(+), 5 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index f89568bfa3..0cd487cea7 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -93,6 +93,7 @@ bool kvm_allowed;
>  bool kvm_readonly_mem_allowed;
>  bool kvm_vm_attributes_allowed;
>  bool kvm_msi_use_devid;
> +bool kvm_pre_fault_memory_supported;
>  static bool kvm_has_guest_debug;
>  static int kvm_sstep_flags;
>  static bool kvm_immediate_exit;
> @@ -2732,6 +2733,7 @@ static int kvm_init(MachineState *ms)
>          kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) &&
>          kvm_check_extension(s, KVM_CAP_USER_MEMORY2) &&
>          (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE);
> +    kvm_pre_fault_memory_supported = kvm_vm_check_extension(s, KVM_CAP_PRE_FAULT_MEMORY);
>  
>      if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
>          s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
> diff --git a/include/system/kvm.h b/include/system/kvm.h
> index ab17c09a55..492ea8a383 100644
> --- a/include/system/kvm.h
> +++ b/include/system/kvm.h
> @@ -42,6 +42,7 @@ extern bool kvm_gsi_routing_allowed;
>  extern bool kvm_gsi_direct_mapping;
>  extern bool kvm_readonly_mem_allowed;
>  extern bool kvm_msi_use_devid;
> +extern bool kvm_pre_fault_memory_supported;
>  
>  #define kvm_enabled()           (kvm_allowed)
>  /**
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 6c749d4ee8..7c39d30c5f 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -5999,9 +5999,11 @@ static bool host_supports_vmx(void)
>   * because private/shared page tracking is already provided through other
>   * means, these 2 use-cases should be treated as being mutually-exclusive.
>   */
> -static int kvm_handle_hc_map_gpa_range(struct kvm_run *run)
> +static int kvm_handle_hc_map_gpa_range(X86CPU *cpu, struct kvm_run *run)
>  {
> +    struct kvm_pre_fault_memory mem;
>      uint64_t gpa, size, attributes;
> +    int ret;
>  
>      if (!machine_require_guest_memfd(current_machine))
>          return -EINVAL;
> @@ -6012,13 +6014,32 @@ static int kvm_handle_hc_map_gpa_range(struct kvm_run *run)
>  
>      trace_kvm_hc_map_gpa_range(gpa, size, attributes, run->hypercall.flags);
>  
> -    return kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED);
> +    ret = kvm_convert_memory(gpa, size, attributes & KVM_MAP_GPA_RANGE_ENCRYPTED);
> +    if (ret || !kvm_pre_fault_memory_supported) {
> +        return ret;
> +    }
> +
> +    /*
> +     * Opportunistically pre-fault memory in. Failures are ignored so that any
> +     * errors in faulting in the memory will get captured in KVM page fault
> +     * path when the guest first accesses the page.
> +     */
> +    memset(&mem, 0, sizeof(mem));
> +    mem.gpa = gpa;
> +    mem.size = size;
> +    while (mem.size) {
> +        if (kvm_vcpu_ioctl(CPU(cpu), KVM_PRE_FAULT_MEMORY, &mem)) {
> +            break;
> +        }
> +    }
> +
> +    return 0;
>  }
>  
> -static int kvm_handle_hypercall(struct kvm_run *run)
> +static int kvm_handle_hypercall(X86CPU *cpu, struct kvm_run *run)
>  {
>      if (run->hypercall.nr == KVM_HC_MAP_GPA_RANGE)
> -        return kvm_handle_hc_map_gpa_range(run);
> +        return kvm_handle_hc_map_gpa_range(cpu, run);
>  
>      return -EINVAL;
>  }
> @@ -6118,7 +6139,7 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
>          break;
>  #endif
>      case KVM_EXIT_HYPERCALL:
> -        ret = kvm_handle_hypercall(run);
> +        ret = kvm_handle_hypercall(cpu, run);
>          break;
>      default:
>          fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
> 
> base-commit: 0f15892acaf3f50ecc20c6dad4b3ebdd701aa93e


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] i386/kvm: Prefault memory on page state change
  2025-03-28 20:30 [PATCH] i386/kvm: Prefault memory on page state change Tom Lendacky
  2025-06-02 13:17 ` Tom Lendacky
@ 2025-06-03  7:41 ` Xiaoyao Li
  2025-06-03 11:47   ` Xiaoyao Li
  1 sibling, 1 reply; 8+ messages in thread
From: Xiaoyao Li @ 2025-06-03  7:41 UTC (permalink / raw)
  To: Tom Lendacky, qemu-devel, kvm
  Cc: Paolo Bonzini, Marcelo Tosatti, Michael Roth

On 3/29/2025 4:30 AM, Tom Lendacky wrote:
> A page state change is typically followed by an access of the page(s) and
> results in another VMEXIT in order to map the page into the nested page
> table. Depending on the size of page state change request, this can
> generate a number of additional VMEXITs. For example, under SNP, when
> Linux is utilizing lazy memory acceptance, memory is typically accepted in
> 4M chunks. A page state change request is submitted to mark the pages as
> private, followed by validation of the memory. Since the guest_memfd
> currently only supports 4K pages, each page validation will result in
> VMEXIT to map the page, resulting in 1024 additional exits.
> 
> When performing a page state change, invoke KVM_PRE_FAULT_MEMORY for the
> size of the page state change in order to pre-map the pages and avoid the
> additional VMEXITs. This helps speed up boot times.

Unfortunately, it breaks TDX guest.

   kvm_hc_map_gpa_range gpa 0x80000000 size 0x200000 attributes 0x0 
flags 0x1

For TDX guest, it uses MAPGPA to maps the range [0x8000 0000, 
+0x0x200000] to shared. The call of KVM_PRE_FAULT_MEMORY on such range 
leads to the TD being marked as bugged

[353467.266761] WARNING: CPU: 109 PID: 295970 at 
arch/x86/kvm/mmu/tdp_mmu.c:674 
tdp_mmu_map_handle_target_level+0x301/0x460 [kvm]

[353472.621399] WARNING: CPU: 109 PID: 295970 at 
arch/x86/kvm/../../../virt/kvm/kvm_main.c:4281 
kvm_vcpu_pre_fault_memory+0x167/0x1a0 [kvm]


It seems the pre map on the non MR back'ed range has issue. But I'm 
still debugging it to understand the root cause.



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] i386/kvm: Prefault memory on page state change
  2025-06-03  7:41 ` Xiaoyao Li
@ 2025-06-03 11:47   ` Xiaoyao Li
  2025-06-03 13:40     ` Tom Lendacky
  2025-06-03 15:00     ` Paolo Bonzini
  0 siblings, 2 replies; 8+ messages in thread
From: Xiaoyao Li @ 2025-06-03 11:47 UTC (permalink / raw)
  To: Tom Lendacky, qemu-devel, kvm
  Cc: Paolo Bonzini, Marcelo Tosatti, Michael Roth

On 6/3/2025 3:41 PM, Xiaoyao Li wrote:
> On 3/29/2025 4:30 AM, Tom Lendacky wrote:
>> A page state change is typically followed by an access of the page(s) and
>> results in another VMEXIT in order to map the page into the nested page
>> table. Depending on the size of page state change request, this can
>> generate a number of additional VMEXITs. For example, under SNP, when
>> Linux is utilizing lazy memory acceptance, memory is typically 
>> accepted in
>> 4M chunks. A page state change request is submitted to mark the pages as
>> private, followed by validation of the memory. Since the guest_memfd
>> currently only supports 4K pages, each page validation will result in
>> VMEXIT to map the page, resulting in 1024 additional exits.
>>
>> When performing a page state change, invoke KVM_PRE_FAULT_MEMORY for the
>> size of the page state change in order to pre-map the pages and avoid the
>> additional VMEXITs. This helps speed up boot times.
> 
> Unfortunately, it breaks TDX guest.
> 
>    kvm_hc_map_gpa_range gpa 0x80000000 size 0x200000 attributes 0x0 
> flags 0x1
> 
> For TDX guest, it uses MAPGPA to maps the range [0x8000 0000, 
> +0x0x200000] to shared. The call of KVM_PRE_FAULT_MEMORY on such range 
> leads to the TD being marked as bugged
> 
> [353467.266761] WARNING: CPU: 109 PID: 295970 at arch/x86/kvm/mmu/ 
> tdp_mmu.c:674 tdp_mmu_map_handle_target_level+0x301/0x460 [kvm]

It turns out to be a KVM bug.

The gpa passed in in KVM_PRE_FAULT_MEMORY, i.e., range->gpa has no 
indication for share vs. private. KVM directly passes range->gpa to 
kvm_tdp_map_page() in kvm_arch_vcpu_pre_fault_memory(), which is then 
assigned to fault.addr

However, fault.addr is supposed to be a gpa of real access in TDX guest, 
which means it needs to have shared bit set if the map is for shared 
access, for TDX case. tdp_mmu_get_root_for_fault() will use it to 
determine which root to be used.

For this case, the pre fault is on the shared memory, while the 
fault.addr leads to mirror_root which is for private memory. Thus it 
triggers KVM_BUG_ON().


> [353472.621399] WARNING: CPU: 109 PID: 295970 at arch/x86/kvm/../../../ 
> virt/kvm/kvm_main.c:4281 kvm_vcpu_pre_fault_memory+0x167/0x1a0 [kvm]
> 
> 
> It seems the pre map on the non MR back'ed range has issue. But I'm 
> still debugging it to understand the root cause.
> 
> 



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] i386/kvm: Prefault memory on page state change
  2025-06-03 11:47   ` Xiaoyao Li
@ 2025-06-03 13:40     ` Tom Lendacky
  2025-06-03 15:00     ` Paolo Bonzini
  1 sibling, 0 replies; 8+ messages in thread
From: Tom Lendacky @ 2025-06-03 13:40 UTC (permalink / raw)
  To: Xiaoyao Li, qemu-devel, kvm; +Cc: Paolo Bonzini, Marcelo Tosatti, Michael Roth

On 6/3/25 06:47, Xiaoyao Li wrote:
> On 6/3/2025 3:41 PM, Xiaoyao Li wrote:
>> On 3/29/2025 4:30 AM, Tom Lendacky wrote:
>>> A page state change is typically followed by an access of the page(s) and
>>> results in another VMEXIT in order to map the page into the nested page
>>> table. Depending on the size of page state change request, this can
>>> generate a number of additional VMEXITs. For example, under SNP, when
>>> Linux is utilizing lazy memory acceptance, memory is typically accepted in
>>> 4M chunks. A page state change request is submitted to mark the pages as
>>> private, followed by validation of the memory. Since the guest_memfd
>>> currently only supports 4K pages, each page validation will result in
>>> VMEXIT to map the page, resulting in 1024 additional exits.
>>>
>>> When performing a page state change, invoke KVM_PRE_FAULT_MEMORY for the
>>> size of the page state change in order to pre-map the pages and avoid the
>>> additional VMEXITs. This helps speed up boot times.
>>
>> Unfortunately, it breaks TDX guest.
>>
>>    kvm_hc_map_gpa_range gpa 0x80000000 size 0x200000 attributes 0x0
>> flags 0x1
>>
>> For TDX guest, it uses MAPGPA to maps the range [0x8000 0000,
>> +0x0x200000] to shared. The call of KVM_PRE_FAULT_MEMORY on such range
>> leads to the TD being marked as bugged
>>
>> [353467.266761] WARNING: CPU: 109 PID: 295970 at arch/x86/kvm/mmu/
>> tdp_mmu.c:674 tdp_mmu_map_handle_target_level+0x301/0x460 [kvm]
> 
> It turns out to be a KVM bug.
> 
> The gpa passed in in KVM_PRE_FAULT_MEMORY, i.e., range->gpa has no
> indication for share vs. private. KVM directly passes range->gpa to
> kvm_tdp_map_page() in kvm_arch_vcpu_pre_fault_memory(), which is then
> assigned to fault.addr
> 
> However, fault.addr is supposed to be a gpa of real access in TDX guest,
> which means it needs to have shared bit set if the map is for shared
> access, for TDX case. tdp_mmu_get_root_for_fault() will use it to
> determine which root to be used.
> 
> For this case, the pre fault is on the shared memory, while the fault.addr
> leads to mirror_root which is for private memory. Thus it triggers
> KVM_BUG_ON().

Is this something that can be fixed in KVM (determine if the range is
private or shared) or does the call to KVM_PRE_FAULT_MEMORY require
modification in some way that works for both TDX and SNP?

Thanks,
Tom

> 
> 
>> [353472.621399] WARNING: CPU: 109 PID: 295970 at arch/x86/kvm/../../../
>> virt/kvm/kvm_main.c:4281 kvm_vcpu_pre_fault_memory+0x167/0x1a0 [kvm]
>>
>>
>> It seems the pre map on the non MR back'ed range has issue. But I'm
>> still debugging it to understand the root cause.
>>
>>
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] i386/kvm: Prefault memory on page state change
  2025-06-03 11:47   ` Xiaoyao Li
  2025-06-03 13:40     ` Tom Lendacky
@ 2025-06-03 15:00     ` Paolo Bonzini
  2025-06-03 15:31       ` Xiaoyao Li
  2025-06-03 16:12       ` Xiaoyao Li
  1 sibling, 2 replies; 8+ messages in thread
From: Paolo Bonzini @ 2025-06-03 15:00 UTC (permalink / raw)
  To: Xiaoyao Li, Tom Lendacky, qemu-devel, kvm; +Cc: Marcelo Tosatti, Michael Roth

On 6/3/25 13:47, Xiaoyao Li wrote:
> On 6/3/2025 3:41 PM, Xiaoyao Li wrote:
>> On 3/29/2025 4:30 AM, Tom Lendacky wrote:
>>> A page state change is typically followed by an access of the page(s) 
>>> and
>>> results in another VMEXIT in order to map the page into the nested page
>>> table. Depending on the size of page state change request, this can
>>> generate a number of additional VMEXITs. For example, under SNP, when
>>> Linux is utilizing lazy memory acceptance, memory is typically 
>>> accepted in
>>> 4M chunks. A page state change request is submitted to mark the pages as
>>> private, followed by validation of the memory. Since the guest_memfd
>>> currently only supports 4K pages, each page validation will result in
>>> VMEXIT to map the page, resulting in 1024 additional exits.
>>>
>>> When performing a page state change, invoke KVM_PRE_FAULT_MEMORY for the
>>> size of the page state change in order to pre-map the pages and avoid 
>>> the
>>> additional VMEXITs. This helps speed up boot times.
>>
>> Unfortunately, it breaks TDX guest.
>>
>>    kvm_hc_map_gpa_range gpa 0x80000000 size 0x200000 attributes 0x0 
>> flags 0x1
>>
>> For TDX guest, it uses MAPGPA to maps the range [0x8000 0000, 
>> +0x0x200000] to shared. The call of KVM_PRE_FAULT_MEMORY on such range 
>> leads to the TD being marked as bugged
>>
>> [353467.266761] WARNING: CPU: 109 PID: 295970 at arch/x86/kvm/mmu/ 
>> tdp_mmu.c:674 tdp_mmu_map_handle_target_level+0x301/0x460 [kvm]
> 
> It turns out to be a KVM bug.
> 
> The gpa passed in in KVM_PRE_FAULT_MEMORY, i.e., range->gpa has no 
> indication for share vs. private. KVM directly passes range->gpa to 
> kvm_tdp_map_page() in kvm_arch_vcpu_pre_fault_memory(), which is then 
> assigned to fault.addr
> 
> However, fault.addr is supposed to be a gpa of real access in TDX guest, 
> which means it needs to have shared bit set if the map is for shared 
> access, for TDX case. tdp_mmu_get_root_for_fault() will use it to 
> determine which root to be used.
> 
> For this case, the pre fault is on the shared memory, while the 
> fault.addr leads to mirror_root which is for private memory. Thus it 
> triggers KVM_BUG_ON().
So this would fix it?

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 7b3f1783ab3c..66f96476fade 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4895,6 +4895,7 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
  {
  	u64 error_code = PFERR_GUEST_FINAL_MASK;
  	u8 level = PG_LEVEL_4K;
+	u64 direct_bits;
  	u64 end;
  	int r;
  
@@ -4909,15 +4910,18 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
  	if (r)
  		return r;
  
+	direct_bits = 0;
  	if (kvm_arch_has_private_mem(vcpu->kvm) &&
  	    kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
  		error_code |= PFERR_PRIVATE_ACCESS;
+	else
+		direct_bits = kvm_gfn_direct_bits(vcpu->kvm);
  
  	/*
  	 * Shadow paging uses GVA for kvm page fault, so restrict to
  	 * two-dimensional paging.
  	 */
-	r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
+	r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level);
  	if (r < 0)
  		return r;
  


I'm applying Tom's patch to get it out of his queue, but will delay sending
a pull request until the Linux-side fix is accepted.

Paolo



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] i386/kvm: Prefault memory on page state change
  2025-06-03 15:00     ` Paolo Bonzini
@ 2025-06-03 15:31       ` Xiaoyao Li
  2025-06-03 16:12       ` Xiaoyao Li
  1 sibling, 0 replies; 8+ messages in thread
From: Xiaoyao Li @ 2025-06-03 15:31 UTC (permalink / raw)
  To: Paolo Bonzini, Tom Lendacky, qemu-devel, kvm
  Cc: Marcelo Tosatti, Michael Roth

On 6/3/2025 11:00 PM, Paolo Bonzini wrote:
> On 6/3/25 13:47, Xiaoyao Li wrote:
>> On 6/3/2025 3:41 PM, Xiaoyao Li wrote:
>>> On 3/29/2025 4:30 AM, Tom Lendacky wrote:
>>>> A page state change is typically followed by an access of the 
>>>> page(s) and
>>>> results in another VMEXIT in order to map the page into the nested page
>>>> table. Depending on the size of page state change request, this can
>>>> generate a number of additional VMEXITs. For example, under SNP, when
>>>> Linux is utilizing lazy memory acceptance, memory is typically 
>>>> accepted in
>>>> 4M chunks. A page state change request is submitted to mark the 
>>>> pages as
>>>> private, followed by validation of the memory. Since the guest_memfd
>>>> currently only supports 4K pages, each page validation will result in
>>>> VMEXIT to map the page, resulting in 1024 additional exits.
>>>>
>>>> When performing a page state change, invoke KVM_PRE_FAULT_MEMORY for 
>>>> the
>>>> size of the page state change in order to pre-map the pages and 
>>>> avoid the
>>>> additional VMEXITs. This helps speed up boot times.
>>>
>>> Unfortunately, it breaks TDX guest.
>>>
>>>    kvm_hc_map_gpa_range gpa 0x80000000 size 0x200000 attributes 0x0 
>>> flags 0x1
>>>
>>> For TDX guest, it uses MAPGPA to maps the range [0x8000 0000, 
>>> +0x0x200000] to shared. The call of KVM_PRE_FAULT_MEMORY on such 
>>> range leads to the TD being marked as bugged
>>>
>>> [353467.266761] WARNING: CPU: 109 PID: 295970 at arch/x86/kvm/mmu/ 
>>> tdp_mmu.c:674 tdp_mmu_map_handle_target_level+0x301/0x460 [kvm]
>>
>> It turns out to be a KVM bug.
>>
>> The gpa passed in in KVM_PRE_FAULT_MEMORY, i.e., range->gpa has no 
>> indication for share vs. private. KVM directly passes range->gpa to 
>> kvm_tdp_map_page() in kvm_arch_vcpu_pre_fault_memory(), which is then 
>> assigned to fault.addr
>>
>> However, fault.addr is supposed to be a gpa of real access in TDX 
>> guest, which means it needs to have shared bit set if the map is for 
>> shared access, for TDX case. tdp_mmu_get_root_for_fault() will use it 
>> to determine which root to be used.
>>
>> For this case, the pre fault is on the shared memory, while the 
>> fault.addr leads to mirror_root which is for private memory. Thus it 
>> triggers KVM_BUG_ON().
> So this would fix it?
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 7b3f1783ab3c..66f96476fade 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4895,6 +4895,7 @@ long kvm_arch_vcpu_pre_fault_memory(struct 
> kvm_vcpu *vcpu,
>   {
>       u64 error_code = PFERR_GUEST_FINAL_MASK;
>       u8 level = PG_LEVEL_4K;
> +    u64 direct_bits;
>       u64 end;
>       int r;
> 
> @@ -4909,15 +4910,18 @@ long kvm_arch_vcpu_pre_fault_memory(struct 
> kvm_vcpu *vcpu,
>       if (r)
>           return r;
> 
> +    direct_bits = 0;
>       if (kvm_arch_has_private_mem(vcpu->kvm) &&
>           kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
>           error_code |= PFERR_PRIVATE_ACCESS;
> +    else
> +        direct_bits = kvm_gfn_direct_bits(vcpu->kvm);

should be

	direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));

> 
>       /*
>        * Shadow paging uses GVA for kvm page fault, so restrict to
>        * two-dimensional paging.
>        */
> -    r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
> +    r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, 
> &level);
>       if (r < 0)
>           return r;
> 
> 
> 
> I'm applying Tom's patch to get it out of his queue, but will delay sending
> a pull request until the Linux-side fix is accepted.

With above mentioned change, it can fix the issue.

Me synced with Yan offline, and our fix is:

diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 52acf99d40a0..209103bf0f30 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -48,7 +48,7 @@ static inline enum kvm_tdp_mmu_root_types 
kvm_gfn_range_filter_to_root_types(str
  static inline struct kvm_mmu_page *tdp_mmu_get_root_for_fault(struct 
kvm_vcpu *vcpu,
                                                               struct 
kvm_page_fault *fault)
  {
-       if (unlikely(!kvm_is_addr_direct(vcpu->kvm, fault->addr)))
+       if (unlikely(fault->is_private))
                 return root_to_sp(vcpu->arch.mmu->mirror_root_hpa);

         return root_to_sp(vcpu->arch.mmu->root.hpa);

> Paolo
> 
> 



^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] i386/kvm: Prefault memory on page state change
  2025-06-03 15:00     ` Paolo Bonzini
  2025-06-03 15:31       ` Xiaoyao Li
@ 2025-06-03 16:12       ` Xiaoyao Li
  1 sibling, 0 replies; 8+ messages in thread
From: Xiaoyao Li @ 2025-06-03 16:12 UTC (permalink / raw)
  To: Paolo Bonzini, Tom Lendacky, qemu-devel, kvm
  Cc: Marcelo Tosatti, Michael Roth

On 6/3/2025 11:00 PM, Paolo Bonzini wrote:
> I'm applying Tom's patch to get it out of his queue, but will delay sending
> a pull request until the Linux-side fix is accepted.

BTW, for the patch itself.

Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Tested-by: Xiaoyao Li <xiaoyao.li@intel.com>


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2025-06-03 16:13 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-28 20:30 [PATCH] i386/kvm: Prefault memory on page state change Tom Lendacky
2025-06-02 13:17 ` Tom Lendacky
2025-06-03  7:41 ` Xiaoyao Li
2025-06-03 11:47   ` Xiaoyao Li
2025-06-03 13:40     ` Tom Lendacky
2025-06-03 15:00     ` Paolo Bonzini
2025-06-03 15:31       ` Xiaoyao Li
2025-06-03 16:12       ` Xiaoyao Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).