public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] KVM: x86: Add support for cmpxchg16b emulation
@ 2026-03-06 10:20 Sairaj Kodilkar
  2026-03-06 15:38 ` Sean Christopherson
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Sairaj Kodilkar @ 2026-03-06 10:20 UTC (permalink / raw)
  To: H. Peter Anvin, Borislav Petkov, Dave Hansen, Ingo Molnar,
	Paolo Bonzini, Sean Christopherson, Thomas Gleixner, kvm,
	linux-kernel, x86
  Cc: suravee.suthikulpanit, vasant.hegde, nikunj.dadhania,
	Manali.Shukla, Sairaj Kodilkar

AMD and Intel both provides support for 128 bit cmpxchg operands using
cmpxchg8b/cmpxchg16b instructions (opcode 0FC7). However, kvm does not
support emulating cmpxchg16b (i.e when destination memory is 128 bit and
REX.W = 1) which causes emulation failure when QEMU guest performs a
cmpxchg16b on a memory region setup as a IO.

Hence extend cmpxchg8b to perform cmpxchg16b when the destination memory
is 128 bit.

Signed-off-by: Sairaj Kodilkar <sarunkod@amd.com>
---
Background:

The AMD IOMMU driver writes 256-bit device table entries with two
128-bit cmpxchg operations. For guests using hardware-accelerated
vIOMMU (still in progress), QEMU traps device table accesses to set up
nested page tables. Without 128-bit cmpxchg emulation, KVM cannot
handle these traps and DTE access emulation fails.

QEMU implementation that traps DTE accesses:
https://github.com/AMDESE/qemu-iommu/blob/wip/for_iommufd_hw_queue-v8_amd_viommu_20260106/hw/i386/amd_viommu.c#L517
---
 arch/x86/kvm/emulate.c     | 48 +++++++++++++++++++++++++++++++-------
 arch/x86/kvm/kvm_emulate.h |  1 +
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c8e292e9a24d..e1a08cd3274b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2188,17 +2188,18 @@ static int em_call_near_abs(struct x86_emulate_ctxt *ctxt)
 	return rc;
 }
 
-static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
+static int __handle_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
 {
-	u64 old = ctxt->dst.orig_val64;
+	u64 old64 = ctxt->dst.orig_val64;
 
-	if (ctxt->dst.bytes == 16)
+	/* Use of the REX.W prefix promotes operation to 128 bits */
+	if (ctxt->rex_bits & REX_W)
 		return X86EMUL_UNHANDLEABLE;
 
-	if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
-	    ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
-		*reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
-		*reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
+	if (((u32) (old64 >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
+	    ((u32) (old64 >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
+		*reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old64 >> 0);
+		*reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old64 >> 32);
 		ctxt->eflags &= ~X86_EFLAGS_ZF;
 	} else {
 		ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
@@ -2209,6 +2210,37 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
+static int __handle_cmpxchg16b(struct x86_emulate_ctxt *ctxt)
+{
+	__uint128_t old128 = ctxt->dst.val128;
+
+	/* Use of the REX.W prefix promotes operation to 128 bits */
+	if (!(ctxt->rex_bits & REX_W))
+		return X86EMUL_UNHANDLEABLE;
+
+	if (((u64) (old128 >> 0) != (u64) reg_read(ctxt, VCPU_REGS_RAX)) ||
+	    ((u64) (old128 >> 64) != (u64) reg_read(ctxt, VCPU_REGS_RDX))) {
+		*reg_write(ctxt, VCPU_REGS_RAX) = (u64) (old128 >> 0);
+		*reg_write(ctxt, VCPU_REGS_RDX) = (u64) (old128 >> 64);
+		ctxt->eflags &= ~X86_EFLAGS_ZF;
+	} else {
+		ctxt->dst.val128 =
+			((__uint128_t) reg_read(ctxt, VCPU_REGS_RCX) << 64) |
+			(u64) reg_read(ctxt, VCPU_REGS_RBX);
+
+		ctxt->eflags |= X86_EFLAGS_ZF;
+	}
+	return X86EMUL_CONTINUE;
+}
+
+static int em_cmpxchgxb(struct x86_emulate_ctxt *ctxt)
+{
+	if (ctxt->dst.bytes == 16)
+		return __handle_cmpxchg16b(ctxt);
+
+	return __handle_cmpxchg8b(ctxt);
+}
+
 static int em_ret(struct x86_emulate_ctxt *ctxt)
 {
 	int rc;
@@ -4097,7 +4129,7 @@ static const struct gprefix pfx_0f_c7_7 = {
 
 
 static const struct group_dual group9 = { {
-	N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
+	N, I(DstMem64 | Lock | PageTable, em_cmpxchgxb), N, N, N, N, N, N,
 }, {
 	N, N, N, N, N, N, N,
 	GP(0, &pfx_0f_c7_7),
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index fb3dab4b5a53..a1e95c57d40e 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -268,6 +268,7 @@ struct operand {
 	union {
 		unsigned long val;
 		u64 val64;
+		__uint128_t val128;
 		char valptr[sizeof(avx256_t)];
 		sse128_t vec_val;
 		avx256_t vec_val2;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] KVM: x86: Add support for cmpxchg16b emulation
  2026-03-06 10:20 [PATCH] KVM: x86: Add support for cmpxchg16b emulation Sairaj Kodilkar
@ 2026-03-06 15:38 ` Sean Christopherson
  2026-03-11 11:18   ` Sairaj Kodilkar
  2026-03-12 11:46   ` Paolo Bonzini
  2026-03-07  2:39 ` kernel test robot
  2026-03-07  5:40 ` kernel test robot
  2 siblings, 2 replies; 6+ messages in thread
From: Sean Christopherson @ 2026-03-06 15:38 UTC (permalink / raw)
  To: Sairaj Kodilkar
  Cc: H. Peter Anvin, Borislav Petkov, Dave Hansen, Ingo Molnar,
	Paolo Bonzini, Thomas Gleixner, kvm, linux-kernel, x86,
	suravee.suthikulpanit, vasant.hegde, nikunj.dadhania,
	Manali.Shukla

On Fri, Mar 06, 2026, Sairaj Kodilkar wrote:
> AMD and Intel both provides support for 128 bit cmpxchg operands using
> cmpxchg8b/cmpxchg16b instructions (opcode 0FC7). However, kvm does not
> support emulating cmpxchg16b (i.e when destination memory is 128 bit and
> REX.W = 1) which causes emulation failure when QEMU guest performs a
> cmpxchg16b on a memory region setup as a IO.
> 
> Hence extend cmpxchg8b to perform cmpxchg16b when the destination memory
> is 128 bit.
> 
> Signed-off-by: Sairaj Kodilkar <sarunkod@amd.com>
> ---
> Background:
> 
> The AMD IOMMU driver writes 256-bit device table entries with two
> 128-bit cmpxchg operations. For guests using hardware-accelerated
> vIOMMU (still in progress), QEMU traps device table accesses to set up
> nested page tables. Without 128-bit cmpxchg emulation, KVM cannot
> handle these traps and DTE access emulation fails.

Please put this paragraph in the changelog proper, the "why" matters greatly here.

> QEMU implementation that traps DTE accesses:
> https://github.com/AMDESE/qemu-iommu/blob/wip/for_iommufd_hw_queue-v8_amd_viommu_20260106/hw/i386/amd_viommu.c#L517
> ---
>  arch/x86/kvm/emulate.c     | 48 +++++++++++++++++++++++++++++++-------
>  arch/x86/kvm/kvm_emulate.h |  1 +
>  2 files changed, 41 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index c8e292e9a24d..e1a08cd3274b 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -2188,17 +2188,18 @@ static int em_call_near_abs(struct x86_emulate_ctxt *ctxt)
>  	return rc;
>  }
>  
> -static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
> +static int __handle_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
>  {
> -	u64 old = ctxt->dst.orig_val64;
> +	u64 old64 = ctxt->dst.orig_val64;
>  
> -	if (ctxt->dst.bytes == 16)
> +	/* Use of the REX.W prefix promotes operation to 128 bits */
> +	if (ctxt->rex_bits & REX_W)

Uh, yeah, and the caller specifically pivoted on this size.  I'm a-ok with a
sanity check, but it should be exactly that, e.g.

	if (WARN_ON_ONCE(8 + (ctxt->rex_bits & REX_W) * 8 != ctxt->dst.bytes))
		return X86EMUL_UNHANDLEABLE;

>  		return X86EMUL_UNHANDLEABLE;
>  
> -	if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
> -	    ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
> -		*reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
> -		*reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
> +	if (((u32) (old64 >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
> +	    ((u32) (old64 >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
> +		*reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old64 >> 0);
> +		*reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old64 >> 32);
>  		ctxt->eflags &= ~X86_EFLAGS_ZF;
>  	} else {
>  		ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
> @@ -2209,6 +2210,37 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
>  	return X86EMUL_CONTINUE;
>  }
>  
> +static int __handle_cmpxchg16b(struct x86_emulate_ctxt *ctxt)
> +{
> +	__uint128_t old128 = ctxt->dst.val128;

There is zero chance you properly tested this patch.  Please write comprehensive
testcases in KUT's emulator64.c before posting the next version.

In writeback(), the OP_MEM case quite clearly operates on "unsigned long" values
*and* consumes orig_val in the locked case.

	case OP_MEM:
		if (ctxt->lock_prefix)
			return segmented_cmpxchg(ctxt,
						 op->addr.mem,
						 &op->orig_val,
						 &op->val,
						 op->bytes);
		else
			return segmented_write(ctxt,
					       op->addr.mem,
					       &op->val,
					       op->bytes);

And then emulator_cmpxchg_emulated() should be taught to do cmpxchg16b itself:

	/* guests cmpxchg8b have to be emulated atomically */
	if (bytes > 8 || (bytes & (bytes - 1)))
		goto emul_write;

That might be a big lift, e.g. to get a 16-byte uaccess version.  If it's
unreasonably difficult, we should reject emulation of LOCK CMPXCHG16B.

And this code would also need to be updated to copy the full 128-bit value.

	/* Copy full 64-bit value for CMPXCHG8B.  */
	ctxt->dst.orig_val64 = ctxt->dst.val64;

Maybe something like this?  Or maybe just copy 128 bits unconditionally (I'm not
sure if that could read uninitiatlized data or not).

	/* Copy the full 64/128-bit value for CMPXCHG8B/16B.  */
	if (IS_ENABLED(CONFIG_X86_64) && ctxt->dst.bytes == 16)
		ctxt->dst.orig_val128 = ctxt->dst.val128;
	else
		ctxt->dst.orig_val64 = ctxt->dst.val64;

> +
> +	/* Use of the REX.W prefix promotes operation to 128 bits */
> +	if (!(ctxt->rex_bits & REX_W))
> +		return X86EMUL_UNHANDLEABLE;
> +
> +	if (((u64) (old128 >> 0) != (u64) reg_read(ctxt, VCPU_REGS_RAX)) ||
> +	    ((u64) (old128 >> 64) != (u64) reg_read(ctxt, VCPU_REGS_RDX))) {
> +		*reg_write(ctxt, VCPU_REGS_RAX) = (u64) (old128 >> 0);
> +		*reg_write(ctxt, VCPU_REGS_RDX) = (u64) (old128 >> 64);
> +		ctxt->eflags &= ~X86_EFLAGS_ZF;
> +	} else {
> +		ctxt->dst.val128 =
> +			((__uint128_t) reg_read(ctxt, VCPU_REGS_RCX) << 64) |
> +			(u64) reg_read(ctxt, VCPU_REGS_RBX);
> +
> +		ctxt->eflags |= X86_EFLAGS_ZF;

IMO we should use a macro to handle 8b vs. 16b.  The code is going to be heinous
no matter what, and so to me, duplicating everything makes it twice as ugly,
whereas a macro makes it like 10% more ugly.

> +	}
> +	return X86EMUL_CONTINUE;
> +}
> +
> +static int em_cmpxchgxb(struct x86_emulate_ctxt *ctxt)
> +{
> +	if (ctxt->dst.bytes == 16)
> +		return __handle_cmpxchg16b(ctxt);
> +
> +	return __handle_cmpxchg8b(ctxt);
> +}
> +
>  static int em_ret(struct x86_emulate_ctxt *ctxt)
>  {
>  	int rc;
> @@ -4097,7 +4129,7 @@ static const struct gprefix pfx_0f_c7_7 = {
>  
>  
>  static const struct group_dual group9 = { {
> -	N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
> +	N, I(DstMem64 | Lock | PageTable, em_cmpxchgxb), N, N, N, N, N, N,

Eh, I vote to keep it em_cmpxchg8b.  _If_ we want to capture that its size is
variable, maybe em_cmpxchg8b_16b?  em_cmpxchgxb just looks like a typo.

Sans correct writeback() handling, something like so:

---
 arch/x86/kvm/emulate.c     | 44 ++++++++++++++++++++++++--------------
 arch/x86/kvm/kvm_emulate.h |  2 ++
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6145dac4a605..3ec4555571fa 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2201,24 +2201,33 @@ static int em_call_near_abs(struct x86_emulate_ctxt *ctxt)
 	return rc;
 }
 
+#define em_cmpxchg8b_16b(__c, rbits, mbits)						  \
+do {											  \
+	u##mbits old = __c->dst.orig_val##mbits;					  \
+											  \
+	BUILD_BUG_ON(rbits * 2 != mbits);						  \
+											  \
+	if (((u##rbits)(old >> 0)     != (u##rbits)reg_read(__c, VCPU_REGS_RAX)) ||	  \
+	    ((u##rbits)(old >> rbits) != (u##rbits)reg_read(__c, VCPU_REGS_RDX))) {	  \
+		*reg_write(__c, VCPU_REGS_RAX) = (u##rbits)(old >> 0);			  \
+		*reg_write(__c, VCPU_REGS_RDX) = (u##rbits)(old >> rbits);		  \
+		__c->eflags &= ~X86_EFLAGS_ZF;						  \
+	} else {									  \
+		__c->dst.val##mbits = ((u##mbits)reg_read(__c, VCPU_REGS_RCX) << rbits) | \
+				       (u##rbits)reg_read(__c, VCPU_REGS_RBX);		  \
+		__c->eflags |= X86_EFLAGS_ZF;						  \
+	}										  \
+} while (0)
+
 static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
 {
-	u64 old = ctxt->dst.orig_val64;
-
-	if (ctxt->dst.bytes == 16)
+	if (WARN_ON_ONCE(8 + (ctxt->rex_bits & REX_W) * 8 != ctxt->dst.bytes))
 		return X86EMUL_UNHANDLEABLE;
 
-	if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
-	    ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
-		*reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
-		*reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
-		ctxt->eflags &= ~X86_EFLAGS_ZF;
-	} else {
-		ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
-			(u32) reg_read(ctxt, VCPU_REGS_RBX);
-
-		ctxt->eflags |= X86_EFLAGS_ZF;
-	}
+	if (!(ctxt->rex_bits & REX_W))
+		em_cmpxchg8b_16b(ctxt, 32, 64);
+	else
+		em_cmpxchg8b_16b(ctxt, 64, 128);
 	return X86EMUL_CONTINUE;
 }
 
@@ -5418,8 +5427,11 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts)
 			goto done;
 		}
 	}
-	/* Copy full 64-bit value for CMPXCHG8B.  */
-	ctxt->dst.orig_val64 = ctxt->dst.val64;
+	/* Copy the full 64/128-bit value for CMPXCHG8B/16B.  */
+	if (IS_ENABLED(CONFIG_X86_64) && ctxt->dst.bytes == 16)
+		ctxt->dst.orig_val128 = ctxt->dst.val128;
+	else
+		ctxt->dst.orig_val64 = ctxt->dst.val64;
 
 special_insn:
 
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index fb3dab4b5a53..0e9968319343 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -255,6 +255,7 @@ struct operand {
 	union {
 		unsigned long orig_val;
 		u64 orig_val64;
+		u64 orig_val128;
 	};
 	union {
 		unsigned long *reg;
@@ -268,6 +269,7 @@ struct operand {
 	union {
 		unsigned long val;
 		u64 val64;
+		__uint128_t val128;
 		char valptr[sizeof(avx256_t)];
 		sse128_t vec_val;
 		avx256_t vec_val2;

base-commit: 5128b972fb2801ad9aca54d990a75611ab5283a9
--

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] KVM: x86: Add support for cmpxchg16b emulation
  2026-03-06 10:20 [PATCH] KVM: x86: Add support for cmpxchg16b emulation Sairaj Kodilkar
  2026-03-06 15:38 ` Sean Christopherson
@ 2026-03-07  2:39 ` kernel test robot
  2026-03-07  5:40 ` kernel test robot
  2 siblings, 0 replies; 6+ messages in thread
From: kernel test robot @ 2026-03-07  2:39 UTC (permalink / raw)
  To: Sairaj Kodilkar, H. Peter Anvin, Borislav Petkov, Dave Hansen,
	Ingo Molnar, Paolo Bonzini, Sean Christopherson, Thomas Gleixner,
	kvm, linux-kernel, x86
  Cc: oe-kbuild-all, suravee.suthikulpanit, vasant.hegde,
	nikunj.dadhania, Manali.Shukla, Sairaj Kodilkar

Hi Sairaj,

kernel test robot noticed the following build warnings:

[auto build test WARNING on kvm/queue]
[also build test WARNING on kvm/next tip/master linus/master v7.0-rc2 next-20260306]
[cannot apply to kvm/linux-next tip/auto-latest]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Sairaj-Kodilkar/KVM-x86-Add-support-for-cmpxchg16b-emulation/20260306-182915
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
patch link:    https://lore.kernel.org/r/20260306102047.29760-1-sarunkod%40amd.com
patch subject: [PATCH] KVM: x86: Add support for cmpxchg16b emulation
config: i386-randconfig-005-20260307 (https://download.01.org/0day-ci/archive/20260307/202603071055.Wi43mREW-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.4.0-5) 12.4.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260307/202603071055.Wi43mREW-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202603071055.Wi43mREW-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from arch/x86/kvm/emulate.c:24:
   arch/x86/kvm/kvm_emulate.h:271:17: error: unknown type name '__uint128_t'
     271 |                 __uint128_t val128;
         |                 ^~~~~~~~~~~
   arch/x86/kvm/emulate.c: In function '__handle_cmpxchg16b':
   arch/x86/kvm/emulate.c:2215:9: error: unknown type name '__uint128_t'; did you mean '__int128__'?
    2215 |         __uint128_t old128 = ctxt->dst.val128;
         |         ^~~~~~~~~~~
         |         __int128__
>> arch/x86/kvm/emulate.c:2222:28: warning: right shift count >= width of type [-Wshift-count-overflow]
    2222 |             ((u64) (old128 >> 64) != (u64) reg_read(ctxt, VCPU_REGS_RDX))) {
         |                            ^~
   arch/x86/kvm/emulate.c:2224:65: warning: right shift count >= width of type [-Wshift-count-overflow]
    2224 |                 *reg_write(ctxt, VCPU_REGS_RDX) = (u64) (old128 >> 64);
         |                                                                 ^~
   arch/x86/kvm/emulate.c:2228:27: error: '__uint128_t' undeclared (first use in this function); did you mean '__int128__'?
    2228 |                         ((__uint128_t) reg_read(ctxt, VCPU_REGS_RCX) << 64) |
         |                           ^~~~~~~~~~~
         |                           __int128__
   arch/x86/kvm/emulate.c:2228:27: note: each undeclared identifier is reported only once for each function it appears in
   arch/x86/kvm/emulate.c:2228:39: error: expected ')' before 'reg_read'
    2228 |                         ((__uint128_t) reg_read(ctxt, VCPU_REGS_RCX) << 64) |
         |                         ~             ^~~~~~~~~
         |                                       )


vim +2222 arch/x86/kvm/emulate.c

  2212	
  2213	static int __handle_cmpxchg16b(struct x86_emulate_ctxt *ctxt)
  2214	{
  2215		__uint128_t old128 = ctxt->dst.val128;
  2216	
  2217		/* Use of the REX.W prefix promotes operation to 128 bits */
  2218		if (!(ctxt->rex_bits & REX_W))
  2219			return X86EMUL_UNHANDLEABLE;
  2220	
  2221		if (((u64) (old128 >> 0) != (u64) reg_read(ctxt, VCPU_REGS_RAX)) ||
> 2222		    ((u64) (old128 >> 64) != (u64) reg_read(ctxt, VCPU_REGS_RDX))) {
  2223			*reg_write(ctxt, VCPU_REGS_RAX) = (u64) (old128 >> 0);
  2224			*reg_write(ctxt, VCPU_REGS_RDX) = (u64) (old128 >> 64);
  2225			ctxt->eflags &= ~X86_EFLAGS_ZF;
  2226		} else {
  2227			ctxt->dst.val128 =
  2228				((__uint128_t) reg_read(ctxt, VCPU_REGS_RCX) << 64) |
  2229				(u64) reg_read(ctxt, VCPU_REGS_RBX);
  2230	
  2231			ctxt->eflags |= X86_EFLAGS_ZF;
  2232		}
  2233		return X86EMUL_CONTINUE;
  2234	}
  2235	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] KVM: x86: Add support for cmpxchg16b emulation
  2026-03-06 10:20 [PATCH] KVM: x86: Add support for cmpxchg16b emulation Sairaj Kodilkar
  2026-03-06 15:38 ` Sean Christopherson
  2026-03-07  2:39 ` kernel test robot
@ 2026-03-07  5:40 ` kernel test robot
  2 siblings, 0 replies; 6+ messages in thread
From: kernel test robot @ 2026-03-07  5:40 UTC (permalink / raw)
  To: Sairaj Kodilkar, H. Peter Anvin, Borislav Petkov, Dave Hansen,
	Ingo Molnar, Paolo Bonzini, Sean Christopherson, Thomas Gleixner,
	kvm, linux-kernel, x86
  Cc: llvm, oe-kbuild-all, suravee.suthikulpanit, vasant.hegde,
	nikunj.dadhania, Manali.Shukla, Sairaj Kodilkar

Hi Sairaj,

kernel test robot noticed the following build errors:

[auto build test ERROR on kvm/queue]
[also build test ERROR on kvm/next tip/master linus/master v7.0-rc2 next-20260306]
[cannot apply to kvm/linux-next tip/auto-latest bp/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Sairaj-Kodilkar/KVM-x86-Add-support-for-cmpxchg16b-emulation/20260306-182915
base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
patch link:    https://lore.kernel.org/r/20260306102047.29760-1-sarunkod%40amd.com
patch subject: [PATCH] KVM: x86: Add support for cmpxchg16b emulation
config: i386-randconfig-012-20260307 (https://download.01.org/0day-ci/archive/20260307/202603071358.8bRahq3i-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260307/202603071358.8bRahq3i-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202603071358.8bRahq3i-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from arch/x86/kvm/mmu/page_track.c:19:
   In file included from arch/x86/kvm/mmu.h:7:
   In file included from arch/x86/kvm/x86.h:10:
>> arch/x86/kvm/kvm_emulate.h:271:3: error: unknown type name '__uint128_t'
     271 |                 __uint128_t val128;
         |                 ^
   1 error generated.
--
   In file included from arch/x86/kvm/vmx/vmcs12.c:4:
   In file included from arch/x86/kvm/vmx/vmcs12.h:7:
   In file included from arch/x86/kvm/vmx/vmcs.h:12:
   In file included from arch/x86/kvm/vmx/capabilities.h:7:
   In file included from arch/x86/kvm/vmx/../lapic.h:11:
   In file included from arch/x86/kvm/vmx/../hyperv.h:25:
   In file included from arch/x86/kvm/vmx/../x86.h:10:
>> arch/x86/kvm/vmx/../kvm_emulate.h:271:3: error: unknown type name '__uint128_t'
     271 |                 __uint128_t val128;
         |                 ^
   1 error generated.
--
   In file included from arch/x86/kvm/emulate.c:24:
>> arch/x86/kvm/kvm_emulate.h:271:3: error: unknown type name '__uint128_t'
     271 |                 __uint128_t val128;
         |                 ^
>> arch/x86/kvm/emulate.c:393:12: error: no member named 'val' in 'struct operand'
     393 |         ctxt->dst.val = 0xFF * !!(ctxt->eflags & X86_EFLAGS_CF);
         |         ~~~~~~~~~ ^
>> arch/x86/kvm/emulate.c:423:27: error: no member named 'val64' in 'struct operand'
     423 |                 .src_val    = ctxt->src.val64,
         |                               ~~~~~~~~~ ^
   arch/x86/kvm/emulate.c:424:27: error: no member named 'val64' in 'struct operand'
     424 |                 .dst_val    = ctxt->dst.val64,
         |                               ~~~~~~~~~ ^
   arch/x86/kvm/emulate.c:955:1: error: no member named 'val' in 'struct operand'
     955 | EM_ASM_2(add);
         | ^~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:346:10: note: expanded from macro 'EM_ASM_2'
     346 |         case 1: __EM_ASM_2(op##b, al, dl); break; \
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:305:3: note: expanded from macro '__EM_ASM_2'
     305 |                 __EM_ASM(#op " %%" #src ", %%" #dst " \n\t")
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:291:25: note: expanded from macro '__EM_ASM'
     291 |                     : "+a" (ctxt->dst.val), \
         |                             ~~~~~~~~~ ^
   arch/x86/kvm/emulate.c:955:1: error: no member named 'val' in 'struct operand'
     955 | EM_ASM_2(add);
         | ^~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:346:10: note: expanded from macro 'EM_ASM_2'
     346 |         case 1: __EM_ASM_2(op##b, al, dl); break; \
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:305:3: note: expanded from macro '__EM_ASM_2'
     305 |                 __EM_ASM(#op " %%" #src ", %%" #dst " \n\t")
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:292:25: note: expanded from macro '__EM_ASM'
     292 |                       "+d" (ctxt->src.val), \
         |                             ~~~~~~~~~ ^
   arch/x86/kvm/emulate.c:955:1: error: no member named 'val' in 'struct operand'
     955 | EM_ASM_2(add);
         | ^~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:346:10: note: expanded from macro 'EM_ASM_2'
     346 |         case 1: __EM_ASM_2(op##b, al, dl); break; \
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:305:3: note: expanded from macro '__EM_ASM_2'
     305 |                 __EM_ASM(#op " %%" #src ", %%" #dst " \n\t")
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:295:25: note: expanded from macro '__EM_ASM'
     295 |                     : "c" (ctxt->src2.val))
         |                            ~~~~~~~~~~ ^
   arch/x86/kvm/emulate.c:955:1: error: no member named 'val' in 'struct operand'
     955 | EM_ASM_2(add);
         | ^~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:347:10: note: expanded from macro 'EM_ASM_2'
     347 |         case 2: __EM_ASM_2(op##w, ax, dx); break; \
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:305:3: note: expanded from macro '__EM_ASM_2'
     305 |                 __EM_ASM(#op " %%" #src ", %%" #dst " \n\t")
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:291:25: note: expanded from macro '__EM_ASM'
     291 |                     : "+a" (ctxt->dst.val), \
         |                             ~~~~~~~~~ ^
   arch/x86/kvm/emulate.c:955:1: error: no member named 'val' in 'struct operand'
     955 | EM_ASM_2(add);
         | ^~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:347:10: note: expanded from macro 'EM_ASM_2'
     347 |         case 2: __EM_ASM_2(op##w, ax, dx); break; \
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:305:3: note: expanded from macro '__EM_ASM_2'
     305 |                 __EM_ASM(#op " %%" #src ", %%" #dst " \n\t")
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:292:25: note: expanded from macro '__EM_ASM'
     292 |                       "+d" (ctxt->src.val), \
         |                             ~~~~~~~~~ ^
   arch/x86/kvm/emulate.c:955:1: error: no member named 'val' in 'struct operand'
     955 | EM_ASM_2(add);
         | ^~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:347:10: note: expanded from macro 'EM_ASM_2'
     347 |         case 2: __EM_ASM_2(op##w, ax, dx); break; \
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:305:3: note: expanded from macro '__EM_ASM_2'
     305 |                 __EM_ASM(#op " %%" #src ", %%" #dst " \n\t")
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:295:25: note: expanded from macro '__EM_ASM'
     295 |                     : "c" (ctxt->src2.val))
         |                            ~~~~~~~~~~ ^
   arch/x86/kvm/emulate.c:955:1: error: no member named 'val' in 'struct operand'
     955 | EM_ASM_2(add);
         | ^~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:348:10: note: expanded from macro 'EM_ASM_2'
     348 |         case 4: __EM_ASM_2(op##l, eax, edx); break; \
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:305:3: note: expanded from macro '__EM_ASM_2'
     305 |                 __EM_ASM(#op " %%" #src ", %%" #dst " \n\t")
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:291:25: note: expanded from macro '__EM_ASM'
     291 |                     : "+a" (ctxt->dst.val), \
         |                             ~~~~~~~~~ ^
   arch/x86/kvm/emulate.c:955:1: error: no member named 'val' in 'struct operand'
     955 | EM_ASM_2(add);
         | ^~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:348:10: note: expanded from macro 'EM_ASM_2'
     348 |         case 4: __EM_ASM_2(op##l, eax, edx); break; \
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:305:3: note: expanded from macro '__EM_ASM_2'
     305 |                 __EM_ASM(#op " %%" #src ", %%" #dst " \n\t")
         |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   arch/x86/kvm/emulate.c:292:25: note: expanded from macro '__EM_ASM'
     292 |                       "+d" (ctxt->src.val), \
--
   In file included from arch/x86/kvm/x86.c:21:
   In file included from arch/x86/kvm/irq.h:19:
   In file included from arch/x86/kvm/lapic.h:11:
   In file included from arch/x86/kvm/hyperv.h:25:
   In file included from arch/x86/kvm/x86.h:10:
>> arch/x86/kvm/kvm_emulate.h:271:3: error: unknown type name '__uint128_t'
     271 |                 __uint128_t val128;
         |                 ^
   In file included from arch/x86/kvm/x86.c:44:
   include/linux/mman.h:157:9: warning: division by zero is undefined [-Wdivision-by-zero]
     157 |                _calc_vm_trans(flags, MAP_SYNC,       VM_SYNC      ) |
         |                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   include/linux/mman.h:135:21: note: expanded from macro '_calc_vm_trans'
     135 |    : ((x) & (bit1)) / ((bit1) / (bit2))))
         |                     ^ ~~~~~~~~~~~~~~~~~
>> arch/x86/kvm/x86.c:9381:30: error: no member named 'val' in 'struct operand'
    9381 |                 return vector == ctxt->src.val;
         |                                  ~~~~~~~~~ ^
   1 warning and 2 errors generated.


vim +/__uint128_t +271 arch/x86/kvm/kvm_emulate.h

   249	
   250	/* Type, address-of, and value of an instruction's operand. */
   251	struct operand {
   252		enum { OP_REG, OP_MEM, OP_MEM_STR, OP_IMM, OP_XMM, OP_YMM, OP_MM, OP_NONE } type;
   253		unsigned int bytes;
   254		unsigned int count;
   255		union {
   256			unsigned long orig_val;
   257			u64 orig_val64;
   258		};
   259		union {
   260			unsigned long *reg;
   261			struct segmented_address {
   262				ulong ea;
   263				unsigned seg;
   264			} mem;
   265			unsigned xmm;
   266			unsigned mm;
   267		} addr;
   268		union {
   269			unsigned long val;
   270			u64 val64;
 > 271			__uint128_t val128;
   272			char valptr[sizeof(avx256_t)];
   273			sse128_t vec_val;
   274			avx256_t vec_val2;
   275			u64 mm_val;
   276			void *data;
   277		} __aligned(32);
   278	};
   279	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] KVM: x86: Add support for cmpxchg16b emulation
  2026-03-06 15:38 ` Sean Christopherson
@ 2026-03-11 11:18   ` Sairaj Kodilkar
  2026-03-12 11:46   ` Paolo Bonzini
  1 sibling, 0 replies; 6+ messages in thread
From: Sairaj Kodilkar @ 2026-03-11 11:18 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: sarunkod, H. Peter Anvin, Borislav Petkov, Dave Hansen,
	Ingo Molnar, Paolo Bonzini, Thomas Gleixner, kvm, linux-kernel,
	x86, suravee.suthikulpanit, vasant.hegde, nikunj.dadhania,
	Manali.Shukla



On 3/6/2026 9:08 PM, Sean Christopherson wrote:
> On Fri, Mar 06, 2026, Sairaj Kodilkar wrote:
>> AMD and Intel both provides support for 128 bit cmpxchg operands using
>> cmpxchg8b/cmpxchg16b instructions (opcode 0FC7). However, kvm does not
>> support emulating cmpxchg16b (i.e when destination memory is 128 bit and
>> REX.W = 1) which causes emulation failure when QEMU guest performs a
>> cmpxchg16b on a memory region setup as a IO.
>>
>> Hence extend cmpxchg8b to perform cmpxchg16b when the destination memory
>> is 128 bit.
>>
>> Signed-off-by: Sairaj Kodilkar <sarunkod@amd.com>
>> ---
>> Background:
>>
>> The AMD IOMMU driver writes 256-bit device table entries with two
>> 128-bit cmpxchg operations. For guests using hardware-accelerated
>> vIOMMU (still in progress), QEMU traps device table accesses to set up
>> nested page tables. Without 128-bit cmpxchg emulation, KVM cannot
>> handle these traps and DTE access emulation fails.
> Please put this paragraph in the changelog proper, the "why" matters greatly here.

Sure.

>> QEMU implementation that traps DTE accesses:
>> https://github.com/AMDESE/qemu-iommu/blob/wip/for_iommufd_hw_queue-v8_amd_viommu_20260106/hw/i386/amd_viommu.c#L517
>> ---
>>   arch/x86/kvm/emulate.c     | 48 +++++++++++++++++++++++++++++++-------
>>   arch/x86/kvm/kvm_emulate.h |  1 +
>>   2 files changed, 41 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
>> index c8e292e9a24d..e1a08cd3274b 100644
>> --- a/arch/x86/kvm/emulate.c
>> +++ b/arch/x86/kvm/emulate.c
>> @@ -2188,17 +2188,18 @@ static int em_call_near_abs(struct x86_emulate_ctxt *ctxt)
>>   	return rc;
>>   }
>>   
>> -static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
>> +static int __handle_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
>>   {
>> -	u64 old = ctxt->dst.orig_val64;
>> +	u64 old64 = ctxt->dst.orig_val64;
>>   
>> -	if (ctxt->dst.bytes == 16)
>> +	/* Use of the REX.W prefix promotes operation to 128 bits */
>> +	if (ctxt->rex_bits & REX_W)
> Uh, yeah, and the caller specifically pivoted on this size.  I'm a-ok with a
> sanity check, but it should be exactly that, e.g.
>
> 	if (WARN_ON_ONCE(8 + (ctxt->rex_bits & REX_W) * 8 != ctxt->dst.bytes))
> 		return X86EMUL_UNHANDLEABLE;

Sure I'll update it.

>>   		return X86EMUL_UNHANDLEABLE;
>>   
>> -	if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
>> -	    ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
>> -		*reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
>> -		*reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
>> +	if (((u32) (old64 >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
>> +	    ((u32) (old64 >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
>> +		*reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old64 >> 0);
>> +		*reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old64 >> 32);
>>   		ctxt->eflags &= ~X86_EFLAGS_ZF;
>>   	} else {
>>   		ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
>> @@ -2209,6 +2210,37 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
>>   	return X86EMUL_CONTINUE;
>>   }
>>   
>> +static int __handle_cmpxchg16b(struct x86_emulate_ctxt *ctxt)
>> +{
>> +	__uint128_t old128 = ctxt->dst.val128;
> There is zero chance you properly tested this patch.  Please write comprehensive
> testcases in KUT's emulator64.c before posting the next version.
>
> In writeback(), the OP_MEM case quite clearly operates on "unsigned long" values
> *and* consumes orig_val in the locked case.
>
> 	case OP_MEM:
> 		if (ctxt->lock_prefix)
> 			return segmented_cmpxchg(ctxt,
> 						 op->addr.mem,
> 						 &op->orig_val,
> 						 &op->val,
> 						 op->bytes);
> 		else
> 			return segmented_write(ctxt,
> 					       op->addr.mem,
> 					       &op->val,
> 					       op->bytes);
>
> And then emulator_cmpxchg_emulated() should be taught to do cmpxchg16b itself:
>
> 	/* guests cmpxchg8b have to be emulated atomically */
> 	if (bytes > 8 || (bytes & (bytes - 1)))
> 		goto emul_write;
>
> That might be a big lift, e.g. to get a 16-byte uaccess version.  If it's
> unreasonably difficult, we should reject emulation of LOCK CMPXCHG16B.

I think its not a too much of the work, I have a prototype ready which
implements cmpxchg16b access in uaccess.h. Tested with a KUT as well
seems to be working fine. Will post it once I cleanup the series.

> And this code would also need to be updated to copy the full 128-bit value.
>
> 	/* Copy full 64-bit value for CMPXCHG8B.  */
> 	ctxt->dst.orig_val64 = ctxt->dst.val64;
>
> Maybe something like this?  Or maybe just copy 128 bits unconditionally (I'm not
> sure if that could read uninitiatlized data or not).
>
> 	/* Copy the full 64/128-bit value for CMPXCHG8B/16B.  */
> 	if (IS_ENABLED(CONFIG_X86_64) && ctxt->dst.bytes == 16)
> 		ctxt->dst.orig_val128 = ctxt->dst.val128;
> 	else
> 		ctxt->dst.orig_val64 = ctxt->dst.val64;

Yeah thanks, I completely missed that cmpxchg with lock prefix
consumes original value. I'll update the code.

>> +
>> +	/* Use of the REX.W prefix promotes operation to 128 bits */
>> +	if (!(ctxt->rex_bits & REX_W))
>> +		return X86EMUL_UNHANDLEABLE;
>> +
>> +	if (((u64) (old128 >> 0) != (u64) reg_read(ctxt, VCPU_REGS_RAX)) ||
>> +	    ((u64) (old128 >> 64) != (u64) reg_read(ctxt, VCPU_REGS_RDX))) {
>> +		*reg_write(ctxt, VCPU_REGS_RAX) = (u64) (old128 >> 0);
>> +		*reg_write(ctxt, VCPU_REGS_RDX) = (u64) (old128 >> 64);
>> +		ctxt->eflags &= ~X86_EFLAGS_ZF;
>> +	} else {
>> +		ctxt->dst.val128 =
>> +			((__uint128_t) reg_read(ctxt, VCPU_REGS_RCX) << 64) |
>> +			(u64) reg_read(ctxt, VCPU_REGS_RBX);
>> +
>> +		ctxt->eflags |= X86_EFLAGS_ZF;
> IMO we should use a macro to handle 8b vs. 16b.  The code is going to be heinous
> no matter what, and so to me, duplicating everything makes it twice as ugly,
> whereas a macro makes it like 10% more ugly.
>
>> +	}
>> +	return X86EMUL_CONTINUE;
>> +}
>> +
>> +static int em_cmpxchgxb(struct x86_emulate_ctxt *ctxt)
>> +{
>> +	if (ctxt->dst.bytes == 16)
>> +		return __handle_cmpxchg16b(ctxt);
>> +
>> +	return __handle_cmpxchg8b(ctxt);
>> +}
>> +
>>   static int em_ret(struct x86_emulate_ctxt *ctxt)
>>   {
>>   	int rc;
>> @@ -4097,7 +4129,7 @@ static const struct gprefix pfx_0f_c7_7 = {
>>   
>>   
>>   static const struct group_dual group9 = { {
>> -	N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N,
>> +	N, I(DstMem64 | Lock | PageTable, em_cmpxchgxb), N, N, N, N, N, N,
> Eh, I vote to keep it em_cmpxchg8b.  _If_ we want to capture that its size is
> variable, maybe em_cmpxchg8b_16b?  em_cmpxchgxb just looks like a typo.

Yeah ok, I was not sure how to represent variable size hence used the x.
Will rename it.

>
> Sans correct writeback() handling, something like so:
>
> ---
>   arch/x86/kvm/emulate.c     | 44 ++++++++++++++++++++++++--------------
>   arch/x86/kvm/kvm_emulate.h |  2 ++
>   2 files changed, 30 insertions(+), 16 deletions(-)
>
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index 6145dac4a605..3ec4555571fa 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -2201,24 +2201,33 @@ static int em_call_near_abs(struct x86_emulate_ctxt *ctxt)
>   	return rc;
>   }
>   
> +#define em_cmpxchg8b_16b(__c, rbits, mbits)						  \
> +do {											  \
> +	u##mbits old = __c->dst.orig_val##mbits;					  \
> +											  \
> +	BUILD_BUG_ON(rbits * 2 != mbits);						  \
> +											  \
> +	if (((u##rbits)(old >> 0)     != (u##rbits)reg_read(__c, VCPU_REGS_RAX)) ||	  \
> +	    ((u##rbits)(old >> rbits) != (u##rbits)reg_read(__c, VCPU_REGS_RDX))) {	  \
> +		*reg_write(__c, VCPU_REGS_RAX) = (u##rbits)(old >> 0);			  \
> +		*reg_write(__c, VCPU_REGS_RDX) = (u##rbits)(old >> rbits);		  \
> +		__c->eflags &= ~X86_EFLAGS_ZF;						  \
> +	} else {									  \
> +		__c->dst.val##mbits = ((u##mbits)reg_read(__c, VCPU_REGS_RCX) << rbits) | \
> +				       (u##rbits)reg_read(__c, VCPU_REGS_RBX);		  \
> +		__c->eflags |= X86_EFLAGS_ZF;						  \
> +	}										  \
> +} while (0)
> +
>   static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
>   {
> -	u64 old = ctxt->dst.orig_val64;
> -
> -	if (ctxt->dst.bytes == 16)
> +	if (WARN_ON_ONCE(8 + (ctxt->rex_bits & REX_W) * 8 != ctxt->dst.bytes))
>   		return X86EMUL_UNHANDLEABLE;
>   
> -	if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) ||
> -	    ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
> -		*reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
> -		*reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
> -		ctxt->eflags &= ~X86_EFLAGS_ZF;
> -	} else {
> -		ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
> -			(u32) reg_read(ctxt, VCPU_REGS_RBX);
> -
> -		ctxt->eflags |= X86_EFLAGS_ZF;
> -	}
> +	if (!(ctxt->rex_bits & REX_W))
> +		em_cmpxchg8b_16b(ctxt, 32, 64);
> +	else
> +		em_cmpxchg8b_16b(ctxt, 64, 128);
>   	return X86EMUL_CONTINUE;
>   }
>   
> @@ -5418,8 +5427,11 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, bool check_intercepts)
>   			goto done;
>   		}
>   	}
> -	/* Copy full 64-bit value for CMPXCHG8B.  */
> -	ctxt->dst.orig_val64 = ctxt->dst.val64;
> +	/* Copy the full 64/128-bit value for CMPXCHG8B/16B.  */
> +	if (IS_ENABLED(CONFIG_X86_64) && ctxt->dst.bytes == 16)
> +		ctxt->dst.orig_val128 = ctxt->dst.val128;
> +	else
> +		ctxt->dst.orig_val64 = ctxt->dst.val64;
>   
>   special_insn:
>   
> diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
> index fb3dab4b5a53..0e9968319343 100644
> --- a/arch/x86/kvm/kvm_emulate.h
> +++ b/arch/x86/kvm/kvm_emulate.h
> @@ -255,6 +255,7 @@ struct operand {
>   	union {
>   		unsigned long orig_val;
>   		u64 orig_val64;
> +		u64 orig_val128;
>   	};
>   	union {
>   		unsigned long *reg;
> @@ -268,6 +269,7 @@ struct operand {
>   	union {
>   		unsigned long val;
>   		u64 val64;
> +		__uint128_t val128;
>   		char valptr[sizeof(avx256_t)];
>   		sse128_t vec_val;
>   		avx256_t vec_val2;
>
> base-commit: 5128b972fb2801ad9aca54d990a75611ab5283a9
> --

Thanks
Sairaj

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] KVM: x86: Add support for cmpxchg16b emulation
  2026-03-06 15:38 ` Sean Christopherson
  2026-03-11 11:18   ` Sairaj Kodilkar
@ 2026-03-12 11:46   ` Paolo Bonzini
  1 sibling, 0 replies; 6+ messages in thread
From: Paolo Bonzini @ 2026-03-12 11:46 UTC (permalink / raw)
  To: Sean Christopherson, Sairaj Kodilkar
  Cc: H. Peter Anvin, Borislav Petkov, Dave Hansen, Ingo Molnar,
	Thomas Gleixner, kvm, linux-kernel, x86, suravee.suthikulpanit,
	vasant.hegde, nikunj.dadhania, Manali.Shukla

On 3/6/26 16:38, Sean Christopherson wrote:
>> -	if (ctxt->dst.bytes == 16)
>> +	/* Use of the REX.W prefix promotes operation to 128 bits */
>> +	if (ctxt->rex_bits & REX_W)
> 
> Uh, yeah, and the caller specifically pivoted on this size.  I'm a-ok with a
> sanity check, but it should be exactly that, e.g.
> 
> 	if (WARN_ON_ONCE(8 + (ctxt->rex_bits & REX_W) * 8 != ctxt->dst.bytes))
> 		return X86EMUL_UNHANDLEABLE;


> And then emulator_cmpxchg_emulated() should be taught to do cmpxchg16b itself:
> 
> 	/* guests cmpxchg8b have to be emulated atomically */
> 	if (bytes > 8 || (bytes & (bytes - 1)))
> 		goto emul_write;
> 
> That might be a big lift, e.g. to get a 16-byte uaccess version.  If it's
> unreasonably difficult, we should reject emulation of LOCK CMPXCHG16B.

It's actually pretty easy because a kind soul already wrote the
cmpxchg8b version of the macros.  Compile-tested only (and checked
that the asm does have the instruction):

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 3a7755c1a4410..f40e815263233 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -388,14 +388,13 @@ do {									\
  		*_old = __old;						\
  	likely(success);					})
  
-#ifdef CONFIG_X86_32
-#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)	({	\
+#define __try_cmpxchg8b_16b_user_asm(_b, _ptr, _pold, _new, label)({	\
  	bool success;							\
  	__typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);		\
  	__typeof__(*(_ptr)) __old = *_old;				\
  	__typeof__(*(_ptr)) __new = (_new);				\
  	asm_goto_output("\n"						\
-		     "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"		\
+		     "1: " LOCK_PREFIX "cmpxchg" #_b "b %[ptr]\n"	\
  		     _ASM_EXTABLE_UA(1b, %l[label])			\
  		     : CC_OUT(z) (success),				\
  		       "+A" (__old),					\
@@ -407,7 +406,6 @@ do {									\
  	if (unlikely(!success))						\
  		*_old = __old;						\
  	likely(success);					})
-#endif // CONFIG_X86_32
  #else  // !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
  #define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label)	({ \
  	int __err = 0;							\
@@ -433,7 +431,6 @@ do {									\
  		*_old = __old;						\
  	likely(success);					})
  
-#ifdef CONFIG_X86_32
  /*
   * Unlike the normal CMPXCHG, use output GPR for both success/fail and error.
   * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are
@@ -441,13 +438,13 @@ do {									\
   * both ESI and EDI for the memory operand, compilation will fail if the error
   * is an input+output as there will be no register available for input.
   */
-#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)	({	\
+#define __try_cmpxchg8b_16b_user_asm(_b, _ptr, _pold, _new, label)	({	\
  	int __result;							\
  	__typeof__(_ptr) _old = (__typeof__(_ptr))(_pold);		\
  	__typeof__(*(_ptr)) __old = *_old;				\
  	__typeof__(*(_ptr)) __new = (_new);				\
  	asm volatile("\n"						\
-		     "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n"		\
+		     "1: " LOCK_PREFIX "cmpxchg" #_b "b %[ptr]\n"	\
  		     "mov $0, %[result]\n\t"				\
  		     "setz %b[result]\n"				\
  		     "2:\n"						\
@@ -464,7 +461,6 @@ do {									\
  	if (unlikely(!__result))					\
  		*_old = __old;						\
  	likely(__result);					})
-#endif // CONFIG_X86_32
  #endif // CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
  
  /* FIXME: this hack is definitely wrong -AK */
@@ -552,9 +549,15 @@ do {										\
  
  extern void __try_cmpxchg_user_wrong_size(void);
  
-#ifndef CONFIG_X86_32
+#ifdef CONFIG_X86_32
+#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label)		\
+	__try_cmpxchg8b_16b_user_asm(8, _ptr, _pold, _new, label)
+#else
  #define __try_cmpxchg64_user_asm(_ptr, _oldp, _nval, _label)		\
  	__try_cmpxchg_user_asm("q", "r", (_ptr), (_oldp), (_nval), _label)
+
+#define __try_cmpxchg128_user_asm(_ptr, _pold, _new, label)		\
+	__try_cmpxchg8b_16b_user_asm(16, _ptr, _pold, _new, label)
  #endif
  
  /*
@@ -581,6 +584,9 @@ extern void __try_cmpxchg_user_wrong_size(void);
  	case 8:	__ret = __try_cmpxchg64_user_asm((__force u64 *)(_ptr), (_oldp),\
  						 (_nval), _label);		\
  		break;								\
+	case 16:__ret = __try_cmpxchg128_user_asm((__force u128 *)(_ptr), (_oldp),\
+						 (_nval), _label);		\
+		break;								\
  	default: __try_cmpxchg_user_wrong_size();				\
  	}									\
  	__ret;						})
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f2db04aa6a17f..3e1bd9cdd75e6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7973,7 +7973,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
  	int r;
  
  	/* guests cmpxchg8b have to be emulated atomically */
-	if (bytes > 8 || (bytes & (bytes - 1)))
+	if (bytes > 2 * sizeof(unsigned long) || (bytes & (bytes - 1)))
  		goto emul_write;
  
  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
@@ -8013,6 +8013,11 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
  	case 8:
  		r = emulator_try_cmpxchg_user(u64, hva, old, new);
  		break;
+#ifdef CONFIG_X86_64
+	case 16:
+		r = emulator_try_cmpxchg_user(u128, hva, old, new);
+		break;
+#endif
  	default:
  		BUG();
  	}


(with the uaccess.h part to be split in a separate patch, of course).

Paolo


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2026-03-12 11:46 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-06 10:20 [PATCH] KVM: x86: Add support for cmpxchg16b emulation Sairaj Kodilkar
2026-03-06 15:38 ` Sean Christopherson
2026-03-11 11:18   ` Sairaj Kodilkar
2026-03-12 11:46   ` Paolo Bonzini
2026-03-07  2:39 ` kernel test robot
2026-03-07  5:40 ` kernel test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox