Linux Confidential Computing Development

Linux Confidential Computing Development
 help / color / mirror / Atom feed

* [PATCH  v13 07/22] KVM: selftests: Introduce structures for TDX guest boot parameters
From: Lisa Wang @ 2026-05-21 23:16 UTC (permalink / raw)
  To: Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao, Chenyi Qiang,
	Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
	Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
	Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
	Sagi Shahar, Sean Christopherson, Shuah Khan, Oliver Upton
  Cc: Jeremiah McReynolds, kvm, linux-coco, linux-kernel, x86,
	Lisa Wang
In-Reply-To: <20260521-tdx-selftests-v13-v13-0-6983ae4c3a4d@google.com>

From: Sagi Shahar <sagis@google.com>

Introduce `td_boot_parameters` and `td_per_vcpu_parameters`, and export
their offsets to assembly via the kbuild infrastructure.

TDX guest registers are private and must be initialized by guest-side
assembly. These structures allow the assembly code to retrieve boot
parameters and index into per-vCPU data based on the vCPU ID, while
keeping host and guest definitions synchronized.

Use kbuild.h to expose the offsets into the structs from c code to
assembly code.

Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Sagi Shahar <sagis@google.com>
Co-developed-by: Lisa Wang <wyihan@google.com>
Signed-off-by: Lisa Wang <wyihan@google.com>
---
 tools/testing/selftests/kvm/.gitignore             |  3 +-
 tools/testing/selftests/kvm/Makefile.kvm           | 29 ++++++++-
 .../selftests/kvm/include/x86/tdx/td_boot.h        | 69 ++++++++++++++++++++++
 .../selftests/kvm/lib/x86/tdx/td_boot_offsets.c    | 21 +++++++
 4 files changed, 119 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
index 1d41a046a7bf..eef6055242b2 100644
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -9,4 +9,5 @@
 !config
 !settings
 !Makefile
-!Makefile.kvm
\ No newline at end of file
+!Makefile.kvm
+include/x86/**/*_offsets.h
\ No newline at end of file
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index e5769268936a..02fad7b35eac 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -19,6 +19,8 @@ LIBKVM += lib/userfaultfd_util.c
 
 LIBKVM_STRING += lib/string_override.c
 
+LIBKVM_ASM_DEFS += lib/x86/tdx/td_boot_offsets.c
+
 LIBKVM_x86 += lib/x86/apic.c
 LIBKVM_x86 += lib/x86/handlers.S
 LIBKVM_x86 += lib/x86/hyperv.c
@@ -260,6 +262,10 @@ OVERRIDE_TARGETS = 1
 include ../lib.mk
 include ../cgroup/lib/libcgroup.mk
 
+# Enable Kbuild tools.
+include $(top_srcdir)/scripts/Kbuild.include
+include $(top_srcdir)/scripts/Makefile.lib
+
 INSTALL_HDR_PATH = $(top_srcdir)/usr
 LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
 LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include
@@ -272,15 +278,24 @@ CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
 	-fno-stack-protector -fno-PIE -fno-strict-aliasing \
 	-I$(LINUX_TOOL_INCLUDE) -I$(LINUX_TOOL_ARCH_INCLUDE) \
 	-I$(LINUX_HDR_PATH) -Iinclude -I$(<D) -Iinclude/$(ARCH) \
-	-I ../rseq -I.. $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
+	-I ../rseq -I.. -I$(OUTPUT)/include/$(ARCH) $(EXTRA_CFLAGS) $(KHDR_INCLUDES)
 ifeq ($(ARCH),s390)
 	CFLAGS += -march=z10
 endif
+
 ifeq ($(ARCH),x86)
+
 ifeq ($(shell echo "void foo(void) { }" | $(CC) -march=x86-64-v2 -x c - -c -o /dev/null 2>/dev/null; echo "$$?"),0)
 	CFLAGS += -march=x86-64-v2
 endif
+
+KVM_GEN_HDRS := $(patsubst lib/x86/%.c, $(OUTPUT)/include/x86/%.h, $(filter lib/x86/%, $(LIBKVM_ASM_DEFS)))
+$(shell mkdir -p $(sort $(dir $(KVM_GEN_HDRS))))
+$(KVM_GEN_HDRS): GUARD = $(shell echo $(*F) | tr a-z A-Z | tr '.' '_')
+$(KVM_GEN_HDRS): $(OUTPUT)/include/x86/%.h: $(OUTPUT)/lib/x86/%.s FORCE
+	$(call filechk,offsets,__$(GUARD)_H__)
 endif
+
 ifeq ($(ARCH),arm64)
 tools_dir := $(top_srcdir)/tools
 arm64_tools_dir := $(tools_dir)/arch/arm64/tools/
@@ -313,6 +328,7 @@ LIBKVM_S := $(filter %.S,$(LIBKVM))
 LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
 LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
 LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING))
+LIBKVM_ASM_DEFS_OBJ += $(patsubst %.c, $(OUTPUT)/%.s, $(LIBKVM_ASM_DEFS))
 LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ) $(LIBCGROUP_O)
 SPLIT_TEST_GEN_PROGS := $(patsubst %, $(OUTPUT)/%, $(SPLIT_TESTS))
 SPLIT_TEST_GEN_OBJ := $(patsubst %, $(OUTPUT)/$(ARCH)/%.o, $(SPLIT_TESTS))
@@ -338,7 +354,9 @@ $(SPLIT_TEST_GEN_OBJ): $(OUTPUT)/$(ARCH)/%.o: $(ARCH)/%.c
 	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
 
 EXTRA_CLEAN += $(GEN_HDRS) \
+	       $(KVM_GEN_HDRS) \
 	       $(LIBKVM_OBJS) \
+	       $(LIBKVM_ASM_DEFS_OBJ) \
 	       $(SPLIT_TEST_GEN_OBJ) \
 	       $(TEST_DEP_FILES) \
 	       $(TEST_GEN_OBJ) \
@@ -350,6 +368,9 @@ $(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c $(GEN_HDRS)
 $(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S $(GEN_HDRS)
 	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
 
+$(LIBKVM_ASM_DEFS_OBJ): $(OUTPUT)/%.s: %.c FORCE
+	$(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -S $< -o $@
+
 # Compile the string overrides as freestanding to prevent the compiler from
 # generating self-referential code, e.g. without "freestanding" the compiler may
 # "optimize" memcmp() by invoking memcmp(), thus causing infinite recursion.
@@ -358,11 +379,15 @@ $(LIBKVM_STRING_OBJ): $(OUTPUT)/%.o: %.c
 
 $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS))))
 $(SPLIT_TEST_GEN_OBJ): $(GEN_HDRS)
+$(LIBKVM_OBJS): $(KVM_GEN_HDRS)
 $(TEST_GEN_PROGS): $(LIBKVM_OBJS)
 $(TEST_GEN_PROGS_EXTENDED): $(LIBKVM_OBJS)
 $(TEST_GEN_OBJ): $(GEN_HDRS)
 
-cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib ..
+FORCE:
+
+cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib .. \
+			$(wildcard $(sort $(dir $(KVM_GEN_HDRS))))
 cscope:
 	$(RM) cscope.*
 	(find $(include_paths) -name '*.h' \
diff --git a/tools/testing/selftests/kvm/include/x86/tdx/td_boot.h b/tools/testing/selftests/kvm/include/x86/tdx/td_boot.h
new file mode 100644
index 000000000000..af4474dee387
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/tdx/td_boot.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_TDX_TD_BOOT_H
+#define SELFTEST_TDX_TD_BOOT_H
+
+#include <stdint.h>
+
+#include <linux/compiler.h>
+#include <linux/sizes.h>
+
+/*
+ * Layout for boot section (not to scale)
+ *
+ *                                   GPA
+ * _________________________________ 0x1_0000_0000 (4GB)
+ * |   Boot code trampoline    |
+ * |___________________________|____ 0x0_ffff_fff0: Reset vector (16B below 4GB)
+ * |   Boot code               |
+ * |___________________________|____ td_boot will be copied here, so that the
+ * |                           |     jmp to td_boot is exactly at the reset vector
+ * |   Empty space             |
+ * |                           |
+ * |───────────────────────────|
+ * |                           |
+ * |                           |
+ * |   Boot parameters         |
+ * |                           |
+ * |                           |
+ * |___________________________|____ 0x0_ffff_0000: TD_BOOT_PARAMETERS_GPA
+ */
+#define FOUR_GIGABYTES_GPA (SZ_4G)
+
+/*
+ * The exact memory layout for LGDT or LIDT instructions.
+ */
+struct __packed td_boot_parameters_dtr {
+	u16 limit;
+	u32 base;
+};
+
+/*
+ * Allows each vCPU to be initialized with different rip and esp.
+ */
+struct td_per_vcpu_parameters {
+	u32 esp_gva;
+	u64 guest_code;
+};
+
+/*
+ * Boot parameters for the TD.
+ *
+ * Unlike a regular VM, KVM cannot set registers such as esp, eip, etc
+ * before boot, so to run selftests, these registers' values have to be
+ * initialized by the TD.
+ *
+ * This struct is loaded in TD private memory at TD_BOOT_PARAMETERS_GPA.
+ *
+ * The TD boot code will read off parameters from this struct and set up the
+ * vCPU for executing selftests.
+ */
+struct td_boot_parameters {
+	u32 cr0;
+	u32 cr3;
+	u32 cr4;
+	struct td_boot_parameters_dtr gdtr;
+	struct td_boot_parameters_dtr idtr;
+	struct td_per_vcpu_parameters per_vcpu[];
+};
+
+#endif /* SELFTEST_TDX_TD_BOOT_H */
diff --git a/tools/testing/selftests/kvm/lib/x86/tdx/td_boot_offsets.c b/tools/testing/selftests/kvm/lib/x86/tdx/td_boot_offsets.c
new file mode 100644
index 000000000000..7f76a3585b99
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86/tdx/td_boot_offsets.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+#define COMPILE_OFFSETS
+
+#include <linux/kbuild.h>
+
+#include "tdx/td_boot.h"
+
+static void __attribute__((used)) common(void)
+{
+	OFFSET(TD_BOOT_PARAMETERS_CR0, td_boot_parameters, cr0);
+	OFFSET(TD_BOOT_PARAMETERS_CR3, td_boot_parameters, cr3);
+	OFFSET(TD_BOOT_PARAMETERS_CR4, td_boot_parameters, cr4);
+	OFFSET(TD_BOOT_PARAMETERS_GDT, td_boot_parameters, gdtr);
+	OFFSET(TD_BOOT_PARAMETERS_IDT, td_boot_parameters, idtr);
+	OFFSET(TD_BOOT_PARAMETERS_PER_VCPU, td_boot_parameters, per_vcpu);
+	OFFSET(TD_PER_VCPU_PARAMETERS_ESP_GVA, td_per_vcpu_parameters, esp_gva);
+	OFFSET(TD_PER_VCPU_PARAMETERS_GUEST_CODE, td_per_vcpu_parameters,
+	       guest_code);
+	DEFINE(SIZEOF_TD_PER_VCPU_PARAMETERS,
+	       sizeof(struct td_per_vcpu_parameters));
+}

-- 
2.54.0.746.g67dd491aae-goog


^ permalink raw reply related

* [PATCH  v13 08/22] KVM: selftests: Add TDX boot code
From: Lisa Wang @ 2026-05-21 23:16 UTC (permalink / raw)
  To: Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao, Chenyi Qiang,
	Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
	Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
	Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
	Sagi Shahar, Sean Christopherson, Shuah Khan, Oliver Upton
  Cc: Jeremiah McReynolds, kvm, linux-coco, linux-kernel, x86,
	Lisa Wang
In-Reply-To: <20260521-tdx-selftests-v13-v13-0-6983ae4c3a4d@google.com>

From: Erdem Aktas <erdemaktas@google.com>

Add code to boot a TDX test VM. Since TDX registers are inaccessible to
KVM, the boot code loads the relevant values from memory into the
registers before jumping to the guest code.

Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Signed-off-by: Erdem Aktas <erdemaktas@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sagi Shahar <sagis@google.com>
Signed-off-by: Sagi Shahar <sagis@google.com>
Signed-off-by: Lisa Wang <wyihan@google.com>
---
 tools/testing/selftests/kvm/Makefile.kvm           |  1 +
 .../selftests/kvm/include/x86/tdx/td_boot.h        |  5 ++
 .../selftests/kvm/include/x86/tdx/td_boot_asm.h    | 16 ++++++
 tools/testing/selftests/kvm/lib/x86/tdx/td_boot.S  | 60 ++++++++++++++++++++++
 4 files changed, 82 insertions(+)

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 02fad7b35eac..929965ca4b75 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -31,6 +31,7 @@ LIBKVM_x86 += lib/x86/sev.c
 LIBKVM_x86 += lib/x86/svm.c
 LIBKVM_x86 += lib/x86/ucall.c
 LIBKVM_x86 += lib/x86/vmx.c
+LIBKVM_x86 += lib/x86/tdx/td_boot.S
 
 LIBKVM_arm64 += lib/arm64/gic.c
 LIBKVM_arm64 += lib/arm64/gic_v3.c
diff --git a/tools/testing/selftests/kvm/include/x86/tdx/td_boot.h b/tools/testing/selftests/kvm/include/x86/tdx/td_boot.h
index af4474dee387..e5d54a20ed72 100644
--- a/tools/testing/selftests/kvm/include/x86/tdx/td_boot.h
+++ b/tools/testing/selftests/kvm/include/x86/tdx/td_boot.h
@@ -66,4 +66,9 @@ struct td_boot_parameters {
 	struct td_per_vcpu_parameters per_vcpu[];
 };
 
+void td_boot(void);
+void td_boot_code_end(void);
+
+#define TD_BOOT_CODE_SIZE (td_boot_code_end - td_boot)
+
 #endif /* SELFTEST_TDX_TD_BOOT_H */
diff --git a/tools/testing/selftests/kvm/include/x86/tdx/td_boot_asm.h b/tools/testing/selftests/kvm/include/x86/tdx/td_boot_asm.h
new file mode 100644
index 000000000000..10b4b527595c
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/tdx/td_boot_asm.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTEST_TDX_TD_BOOT_ASM_H
+#define SELFTEST_TDX_TD_BOOT_ASM_H
+
+/*
+ * GPA where TD boot parameters will be loaded.
+ *
+ * TD_BOOT_PARAMETERS_GPA is arbitrarily chosen to
+ *
+ * + be within the 4GB address space
+ * + provide enough contiguous memory for the struct td_boot_parameters such
+ *   that there is one struct td_per_vcpu_parameters for KVM_MAX_VCPUS
+ */
+#define TD_BOOT_PARAMETERS_GPA 0xffff0000
+
+#endif  // SELFTEST_TDX_TD_BOOT_ASM_H
diff --git a/tools/testing/selftests/kvm/lib/x86/tdx/td_boot.S b/tools/testing/selftests/kvm/lib/x86/tdx/td_boot.S
new file mode 100644
index 000000000000..7aa33caa9a78
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86/tdx/td_boot.S
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include "tdx/td_boot_asm.h"
+#include "tdx/td_boot_offsets.h"
+#include "processor_asm.h"
+
+.code32
+
+.globl td_boot
+td_boot:
+	/* In this procedure, edi is used as a temporary register. */
+	cli
+
+	/* Paging is off. */
+
+	movl $TD_BOOT_PARAMETERS_GPA, %ebx
+
+	/*
+	 * Find the address of struct td_per_vcpu_parameters for this
+	 * vCPU based on esi (TDX spec: initialized with vCPU id). Put
+	 * struct address into register for indirect addressing.
+	 */
+	movl $SIZEOF_TD_PER_VCPU_PARAMETERS, %eax
+	mul %esi
+	leal TD_BOOT_PARAMETERS_PER_VCPU(%ebx), %edi
+	addl %edi, %eax
+
+	/* Setup stack. */
+	movl TD_PER_VCPU_PARAMETERS_ESP_GVA(%eax), %esp
+
+	/* Setup GDT. */
+	leal TD_BOOT_PARAMETERS_GDT(%ebx), %edi
+	lgdt (%edi)
+
+	/* Setup IDT. */
+	leal TD_BOOT_PARAMETERS_IDT(%ebx), %edi
+	lidt (%edi)
+
+	/*
+	 * Set up control registers (There are no instructions to mov from
+	 * memory to control registers, hence use edi as a scratch register).
+	 */
+	movl TD_BOOT_PARAMETERS_CR4(%ebx), %edi
+	movl %edi, %cr4
+	movl TD_BOOT_PARAMETERS_CR3(%ebx), %edi
+	movl %edi, %cr3
+	movl TD_BOOT_PARAMETERS_CR0(%ebx), %edi
+	movl %edi, %cr0
+
+	/* Switching to 64bit mode after ljmp and then jump to guest code */
+	ljmp $(KERNEL_CS),$1f
+1:
+	jmp *TD_PER_VCPU_PARAMETERS_GUEST_CODE(%eax)
+
+/* Leave marker so size of td_boot code can be computed. */
+.globl td_boot_code_end
+td_boot_code_end:
+
+/* Disable executable stack. */
+.section .note.GNU-stack,"",%progbits

-- 
2.54.0.746.g67dd491aae-goog


^ permalink raw reply related

* [PATCH  v13 06/22] tools: include: Add kbuild.h for assembly structure offsets
From: Lisa Wang @ 2026-05-21 23:16 UTC (permalink / raw)
  To: Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao, Chenyi Qiang,
	Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
	Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
	Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
	Sagi Shahar, Sean Christopherson, Shuah Khan, Oliver Upton
  Cc: Jeremiah McReynolds, kvm, linux-coco, linux-kernel, x86,
	Lisa Wang
In-Reply-To: <20260521-tdx-selftests-v13-v13-0-6983ae4c3a4d@google.com>

From: Sagi Shahar <sagis@google.com>

Add the Kbuild macros needed to enable the filechk_offsets mechanism to
generate C header files containing structure member offset information.

Tools depending on assembly code that operate on structures have to
hardcode the offsets of structure members. The Kbuild infrastructure
can instead generate C header files with these offsets automatically,
allowing them to be included in assembly code as symbolic constants.

For example, the TDX guest boot code requires access to parameters
passed in the C structure(struct td_boot_parameters). This header
provides the macros needed to extract these offsets from C code and
expose them to assembly, ensuring the two remain synchronized.

Signed-off-by: Sagi Shahar <sagis@google.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Lisa Wang <wyihan@google.com>
---
 tools/include/linux/kbuild.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tools/include/linux/kbuild.h b/tools/include/linux/kbuild.h
new file mode 100644
index 000000000000..957fd55cd159
--- /dev/null
+++ b/tools/include/linux/kbuild.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TOOLS_LINUX_KBUILD_H
+#define __TOOLS_LINUX_KBUILD_H
+
+#define DEFINE(sym, val) \
+	asm volatile("\n.ascii \"->" #sym " %0 " #val "\"" : : "i" (val))
+
+#define OFFSET(sym, str, mem) \
+	DEFINE(sym, __builtin_offsetof(struct str, mem))
+
+#endif /* __TOOLS_LINUX_KBUILD_H */

-- 
2.54.0.746.g67dd491aae-goog

^ permalink raw reply related

* [PATCH  v13 05/22] KVM: selftests: Expose segment definitions to assembly files
From: Lisa Wang @ 2026-05-21 23:16 UTC (permalink / raw)
  To: Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao, Chenyi Qiang,
	Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
	Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
	Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
	Sagi Shahar, Sean Christopherson, Shuah Khan, Oliver Upton
  Cc: Jeremiah McReynolds, kvm, linux-coco, linux-kernel, x86,
	Lisa Wang
In-Reply-To: <20260521-tdx-selftests-v13-v13-0-6983ae4c3a4d@google.com>

From: Sagi Shahar <sagis@google.com>

Move kernel segment definitions to a separate file which can be included
from assembly files.

Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Signed-off-by: Sagi Shahar <sagis@google.com>
Signed-off-by: Lisa Wang <wyihan@google.com>
---
 tools/testing/selftests/kvm/include/x86/processor_asm.h | 12 ++++++++++++
 tools/testing/selftests/kvm/lib/x86/processor.c         |  5 +----
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/x86/processor_asm.h b/tools/testing/selftests/kvm/include/x86/processor_asm.h
new file mode 100644
index 000000000000..713b6bc0aeb7
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/processor_asm.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Used for storing defines used by both c and assembly code.
+ */
+#ifndef SELFTEST_KVM_PROCESSOR_ASM_H
+#define SELFTEST_KVM_PROCESSOR_ASM_H
+
+#define KERNEL_CS	0x8
+#define KERNEL_DS	0x10
+#define KERNEL_TSS	0x18
+
+#endif  /* SELFTEST_KVM_PROCESSOR_ASM_H */
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index 8d06e7186df1..62abfe27fe3a 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -8,6 +8,7 @@
 #include "kvm_util.h"
 #include "pmu.h"
 #include "processor.h"
+#include "processor_asm.h"
 #include "smm.h"
 #include "svm_util.h"
 #include "sev.h"
@@ -18,10 +19,6 @@
 #define NUM_INTERRUPTS 256
 #endif
 
-#define KERNEL_CS	0x8
-#define KERNEL_DS	0x10
-#define KERNEL_TSS	0x18
-
 gva_t exception_handlers;
 bool host_cpu_is_amd;
 bool host_cpu_is_intel;

-- 
2.54.0.746.g67dd491aae-goog


^ permalink raw reply related

* [PATCH  v13 04/22] KVM: selftests: TDX: Use KVM_TDX_CAPABILITIES to validate TDs' attribute configuration
From: Lisa Wang @ 2026-05-21 23:16 UTC (permalink / raw)
  To: Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao, Chenyi Qiang,
	Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
	Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
	Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
	Sagi Shahar, Sean Christopherson, Shuah Khan, Oliver Upton
  Cc: Jeremiah McReynolds, kvm, linux-coco, linux-kernel, x86,
	Lisa Wang
In-Reply-To: <20260521-tdx-selftests-v13-v13-0-6983ae4c3a4d@google.com>

From: Isaku Yamahata <isaku.yamahata@intel.com>

Make sure that all the attributes enabled by the test are reported as
supported by both the TDX module and KVM. KVM filters out the attributes
not supported by itself.

This also exercises the KVM_TDX_CAPABILITIES ioctl.

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Co-developed-by: Sagi Shahar <sagis@google.com>
Signed-off-by: Sagi Shahar <sagis@google.com>
Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Lisa Wang <wyihan@google.com>
---
 tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c b/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c
index 868ff62e22f2..e5c998874a0d 100644
--- a/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c
+++ b/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c
@@ -110,6 +110,18 @@ static void tdx_filter_cpuid(struct kvm_vm *vm,
 	free(tdx_cap);
 }
 
+static void tdx_check_attributes(struct kvm_vm *vm, u64 attributes)
+{
+	struct kvm_tdx_capabilities *tdx_cap;
+
+	tdx_cap = tdx_read_capabilities(vm);
+
+	/* Make sure all the attributes are reported as supported */
+	TEST_ASSERT_EQ(attributes & tdx_cap->supported_attrs, attributes);
+
+	free(tdx_cap);
+}
+
 void tdx_init_vm(struct kvm_vm *vm, u64 attributes)
 {
 	struct kvm_tdx_init_vm *init_vm;
@@ -129,6 +141,8 @@ void tdx_init_vm(struct kvm_vm *vm, u64 attributes)
 	memcpy(&init_vm->cpuid, cpuid, kvm_cpuid2_size(cpuid->nent));
 	free(cpuid);
 
+	tdx_check_attributes(vm, attributes);
+
 	init_vm->attributes = attributes;
 
 	tdx_vm_ioctl(vm, KVM_TDX_INIT_VM, 0, init_vm);

-- 
2.54.0.746.g67dd491aae-goog


^ permalink raw reply related

* [PATCH  v13 03/22] KVM: selftests: Initialize the TDX VM
From: Lisa Wang @ 2026-05-21 23:16 UTC (permalink / raw)
  To: Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao, Chenyi Qiang,
	Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
	Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
	Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
	Sagi Shahar, Sean Christopherson, Shuah Khan, Oliver Upton
  Cc: Jeremiah McReynolds, kvm, linux-coco, linux-kernel, x86,
	Lisa Wang
In-Reply-To: <20260521-tdx-selftests-v13-v13-0-6983ae4c3a4d@google.com>

From: Sagi Shahar <sagis@google.com>

Add tdx_init_vm() to handle the mandatory VM-level initialization
sequence required for Intel TDX.

For TDX, the guest's CPUID configuration must be "sealed" during
KVM_TDX_INIT_VM before any vCPUs are created. This is necessary because
the TDX hardware directly virtualizes CPUID and includes the
configuration in the guest's initial security measurement.

The helper calculates the required CPUID values by filtering the host-
supported bits (kvm_get_supported_cpuid) against the "directly
configurable" bits reported by KVM_TDX_CAPABILITIES, ensuring
compliance with the strict requirements of the TDH.MNG.INIT SEAMCALL.

Co-developed-by: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Co-developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Sagi Shahar <sagis@google.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Lisa Wang <wyihan@google.com>
---
 .../selftests/kvm/include/x86/tdx/tdx_util.h       |  30 +++++
 tools/testing/selftests/kvm/lib/x86/processor.c    |   3 +
 tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c | 137 +++++++++++++++++++++
 3 files changed, 170 insertions(+)

diff --git a/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h b/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h
index f647e6ca6b34..48d4bd36c35b 100644
--- a/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h
+++ b/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h
@@ -11,4 +11,34 @@ static inline bool is_tdx_vm(struct kvm_vm *vm)
 	return vm->type == KVM_X86_TDX_VM;
 }
 
+/*
+ * TDX ioctls
+ * Use underscores to avoid collisions with struct member names.
+ */
+#define __tdx_vm_ioctl(vm, cmd, _flags, arg)				\
+({									\
+	int r;								\
+									\
+	union {								\
+		struct kvm_tdx_cmd c;					\
+		unsigned long raw;					\
+	} tdx_cmd = { .c = {						\
+		.id = (cmd),						\
+		.flags = (u32)(_flags),				\
+		.data = (u64)(arg),				\
+	} };								\
+									\
+	r = __vm_ioctl(vm, KVM_MEMORY_ENCRYPT_OP, &tdx_cmd.raw);	\
+	r ?: tdx_cmd.c.hw_error;					\
+})
+
+#define tdx_vm_ioctl(vm, cmd, flags, arg)				\
+({									\
+	int ret = __tdx_vm_ioctl(vm, cmd, flags, arg);			\
+									\
+	__TEST_ASSERT_VM_VCPU_IOCTL(!ret, #cmd,	ret, vm);		\
+})
+
+void tdx_init_vm(struct kvm_vm *vm, u64 attributes);
+
 #endif /* SELFTESTS_TDX_TDX_UTIL_H */
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index b68ad1dc7e02..8d06e7186df1 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -802,6 +802,9 @@ void kvm_arch_vm_post_create(struct kvm_vm *vm, unsigned int nr_vcpus)
 		vm_sev_ioctl(vm, KVM_SEV_INIT2, &init);
 	}
 
+	if (is_tdx_vm(vm))
+		tdx_init_vm(vm, 0);
+
 	r = __vm_ioctl(vm, KVM_GET_TSC_KHZ, NULL);
 	TEST_ASSERT(r > 0, "KVM_GET_TSC_KHZ did not provide a valid TSC frequency.");
 	guest_tsc_khz = r;
diff --git a/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c b/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c
new file mode 100644
index 000000000000..868ff62e22f2
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "tdx/tdx_util.h"
+
+static struct kvm_tdx_capabilities *tdx_read_capabilities(struct kvm_vm *vm)
+{
+	struct kvm_tdx_capabilities *tdx_cap = NULL;
+	int nr_cpuid_configs = 4;
+	int rc = -1;
+	int i;
+
+	do {
+		nr_cpuid_configs *= 2;
+
+		tdx_cap = realloc(tdx_cap, sizeof(*tdx_cap) +
+					   sizeof(tdx_cap->cpuid) +
+					   (sizeof(struct kvm_cpuid_entry2) * nr_cpuid_configs));
+		TEST_ASSERT(tdx_cap,
+			    "Could not allocate memory for tdx capability nr_cpuid_configs %d\n",
+			    nr_cpuid_configs);
+
+		tdx_cap->cpuid.nent = nr_cpuid_configs;
+		rc = __tdx_vm_ioctl(vm, KVM_TDX_CAPABILITIES, 0, tdx_cap);
+	} while (rc < 0 && errno == E2BIG);
+
+	TEST_ASSERT(rc == 0, "KVM_TDX_CAPABILITIES failed: %d %d",
+		    rc, errno);
+
+	pr_debug("tdx_cap: supported_attrs: 0x%016llx\n"
+		 "tdx_cap: supported_xfam 0x%016llx\n",
+		 tdx_cap->supported_attrs, tdx_cap->supported_xfam);
+
+	for (i = 0; i < tdx_cap->cpuid.nent; i++) {
+		const struct kvm_cpuid_entry2 *config = &tdx_cap->cpuid.entries[i];
+
+		pr_debug("cpuid config[%d]: leaf 0x%x sub_leaf 0x%x eax 0x%08x ebx 0x%08x ecx 0x%08x edx 0x%08x\n",
+			 i, config->function, config->index,
+			 config->eax, config->ebx, config->ecx, config->edx);
+	}
+
+	return tdx_cap;
+}
+
+static struct kvm_cpuid_entry2 *tdx_find_cpuid_config(struct kvm_tdx_capabilities *cap,
+						      u32 leaf, u32 sub_leaf)
+{
+	struct kvm_cpuid_entry2 *config;
+	u32 i;
+
+	for (i = 0; i < cap->cpuid.nent; i++) {
+		config = &cap->cpuid.entries[i];
+
+		if (config->function == leaf && config->index == sub_leaf)
+			return config;
+	}
+
+	return NULL;
+}
+
+/*
+ * Filter CPUID based on TDX supported capabilities
+ *
+ * Input Args:
+ *   vm - Virtual Machine
+ *   cpuid_data - CPUID fields to filter
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * For each CPUID leaf, filter out non-supported bits based on the capabilities reported
+ * by the TDX module
+ */
+static void tdx_filter_cpuid(struct kvm_vm *vm,
+			     struct kvm_cpuid2 *cpuid_data)
+{
+	struct kvm_tdx_capabilities *tdx_cap;
+	struct kvm_cpuid_entry2 *config;
+	struct kvm_cpuid_entry2 *e;
+	int i;
+
+	tdx_cap = tdx_read_capabilities(vm);
+
+	i = 0;
+	while (i < cpuid_data->nent) {
+		e = cpuid_data->entries + i;
+		config = tdx_find_cpuid_config(tdx_cap, e->function, e->index);
+
+		if (!config) {
+			int left = cpuid_data->nent - i - 1;
+
+			if (left > 0)
+				memmove(cpuid_data->entries + i,
+					cpuid_data->entries + i + 1,
+					sizeof(*cpuid_data->entries) * left);
+			cpuid_data->nent--;
+			continue;
+		}
+
+		e->eax &= config->eax;
+		e->ebx &= config->ebx;
+		e->ecx &= config->ecx;
+		e->edx &= config->edx;
+
+		i++;
+	}
+
+	free(tdx_cap);
+}
+
+void tdx_init_vm(struct kvm_vm *vm, u64 attributes)
+{
+	struct kvm_tdx_init_vm *init_vm;
+	const struct kvm_cpuid2 *tmp;
+	struct kvm_cpuid2 *cpuid;
+
+	tmp = kvm_get_supported_cpuid();
+
+	cpuid = allocate_kvm_cpuid2(tmp->nent);
+	memcpy(cpuid, tmp, kvm_cpuid2_size(tmp->nent));
+	tdx_filter_cpuid(vm, cpuid);
+
+	init_vm = calloc(1, sizeof(*init_vm) +
+			 sizeof(init_vm->cpuid.entries[0]) * cpuid->nent);
+	TEST_ASSERT(init_vm, "init_vm allocation failed");
+
+	memcpy(&init_vm->cpuid, cpuid, kvm_cpuid2_size(cpuid->nent));
+	free(cpuid);
+
+	init_vm->attributes = attributes;
+
+	tdx_vm_ioctl(vm, KVM_TDX_INIT_VM, 0, init_vm);
+
+	free(init_vm);
+}

-- 
2.54.0.746.g67dd491aae-goog


^ permalink raw reply related

* [PATCH  v13 02/22] KVM: selftests: Update kvm_init_vm_address_properties() for TDX
From: Lisa Wang @ 2026-05-21 23:16 UTC (permalink / raw)
  To: Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao, Chenyi Qiang,
	Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
	Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
	Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
	Sagi Shahar, Sean Christopherson, Shuah Khan, Oliver Upton
  Cc: Jeremiah McReynolds, kvm, linux-coco, linux-kernel, x86,
	Lisa Wang, Adrian Hunter
In-Reply-To: <20260521-tdx-selftests-v13-v13-0-6983ae4c3a4d@google.com>

From: Isaku Yamahata <isaku.yamahata@intel.com>

Initialize the TDX S-bit and the GPA tag mask in
kvm_init_vm_address_properties() for TDX VMs, similar to how the C-bit
is initialized for SEV VMs.

The TDX S-bit is used to distinguish between shared and private guest
physical addresses. Its position is determined by the guest physical
address width, which is either 48 or 52 bits for current TDX
implementations.

Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Co-developed-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Co-developed-by: Sagi Shahar <sagis@google.com>
Signed-off-by: Sagi Shahar <sagis@google.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Lisa Wang <wyihan@google.com>
---
 tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h | 14 ++++++++++++++
 tools/testing/selftests/kvm/lib/x86/processor.c        | 12 ++++++++++--
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h b/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h
new file mode 100644
index 000000000000..f647e6ca6b34
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86/tdx/tdx_util.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef SELFTESTS_TDX_TDX_UTIL_H
+#define SELFTESTS_TDX_TDX_UTIL_H
+
+#include <stdbool.h>
+
+#include "kvm_util.h"
+
+static inline bool is_tdx_vm(struct kvm_vm *vm)
+{
+	return vm->type == KVM_X86_TDX_VM;
+}
+
+#endif /* SELFTESTS_TDX_TDX_UTIL_H */
diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c
index b51467d70f6e..b68ad1dc7e02 100644
--- a/tools/testing/selftests/kvm/lib/x86/processor.c
+++ b/tools/testing/selftests/kvm/lib/x86/processor.c
@@ -11,6 +11,7 @@
 #include "smm.h"
 #include "svm_util.h"
 #include "sev.h"
+#include "tdx/tdx_util.h"
 #include "vmx.h"
 
 #ifndef NUM_INTERRUPTS
@@ -1311,12 +1312,19 @@ void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
 
 void kvm_init_vm_address_properties(struct kvm_vm *vm)
 {
+	u32 gpa_bits = kvm_cpu_property(X86_PROPERTY_GUEST_MAX_PHY_ADDR);
+
+	vm->arch.sev_fd = -1;
+
 	if (is_sev_vm(vm)) {
 		vm->arch.sev_fd = open_sev_dev_path_or_exit();
 		vm->arch.c_bit = BIT_ULL(this_cpu_property(X86_PROPERTY_SEV_C_BIT));
 		vm->gpa_tag_mask = vm->arch.c_bit;
-	} else {
-		vm->arch.sev_fd = -1;
+	} else if (is_tdx_vm(vm)) {
+		TEST_ASSERT(gpa_bits == 48 || gpa_bits == 52,
+			    "TDX: bad X86_PROPERTY_GUEST_MAX_PHY_ADDR value: %u", gpa_bits);
+		vm->arch.s_bit = BIT_ULL(gpa_bits - 1);
+		vm->gpa_tag_mask = vm->arch.s_bit;
 	}
 }
 

-- 
2.54.0.746.g67dd491aae-goog


^ permalink raw reply related

* [PATCH  v13 01/22] KVM: selftests: Add macros to simplify creating VM shapes for non-default types
From: Lisa Wang @ 2026-05-21 23:16 UTC (permalink / raw)
  To: Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao, Chenyi Qiang,
	Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
	Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
	Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
	Sagi Shahar, Sean Christopherson, Shuah Khan, Oliver Upton
  Cc: Jeremiah McReynolds, kvm, linux-coco, linux-kernel, x86,
	Lisa Wang
In-Reply-To: <20260521-tdx-selftests-v13-v13-0-6983ae4c3a4d@google.com>

From: Sean Christopherson <seanjc@google.com>

Add VM_TYPE() and __VM_TYPE() macros to create a vm_shape structure given
a type (and mode), and use the macros to define VM_SHAPE_{SEV,SEV_ES,SNP}
shapes for x86's SEV family of VM shapes.  Providing common infrastructure
will avoid having to copy+paste vm_sev_create_with_one_vcpu() for TDX.

Use the new SEV+ shapes and drop vm_sev_create_with_one_vcpu().

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sagi Shahar <sagis@google.com>
Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Lisa Wang <wyihan@google.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h     | 13 +++++++
 .../testing/selftests/kvm/include/x86/processor.h  |  4 +++
 tools/testing/selftests/kvm/include/x86/sev.h      |  2 --
 tools/testing/selftests/kvm/lib/x86/sev.c          | 16 ---------
 tools/testing/selftests/kvm/x86/sev_smoke_test.c   | 40 +++++++++++-----------
 5 files changed, 37 insertions(+), 38 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index dc70c6da63fa..041bdbfb93f7 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -233,6 +233,19 @@ kvm_static_assert(sizeof(struct vm_shape) == sizeof(u64));
 	shape;					\
 })
 
+#define __VM_TYPE(__mode, __type)		\
+({						\
+	struct vm_shape shape = {		\
+		.mode = (__mode),		\
+		.type = (__type)		\
+	};					\
+						\
+	shape;					\
+})
+
+#define VM_TYPE(__type)				\
+	__VM_TYPE(VM_MODE_DEFAULT, __type)
+
 extern enum vm_guest_mode vm_mode_default;
 
 #if defined(__aarch64__)
diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h
index 77f576ee7789..0aa6eecfcbde 100644
--- a/tools/testing/selftests/kvm/include/x86/processor.h
+++ b/tools/testing/selftests/kvm/include/x86/processor.h
@@ -365,6 +365,10 @@ static inline unsigned int x86_model(unsigned int eax)
 	return ((eax >> 12) & 0xf0) | ((eax >> 4) & 0x0f);
 }
 
+#define VM_SHAPE_SEV		VM_TYPE(KVM_X86_SEV_VM)
+#define VM_SHAPE_SEV_ES		VM_TYPE(KVM_X86_SEV_ES_VM)
+#define VM_SHAPE_SNP		VM_TYPE(KVM_X86_SNP_VM)
+
 #define PHYSICAL_PAGE_MASK      GENMASK_ULL(51, 12)
 
 #define PAGE_SHIFT		12
diff --git a/tools/testing/selftests/kvm/include/x86/sev.h b/tools/testing/selftests/kvm/include/x86/sev.h
index 1af44c151d60..944c59dbe510 100644
--- a/tools/testing/selftests/kvm/include/x86/sev.h
+++ b/tools/testing/selftests/kvm/include/x86/sev.h
@@ -53,8 +53,6 @@ void snp_vm_launch_start(struct kvm_vm *vm, u64 policy);
 void snp_vm_launch_update(struct kvm_vm *vm);
 void snp_vm_launch_finish(struct kvm_vm *vm);
 
-struct kvm_vm *vm_sev_create_with_one_vcpu(u32 type, void *guest_code,
-					   struct kvm_vcpu **cpu);
 void vm_sev_launch(struct kvm_vm *vm, u64 policy, u8 *measurement);
 
 kvm_static_assert(SEV_RET_SUCCESS == 0);
diff --git a/tools/testing/selftests/kvm/lib/x86/sev.c b/tools/testing/selftests/kvm/lib/x86/sev.c
index 93f916903461..95d8520eea34 100644
--- a/tools/testing/selftests/kvm/lib/x86/sev.c
+++ b/tools/testing/selftests/kvm/lib/x86/sev.c
@@ -158,22 +158,6 @@ void snp_vm_launch_finish(struct kvm_vm *vm)
 	vm_sev_ioctl(vm, KVM_SEV_SNP_LAUNCH_FINISH, &launch_finish);
 }
 
-struct kvm_vm *vm_sev_create_with_one_vcpu(u32 type, void *guest_code,
-					   struct kvm_vcpu **cpu)
-{
-	struct vm_shape shape = {
-		.mode = VM_MODE_DEFAULT,
-		.type = type,
-	};
-	struct kvm_vm *vm;
-	struct kvm_vcpu *cpus[1];
-
-	vm = __vm_create_with_vcpus(shape, 1, 0, guest_code, cpus);
-	*cpu = cpus[0];
-
-	return vm;
-}
-
 void vm_sev_launch(struct kvm_vm *vm, u64 policy, u8 *measurement)
 {
 	if (is_sev_snp_vm(vm)) {
diff --git a/tools/testing/selftests/kvm/x86/sev_smoke_test.c b/tools/testing/selftests/kvm/x86/sev_smoke_test.c
index 1a49ee391586..fe2c438882ae 100644
--- a/tools/testing/selftests/kvm/x86/sev_smoke_test.c
+++ b/tools/testing/selftests/kvm/x86/sev_smoke_test.c
@@ -104,7 +104,7 @@ static void compare_xsave(u8 *from_host, u8 *from_guest)
 		abort();
 }
 
-static void test_sync_vmsa(u32 type, u64 policy)
+static void test_sync_vmsa(struct vm_shape shape, u64 policy)
 {
 	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
@@ -114,7 +114,7 @@ static void test_sync_vmsa(u32 type, u64 policy)
 	double x87val = M_PI;
 	struct kvm_xsave __attribute__((aligned(64))) xsave = { 0 };
 
-	vm = vm_sev_create_with_one_vcpu(type, guest_code_xsave, &vcpu);
+	vm = vm_create_shape_with_one_vcpu(shape, &vcpu, guest_code_xsave);
 	gva = vm_alloc_shared(vm, PAGE_SIZE, KVM_UTIL_MIN_VADDR,
 			      MEM_REGION_TEST_DATA);
 	hva = addr_gva2hva(vm, gva);
@@ -150,13 +150,13 @@ static void test_sync_vmsa(u32 type, u64 policy)
 	kvm_vm_free(vm);
 }
 
-static void test_sev(void *guest_code, u32 type, u64 policy)
+static void test_sev(void *guest_code, struct vm_shape shape, u64 policy)
 {
 	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
 	struct ucall uc;
 
-	vm = vm_sev_create_with_one_vcpu(type, guest_code, &vcpu);
+	vm = vm_create_shape_with_one_vcpu(shape, &vcpu, guest_code);
 
 	/* TODO: Validate the measurement is as expected. */
 	vm_sev_launch(vm, policy, NULL);
@@ -201,12 +201,12 @@ static void guest_shutdown_code(void)
 	__asm__ __volatile__("ud2");
 }
 
-static void test_sev_shutdown(u32 type, u64 policy)
+static void test_sev_shutdown(struct vm_shape shape, u64 policy)
 {
 	struct kvm_vcpu *vcpu;
 	struct kvm_vm *vm;
 
-	vm = vm_sev_create_with_one_vcpu(type, guest_shutdown_code, &vcpu);
+	vm = vm_create_shape_with_one_vcpu(shape, &vcpu, guest_shutdown_code);
 
 	vm_sev_launch(vm, policy, NULL);
 
@@ -218,28 +218,28 @@ static void test_sev_shutdown(u32 type, u64 policy)
 	kvm_vm_free(vm);
 }
 
-static void test_sev_smoke(void *guest, u32 type, u64 policy)
+static void test_sev_smoke(void *guest, struct vm_shape shape, u64 policy)
 {
 	const u64 xf_mask = XFEATURE_MASK_X87_AVX;
 
-	if (type == KVM_X86_SNP_VM)
-		test_sev(guest, type, policy | SNP_POLICY_DBG);
+	if (shape.type == KVM_X86_SNP_VM)
+		test_sev(guest, shape, policy | SNP_POLICY_DBG);
 	else
-		test_sev(guest, type, policy | SEV_POLICY_NO_DBG);
-	test_sev(guest, type, policy);
+		test_sev(guest, shape, policy | SEV_POLICY_NO_DBG);
+	test_sev(guest, shape, policy);
 
-	if (type == KVM_X86_SEV_VM)
+	if (shape.type == KVM_X86_SEV_VM)
 		return;
 
-	test_sev_shutdown(type, policy);
+	test_sev_shutdown(shape, policy);
 
 	if (kvm_has_cap(KVM_CAP_XCRS) &&
 	    (xgetbv(0) & kvm_cpu_supported_xcr0() & xf_mask) == xf_mask) {
-		test_sync_vmsa(type, policy);
-		if (type == KVM_X86_SNP_VM)
-			test_sync_vmsa(type, policy | SNP_POLICY_DBG);
+		test_sync_vmsa(shape, policy);
+		if (shape.type == KVM_X86_SNP_VM)
+			test_sync_vmsa(shape, policy | SNP_POLICY_DBG);
 		else
-			test_sync_vmsa(type, policy | SEV_POLICY_NO_DBG);
+			test_sync_vmsa(shape, policy | SEV_POLICY_NO_DBG);
 	}
 }
 
@@ -247,13 +247,13 @@ int main(int argc, char *argv[])
 {
 	TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SEV));
 
-	test_sev_smoke(guest_sev_code, KVM_X86_SEV_VM, 0);
+	test_sev_smoke(guest_sev_code, VM_SHAPE_SEV, 0);
 
 	if (kvm_cpu_has(X86_FEATURE_SEV_ES))
-		test_sev_smoke(guest_sev_es_code, KVM_X86_SEV_ES_VM, SEV_POLICY_ES);
+		test_sev_smoke(guest_sev_es_code, VM_SHAPE_SEV_ES, SEV_POLICY_ES);
 
 	if (kvm_cpu_has(X86_FEATURE_SEV_SNP))
-		test_sev_smoke(guest_snp_code, KVM_X86_SNP_VM, snp_default_policy());
+		test_sev_smoke(guest_snp_code, VM_SHAPE_SNP, snp_default_policy());
 
 	return 0;
 }

-- 
2.54.0.746.g67dd491aae-goog


^ permalink raw reply related

* [PATCH v13 00/22] TDX KVM selftests
From: Lisa Wang @ 2026-05-21 23:16 UTC (permalink / raw)
  To: Andrew Jones, Ackerley Tng, Binbin Wu, Chao Gao, Chenyi Qiang,
	Dave Hansen, Erdem Aktas, Ira Weiny, Isaku Yamahata,
	Kiryl Shutsemau, linux-kselftest, Paolo Bonzini, Pratik R. Sampat,
	Reinette Chatre, Rick Edgecombe, Roger Wang, Ryan Afranji,
	Sagi Shahar, Sean Christopherson, Shuah Khan, Oliver Upton
  Cc: Jeremiah McReynolds, kvm, linux-coco, linux-kernel, x86,
	Lisa Wang, Adrian Hunter

This patch series focuses on setting up a TDX VM and adding all code
necessary to run a basic lifecycle test.

Unlike standard KVM selftests can set up the VM through guest registers,
TDX module protects TDs' register state from the host. This feature of
TDX causes problems on VM boot state initialization and the ucall
implementation.

In standard KVM selftests, the host directly initializes the guest state
by manipulating Special Registers (SREGs) and General Purpose Registers
(GPRs) via IOCTLs (KVM_SET_SREGS, etc.) before the first KVM_RUN.

To bypass direct register initialization by the host, we utilize the
standard x86 reset vector as the default entry point.

The mechanism works as follows:
1. The host places register values into a specific memory region and
   inserts boot code at the VM's default starting point.
2. When the VM starts, it executes this boot code to "pull" values from
   memory and manually set up its own SREGs and GPRs.
3. Once the environment is ready, the boot code jumps to the guest code.

The standard x86 ucall() implementation uses PIO, but it does not
actually transmit data through the 4-byte PIO data. Instead, it relies
on the host reading the ucall address directly from the guest's RDI
register.

TDX selftests cannot utilize the standard x86 ucall implementation,
because the host is unable to access the guest's RDI register. Based on
this restriction, we considered these potential solutions for the TDX
ucall implementation.

1. TDCALL PIO with RCX-bits Passthrough
We first considered passing the RDI value through RCX bits to bypass the
hardware's register protection, which could be the closest approach to
the non-TDX implementation as per Sean's suggestion[1]. However, this
approach is blocked by the software-side implementation: KVM_GET_REGS
currently does not support TDX VMs and returns -EINVAL. To make this
work, the KVM ioctl would need a test-only hack.

2. TDCALL PIO with buffer indexing
To keep a PIO-based approach and unify the get_ucall implementation for
both TDX and non-TDX VMs, we considered TDCALL PIO with buffer indexing.
Since the ucall buffer is initialized prior to execution, the VM could
just pass a buffer index rather than an 8-byte ucall address to fit
within the 4-byte PIO data limit. The host, already knowing the ucall
buffer's base address, could then resolve the ucall content via this
index. We abandoned this solution because it would require changes to
the common ucall structure and impact other non-x86 architectures.

3. TDCALL MMIO (Selected solution)
We ultimately selected TDCALL with an 8-byte MMIO data. This method only
requires initializing an MMIO GPA and adding TDCALL MMIO implementation
for TDX under the original x86 ucall path. While this diverges from the
non-TDX PIO, it provides the cleanest implementation with minimal
disruption to the overall ucall architecture.

4. A note on #VE and x86 ucall simplification
It is worth noting that the use of a Virtualization Exception (#VE)
is orthogonal to the PIO vs. MMIO discussion; rather, it is a question
of how much we want to simplify the x86 ucall implementation. A #VE
handler is one option to allow VMs use PIO/MMIO identical to the
non-TDX case. Alternatively, having an MMIO_WRITE wrapper macro, as Sean
suggested[2], is another option. Either way, discussion for this is
likely a premature optimization right now, since the PIO/MMIO call is
only used under ucall_arch_do_ucall(), and standard and TDX VMs use
different ones now. We should optimize this in the future, but for now,
invoking TDCALL directly is more robust and concise.

v13 revision for TDX KVM selftests based on kvm/next and guest_memfd:
In-place conversion support[3]. For ease of testing, this series is also
available at: https://github.com/googleprodkernel/linux-cc/commits/tdx-selftests-v13

Changes from v12[4]:
1. Fixed some bugs, including typo, commit order and commit messages.
2. Inlined the TDCALL to tdx.c file.
3. Refactored the Makefile to use pattern rules for generic source
   compilation while ensuring build artifacts are directed to the target
   output directory.

Series is organized by:
1. Patches 1 - 4: Initialize the TDX VM
2. Patches 5 - 8: Add the TDX boot code
3. Patches 9 - 13: Set up the boot region
4. Patches 14 - 17: Set up the vCPU
5. Patches 18 - 19: Finalize the TDX VM
6. Patches 20 - 22: Implement the ucall and run the TDX test

[1]: https://lore.kernel.org/kvm/aQTcDH9LRezI30dm@google.com/
[2]: https://lore.kernel.org/kvm/aQTSdk3JtFu1qOMj@google.com/
[3]: https://lore.kernel.org/all/20260507-gmem-inplace-conversion-v6-0-91ab5a8b19a4@google.com/T/
[4]: https://lore.kernel.org/kvm/20251028212052.200523-1-sagis@google.com/

Signed-off-by: Lisa Wang <wyihan@google.com>
---
Ackerley Tng (2):
      KVM: selftests: Add helpers to init TDX memory and finalize VM
      KVM: selftests: Add ucall support for TDX

Erdem Aktas (2):
      KVM: selftests: Add TDX boot code
      KVM: selftests: Implement MMIO WRITE for the TDX VM

Isaku Yamahata (2):
      KVM: selftests: Update kvm_init_vm_address_properties() for TDX
      KVM: selftests: TDX: Use KVM_TDX_CAPABILITIES to validate TDs' attribute configuration

Lisa Wang (2):
      KVM: selftests: Back the first memory region with guest_memfd for TDX
      KVM: selftests: Set first memory region as shared if guest_memfd

Sagi Shahar (13):
      KVM: selftests: Initialize the TDX VM
      KVM: selftests: Expose segment definitions to assembly files
      tools: include: Add kbuild.h for assembly structure offsets
      KVM: selftests: Introduce structures for TDX guest boot parameters
      KVM: selftests: Expose functions to get default sregs values
      KVM: selftests: Set up TDX boot code region
      KVM: selftests: Set up TDX boot parameters region
      KVM: selftests: Expose function to allocate vCPU stack
      KVM: selftests: Call KVM_TDX_INIT_VCPU when creating a new TDX vcpu
      KVM: selftests: Load per-vCPU guest stack in TDX boot parameters
      KVM: selftests: Set entry point for TDX guest code
      KVM: selftests: Finalize TD memory as part of kvm_arch_vm_finalize_vcpus
      KVM: selftests: Add TDX lifecycle test

Sean Christopherson (1):
      KVM: selftests: Add macros to simplify creating VM shapes for non-default types

 tools/include/linux/kbuild.h                       |  11 +
 tools/testing/selftests/kvm/.gitignore             |   3 +-
 tools/testing/selftests/kvm/Makefile.kvm           |  33 +-
 tools/testing/selftests/kvm/include/kvm_util.h     |  13 +
 .../testing/selftests/kvm/include/x86/processor.h  |  40 +++
 .../selftests/kvm/include/x86/processor_asm.h      |  12 +
 tools/testing/selftests/kvm/include/x86/sev.h      |   2 -
 .../selftests/kvm/include/x86/tdx/td_boot.h        |  74 +++++
 .../selftests/kvm/include/x86/tdx/td_boot_asm.h    |  16 +
 tools/testing/selftests/kvm/include/x86/tdx/tdx.h  |  16 +
 .../selftests/kvm/include/x86/tdx/tdx_util.h       |  80 +++++
 tools/testing/selftests/kvm/include/x86/ucall.h    |   6 -
 tools/testing/selftests/kvm/lib/kvm_util.c         |  18 +-
 tools/testing/selftests/kvm/lib/x86/processor.c    | 107 ++++---
 tools/testing/selftests/kvm/lib/x86/sev.c          |  16 -
 tools/testing/selftests/kvm/lib/x86/tdx/td_boot.S  |  60 ++++
 .../selftests/kvm/lib/x86/tdx/td_boot_offsets.c    |  21 ++
 tools/testing/selftests/kvm/lib/x86/tdx/tdx.c      |  30 ++
 tools/testing/selftests/kvm/lib/x86/tdx/tdx_util.c | 334 +++++++++++++++++++++
 tools/testing/selftests/kvm/lib/x86/ucall.c        |  30 ++
 tools/testing/selftests/kvm/x86/sev_smoke_test.c   |  40 +--
 tools/testing/selftests/kvm/x86/tdx_vm_test.c      |  33 ++
 22 files changed, 907 insertions(+), 88 deletions(-)
---
base-commit: cd1b71113e3f70f0a1a3d61550cf89f1eed379c4
change-id: 20260508-tdx-selftests-v13-bf00ad0cb8fe

Best regards,
-- 
Lisa Wang <wyihan@google.com>

^ permalink raw reply

* Re: [PATCH v14 07/44] arm64: RMI: Configure the RMM with the host's page size
From: Suzuki K Poulose @ 2026-05-21 22:36 UTC (permalink / raw)
  To: Gavin Shan, Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Zenghui Yu, linux-arm-kernel, linux-kernel,
	Joey Gouly, Alexandru Elisei, Christoffer Dall, Fuad Tabba,
	linux-coco, Ganapatrao Kulkarni, Shanker Donthineni, Alper Gun,
	Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve, WeiLin.Chang,
	Lorenzo.Pieralisi2
In-Reply-To: <45a953a8-6edc-45c3-b5bd-17f14397ab89@redhat.com>

On 21/05/2026 01:51, Gavin Shan wrote:
> Hi Steven,
> 
> On 5/13/26 11:17 PM, Steven Price wrote:
>> RMM v2.0 brings the ability to set the RMM's granule size. Check the
>> feature registers and configure the RMM so that it matches the host's
>> page size. This means that operations can be done with a granulatity
>> equal to PAGE_SIZE.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> Changes since v13:
>>   * Moved out of KVM.
>> ---
>>   arch/arm64/kernel/rmi.c | 42 +++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 42 insertions(+)
>>
>> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
>> index 99c1ccc35c11..a14ead5dedda 100644
>> --- a/arch/arm64/kernel/rmi.c
>> +++ b/arch/arm64/kernel/rmi.c
>> @@ -49,6 +49,45 @@ static int rmi_check_version(void)
>>       return 0;
>>   }
>> +static int rmi_configure(void)
>> +{
>> +    struct rmm_config *config __free(free_page) = NULL;
>> +    unsigned long ret;
>> +
>> +    config = (struct rmm_config *)get_zeroed_page(GFP_KERNEL);
>> +    if (!config)
>> +        return -ENOMEM;
>> +
>> +    switch (PAGE_SIZE) {
>> +    case SZ_4K:
>> +        config->rmi_granule_size = RMI_GRANULE_SIZE_4KB;
>> +        break;
>> +    case SZ_16K:
>> +        config->rmi_granule_size = RMI_GRANULE_SIZE_16KB;
>> +        break;
>> +    case SZ_64K:
>> +        config->rmi_granule_size = RMI_GRANULE_SIZE_64KB;
>> +        break;
>> +    default:
>> +        pr_err("Unsupported PAGE_SIZE for RMM\n");
>> +        return -EINVAL;
>> +    }
>> +
>> +    ret = rmi_rmm_config_set(virt_to_phys(config));
>> +    if (ret) {
>> +        pr_err("RMM config set failed\n");
>> +        return -EINVAL;
>> +    }
>> +
> 
> Looking at branch 'topics/rmm-v2.0-poc_2' of RMM implementation, the 
> granule size
> is fixed to be 4KB at present. I'm not sure if I have looked into 
> correct RMM
> implementation, but 'topics/rmm-v2.0-poc_2' is recommended one in the cover
> letter.
> 

You are right. The tf-RMM only supports 4KB. The policy at the KVM host
is to set the Linux PAGE_SIZE for the GRANULE_SIZE (at least for now).
If the RMM doesn't support the PAGE_SIZE, we don't support the RMM.


> Besides, there has checks in the handler of the RMI command to make sure 
> that
> struct rmm_config::tracking_region_size to be 1GB, indicated by zero. It 
> maybe
> worthy to set it before call to rmi_rmm_config_set().
> 
>      config.tracking_region_size = 0; /* 1GB */

Thanks, this explicit initialisation is missing, though in effect the
value is 0'd. Also, we can't really say 1GB here, because the driver 
should work for an RMM capable of 64K. So, instead, may be we could :

	/* See the definition of RMM_GRANULE_TRACKING_SIZE */
	config.tracking_region_size = 0;

Suzuki


>      ret = rmi_rmm_config_set(virt_to_phys(config));
> 
> 
>> +    ret = rmi_rmm_activate();
>> +    if (ret) {
>> +        pr_err("RMM activate failed\n");
>> +        return -ENXIO;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   static int __init arm64_init_rmi(void)
>>   {
>>       /* Continue without realm support if we can't agree on a version */
>> @@ -60,6 +99,9 @@ static int __init arm64_init_rmi(void)
>>       if (WARN_ON(rmi_features(1, &rmm_feat_reg1)))
>>           return 0;
>> +    if (rmi_configure())
>> +        return 0;
>> +
>>       return 0;
>>   }
>>   subsys_initcall(arm64_init_rmi);
> 
> Thanks,
> Gavin
> 


^ permalink raw reply

* Re: [PATCH v3 02/41] x86/tsc: Add helper to register CPU and TSC freq calibration routines
From: David Woodhouse @ 2026-05-21 21:37 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Kiryl Shutsemau, Paolo Bonzini, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Ajay Kaher, Alexey Makhalov,
	Jan Kiszka, Dave Hansen, Andy Lutomirski, Peter Zijlstra,
	Juergen Gross, Daniel Lezcano, Thomas Gleixner, John Stultz,
	Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
	linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
	Nikunj A Dadhania, Thomas Gleixner
In-Reply-To: <ag92Ze_FADmL1llo@google.com>

[-- Attachment #1: Type: text/plain, Size: 1648 bytes --]

On Thu, 2026-05-21 at 14:17 -0700, Sean Christopherson wrote:
>  
> > That seems reasonable. Where does the call to
> > native_calibrate_tsc()
> > happen; is that from determine_cpu_tsc_frequencies()? 
> 
> Yep.

Great, thanks.

> static bool __init determine_cpu_tsc_frequencies(bool early,
> 						 unsigned int
> known_cpu_khz,
> 						 unsigned int
> known_tsc_khz)
> {
> 	/* Make sure that cpu and tsc are not already calibrated */
> 	WARN_ON(cpu_khz || tsc_khz);
> 
> 	if (early) {
> 		/*
> 		 * Early CPU calibration can only use methods that
> are available
> 		 * early in boot (obviously).
> 		 */
> 		if (known_cpu_khz)
> 			cpu_khz = known_cpu_khz;
> 		else
> 			cpu_khz = native_calibrate_cpu_early();
> 		if (known_tsc_khz)
> 			tsc_khz = known_tsc_khz;
> 		else
> 			tsc_khz = native_calibrate_tsc();
> 	} else {
> 		cpu_khz = pit_hpet_ptimer_calibrate_cpu();
> 	}


If, after all that, we still end up in the case where we *do* have to
calibrate it against a legacy timer (which sadly IIRC is the case even
on some fairly modern AMD generations), could we round the answer?

We currently have *far* more precision than accuracy, leading to values
like 2399997kHz which change every boot (and end up being what gets
*advertised* to guests on such a host... and then unless we're careful
to avoid it, we end up trying to *scale* a different host's TSC down
from 2399998 to 2399997 for a guest which is migrated from the first
host...)

We should just fix them (e.g. to 2400000kHz) and let NTP sort them out.

Something like "round to the nearest MHz if that's within ±10PPM"?



[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]

^ permalink raw reply

* Re: [PATCH v6 21/43] KVM: SEV: Make 'uaddr' parameter optional for KVM_SEV_SNP_LAUNCH_UPDATE
From: Ackerley Tng @ 2026-05-21 21:27 UTC (permalink / raw)
  To: Sean Christopherson, Fuad Tabba
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, willy, wyihan, yan.y.zhao, forkloop, pratyush,
	suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen, x86,
	H. Peter Anvin, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, Shuah Khan,
	Vishal Annapurve, Andrew Morton, Chris Li, Kairui Song,
	Kemeng Shi, Nhat Pham, Baoquan He, Barry Song, Axel Rasmussen,
	Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng, Shakeel Butt,
	Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka, kvm,
	linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <ag8G7Wq5PbEdKloG@google.com>

Sean Christopherson <seanjc@google.com> writes:

> On Thu, May 21, 2026, Fuad Tabba wrote:
>> Hi,
>>
>> On Thu, 7 May 2026 at 21:22, Ackerley Tng via B4 Relay
>> <devnull+ackerleytng.google.com@kernel.org> wrote:
>> >
>> > From: Michael Roth <michael.roth@amd.com>
>> >
>> > For vm_memory_attributes=1, in-place conversion/population is not
>> > supported, so the initial contents necessarily must need to come
>> > from a separate src address, which is enforced by the current
>> > implementation. However, for vm_memory_attributes=0, it is possible for
>> > guest memory to be initialized directly from userspace by mmap()'ing the
>> > guest_memfd and writing to it while the corresponding GPA ranges are in
>> > a 'shared' state before converting them to the 'private' state expected
>> > by KVM_SEV_SNP_LAUNCH_UPDATE.
>> >
>> > Update the handling/documentation for KVM_SEV_SNP_LAUNCH_UPDATE to allow
>> > for 'uaddr' to be set to NULL when vm_memory_attributes=0, which
>> > SNP_LAUNCH_UPDATE will then use to determine when it should/shouldn't
>> > copy in data from a separate memory location. Continue to enforce
>> > non-NULL for the original vm_memory_attributes=1 case.
>> >
>> > Signed-off-by: Michael Roth <michael.roth@amd.com>
>> > [Added src_page check in error handling path when the firmware command fails]
>> > [Dropped ifdef CONFIG_KVM_VM_MEMORY_ATTRIBUTES]
>> > Signed-off-by: Ackerley Tng <ackerleytng@google.com>
>>
>> I'm not very familiar with the SEV-SNP populate flows, but it looks
>> like Sashiko is on to something:
>> https://sashiko.dev/#/patchset/20260507-gmem-inplace-conversion-v6-0-91ab5a8b19a4%40google.com?part=21
>>
>> - a potential read-only page overwrite, because src_page is acquired
>> via get_user_pages_fast() without the FOLL_WRITE flag, but is then
>> overwritten via memcpy
>
> Oof, yeah, that's bad.  Adding FOLL_WRITE to kvm_gmem_populate() feels wrong, and
> could break uABI, but doing gup() in SNP code would reintroduce the AB-BA issue
> with filemap_invalidate_lock().
>
> Aha!  Not if we use get_user_page_fast_only().  Ugh, but then we'd have to plumb
> the userspace address into the post-populated callback.
>
> Hrm.  Given that no one has yelled about overwriting their CPUID page, and given
> that the CPUID page is likely dynamically created and thus is unlikely to be a
> read-only mapping (e.g. versus the initial image), maybe this?
>

Overwriting the CPUID page is by design, I think. IIUC if the SNP
firmware doesn't like something about the CPUID page, it can update
src_page and then return an error to userspace.

Userspace should then check if it agrees with the updated CPUID contents
and then retry if it agrees.

> diff --git arch/x86/kvm/svm/sev.c arch/x86/kvm/svm/sev.c
> index 37d4cfa5d980..c73c028d72c1 100644
> --- arch/x86/kvm/svm/sev.c
> +++ arch/x86/kvm/svm/sev.c
> @@ -2456,6 +2456,7 @@ static int snp_launch_update(struct kvm *kvm, struct kvm_sev_cmd *argp)
>         sev_populate_args.type = params.type;
>
>         count = kvm_gmem_populate(kvm, params.gfn_start, src, npages,
> +                                 params.type == KVM_SEV_SNP_PAGE_TYPE_CPUID,

I think this makes sense given that writing to src_page can only happen
when params.type == KVM_SEV_SNP_PAGE_TYPE_CPUID (this is explicitly one
of the guards in sev_gmem_post_populate()):

	/*
	 * If the firmware command failed handle the reclaim and cleanup of that
	 * PFN before reporting an error.
	 *
	 * Additionally, when invalid CPUID function entries are detected,
	 * firmware writes the expected values into the page and leaves it
	 * unencrypted so it can be used for debugging and error-reporting.
	 *
	 * Copy this page back into the source buffer so userspace can use this
	 * information to provide information on which CPUID leaves/fields
	 * failed CPUID validation.
	 */
	if (ret && !snp_page_reclaim(kvm, pfn) &&
	    sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
	    sev_populate_args->fw_error == SEV_RET_INVALID_PARAM && src_page) {
		void *src_vaddr = kmap_local_page(src_page);
		void *dst_vaddr = kmap_local_pfn(pfn);

		memcpy(src_vaddr, dst_vaddr, PAGE_SIZE);

		kunmap_local(src_vaddr);
		kunmap_local(dst_vaddr);
	}

>                                   sev_gmem_post_populate, &sev_populate_args);
>         if (count < 0) {
>                 argp->error = sev_populate_args.fw_error;
> diff --git arch/x86/kvm/vmx/tdx.c arch/x86/kvm/vmx/tdx.c
> index f97bcf580e6d..33f35be4455b 100644
> --- arch/x86/kvm/vmx/tdx.c
> +++ arch/x86/kvm/vmx/tdx.c
> @@ -3188,7 +3188,7 @@ static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *c
>                 };
>                 gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
>                                              u64_to_user_ptr(region.source_addr),
> -                                            1, tdx_gmem_post_populate, &arg);
> +                                            1, false, tdx_gmem_post_populate, &arg);

And TDX doesn't try to write src_page, so this is good too.

>                 if (gmem_ret < 0) {
>                         ret = gmem_ret;
>                         break;
> diff --git include/linux/kvm_host.h include/linux/kvm_host.h
> index 61a3430957f2..b83cda2870ba 100644
> --- include/linux/kvm_host.h
> +++ include/linux/kvm_host.h
> @@ -2596,7 +2596,8 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord
>  typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
>                                     struct page *page, void *opaque);
>
> -long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
> +long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src,
> +                      long npages, bool writable,

What do you think of need_writable_src instead of just writable for the
variable name?

>                        kvm_gmem_populate_cb post_populate, void *opaque);
>  #endif
>
> diff --git virt/kvm/guest_memfd.c virt/kvm/guest_memfd.c
> index a35a55571a2d..6553d4e032ce 100644
> --- virt/kvm/guest_memfd.c
> +++ virt/kvm/guest_memfd.c
> @@ -858,7 +858,8 @@ static long __kvm_gmem_populate(struct kvm *kvm, struct kvm_memory_slot *slot,
>         return ret;
>  }
>
> -long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
> +long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src,
> +                      long npages, bool writable,
>                        kvm_gmem_populate_cb post_populate, void *opaque)
>  {
>         struct kvm_memory_slot *slot;
> @@ -892,8 +893,9 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
>
>                 if (src) {
>                         unsigned long uaddr = (unsigned long)src + i * PAGE_SIZE;
> +                       unsigned int flags = writable ? FOLL_WRITE : 0;

How about using FOLL_WRITE | FOLL_NOFAULT so if it weren't writable to
start with, don't CoW, just error out?

Like you said above the CPUID page provided as src_page would have been
written to before, so it should have been mapped as writable.

>
> -                       ret = get_user_pages_fast(uaddr, 1, 0, &src_page);
> +                       ret = get_user_pages_fast(uaddr, 1, flags, &src_page);

If we stick with FOLL_WRITE, this also solves the case where a read-only
mapping or global zero page are provided as src_page, since
get_user_pages_fast() will do a copy-on-write if those were the inputs,
making it writable before the write happens (on failure) in
sev_gmem_post_populate().

>                         if (ret < 0)
>                                 break;
>                         if (ret != 1) {
>
>> - an ordering violation with the kunmap_local() calls
>
> Yeesh, that's a new one for me.  Thankfully this is 64-bit only, so it's not an
> issue.
>
>> These predate this patch series and are just being touched by the
>> 'src_page' addition, but if Sashiko's right, these should probably be
>> fixed sooner rather than later.
>
> Yeah, ditto with the offset wrapping case.

^ permalink raw reply

* Re: [PATCH v3 02/41] x86/tsc: Add helper to register CPU and TSC freq calibration routines
From: Sean Christopherson @ 2026-05-21 21:17 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Kiryl Shutsemau, Paolo Bonzini, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Ajay Kaher, Alexey Makhalov,
	Jan Kiszka, Dave Hansen, Andy Lutomirski, Peter Zijlstra,
	Juergen Gross, Daniel Lezcano, Thomas Gleixner, John Stultz,
	Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
	linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
	Nikunj A Dadhania, Thomas Gleixner
In-Reply-To: <342098f6bfe1e4c7b233433df8f79713b4220614.camel@infradead.org>

On Thu, May 21, 2026, David Woodhouse wrote:
> On Thu, 2026-05-21 at 13:53 -0700, Sean Christopherson wrote:
> > 
> > E.g. this is what I've got for the early flow.  Testing now. 
> > 
> >   void __init tsc_early_init(void)
> >   {
> > 	unsigned int known_cpu_khz = 0, known_tsc_khz = 0;
> > 
> > 	if (!boot_cpu_has(X86_FEATURE_TSC))
> > 		return;
> > 	/* Don't change UV TSC multi-chassis synchronization */
> > 	if (is_early_uv_system())
> > 		return;
> > 
> > 	if (x86_init.hyper.get_cpu_khz)
> > 		known_cpu_khz = x86_init.hyper.get_cpu_khz();
> > 
> > 	if (tsc_early_khz)
> > 		known_tsc_khz = tsc_early_khz;
> > 	else if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
> > 		known_tsc_khz = snp_secure_tsc_init();
> > 	else if (boot_cpu_has(X86_FEATURE_TDX_GUEST))
> > 		known_tsc_khz = tdx_tsc_init();
> > 
> > 	/*
> > 	 * If the TSC frequency is still unknown, i.e. not provided by the user
> > 	 * or by trusted firmware, try to get it from the hypervisor (which is
> > 	 * untrusted when running as a CoCo guest).
> > 	 */
> > 	if (!known_tsc_khz && x86_init.hyper.get_tsc_khz)
> > 		known_tsc_khz = x86_init.hyper.get_tsc_khz();
> > 
> > 	if (known_tsc_khz)
> > 		setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
> > 
> > 	if (!determine_cpu_tsc_frequencies(true, known_cpu_khz, known_tsc_khz))
> > 		return;
> > 	tsc_enable_sched_clock();
> >   }
> 
> That seems reasonable. Where does the call to native_calibrate_tsc()
> happen; is that from determine_cpu_tsc_frequencies()? 

Yep.

static bool __init determine_cpu_tsc_frequencies(bool early,
						 unsigned int known_cpu_khz,
						 unsigned int known_tsc_khz)
{
	/* Make sure that cpu and tsc are not already calibrated */
	WARN_ON(cpu_khz || tsc_khz);

	if (early) {
		/*
		 * Early CPU calibration can only use methods that are available
		 * early in boot (obviously).
		 */
		if (known_cpu_khz)
			cpu_khz = known_cpu_khz;
		else
			cpu_khz = native_calibrate_cpu_early();
		if (known_tsc_khz)
			tsc_khz = known_tsc_khz;
		else
			tsc_khz = native_calibrate_tsc();
	} else {
		cpu_khz = pit_hpet_ptimer_calibrate_cpu();
	}

	...

^ permalink raw reply

* Re: [PATCH v3 02/41] x86/tsc: Add helper to register CPU and TSC freq calibration routines
From: David Woodhouse @ 2026-05-21 21:01 UTC (permalink / raw)
  To: Sean Christopherson
  Cc: Kiryl Shutsemau, Paolo Bonzini, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Ajay Kaher, Alexey Makhalov,
	Jan Kiszka, Dave Hansen, Andy Lutomirski, Peter Zijlstra,
	Juergen Gross, Daniel Lezcano, Thomas Gleixner, John Stultz,
	Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
	linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
	Nikunj A Dadhania, Thomas Gleixner
In-Reply-To: <ag9wz3RiJOtVZrK0@google.com>

[-- Attachment #1: Type: text/plain, Size: 1407 bytes --]

On Thu, 2026-05-21 at 13:53 -0700, Sean Christopherson wrote:
> 
> E.g. this is what I've got for the early flow.  Testing now. 
> 
>   void __init tsc_early_init(void)
>   {
> 	unsigned int known_cpu_khz = 0, known_tsc_khz = 0;
> 
> 	if (!boot_cpu_has(X86_FEATURE_TSC))
> 		return;
> 	/* Don't change UV TSC multi-chassis synchronization */
> 	if (is_early_uv_system())
> 		return;
> 
> 	if (x86_init.hyper.get_cpu_khz)
> 		known_cpu_khz = x86_init.hyper.get_cpu_khz();
> 
> 	if (tsc_early_khz)
> 		known_tsc_khz = tsc_early_khz;
> 	else if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
> 		known_tsc_khz = snp_secure_tsc_init();
> 	else if (boot_cpu_has(X86_FEATURE_TDX_GUEST))
> 		known_tsc_khz = tdx_tsc_init();
> 
> 	/*
> 	 * If the TSC frequency is still unknown, i.e. not provided by the user
> 	 * or by trusted firmware, try to get it from the hypervisor (which is
> 	 * untrusted when running as a CoCo guest).
> 	 */
> 	if (!known_tsc_khz && x86_init.hyper.get_tsc_khz)
> 		known_tsc_khz = x86_init.hyper.get_tsc_khz();
> 
> 	if (known_tsc_khz)
> 		setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
> 
> 	if (!determine_cpu_tsc_frequencies(true, known_cpu_khz, known_tsc_khz))
> 		return;
> 	tsc_enable_sched_clock();
>   }

That seems reasonable. Where does the call to native_calibrate_tsc()
happen; is that from determine_cpu_tsc_frequencies()? 

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]

^ permalink raw reply

* Re: [PATCH v3 37/41] x86/kvmclock: Use TSC for sched_clock if it's constant and non-stop
From: Sean Christopherson @ 2026-05-21 21:01 UTC (permalink / raw)
  To: Dongli Zhang
  Cc: kvm, Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, x86, linux-coco, linux-hyperv, virtualization,
	linux-kernel, xen-devel, Kiryl Shutsemau, Paolo Bonzini,
	K. Y. Srinivasan, Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li,
	Ajay Kaher, Alexey Makhalov, Jan Kiszka, Dave Hansen,
	Andy Lutomirski, Peter Zijlstra, Juergen Gross, Daniel Lezcano,
	Thomas Gleixner, John Stultz, Michael Kelley, Tom Lendacky,
	Nikunj A Dadhania, Thomas Gleixner, David Woodhouse
In-Reply-To: <c54fd01b-fe22-4c9c-8d5f-5b317de07a40@oracle.com>

On Thu, May 21, 2026, Dongli Zhang wrote:
> On 2026-05-15 12:19 PM, Sean Christopherson wrote:
> > Prefer the TSC over kvmclock for sched_clock if the TSC is constant,
> > nonstop, and not marked unstable via command line.  I.e. use the same
> > criteria as tweaking the clocksource rating so that TSC is preferred over
> > kvmclock.  Per the below comment from native_sched_clock(), sched_clock
> > is more tolerant of slop than clocksource; using TSC for clocksource but
> > not sched_clock makes little to no sense, especially now that KVM CoCo
> > guests with a trusted TSC use TSC, not kvmclock.
> > 
> >         /*
> >          * Fall back to jiffies if there's no TSC available:
> >          * ( But note that we still use it if the TSC is marked
> >          *   unstable. We do this because unlike Time Of Day,
> >          *   the scheduler clock tolerates small errors and it's
> >          *   very important for it to be as fast as the platform
> >          *   can achieve it. )
> >          */
> > 
> > The only advantage of using kvmclock is that doing so allows for early
> > and common detection of PVCLOCK_GUEST_STOPPED, but that code has been
> > broken for over two years with nary a complaint, i.e. it can't be
> > _that_ valuable.  And as above, certain types of KVM guests are losing
> > the functionality regardless, i.e. acknowledging PVCLOCK_GUEST_STOPPED
> > needs to be decoupled from sched_clock() no matter what.
> 
> Has it been broken for two years because of pvclock_clocksource_read_nowd()?

Yep.  Because pvclock_clocksource_read_nowd() ignores PVCLOCK_GUEST_STOPPED, the
flag only ever gets recognized when the kernel reads WALL_CLOCK, which AFAICT
only happens at initial boot, and during suspend and resume.

^ permalink raw reply

* Re: [PATCH v3 02/41] x86/tsc: Add helper to register CPU and TSC freq calibration routines
From: Sean Christopherson @ 2026-05-21 20:53 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Kiryl Shutsemau, Paolo Bonzini, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Ajay Kaher, Alexey Makhalov,
	Jan Kiszka, Dave Hansen, Andy Lutomirski, Peter Zijlstra,
	Juergen Gross, Daniel Lezcano, Thomas Gleixner, John Stultz,
	Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
	linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
	Nikunj A Dadhania, Thomas Gleixner
In-Reply-To: <44e0d60548d317fd59895f18bd17220dfb2f834b.camel@infradead.org>

On Wed, May 20, 2026, David Woodhouse wrote:
> On Fri, 2026-05-15 at 12:19 -0700, Sean Christopherson wrote:
> > Add a helper to register non-native, i.e. PV and CoCo, CPU and TSC
> > frequency calibration routines.  This will allow consolidating handling
> > of common TSC properties that are forced by hypervisor (PV routines),
> > and will also allow adding sanity checks to guard against overriding a
> > TSC calibration routine with a routine that is less robust/trusted.
> > 
> > Make the CPU calibration routine optional, as Xen (very sanely) doesn't
> > assume the CPU runs as the same frequency as the TSC.
> > 
> > Wrap the helper in an #ifdef to document that the kernel overrides
> > the native routines when running as a VM, and to guard against unwanted
> > usage.  Add a TODO to call out that AMD_MEM_ENCRYPT is a mess and doesn't
> > depend on HYPERVISOR_GUEST because it gates both guest and host code.
> > 
> > No functional change intended.
> > 
> > Reviewed-by: Michael Kelley <mhklinux@outlook.com>
> > Tested-by: Michael Kelley <mhklinux@outlook.com>
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> 
> Mildly concerned that we might want to support multiple options — does
> it have CPUID 0x15? Does it have 0x40000x10? Does it have a pvclock?
> There are various permutations of those which are perhaps best handled
> by *trying* each one, in some order, and populating a struct with the
> answers?
> 
> But on the basis that perfect is the enemy of good,

This has been bothering me too.

Aha!  AHA!  Idea.

... 4 hours later ...

Mhahahaahah, victory is mine!!!!

TL;DR: Overriding x86_platform_ops hooks is dumb.

To your point about making an informed decision, that's essentialy what this series
is already doing, just in a very roundabout way:

  1. x86_platform.calibrate_{cpu,tsc}() are initialized to "native" versions
  2. Hypervisor init code runs and conditionally overrides calibrate_{cpu,tsc}()
  3. CoCo init code runs and conditionally overrides calibrate_{cpu,tsc}()

So the ordering you want is already there, as is "trying" each source to some
extent, in the form of steps #2 and #3 overriding the hooks if and only if their
source of information is valid.  For all intents and purposes, the hardening I
was adding by formalizing the calibration overrides was to enforce the above ordering.

But that's obviously all but impossible to follow, _and_ it's pointless.

For every PV case, including TDX and SNP, "calibration" is simply information
retrieval, i.e. it never changes (barring broken hypervisors/firmware), and the
information is always available during early boot.

Contrast that with the pre-CPUID CPU frequency calibration, where the frequency
might change, the kernel is making a best guest based on other timekeeping sources,
and not all timekeeping sources are available during early boot.

And so overriding x86_platform.calibrate_{cpu,tsc}() for PV code is completely
unecessary, because steps #2 and #3 already know the frequency when they override
the hooks, and "success" is guaranteed, i.e. the kernel won't have to switch to a
"late" calibration flow.

If we provide x86_hyper_init hooks:

	unsigned int (*get_tsc_khz)(void);
	unsigned int (*get_cpu_khz)(void);

then we can kill off x86_platform.calibrate_{cpu,tsc}() entirely, explicitly
define the preferred ordering (user-forced => CoCo => Hypervisor => native), and
depup some of the hypervisor code.

E.g. this is what I've got for the early flow.  Testing now. 

  void __init tsc_early_init(void)
  {
	unsigned int known_cpu_khz = 0, known_tsc_khz = 0;

	if (!boot_cpu_has(X86_FEATURE_TSC))
		return;
	/* Don't change UV TSC multi-chassis synchronization */
	if (is_early_uv_system())
		return;

	if (x86_init.hyper.get_cpu_khz)
		known_cpu_khz = x86_init.hyper.get_cpu_khz();

	if (tsc_early_khz)
		known_tsc_khz = tsc_early_khz;
	else if (cc_platform_has(CC_ATTR_GUEST_SNP_SECURE_TSC))
		known_tsc_khz = snp_secure_tsc_init();
	else if (boot_cpu_has(X86_FEATURE_TDX_GUEST))
		known_tsc_khz = tdx_tsc_init();

	/*
	 * If the TSC frequency is still unknown, i.e. not provided by the user
	 * or by trusted firmware, try to get it from the hypervisor (which is
	 * untrusted when running as a CoCo guest).
	 */
	if (!known_tsc_khz && x86_init.hyper.get_tsc_khz)
		known_tsc_khz = x86_init.hyper.get_tsc_khz();

	if (known_tsc_khz)
		setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);

	if (!determine_cpu_tsc_frequencies(true, known_cpu_khz, known_tsc_khz))
		return;
	tsc_enable_sched_clock();
  }

^ permalink raw reply

* Re: [PATCH v3 29/41] x86/paravirt: Plumb a return code into __paravirt_set_sched_clock()
From: Sean Christopherson @ 2026-05-21 20:35 UTC (permalink / raw)
  To: David Woodhouse
  Cc: Kiryl Shutsemau, Paolo Bonzini, K. Y. Srinivasan, Haiyang Zhang,
	Wei Liu, Dexuan Cui, Long Li, Ajay Kaher, Alexey Makhalov,
	Jan Kiszka, Dave Hansen, Andy Lutomirski, Peter Zijlstra,
	Juergen Gross, Daniel Lezcano, Thomas Gleixner, John Stultz,
	Rick Edgecombe, Vitaly Kuznetsov,
	Broadcom internal kernel review list, Boris Ostrovsky,
	Stephen Boyd, x86, linux-coco, kvm, linux-hyperv, virtualization,
	linux-kernel, xen-devel, Michael Kelley, Tom Lendacky,
	Nikunj A Dadhania, Thomas Gleixner
In-Reply-To: <13d79ba1e0450068c9573ccd8deb3ec007aea8d6.camel@infradead.org>

On Thu, May 21, 2026, David Woodhouse wrote:
> On Fri, 2026-05-15 at 12:19 -0700, Sean Christopherson wrote:
> > Add a return code to __paravirt_set_sched_clock() so that the kernel can
> > reject attempts to use a PV sched_clock without breaking the caller.  E.g.
> > when running as a CoCo VM with a secure TSC, using a PV clock is generally
> > undesirable.
> > 
> > Note, kvmclock is the only PV clock that does anything "extra" beyond
> > simply registering itself as sched_clock, i.e. is the only caller that
> > needs to check the new return value.
> > 
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> 
> Oooh... can we use this to reject the kvmclock when we have a stable
> and reliable TSC even for non-CoCo guests?

Yes, but I would much rather "fix" kvmclock to not even attempt to register itself
as the sched_clock (which this series does).

^ permalink raw reply

* Re: [PATCH v3 36/41] x86/kvmclock: Get local APIC bus frequency from PV CPUID Timing Info
From: Sean Christopherson @ 2026-05-21 20:34 UTC (permalink / raw)
  To: David Woodhouse
  Cc: tglx@kernel.org, longli@microsoft.com, luto@kernel.org,
	alexey.makhalov@broadcom.com, jstultz@google.com,
	dave.hansen@linux.intel.com, ajay.kaher@broadcom.com,
	jan.kiszka@siemens.com, haiyangz@microsoft.com, kas@kernel.org,
	pbonzini@redhat.com, kys@microsoft.com, decui@microsoft.com,
	daniel.lezcano@kernel.org, wei.liu@kernel.org,
	peterz@infradead.org, jgross@suse.com, boris.ostrovsky@oracle.com,
	linux-coco@lists.linux.dev, kvm@vger.kernel.org,
	mhklinux@outlook.com, thomas.lendacky@amd.com,
	linux-kernel@vger.kernel.org,
	bcm-kernel-feedback-list@broadcom.com, tglx@linutronix.de,
	nikunj@amd.com, xen-devel@lists.xenproject.org,
	linux-hyperv@vger.kernel.org, vkuznets@redhat.com,
	rick.p.edgecombe@intel.com, virtualization@lists.linux.dev,
	sboyd@kernel.org, x86@kernel.org
In-Reply-To: <7489ff3cc1ff402bf0ade38272fc52dcbcc75fc1.camel@amazon.co.uk>

On Wed, May 20, 2026, David Woodhouse wrote:
> On Fri, 2026-05-15 at 12:19 -0700, Sean Christopherson wrote:
> > When running as a KVM guest with kvmclock support enabled, stuff the APIC
> > timer period/frequency with the local APIC bus frequency reported in
> > CPUID.0x40000010.EBX instead of trying to calibrate/guess the frequency.
> > 
> > See Documentation/virt/kvm/x86/cpuid.rst for details.
> > 
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> 
> I still don't much like the way this is done inside kvm_get_tsc_khz().

Yeah, I don't like it either (understatement).  Aha!  native_calibrate_tsc() is
the oddball, all of the PV flows stuff lapic_timer_period when parsing the initial
timing info.  I'll just do that.  Blindly writing a global is all kinds of fugly,
but that's a future
problem to solve.

> We also probably ought to be looking for the timing leaf on other
> hypervisors including VMware 

VMware gets the frequency via hypercall.  Why, I have no idea.  I'll let the
VMware folks deal with that.

	eax = vmware_hypercall3(VMWARE_CMD_GETHZ, UINT_MAX, &ebx, &ecx);

> and probably Bhyve too.  Should it be done somewhere else?

I'm not opposed to that, though I don't know that it'd be a net positive. The
"hard" part of getting the info is finding the CPUID base and checking if the
leaf is available.  Unlike the native CPUID leaf, no math is necessary, and so
once the leaf is obtained, getting the frequency is trivial.

Regardless, I definitely don't want to take it on in this series. :-)


^ permalink raw reply

* Re: [PATCH v2 15/15] KVM: x86: Move the bulk of register specific code from x86.c to regs.c
From: Sean Christopherson @ 2026-05-21 18:47 UTC (permalink / raw)
  To: Kai Huang
  Cc: dwmw2@infradead.org, Rick P Edgecombe, x86@kernel.org,
	binbin.wu@linux.intel.com, kas@kernel.org,
	dave.hansen@linux.intel.com, vkuznets@redhat.com, paul@xen.org,
	yosry@kernel.org, pbonzini@redhat.com, kvm@vger.kernel.org,
	linux-coco@lists.linux.dev, linux-kernel@vger.kernel.org
In-Reply-To: <0ed747418eaef45a8c161ab5a9e28a12c604f9ea.camel@intel.com>

On Wed, May 20, 2026, Kai Huang wrote:
> On Wed, 2026-05-20 at 11:11 -0700, Sean Christopherson wrote:
> > On Wed, May 20, 2026, Kai Huang wrote:
> > > But if we want to hide KVM internal structures, I don't see any other options
> > > except virt/kvm/include/ is the place to go?
> > 
> > arch/$(ARCH)/kvm/kvm_arch.h is the obvious approach.  Code in virt/kvm can reach
> > arch/$(ARCH)/kvm, we just need to add it to the include path.  That's why I was
> > working on unifying the include definitions.
> 
> Yeah, for asm/kvm_host.h.
> 
> But if I am still following you we still need a place for linux/kvm_host.h, for
> which I thought virt/kvm/include/ would be the place at first glance.

Oh, yes, the KVM-internal pieces of linuy/kvm_host.h would live in virt/kvm.

^ permalink raw reply

* Re: [PATCH v5 1/2] dma-mapping: introduce DMA_ATTR_CC_SHARED for shared memory
From: Jason Gunthorpe @ 2026-05-21 17:54 UTC (permalink / raw)
  To: Aneesh Kumar K.V
  Cc: Jiri Pirko, dri-devel, linaro-mm-sig, iommu, linux-media,
	sumit.semwal, benjamin.gaignard, Brian.Starkey, jstultz,
	tjmercier, christian.koenig, m.szyprowski, robin.murphy, leon,
	sean.anderson, ptesarik, catalin.marinas, suzuki.poulose,
	steven.price, thomas.lendacky, john.allen, ashish.kalra,
	suravee.suthikulpanit, linux-coco
In-Reply-To: <yq5azf1s6aic.fsf@kernel.org>

On Thu, May 21, 2026 at 09:05:39PM +0530, Aneesh Kumar K.V wrote:
> I am wondering whether this is better
> 
> static inline dma_addr_t dma_direct_map_phys(struct device *dev,
> 		phys_addr_t phys, size_t size, enum dma_data_direction dir,
> 		unsigned long attrs, bool flush)
> {
> 	dma_addr_t dma_addr;
> 
> 	/*
> 	 * For a device requiring unencrypted DMA, MMIO memory is treated
> 	 * as shared.
> 	 */
> 	if (force_dma_unencrypted(dev) && (attrs & DMA_ATTR_MMIO))
> 		attrs |= DMA_ATTR_CC_SHARED;

It is an option, I would be happier if we went and fixed the few
callers to properly pass the shared. CC did this with the
pgprot_decrypted() stuff, same reasoning:

diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index bfdb9ed7074116..e77f6404caa3db 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -90,7 +90,7 @@ static bool blk_dma_map_direct(struct request *req, struct device *dma_dev,
 	unsigned int attrs = 0;
 
 	if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
-		attrs |= DMA_ATTR_MMIO;
+		attrs |= iter->p2pdma.mem->dma_mapping_flags;
 
 	iter->addr = dma_map_phys(dma_dev, vec->paddr, vec->len,
 			rq_dma_dir(req), attrs);
@@ -115,7 +115,7 @@ static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev,
 	iter->len = dma_iova_size(state);
 
 	if (iter->p2pdma.map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
-		attrs |= DMA_ATTR_MMIO;
+		attrs |= iter->p2pdma.mem->dma_mapping_flags;
 
 	do {
 		error = dma_iova_link(dma_dev, state, vec->paddr, mapped,
diff --git a/drivers/dma-buf/dma-buf-mapping.c b/drivers/dma-buf/dma-buf-mapping.c
index 794acff2546a34..96022fadc48245 100644
--- a/drivers/dma-buf/dma-buf-mapping.c
+++ b/drivers/dma-buf/dma-buf-mapping.c
@@ -147,7 +147,7 @@ struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach,
 			ret = dma_iova_link(attach->dev, dma->state,
 					    phys_vec[i].paddr, 0,
 					    phys_vec[i].len, dir,
-					    DMA_ATTR_MMIO);
+					    provider->dma_mapping_flags);
 			if (ret)
 				goto err_unmap_dma;
 
@@ -155,7 +155,7 @@ struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach,
 		} else {
 			addr = dma_map_phys(attach->dev, phys_vec[i].paddr,
 					    phys_vec[i].len, dir,
-					    DMA_ATTR_MMIO);
+					    provider->dma_mapping_flags);
 			ret = dma_mapping_error(attach->dev, addr);
 			if (ret)
 				goto err_unmap_dma;
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 7c898542af8d5e..e4229b4d35c767 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -282,6 +282,8 @@ int pcim_p2pdma_init(struct pci_dev *pdev)
 			continue;
 
 		p2p->mem[i].owner = &pdev->dev;
+		p2p->mem[i].dma_mapping_flags =
+			DMA_ATTR_MMIO | DMA_ATTR_CC_SHARED;
 		p2p->mem[i].bus_offset =
 			pci_bus_address(pdev, i) - pci_resource_start(pdev, i);
 	}
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index 873de20a224759..402dc5e5d62b0a 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -21,10 +21,12 @@ struct scatterlist;
  *
  * A p2pdma provider is a range of MMIO address space available to the CPU.
  * @owner: Device to which this provider belongs.
+ * @dma_mapping_flags: DMA attributes to use for host bridge mappings.
  * @bus_offset: Bus offset for p2p communication.
  */
 struct p2pdma_provider {
 	struct device *owner;
+	unsigned long dma_mapping_flags;
 	u64 bus_offset;
 };
 
diff --git a/mm/hmm.c b/mm/hmm.c
index 5955f2f0c83db1..c3f445acddf873 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -811,7 +811,7 @@ dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
 	case PCI_P2PDMA_MAP_NONE:
 		break;
 	case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
-		attrs |= DMA_ATTR_MMIO;
+		attrs |= p2pdma_state->mem->dma_mapping_flags;
 		pfns[idx] |= HMM_PFN_P2PDMA;
 		break;
 	case PCI_P2PDMA_MAP_BUS_ADDR:

^ permalink raw reply related

* Re: [PATCH v4 04/13] dma: swiotlb: track pool encryption state and honor DMA_ATTR_CC_SHARED
From: Aneesh Kumar K.V @ 2026-05-21 17:20 UTC (permalink / raw)
  To: Mostafa Saleh
  Cc: iommu, linux-arm-kernel, linux-kernel, linux-coco, Robin Murphy,
	Marek Szyprowski, Will Deacon, Marc Zyngier, Steven Price,
	Suzuki K Poulose, Catalin Marinas, Jiri Pirko, Jason Gunthorpe,
	Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
	linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
	Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, x86
In-Reply-To: <CAFgf54o4ZnvnJ3369bHb10tvJJVP+5YWq=ec4Jh5K6S6U9uNEA@mail.gmail.com>

Mostafa Saleh <smostafa@google.com> writes:

> On Tue, May 12, 2026 at 10:05 AM Aneesh Kumar K.V (Arm)
> <aneesh.kumar@kernel.org> wrote:
>> @@ -1411,6 +1436,16 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
>>         if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
>>                 pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
>>
>> +       /*
>> +        * if we are trying to swiotlb map a decrypted paddr or the paddr is encrypted
>> +        * but the device is forcing decryption, use decrypted io_tlb_mem
>> +        */
>> +       if ((attrs & DMA_ATTR_CC_SHARED) || force_dma_unencrypted(dev))
>
> I don't think swiotlb needs to know about force_dma_unencrypted(), the
> dma/direct caller should have all the information to pass the
> appropriate flags.
>
> Thanks.
> Mostafa
>
>> +               require_decrypted = true;
>> +
>> +       if (require_decrypted != mem->unencrypted)
>> +               return (phys_addr_t)DMA_MAPPING_ERROR;
>> +

Based on other email threads, this is now updated to

@@ -1372,9 +1417,19 @@ static unsigned long mem_used(struct io_tlb_mem *mem)
  *			any pre- or post-padding for alignment
  * @alloc_align_mask:	Required start and end alignment of the allocated buffer
  * @dir:		DMA direction
- * @attrs:		Optional DMA attributes for the map operation
+ * @attrs:		Optional DMA attributes for the map operation, updated
+ *			to match the selected SWIOTLB pool
  *
  * Find and allocate a suitable sequence of IO TLB slots for the request.
+ * The device's SWIOTLB pool must match the device's current DMA encryption
+ * requirements. If the device requires decrypted DMA, bouncing is done through
+ * an unencrypted pool and the mapping is marked shared. If the device can DMA
+ * to encrypted memory, bouncing is done through an encrypted pool even when the
+ * original DMA address was unencrypted. Enabling encrypted DMA for a device is
+ * therefore expected to update its default io_tlb_mem to an encrypted pool, so
+ * later bounce mappings for both encrypted and decrypted original memory use
+ * that encrypted pool.
+ *
  * The allocated space starts at an alignment specified by alloc_align_mask,
  * and the size of the allocated space is rounded up so that the total amount
  * of allocated space is a multiple of (alloc_align_mask + 1). If
@@ -1411,6 +1466,16 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
 		pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
 
+	/* swiotlb pool is incorrect for this device */
+	if (unlikely(mem->unencrypted != force_dma_unencrypted(dev)))
+		return (phys_addr_t)DMA_MAPPING_ERROR;
+
+	/* Force attrs to match the kind of memory in the pool */
+	if (mem->unencrypted)
+		*attrs |= DMA_ATTR_CC_SHARED;
+	else
+		*attrs &= ~DMA_ATTR_CC_SHARED;
+
 	/*
 	 * The default swiotlb memory pool is allocated with PAGE_SIZE
 	 * alignment. If a mapping is requested with larger alignment,
@@ -1608,8 +1673,11 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
 	if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
 		return DMA_MAPPING_ERROR;
 
-	/* Ensure that the address returned is DMA'ble */
-	dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
+	if (attrs & DMA_ATTR_CC_SHARED)
+		dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
+	else
+		dma_addr = phys_to_dma_encrypted(dev, swiotlb_addr);
+
 	if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
 		__swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
 			attrs | DMA_ATTR_SKIP_CPU_SYNC,
@@ -1773,7 +1841,7 @@ static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,

^ permalink raw reply

* Re: [PATCH v4 04/13] dma: swiotlb: track pool encryption state and honor DMA_ATTR_CC_SHARED
From: Mostafa Saleh @ 2026-05-21 17:06 UTC (permalink / raw)
  To: Aneesh Kumar K.V (Arm)
  Cc: iommu, linux-arm-kernel, linux-kernel, linux-coco, Robin Murphy,
	Marek Szyprowski, Will Deacon, Marc Zyngier, Steven Price,
	Suzuki K Poulose, Catalin Marinas, Jiri Pirko, Jason Gunthorpe,
	Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
	linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
	Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, Sven Schnelle, x86
In-Reply-To: <20260512090408.794195-5-aneesh.kumar@kernel.org>

On Tue, May 12, 2026 at 10:05 AM Aneesh Kumar K.V (Arm)
<aneesh.kumar@kernel.org> wrote:
> @@ -1411,6 +1436,16 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
>         if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
>                 pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
>
> +       /*
> +        * if we are trying to swiotlb map a decrypted paddr or the paddr is encrypted
> +        * but the device is forcing decryption, use decrypted io_tlb_mem
> +        */
> +       if ((attrs & DMA_ATTR_CC_SHARED) || force_dma_unencrypted(dev))

I don't think swiotlb needs to know about force_dma_unencrypted(), the
dma/direct caller should have all the information to pass the
appropriate flags.

Thanks.
Mostafa

> +               require_decrypted = true;
> +
> +       if (require_decrypted != mem->unencrypted)
> +               return (phys_addr_t)DMA_MAPPING_ERROR;
> +

^ permalink raw reply

* Re: [PATCH v14 09/44] arm64: RMI: Provide functions to delegate/undelegate ranges of memory
From: Suzuki K Poulose @ 2026-05-21 16:01 UTC (permalink / raw)
  To: Marc Zyngier, Steven Price
  Cc: kvm, kvmarm, Catalin Marinas, Will Deacon, James Morse,
	Oliver Upton, Zenghui Yu, linux-arm-kernel, linux-kernel,
	Joey Gouly, Alexandru Elisei, Christoffer Dall, Fuad Tabba,
	linux-coco, Ganapatrao Kulkarni, Gavin Shan, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <867bowx3qx.wl-maz@kernel.org>

On 21/05/2026 14:59, Marc Zyngier wrote:
> On Wed, 13 May 2026 14:17:17 +0100,
> Steven Price <steven.price@arm.com> wrote:
>>
>> The RMM requires memory is 'delegated' to it so that it can be used
>> either for a realm guest or for various tracking purposes within the RMM
>> (e.g. for metadata or page tables). Memory that has been delegated
>> cannot be accessed by the host (it will result in a Granule Protection
>> Fault).
>>
>> Undelegation may fail if the memory is still in use by the RMM. This
>> shouldn't happen (Linux should ensure it has destroyed the RMM objects
>> before attempting to undelegate). In the event that it does happen this
>> points to a programming bug and the only reasonable approach is for the
>> physical pages to be leaked - it is up to the caller of
>> rmi_undelegate_range() to handle this.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> v14:
>>   * Split into separate patch and moved out of KVM
>> ---
>>   arch/arm64/include/asm/rmi_cmds.h | 13 +++++++++++
>>   arch/arm64/kernel/rmi.c           | 36 +++++++++++++++++++++++++++++++
>>   2 files changed, 49 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
>> index 9078a2920a7c..eb213c8e6f26 100644
>> --- a/arch/arm64/include/asm/rmi_cmds.h
>> +++ b/arch/arm64/include/asm/rmi_cmds.h
>> @@ -33,6 +33,19 @@ struct rmi_sro_state {
>>   } while (RMI_RETURN_STATUS(res.a0) == RMI_BUSY ||			\
>>   	 RMI_RETURN_STATUS(res.a0) == RMI_BLOCKED)
>>   
>> +int rmi_delegate_range(phys_addr_t phys, unsigned long size);
>> +int rmi_undelegate_range(phys_addr_t phys, unsigned long size);
>> +
>> +static inline int rmi_delegate_page(phys_addr_t phys)
>> +{
>> +	return rmi_delegate_range(phys, PAGE_SIZE);
>> +}
>> +
>> +static inline int rmi_undelegate_page(phys_addr_t phys)
>> +{
>> +	return rmi_undelegate_range(phys, PAGE_SIZE);
>> +}
>> +
>>   bool rmi_is_available(void);
>>   
>>   unsigned long rmi_sro_execute(struct rmi_sro_state *sro, gfp_t gfp);
>> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
>> index 52a415e99500..08cef54acadb 100644
>> --- a/arch/arm64/kernel/rmi.c
>> +++ b/arch/arm64/kernel/rmi.c
>> @@ -12,6 +12,42 @@ static bool arm64_rmi_is_available;
>>   unsigned long rmm_feat_reg0;
>>   unsigned long rmm_feat_reg1;
>>   
>> +int rmi_delegate_range(phys_addr_t phys, unsigned long size)
>> +{
>> +	unsigned long ret = 0;
>> +	unsigned long top = phys + size;
>> +	unsigned long out_top;
>> +
>> +	while (phys < top) {
>> +		ret = rmi_granule_range_delegate(phys, top, &out_top);
>> +		if (ret == RMI_SUCCESS)
>> +			phys = out_top;
>> +		else if (ret != RMI_BUSY && ret != RMI_BLOCKED)
>> +			return ret;
>> +	}
>> +
>> +	return ret;
>> +}
>> +
>> +int rmi_undelegate_range(phys_addr_t phys, unsigned long size)
>> +{
>> +	unsigned long ret = 0;
>> +	unsigned long top = phys + size;
>> +	unsigned long out_top;
>> +
>> +	WARN_ON(size == 0);
> 
> I find it odd to warn on size = 0. After all, free(NULL) is not an
> error. But even then, you continue feeding this to the RMM.
> 
> You also don't seem to be bothered with that on the delegation side...
> 
>> +
>> +	while (phys < top) {
>> +		ret = rmi_granule_range_undelegate(phys, top, &out_top);
>> +		if (ret == RMI_SUCCESS)
>> +			phys = out_top;
> 
> and size==0 doesn't violate any of the failure conditions listed in
> B4.5.18.2 (beta2). Will you end-up looping around forever?

That is not true ? It triggers, top_bound error condition, for both.


pre: UInt(top) <= UInt(base)
post: result.status == RMI_ERROR_INPUT


Suzuki
> 
> Same questions for the delegation, obviously.
> 
> 	M.
> 


^ permalink raw reply

* Re: [PATCH v14 06/44] arm64: RMI: Check for RMI support at init
From: Steven Price @ 2026-05-21 15:49 UTC (permalink / raw)
  To: Gavin Shan, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <ee494ecd-8979-40f2-896e-82137abbf440@redhat.com>

On 21/05/2026 01:39, Gavin Shan wrote:
> Hi Steven,
> 
> On 5/13/26 11:17 PM, Steven Price wrote:
>> Query the RMI version number and check if it is a compatible version.
>> The first two feature registers are read and exposed for future code to
>> use.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> v14:
>>   * This moves the basic RMI setup into the 'kernel' directory. This is
>>     because RMI will be used for some features outside of KVM so should
>>     be available even if KVM isn't compiled in.
>> ---
>>   arch/arm64/include/asm/rmi_cmds.h |  3 ++
>>   arch/arm64/kernel/Makefile        |  2 +-
>>   arch/arm64/kernel/cpufeature.c    |  1 +
>>   arch/arm64/kernel/rmi.c           | 65 +++++++++++++++++++++++++++++++
>>   4 files changed, 70 insertions(+), 1 deletion(-)
>>   create mode 100644 arch/arm64/kernel/rmi.c
>>
> 
> [...]
> 
>> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
>> new file mode 100644
>> index 000000000000..99c1ccc35c11
>> --- /dev/null
>> +++ b/arch/arm64/kernel/rmi.c
>> @@ -0,0 +1,65 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * Copyright (C) 2023-2025 ARM Ltd.
>> + */
>> +
>> +#include <linux/memblock.h>
>> +
>> +#include <asm/rmi_cmds.h>
>> +
>> +unsigned long rmm_feat_reg0;
>> +unsigned long rmm_feat_reg1;
>> +
>> +static int rmi_check_version(void)
>> +{
>> +    struct arm_smccc_res res;
>> +    unsigned short version_major, version_minor;
>> +    unsigned long host_version = RMI_ABI_VERSION(RMI_ABI_MAJOR_VERSION,
>> +                             RMI_ABI_MINOR_VERSION);
>> +    unsigned long aa64pfr0 =
>> read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
>> +
>> +    /* If RME isn't supported, then RMI can't be */
>> +    if (cpuid_feature_extract_unsigned_field(aa64pfr0,
>> ID_AA64PFR0_EL1_RME_SHIFT) == 0)
>> +        return -ENXIO;
>> +
>> +    arm_smccc_1_1_invoke(SMC_RMI_VERSION, host_version, &res);
>> +
>> +    if (res.a0 == SMCCC_RET_NOT_SUPPORTED)
>> +        return -ENXIO;
>> +
>> +    version_major = RMI_ABI_VERSION_GET_MAJOR(res.a1);
>> +    version_minor = RMI_ABI_VERSION_GET_MINOR(res.a1);
>> +
>> +    if (res.a0 != RMI_SUCCESS) {
>> +        unsigned short high_version_major, high_version_minor;
>> +
>> +        high_version_major = RMI_ABI_VERSION_GET_MAJOR(res.a2);
>> +        high_version_minor = RMI_ABI_VERSION_GET_MINOR(res.a2);
>> +
>> +        pr_err("Unsupported RMI ABI (v%d.%d - v%d.%d) we want v%d.%d\n",
>> +               version_major, version_minor,
>> +               high_version_major, high_version_minor,
>> +               RMI_ABI_MAJOR_VERSION,
>> +               RMI_ABI_MINOR_VERSION);
>> +        return -ENXIO;
>> +    }
>> +
>> +    pr_info("RMI ABI version %d.%d\n", version_major, version_minor);
>> +
>> +    return 0;
>> +}
>> +
>> +static int __init arm64_init_rmi(void)
>> +{
>> +    /* Continue without realm support if we can't agree on a version */
>> +    if (rmi_check_version())
>> +        return 0;
> 
> Is this still a valid point that we have to return zero on errors returned
> from rmi_check_version() or other other function calls like rmi_features()?
> arm64_init_rmi() is triggered by subsys_initcall() where the return value
> needs to indicate success or failure. It's fine to return error code from
> arm64_init_rmi() in the path.

Hmm, I guess now this is moved to arm64 code this indeed doesn't need
to. Within a module I believe an error return can fail the module loading.

I'm not sure it really makes much difference though - if this
initialisation fails then it's not really an error - it just means the
feature is unavailable.

Thanks,
Steve

>> +
>> +    if (WARN_ON(rmi_features(0, &rmm_feat_reg0)))
>> +        return 0;
>> +    if (WARN_ON(rmi_features(1, &rmm_feat_reg1)))
>> +        return 0;
>> +
>> +    return 0;
>> +}
>> +subsys_initcall(arm64_init_rmi);
> 
> Thanks,
> Gavin
> 


^ permalink raw reply

* Re: [PATCH v14 05/44] arm64: RMI: Add wrappers for RMI calls
From: Steven Price @ 2026-05-21 15:44 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: kvm, kvmarm, Catalin Marinas, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
	Shanker Donthineni, Alper Gun, Aneesh Kumar K . V, Emi Kisanuki,
	Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <86cxypvsfy.wl-maz@kernel.org>

On 21/05/2026 13:49, Marc Zyngier wrote:
> On Wed, 13 May 2026 14:17:13 +0100,
> Steven Price <steven.price@arm.com> wrote:
>>
>> The wrappers make the call sites easier to read and deal with the
>> boiler plate of handling the error codes from the RMM.
>>
>> Signed-off-by: Steven Price <steven.price@arm.com>
>> ---
>> Changes from v13:
>>  * Update to RMM v2.0-bet1 spec including some SRO support (there still
>>    some FIXMEs where SRO support is incomplete).
>> Changes from v12:
>>  * Update to RMM v2.0 specification
>> Changes from v8:
>>  * Switch from arm_smccc_1_2_smc() to arm_smccc_1_2_invoke() in
>>    rmi_rtt_read_entry() for consistency.
>> Changes from v7:
>>  * Minor renaming of parameters and updated comments
>> Changes from v5:
>>  * Further improve comments
>> Changes from v4:
>>  * Improve comments
>> Changes from v2:
>>  * Make output arguments optional.
>>  * Mask RIPAS value rmi_rtt_read_entry()
>>  * Drop unused rmi_rtt_get_phys()
>> ---
>>  arch/arm64/include/asm/rmi_cmds.h | 661 ++++++++++++++++++++++++++++++
>>  1 file changed, 661 insertions(+)
>>  create mode 100644 arch/arm64/include/asm/rmi_cmds.h
>>
>> diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
>> new file mode 100644
>> index 000000000000..04f7066894e9
>> --- /dev/null
>> +++ b/arch/arm64/include/asm/rmi_cmds.h
>> @@ -0,0 +1,661 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/*
>> + * Copyright (C) 2023 ARM Ltd.
>> + */
>> +
>> +#ifndef __ASM_RMI_CMDS_H
>> +#define __ASM_RMI_CMDS_H
>> +
>> +#include <linux/arm-smccc.h>
>> +
>> +#include <asm/rmi_smc.h>
>> +
>> +struct rtt_entry {
>> +	unsigned long walk_level;
>> +	unsigned long desc;
>> +	int state;
>> +	int ripas;
>> +};
>> +
>> +#define RMI_MAX_ADDR_LIST	256
>> +
>> +struct rmi_sro_state {
>> +	struct arm_smccc_1_2_regs regs;
>> +	unsigned long addr_count;
>> +	unsigned long addr_list[RMI_MAX_ADDR_LIST];
>> +};
>> +
>> +#define rmi_smccc(...) do {						\
>> +	arm_smccc_1_1_invoke(__VA_ARGS__);				\
>> +} while (RMI_RETURN_STATUS(res.a0) == RMI_BUSY ||			\
>> +	 RMI_RETURN_STATUS(res.a0) == RMI_BLOCKED)
>> +
>> +unsigned long rmi_sro_execute(struct rmi_sro_state *sro, gfp_t gfp);
>> +void rmi_sro_free(struct rmi_sro_state *sro);
>> +
>> +/**
>> + * rmi_rmm_config_set() - Configure the RMM
>> + * @cfg_ptr: PA of a struct rmm_config
>> + *
>> + * Sets configuration options on the RMM.
>> + *
>> + * Return: RMI return code
>> + */
>> +static inline int rmi_rmm_config_set(unsigned long cfg_ptr)
>> +{
>> +	struct arm_smccc_res res;
>> +
>> +	arm_smccc_1_1_invoke(SMC_RMI_RMM_CONFIG_SET, cfg_ptr, &res);
>> +
>> +	return res.a0;
>> +}
>> +
>> +/**
>> + * rmi_rmm_activate() - Activate the RMM
>> + *
>> + * Return: RMI return code
>> + */
>> +static inline int rmi_rmm_activate(void)
>> +{
>> +	struct arm_smccc_res res;
>> +
>> +	arm_smccc_1_1_invoke(SMC_RMI_RMM_ACTIVATE, &res);
>> +
>> +	return res.a0;
>> +}
>> +
>> +/**
>> + * rmi_granule_tracking_get() - Get configuration of a Granule tracking region
>> + * @start: Base PA of the tracking region
>> + * @end: End of the PA region
>> + * @out_category: Memory category
>> + * @out_state: Tracking region state
>> + * @out_top: Top of the memory region
>> + *
>> + * Return: RMI return code
>> + */
>> +static inline int rmi_granule_tracking_get(unsigned long start,
>> +					   unsigned long end,
>> +					   unsigned long *out_category,
>> +					   unsigned long *out_state,
>> +					   unsigned long *out_top)
>> +{
>> +	struct arm_smccc_res res;
>> +
>> +	arm_smccc_1_1_invoke(SMC_RMI_GRANULE_TRACKING_GET, start, end, &res);
>> +
>> +	if (out_category)
>> +		*out_category = res.a1;
>> +	if (out_state)
>> +		*out_state = res.a2;
>> +	if (out_top)
>> +		*out_top = res.a3;
>> +
>> +	return res.a0;
>> +}
>> +
>> +/**
>> + * rmi_gpt_l1_create() - Create a Level 1 GPT
>> + * @addr: Base of physical address region described by the L1GPT
>> + *
>> + * Return: RMI return code
>> + */
>> +static inline int rmi_gpt_l1_create(unsigned long addr)
>> +{
>> +	struct arm_smccc_res res;
>> +
>> +	arm_smccc_1_1_invoke(SMC_RMI_GPT_L1_CREATE, addr, &res);
>> +
>> +	if (RMI_RETURN_STATUS(res.a0) == RMI_INCOMPLETE) {
>> +		/* FIXME */
> 
> Is that part of the SRO stuff you're talking about in the notes?
> What is the ETA for fixing all these FIXMEs?

Yes, RMI_INCOMPLETE is the return for SRO. Fixing all this up is on the
plan for my next posting which I expect to be after 7.2-rc1 (so July).
There were some changes in the beta 2 spec and the RMM doesn't implement
most of this yet so I didn't want to rush out completely untested code
which might change.

Thanks,
Steve

> Thanks,
> 
> 	M.
> 


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox