Linux Confidential Computing Development
 help / color / mirror / Atom feed
* [PATCH v7 37/42] KVM: selftests: Provide common function to set memory attributes
From: Ackerley Tng via B4 Relay @ 2026-05-23  0:18 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>

From: Sean Christopherson <seanjc@google.com>

Introduce vm_mem_set_memory_attributes(), which handles setting of memory
attributes for a range of guest physical addresses, regardless of whether
the attributes should be set via guest_memfd or via the memory attributes
at the VM level.

Refactor existing vm_mem_set_{shared,private} functions to use the new
function. Opportunistically update the size parameter to use size_t instead
of u64.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h | 46 +++++++++++++++++++-------
 1 file changed, 34 insertions(+), 12 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index e9b4ae9596e05..a86418cdf5f4f 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -454,18 +454,6 @@ static inline void vm_set_memory_attributes(struct kvm_vm *vm, gpa_t gpa,
 	vm_ioctl(vm, KVM_SET_MEMORY_ATTRIBUTES, &attr);
 }
 
-static inline void vm_mem_set_private(struct kvm_vm *vm, gpa_t gpa,
-				      u64 size)
-{
-	vm_set_memory_attributes(vm, gpa, size, KVM_MEMORY_ATTRIBUTE_PRIVATE);
-}
-
-static inline void vm_mem_set_shared(struct kvm_vm *vm, gpa_t gpa,
-				     u64 size)
-{
-	vm_set_memory_attributes(vm, gpa, size, 0);
-}
-
 static inline int __gmem_set_memory_attributes(int fd, u64 offset,
 					       size_t size, u64 attributes,
 					       u64 *error_offset)
@@ -532,6 +520,40 @@ static inline void gmem_set_shared(int fd, u64 offset, size_t size)
 	gmem_set_memory_attributes(fd, offset, size, 0);
 }
 
+static inline void vm_mem_set_memory_attributes(struct kvm_vm *vm, gpa_t gpa,
+						size_t size, u64 attrs)
+{
+	if (kvm_has_gmem_attributes) {
+		gpa_t end = gpa + size;
+		off_t fd_offset;
+		gpa_t addr;
+		size_t len;
+		int fd;
+
+		for (addr = gpa; addr < end; addr += len) {
+			fd = kvm_gpa_to_guest_memfd(vm, addr, &fd_offset, &len);
+			len = min(end - addr, len);
+
+			gmem_set_memory_attributes(fd, fd_offset, len, attrs);
+		}
+	} else {
+		vm_set_memory_attributes(vm, gpa, size, attrs);
+	}
+}
+
+static inline void vm_mem_set_private(struct kvm_vm *vm, gpa_t gpa,
+				      size_t size)
+{
+	vm_mem_set_memory_attributes(vm, gpa, size,
+				     KVM_MEMORY_ATTRIBUTE_PRIVATE);
+}
+
+static inline void vm_mem_set_shared(struct kvm_vm *vm, gpa_t gpa,
+				     size_t size)
+{
+	vm_mem_set_memory_attributes(vm, gpa, size, 0);
+}
+
 void vm_guest_mem_fallocate(struct kvm_vm *vm, gpa_t gpa, u64 size,
 			    bool punch_hole);
 

-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v7 36/42] KVM: selftests: Provide function to look up guest_memfd details from gpa
From: Ackerley Tng via B4 Relay @ 2026-05-23  0:18 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>

From: Ackerley Tng <ackerleytng@google.com>

Introduce a new helper, kvm_gpa_to_guest_memfd(), to find the
guest_memfd-related details of a memory region that contains a given guest
physical address (GPA).

The function returns the file descriptor for the memfd, the offset into
the file that corresponds to the GPA, and the number of bytes remaining
in the region from that GPA.

kvm_gpa_to_guest_memfd() was factored out from vm_guest_mem_fallocate();
refactor vm_guest_mem_fallocate() to use the new helper.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/include/kvm_util.h |  3 +++
 tools/testing/selftests/kvm/lib/kvm_util.c     | 37 ++++++++++++++++----------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index d4c285c6fbe44..e9b4ae9596e05 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -428,6 +428,9 @@ static inline void vm_enable_cap(struct kvm_vm *vm, u32 cap, u64 arg0)
 	vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap);
 }
 
+int kvm_gpa_to_guest_memfd(struct kvm_vm *vm, gpa_t gpa, off_t *fd_offset,
+			   size_t *nr_bytes);
+
 /*
  * KVM_SET_MEMORY_ATTRIBUTES{,2} overwrites _all_ attributes.  These
  * flows need significant enhancements to support multiple attributes.
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index daa0c1e835a71..f8f0cd62f2f17 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1283,27 +1283,20 @@ void vm_guest_mem_fallocate(struct kvm_vm *vm, u64 base, u64 size,
 			    bool punch_hole)
 {
 	const int mode = FALLOC_FL_KEEP_SIZE | (punch_hole ? FALLOC_FL_PUNCH_HOLE : 0);
-	struct userspace_mem_region *region;
 	u64 end = base + size;
-	gpa_t gpa, len;
 	off_t fd_offset;
-	int ret;
+	int fd, ret;
+	size_t len;
+	gpa_t gpa;
 
 	for (gpa = base; gpa < end; gpa += len) {
-		u64 offset;
-
-		region = userspace_mem_region_find(vm, gpa, gpa);
-		TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD,
-			    "Private memory region not found for GPA 0x%lx", gpa);
+		fd = kvm_gpa_to_guest_memfd(vm, gpa, &fd_offset, &len);
+		len = min(end - gpa, len);
 
-		offset = gpa - region->region.guest_phys_addr;
-		fd_offset = region->region.guest_memfd_offset + offset;
-		len = min_t(u64, end - gpa, region->region.memory_size - offset);
-
-		ret = fallocate(region->region.guest_memfd, mode, fd_offset, len);
+		ret = fallocate(fd, mode, fd_offset, len);
 		TEST_ASSERT(!ret, "fallocate() failed to %s at %lx (len = %lu), fd = %d, mode = %x, offset = %lx",
 			    punch_hole ? "punch hole" : "allocate", gpa, len,
-			    region->region.guest_memfd, mode, fd_offset);
+			    fd, mode, fd_offset);
 	}
 }
 
@@ -1640,6 +1633,22 @@ void *addr_gpa2alias(struct kvm_vm *vm, gpa_t gpa)
 	return (void *) ((uintptr_t) region->host_alias + offset);
 }
 
+int kvm_gpa_to_guest_memfd(struct kvm_vm *vm, gpa_t gpa, off_t *fd_offset,
+			   size_t *nr_bytes)
+{
+	struct userspace_mem_region *region;
+	gpa_t gpa_offset;
+
+	region = userspace_mem_region_find(vm, gpa, gpa);
+	TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD,
+		    "guest_memfd memory region not found for GPA 0x%lx", gpa);
+
+	gpa_offset = gpa - region->region.guest_phys_addr;
+	*fd_offset = region->region.guest_memfd_offset + gpa_offset;
+	*nr_bytes = region->region.memory_size - gpa_offset;
+	return region->region.guest_memfd;
+}
+
 /* Create an interrupt controller chip for the specified VM. */
 void vm_create_irqchip(struct kvm_vm *vm)
 {

-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v7 38/42] KVM: selftests: Check fd/flags provided to mmap() when setting up memslot
From: Ackerley Tng via B4 Relay @ 2026-05-23  0:18 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>

From: Sean Christopherson <seanjc@google.com>

Check that a valid fd provided to mmap() must be accompanied by MAP_SHARED.

With an invalid fd (usually used for anonymous mappings), there are no
constraints on mmap() flags.

Add this check to make sure that when a guest_memfd is used as region->fd,
the flag provided to mmap() will include MAP_SHARED.

Signed-off-by: Sean Christopherson <seanjc@google.com>
[Rephrase assertion message.]
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 tools/testing/selftests/kvm/lib/kvm_util.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index f8f0cd62f2f17..21c7e52a2bdac 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -1088,6 +1088,9 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type,
 					     src_type == VM_MEM_SRC_SHARED_HUGETLB);
 	}
 
+	TEST_ASSERT(region->fd == -1 || backing_src_is_shared(src_type),
+		    "A valid fd provided to mmap() must be accompanied by MAP_SHARED.");
+
 	region->mmap_start = __kvm_mmap(region->mmap_size, PROT_READ | PROT_WRITE,
 					vm_mem_backing_src_alias(src_type)->flag,
 					region->fd, mmap_offset);

-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v7 41/42] KVM: selftests: Add script to exercise private_mem_conversions_test
From: Ackerley Tng via B4 Relay @ 2026-05-23  0:18 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>

From: Ackerley Tng <ackerleytng@google.com>

Add a wrapper script to simplify running the private_mem_conversions_test
with a variety of configurations. Manually invoking the test for all
supported memory backing source types is tedious.

The script automatically detects the availability of 2MB and 1GB hugepages
and builds a list of source types to test. It then iterates through the
list, running the test for each type with both a single memslot and
multiple memslots.

This makes it easier to get comprehensive test coverage across different
memory configurations.

Add and use a helper program in C to be able to read
KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES as defined in header files and then
issue the ioctl to read the KVM CAP.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 tools/testing/selftests/kvm/Makefile.kvm           |   4 +
 .../selftests/kvm/kvm_has_gmem_attributes.c        |  17 +++
 .../kvm/x86/private_mem_conversions_test.sh        | 128 +++++++++++++++++++++
 3 files changed, 149 insertions(+)

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 6232881be500a..e5769268936a7 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -54,6 +54,7 @@ LIBKVM_loongarch += lib/loongarch/exception.S
 
 # Non-compiled test targets
 TEST_PROGS_x86 += x86/nx_huge_pages_test.sh
+TEST_PROGS_x86 += x86/private_mem_conversions_test.sh
 
 # Compiled test targets valid on all architectures with libkvm support
 TEST_GEN_PROGS_COMMON = demand_paging_test
@@ -67,6 +68,8 @@ TEST_GEN_PROGS_COMMON += set_memory_region_test
 TEST_GEN_PROGS_COMMON += memslot_modification_stress_test
 TEST_GEN_PROGS_COMMON += memslot_perf_test
 
+TEST_GEN_PROGS_EXTENDED_COMMON += kvm_has_gmem_attributes
+
 # Compiled test targets
 TEST_GEN_PROGS_x86 = $(TEST_GEN_PROGS_COMMON)
 TEST_GEN_PROGS_x86 += x86/cpuid_test
@@ -245,6 +248,7 @@ SPLIT_TESTS += get-reg-list
 
 TEST_PROGS += $(TEST_PROGS_$(ARCH))
 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(ARCH))
+TEST_GEN_PROGS_EXTENDED += $(TEST_GEN_PROGS_EXTENDED_COMMON)
 TEST_GEN_PROGS_EXTENDED += $(TEST_GEN_PROGS_EXTENDED_$(ARCH))
 LIBKVM += $(LIBKVM_$(ARCH))
 
diff --git a/tools/testing/selftests/kvm/kvm_has_gmem_attributes.c b/tools/testing/selftests/kvm/kvm_has_gmem_attributes.c
new file mode 100644
index 0000000000000..4f361349412fb
--- /dev/null
+++ b/tools/testing/selftests/kvm/kvm_has_gmem_attributes.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Utility to check if KVM supports guest_memfd attributes.
+ *
+ * Copyright (C) 2025, Google LLC.
+ */
+
+#include <stdio.h>
+
+#include "kvm_util.h"
+
+int main(void)
+{
+	printf("%u\n", kvm_check_cap(KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES) > 0);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.sh b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.sh
new file mode 100755
index 0000000000000..7179a4fcdd498
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Wrapper script which runs different test setups of
+# private_mem_conversions_test.
+#
+# Copyright (C) 2025, Google LLC.
+
+NUM_VCPUS_TO_TEST=4
+NUM_MEMSLOTS_TO_TEST=$NUM_VCPUS_TO_TEST
+
+# Required pages are based on the test setup in the C code.
+REQUIRED_NUM_2M_HUGEPAGES=$((1024 * NUM_VCPUS_TO_TEST))
+REQUIRED_NUM_1G_HUGEPAGES=$((2 * NUM_VCPUS_TO_TEST))
+
+get_hugepage_count() {
+    local page_size_kb=$1
+    local path="/sys/kernel/mm/hugepages/hugepages-${page_size_kb}kB/nr_hugepages"
+    if [ -f "$path" ]; then
+        cat "$path"
+    else
+        echo 0
+    fi
+}
+
+get_default_hugepage_size_in_kb() {
+    local size=$(grep "Hugepagesize:" /proc/meminfo | awk '{print $2}')
+    echo "$size"
+}
+
+run_tests() {
+    local executable_path=$1
+    local src_type=$2
+    local num_memslots=$3
+    local num_vcpus=$4
+
+    echo "$executable_path -s $src_type -m $num_memslots -n $num_vcpus"
+    "$executable_path" -s "$src_type" -m "$num_memslots" -n "$num_vcpus"
+}
+
+script_dir=$(dirname "$(realpath "$0")")
+test_executable="${script_dir}/private_mem_conversions_test"
+kvm_has_gmem_attributes_tool="${script_dir}/../kvm_has_gmem_attributes"
+
+if [ ! -f "$test_executable" ]; then
+    echo "Error: Test executable not found at '$test_executable'" >&2
+    exit 1
+fi
+
+if [ ! -f "$kvm_has_gmem_attributes_tool" ]; then
+    echo "Error: kvm_has_gmem_attributes utility not found at '$kvm_has_gmem_attributes_tool'" >&2
+    exit 1
+fi
+
+kvm_has_gmem_attributes=$("$kvm_has_gmem_attributes_tool" | tail -n1)
+
+if [ "$kvm_has_gmem_attributes" -eq 1 ]; then
+    backing_src_types=("shmem")
+else
+    hugepage_2mb_count=$(get_hugepage_count 2048)
+    hugepage_2mb_enabled=$((hugepage_2mb_count >= REQUIRED_NUM_2M_HUGEPAGES))
+    hugepage_1gb_count=$(get_hugepage_count 1048576)
+    hugepage_1gb_enabled=$((hugepage_1gb_count >= REQUIRED_NUM_1G_HUGEPAGES))
+
+    default_hugepage_size_kb=$(get_default_hugepage_size_in_kb)
+    hugepage_default_enabled=0
+    if [ "$default_hugepage_size_kb" -eq 2048 ]; then
+        hugepage_default_enabled=$hugepage_2mb_enabled
+    elif [ "$default_hugepage_size_kb" -eq 1048576 ]; then
+        hugepage_default_enabled=$hugepage_1gb_enabled
+    fi
+
+    backing_src_types=("anonymous" "anonymous_thp")
+
+    if [ "$hugepage_default_enabled" -eq 1 ]; then
+        backing_src_types+=("anonymous_hugetlb")
+    else
+        echo "skipping anonymous_hugetlb backing source type"
+    fi
+
+    if [ "$hugepage_2mb_enabled" -eq 1 ]; then
+        backing_src_types+=("anonymous_hugetlb_2mb")
+    else
+        echo "skipping anonymous_hugetlb_2mb backing source type"
+    fi
+
+    if [ "$hugepage_1gb_enabled" -eq 1 ]; then
+        backing_src_types+=("anonymous_hugetlb_1gb")
+    else
+        echo "skipping anonymous_hugetlb_1gb backing source type"
+    fi
+
+    backing_src_types+=("shmem")
+
+    if [ "$hugepage_default_enabled" -eq 1 ]; then
+        backing_src_types+=("shared_hugetlb")
+    else
+        echo "skipping shared_hugetlb backing source type"
+    fi
+fi
+
+return_code=0
+for i in "${!backing_src_types[@]}"; do
+    src_type=${backing_src_types[$i]}
+    if [ "$i" -gt 0 ]; then
+        echo
+    fi
+
+    if ! run_tests "$test_executable" "$src_type" 1 1; then
+        return_code=$?
+        echo "Test failed for source type '$src_type'. Arguments: -s $src_type -m 1 -n 1" >&2
+        break
+    fi
+
+    if ! run_tests "$test_executable" "$src_type" 1 "$NUM_VCPUS_TO_TEST"; then
+        return_code=$?
+        echo "Test failed for source type '$src_type'. Arguments: -s $src_type -m 1 -n $NUM_VCPUS_TO_TEST" >&2
+        break
+    fi
+
+    if ! run_tests "$test_executable" "$src_type" "$NUM_MEMSLOTS_TO_TEST" "$NUM_VCPUS_TO_TEST"; then
+        return_code=$?
+        echo "Test failed for source type '$src_type'. Arguments: -s $src_type -m $NUM_MEMSLOTS_TO_TEST -n $NUM_VCPUS_TO_TEST" >&2
+        break
+    fi
+done
+
+exit "$return_code"

-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v7 40/42] KVM: selftests: Update private_mem_conversions_test to mmap() guest_memfd
From: Ackerley Tng via B4 Relay @ 2026-05-23  0:18 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>

From: Ackerley Tng <ackerleytng@google.com>

Update the private memory conversions selftest to also test conversions
that are done "in-place" via per-guest_memfd memory attributes. In-place
conversions require the host to be able to mmap() the guest_memfd so that
the host and guest can share the same backing physical memory.

This includes several updates, that are conditioned on the system
supporting per-guest_memfd attributes (kvm_has_gmem_attributes):

1. Set up guest_memfd requesting MMAP and INIT_SHARED.

2. With in-place conversions, the host's mapping points directly to the
   guest's memory. When the guest converts a region to private, host access
   to that region is blocked. Update the test to expect a SIGBUS when
   attempting to access the host virtual address (HVA) of private memory.

3. Use vm_mem_set_memory_attributes(), which chooses how to set memory
   attributes based on whether kvm_has_gmem_attributes.

Restrict the test to using VM_MEM_SRC_SHMEM because guest_memfd's required
mmap() flags and page sizes happens to align with those of
VM_MEM_SRC_SHMEM. As long as VM_MEM_SRC_SHMEM is used for src_type,
vm_mem_add() works as intended.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 .../kvm/x86/private_mem_conversions_test.c         | 44 ++++++++++++++++++----
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
index 289ad10063fca..4308c67952310 100644
--- a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
+++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c
@@ -306,9 +306,12 @@ static void handle_exit_hypercall(struct kvm_vcpu *vcpu)
 	if (do_fallocate)
 		vm_guest_mem_fallocate(vm, gpa, size, map_shared);
 
-	if (set_attributes)
-		vm_set_memory_attributes(vm, gpa, size,
-					 map_shared ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE);
+	if (set_attributes) {
+		u64 attrs = map_shared ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE;
+
+		vm_mem_set_memory_attributes(vm, gpa, size, attrs);
+	}
+
 	run->hypercall.ret = 0;
 }
 
@@ -352,8 +355,20 @@ static void *__test_mem_conversions(void *__vcpu)
 				size_t nr_bytes = min_t(size_t, vm->page_size, size - i);
 				u8 *hva = addr_gpa2hva(vm, gpa + i);
 
-				/* In all cases, the host should observe the shared data. */
-				memcmp_h(hva, gpa + i, uc.args[3], nr_bytes);
+				/*
+				 * When using per-guest_memfd memory attributes,
+				 * i.e. in-place conversion, host accesses will
+				 * point at guest memory and should SIGBUS when
+				 * guest memory is private.  When using per-VM
+				 * attributes, i.e. separate backing for shared
+				 * vs. private, the host should always observe
+				 * the shared data.
+				 */
+				if (kvm_has_gmem_attributes &&
+				    uc.args[0] == SYNC_PRIVATE)
+					TEST_EXPECT_SIGBUS(READ_ONCE(*hva));
+				else
+					memcmp_h(hva, gpa + i, uc.args[3], nr_bytes);
 
 				/* For shared, write the new pattern to guest memory. */
 				if (uc.args[0] == SYNC_SHARED)
@@ -382,6 +397,7 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, u32 nr_v
 	const size_t slot_size = memfd_size / nr_memslots;
 	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 	pthread_t threads[KVM_MAX_VCPUS];
+	u64 gmem_flags;
 	struct kvm_vm *vm;
 	int memfd, i;
 
@@ -397,12 +413,17 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, u32 nr_v
 
 	vm_enable_cap(vm, KVM_CAP_EXIT_HYPERCALL, (1 << KVM_HC_MAP_GPA_RANGE));
 
-	memfd = vm_create_guest_memfd(vm, memfd_size, 0);
+	if (kvm_has_gmem_attributes)
+		gmem_flags = GUEST_MEMFD_FLAG_MMAP | GUEST_MEMFD_FLAG_INIT_SHARED;
+	else
+		gmem_flags = 0;
+
+	memfd = vm_create_guest_memfd(vm, memfd_size, gmem_flags);
 
 	for (i = 0; i < nr_memslots; i++)
 		vm_mem_add(vm, src_type, BASE_DATA_GPA + slot_size * i,
 			   BASE_DATA_SLOT + i, slot_size / vm->page_size,
-			   KVM_MEM_GUEST_MEMFD, memfd, slot_size * i, 0);
+			   KVM_MEM_GUEST_MEMFD, memfd, slot_size * i, gmem_flags);
 
 	for (i = 0; i < nr_vcpus; i++) {
 		gpa_t gpa =  BASE_DATA_GPA + i * per_cpu_size;
@@ -452,17 +473,24 @@ static void usage(const char *cmd)
 
 int main(int argc, char *argv[])
 {
-	enum vm_mem_backing_src_type src_type = DEFAULT_VM_MEM_SRC;
+	enum vm_mem_backing_src_type src_type;
 	u32 nr_memslots = 1;
 	u32 nr_vcpus = 1;
 	int opt;
 
 	TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM));
 
+	src_type = kvm_has_gmem_attributes ? VM_MEM_SRC_SHMEM :
+					     DEFAULT_VM_MEM_SRC;
+
 	while ((opt = getopt(argc, argv, "hm:s:n:")) != -1) {
 		switch (opt) {
 		case 's':
 			src_type = parse_backing_src_type(optarg);
+			TEST_ASSERT(!kvm_has_gmem_attributes ||
+				    src_type == VM_MEM_SRC_SHMEM,
+				    "Testing in-place conversions, only %s mem_type supported\n",
+				    vm_mem_backing_src_alias(VM_MEM_SRC_SHMEM)->name);
 			break;
 		case 'n':
 			nr_vcpus = atoi_positive("nr_vcpus", optarg);

-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v7 39/42] KVM: selftests: Make TEST_EXPECT_SIGBUS thread-safe
From: Ackerley Tng via B4 Relay @ 2026-05-23  0:18 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>

From: Ackerley Tng <ackerleytng@google.com>

The TEST_EXPECT_SIGBUS macro is not thread-safe as it uses a global
sigjmp_buf and installs a global SIGBUS signal handler. If multiple threads
execute the macro concurrently, they will race on installing the signal
handler and stomp on other threads' jump buffers, leading to incorrect test
behavior.

Make TEST_EXPECT_SIGBUS thread-safe with the following changes:

Share the KVM tests' global signal handler. sigaction() applies to all
threads; without sharing a global signal handler, one thread may have
removed the signal handler that another thread added, hence leading to
unexpected signals.

The alternative of layering signal handlers was considered, but calling
sigaction() within TEST_EXPECT_SIGBUS() necessarily creates a race. To
avoid adding new setup and teardown routines to do sigaction() and keep
usage of TEST_EXPECT_SIGBUS() simple, share the KVM tests' global signal
handler.

Opportunistically rename report_unexpected_signal to
catchall_signal_handler.

To continue to only expect SIGBUS within specific regions of code, use a
thread-specific variable, expecting_sigbus, to replace installing and
removing signal handlers.

Make the execution environment for the thread, sigjmp_buf, a
thread-specific variable.

As part of TEST_EXPECT_SIGBUS(), assert the prerequisite for this setup,
that the current signal handler is the catchall_signal_handler.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 tools/testing/selftests/kvm/include/test_util.h | 32 +++++++++++++------------
 tools/testing/selftests/kvm/lib/kvm_util.c      | 18 ++++++++++----
 tools/testing/selftests/kvm/lib/test_util.c     |  7 ------
 3 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
index c280c3233f502..c9ba4e010f0b8 100644
--- a/tools/testing/selftests/kvm/include/test_util.h
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -82,21 +82,23 @@ do {									\
 	__builtin_unreachable(); \
 } while (0)
 
-extern sigjmp_buf expect_sigbus_jmpbuf;
-void expect_sigbus_handler(int signum);
-
-#define TEST_EXPECT_SIGBUS(action)						\
-do {										\
-	struct sigaction sa_old, sa_new = {					\
-		.sa_handler = expect_sigbus_handler,				\
-	};									\
-										\
-	sigaction(SIGBUS, &sa_new, &sa_old);					\
-	if (sigsetjmp(expect_sigbus_jmpbuf, 1) == 0) {				\
-		action;								\
-		TEST_FAIL("'%s' should have triggered SIGBUS", #action);	\
-	}									\
-	sigaction(SIGBUS, &sa_old, NULL);					\
+extern __thread sigjmp_buf expect_sigbus_jmpbuf;
+extern __thread volatile sig_atomic_t expecting_sigbus;
+extern void catchall_signal_handler(int signum);
+
+#define TEST_EXPECT_SIGBUS(action)					\
+do {									\
+	struct sigaction __sa = {};					\
+									\
+	TEST_ASSERT_EQ(sigaction(SIGBUS, NULL, &__sa), 0);		\
+	TEST_ASSERT_EQ(__sa.sa_handler, &catchall_signal_handler);	\
+									\
+	expecting_sigbus = true;					\
+	if (sigsetjmp(expect_sigbus_jmpbuf, 1) == 0) {			\
+		action;							\
+		TEST_FAIL("'%s' should have triggered SIGBUS", #action);\
+	}								\
+	expecting_sigbus = false;					\
 } while (0)
 
 size_t parse_size(const char *size);
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 21c7e52a2bdac..a7725fff58b46 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -2270,13 +2270,20 @@ __weak void kvm_selftest_arch_init(void)
 {
 }
 
-static void report_unexpected_signal(int signum)
+__thread sigjmp_buf expect_sigbus_jmpbuf;
+__thread volatile sig_atomic_t expecting_sigbus;
+
+void catchall_signal_handler(int signum)
 {
+	switch (signum) {
+	case SIGBUS: {
+		if (expecting_sigbus)
+			siglongjmp(expect_sigbus_jmpbuf, 1);
+
+		TEST_FAIL("Unexpected SIGBUS (%d)\n", signum);
+	}
 #define KVM_CASE_SIGNUM(sig)					\
 	case sig: TEST_FAIL("Unexpected " #sig " (%d)\n", signum)
-
-	switch (signum) {
-	KVM_CASE_SIGNUM(SIGBUS);
 	KVM_CASE_SIGNUM(SIGSEGV);
 	KVM_CASE_SIGNUM(SIGILL);
 	KVM_CASE_SIGNUM(SIGFPE);
@@ -2288,12 +2295,13 @@ static void report_unexpected_signal(int signum)
 void __attribute((constructor)) kvm_selftest_init(void)
 {
 	struct sigaction sig_sa = {
-		.sa_handler = report_unexpected_signal,
+		.sa_handler = catchall_signal_handler,
 	};
 
 	/* Tell stdout not to buffer its content. */
 	setbuf(stdout, NULL);
 
+	expecting_sigbus = false;
 	sigaction(SIGBUS, &sig_sa, NULL);
 	sigaction(SIGSEGV, &sig_sa, NULL);
 	sigaction(SIGILL, &sig_sa, NULL);
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
index bab1bd2b775b6..30eb701e4becd 100644
--- a/tools/testing/selftests/kvm/lib/test_util.c
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -18,13 +18,6 @@
 
 #include "test_util.h"
 
-sigjmp_buf expect_sigbus_jmpbuf;
-
-void __attribute__((used)) expect_sigbus_handler(int signum)
-{
-	siglongjmp(expect_sigbus_jmpbuf, 1);
-}
-
 /*
  * Random number generator that is usable from guest code. This is the
  * Park-Miller LCG using standard constants.

-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* [PATCH v7 42/42] KVM: selftests: Update private memory exits test to work with per-gmem attributes
From: Ackerley Tng via B4 Relay @ 2026-05-23  0:18 UTC (permalink / raw)
  To: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, liam, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka
  Cc: kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco, Ackerley Tng
In-Reply-To: <20260522-gmem-inplace-conversion-v7-0-2f0fae496530@google.com>

From: Sean Christopherson <seanjc@google.com>

Skip setting memory to private in the private memory exits test when using
per-gmem memory attributes, as memory is initialized to private by default
for guest_memfd, and using vm_mem_set_private() on a guest_memfd instance
requires creating guest_memfd with GUEST_MEMFD_FLAG_MMAP (which is totally
doable, but would need to be conditional and is ultimately unnecessary).

Expect an emulated MMIO instead of a memory fault exit when attributes are
per-gmem, as deleting the memslot effectively drops the private status,
i.e. the GPA becomes shared and thus supports emulated MMIO.

Skip the "memslot not private" test entirely, as private vs. shared state
for x86 software-protected VMs comes from the memory attributes themselves,
and so when doing in-place conversions there can never be a disconnect
between the expected and actual states.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 .../selftests/kvm/x86/private_mem_kvm_exits_test.c | 36 ++++++++++++++++++----
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c b/tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
index 10db9fe6d9063..70ed16066c63e 100644
--- a/tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
+++ b/tools/testing/selftests/kvm/x86/private_mem_kvm_exits_test.c
@@ -62,8 +62,9 @@ static void test_private_access_memslot_deleted(void)
 
 	virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES);
 
-	/* Request to access page privately */
-	vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE);
+	/* Request to access page privately. */
+	if (!kvm_has_gmem_attributes)
+		vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE);
 
 	pthread_create(&vm_thread, NULL,
 		       (void *(*)(void *))run_vcpu_get_exit_reason,
@@ -74,10 +75,26 @@ static void test_private_access_memslot_deleted(void)
 	pthread_join(vm_thread, &thread_return);
 	exit_reason = (u32)(u64)thread_return;
 
-	TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT);
-	TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE);
-	TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA);
-	TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE);
+	/*
+	 * If attributes are tracked per-gmem, deleting the memslot that points
+	 * at the gmem instance effectively makes the memory shared, and so the
+	 * read should trigger emulated MMIO.
+	 *
+	 * If attributes are tracked per-VM, deleting the memslot shouldn't
+	 * affect the private attribute, and so KVM should generate a memory
+	 * fault exit (emulated MMIO on private GPAs is disallowed).
+	 */
+	if (kvm_has_gmem_attributes) {
+		TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MMIO);
+		TEST_ASSERT_EQ(vcpu->run->mmio.phys_addr, EXITS_TEST_GPA);
+		TEST_ASSERT_EQ(vcpu->run->mmio.len, sizeof(u64));
+		TEST_ASSERT_EQ(vcpu->run->mmio.is_write, false);
+	} else {
+		TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT);
+		TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE);
+		TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA);
+		TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE);
+	}
 
 	kvm_vm_free(vm);
 }
@@ -88,6 +105,13 @@ static void test_private_access_memslot_not_private(void)
 	struct kvm_vcpu *vcpu;
 	u32 exit_reason;
 
+	/*
+	 * Accessing non-private memory as private with a software-protected VM
+	 * isn't possible when doing in-place conversions.
+	 */
+	if (kvm_has_gmem_attributes)
+		return;
+
 	vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu,
 					   guest_repeatedly_read);
 

-- 
2.54.0.794.g4f17f83d09-goog



^ permalink raw reply related

* Re: [PATCH] x86/tdx: Fix zero-extension for CPUID emulation
From: Christian Ludloff @ 2026-05-23  0:40 UTC (permalink / raw)
  To: Kiryl Shutsemau
  Cc: Dave Hansen, Edgecombe, Rick P, linux-coco@lists.linux.dev,
	clopez@suse.de, x86@kernel.org, ak@linux.intel.com, bp@alien8.de,
	dave.hansen@linux.intel.com, hpa@zytor.com, mingo@redhat.com,
	linux-kernel@vger.kernel.org, Luck, Tony, tglx@kernel.org,
	stable@vger.kernel.org, kvm@vger.kernel.org

On Tue, May 12, 2026 at 03:14:54PM -0700, Dave Hansen wrote:
> CPUID (the instruction) is defined to fill in eax/ebx/ecx/edx.

In the original x64 spec CPUID inherited 32-bit op size from
the pre-x64 days, and although established leaves might all
have followed that definition, the ISA per se doesn't prohibit
an implementation that allows, or defaults to, 64-bit op size.

Having made that statement... the same does go for MSRs.

> Those are 32-bit registers so the normal register rules apply:
> "32-bit operands generate a 32-bit result, zero-extended to a
> 64-bit result in the destination general-purpose register."

...in PM64 ...while outside PM64 and across mode switches
the upper 32 bits are explicitly undefined. Needless to say...
SMM and then VMX and SVM had to violate that to function.

> So a properly-behaving CPUID implementation will always end
> up with the top 32 bits empty on the four CPUID registers after
> a CPUID is executed.

True for a "32-bit op size" implementation. Maybe insert that.

--
C.

^ permalink raw reply

* Re: [RFC PATCH 15/15] x86/virt/tdx: Enable TDX Quoting extension
From: Tony Lindgren @ 2026-05-25  5:17 UTC (permalink / raw)
  To: Xu Yilun
  Cc: kas, djbw, rick.p.edgecombe, x86, peter.fang, linux-coco,
	linux-kernel, kvm, sohil.mehta, yilun.xu, baolu.lu,
	zhenzhong.duan, xiaoyao.li
In-Reply-To: <20260522034128.3144354-16-yilun.xu@linux.intel.com>

On Fri, May 22, 2026 at 11:41:28AM +0800, Xu Yilun wrote:
> From: Peter Fang <peter.fang@intel.com>
> 
> TDX Module updates global metadata when add-on features are enabled.
> Host should update the cached tdx_sysinfo to reflect these changes.

This should be made clearer IMO. How about mention that get_tdx_sys_info()
needs to get called again to reload the TDX module global metadata?
 
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -1049,6 +1049,7 @@ static __init int construct_tdmrs(struct list_head *tmb_list,
>  static __init int config_tdx_module(struct tdmr_info_list *tdmr_list,
>  				    u64 global_keyid)
>  {
> +	u64 seamcall_fn = TDH_SYS_CONFIG_V0;
>  	struct tdx_module_args args = {};
>  	u64 *tdmr_pa_array;
>  	size_t array_sz;
> @@ -1074,8 +1075,22 @@ static __init int config_tdx_module(struct tdmr_info_list *tdmr_list,
>  	args.rcx = __pa(tdmr_pa_array);
>  	args.rdx = tdmr_list->nr_consumed_tdmrs;
>  	args.r8 = global_keyid;
> -	ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
>  
> +	if (tdx_sysinfo.features.tdx_features0 & TDX_FEATURES0_QUOTE) {
> +		args.r9 |= TDX_FEATURES0_QUOTE;
> +		/* These parameters require version >= 1 */
> +		seamcall_fn = TDH_SYS_CONFIG;
> +	}
> +
> +	ret = seamcall_prerr(seamcall_fn, &args);
> +	if (ret)
> +		goto free_tdmr;
> +
> +	/* enabling TDX Quoting may change tdx_sysinfo, update it */
> +	if (tdx_sysinfo.features.tdx_features0 & TDX_FEATURES0_QUOTE)
> +		ret = get_tdx_sys_info(&tdx_sysinfo);

The comment above helps, but the change in the handling will be easy to
miss.

> +free_tdmr:
>  	/* Free the array as it is not required anymore. */
>  	kfree(tdmr_pa_array);
>  

So I think it would be good to also add a comment to get_tdx_sys_info()
to make it easier for folks to follow that it may get called multiple
times.

Regards,

Tony

^ permalink raw reply

* Re: [PATCH 04/15] x86/virt/tdx: Enable the Extensions right after basic TDX Module init
From: Tony Lindgren @ 2026-05-25  6:00 UTC (permalink / raw)
  To: Xu Yilun
  Cc: kas, djbw, rick.p.edgecombe, x86, peter.fang, linux-coco,
	linux-kernel, kvm, sohil.mehta, yilun.xu, baolu.lu,
	zhenzhong.duan, xiaoyao.li
In-Reply-To: <20260522034128.3144354-5-yilun.xu@linux.intel.com>

On Fri, May 22, 2026 at 11:41:17AM +0800, Xu Yilun wrote:
> The detailed initialization flow for TDX Module Extensions has been
> fully implemented. Enable the flow after basic TDX Module
> initialization.
> 
> Theoretically, the Extensions doesn't need to be enabled right after
> basic TDX initialization. It could be enabled right before the first
> Extension SEAMCALL is issued. That would save or postpone memory usage.
> But it isn't worth the complexity, the needs for the Extensions are vast
> but the savings are little for a typical TDX capable system (about
> 0.001% of memory). So the Linux decision is to just enable it along with
> the basic TDX.
> 
> Note that the Extensions initialization flow will still not start if no
> add-on features require Extensions. The enabling of add-on features will
> be in later patches. Until then, the system hasn't consumed extra memory.

Looking at patch 15/15, we need to reload the TDX module metadata at least
for the attestation. We need to do that early, so to me it seems that
everything can be just tagged __init from the start.

So you can just call init_tdx_ext() in patch 3/15, and this patch is not
needed at all?

Regards,

Tony

^ permalink raw reply

* Re: [PATCH 01/15] x86/virt/tdx: Read global metadata for TDX Module Extensions
From: Xiaoyao Li @ 2026-05-25  6:24 UTC (permalink / raw)
  To: Xu Yilun, kas, djbw, rick.p.edgecombe, x86, peter.fang
  Cc: linux-coco, linux-kernel, kvm, sohil.mehta, yilun.xu, baolu.lu,
	zhenzhong.duan
In-Reply-To: <20260522034128.3144354-2-yilun.xu@linux.intel.com>

On 5/22/2026 11:41 AM, Xu Yilun wrote:
> Add reading of the global metadata for TDX Module Extensions.
> 
> TDX Module Extensions is an add-on feature enumerated by TDX_FEATURES0.

> But for the Module's integrity, Linux requires that all features that a
> Module advertises must have a complete, valid set of metadata, 

I doubt on this.

1. Is it a must that any new feature introduces new metadata field?

2. Linux only cares the integrity for the features it uses, not for all 
the features.

> and the
> validation must succeed at core TDX initialization time.
> 
> Check TDX_FEATURES0 before reading these metadata. If a feature is
> advertised, a failure in reading associated metadata causes the entire
> TDX initialization to fail, otherwise skip.

I'm not sure why we need to explain the behavior when the reading fails. 
It's not different to other existing fields.

Instead, I think you can explain why we need to check TDX_FEATURES0_EXT 
at first.

Anyway, I don't read it as a good changelog. It event doesn't tell what 
the added fields are and why we need them.

^ permalink raw reply

* Re: [PATCH 01/15] x86/virt/tdx: Read global metadata for TDX Module Extensions
From: Xiaoyao Li @ 2026-05-25  6:54 UTC (permalink / raw)
  To: Xu Yilun, kas, djbw, rick.p.edgecombe, x86, peter.fang
  Cc: linux-coco, linux-kernel, kvm, sohil.mehta, yilun.xu, baolu.lu,
	zhenzhong.duan
In-Reply-To: <20260522034128.3144354-2-yilun.xu@linux.intel.com>

On 5/22/2026 11:41 AM, Xu Yilun wrote:
...
> +static __init int get_tdx_sys_info_ext(struct tdx_sys_info_ext *sysinfo_ext)
> +{
> +	int ret = 0;
> +	u64 val;
> +
> +	if (!ret && !(ret = read_sys_metadata_field(0x3100000100000000, &val)))
> +		sysinfo_ext->memory_pool_required_pages = val;
> +	if (!ret && !(ret = read_sys_metadata_field(0x3100000000000001, &val)))
> +		sysinfo_ext->ext_required = val;
> +
> +	return ret;
> +}
> +
>   static __init int get_tdx_sys_info(struct tdx_sys_info *sysinfo)
>   {
>   	int ret = 0;
> @@ -116,5 +129,8 @@ static __init int get_tdx_sys_info(struct tdx_sys_info *sysinfo)
>   	ret = ret ?: get_tdx_sys_info_td_ctrl(&sysinfo->td_ctrl);
>   	ret = ret ?: get_tdx_sys_info_td_conf(&sysinfo->td_conf);
>   
> +	if (sysinfo->features.tdx_features0 & TDX_FEATURES0_EXT)
> +		ret = ret ?: get_tdx_sys_info_ext(&sysinfo->ext);

Is it correct to read "memory_pool_required_pages" and "ext_required" so 
early in get_tdx_sys_info()? get_tdx_sys_info() is called before 
config_tdx_module() which calls TDH.SYS.CONFIG.

If I read the TDX module base spec correctly, the amount of memory for 
extensions and EXT_REQUIRED field depends on the enabled features, which 
is determined by TDH.SYS.CONFIG/TDH.SYS.UPDATE ?

>   	return ret;
>   }


^ permalink raw reply

* Re: [PATCH v14 06/44] arm64: RMI: Check for RMI support at init
From: Gavin Shan @ 2026-05-25  6:58 UTC (permalink / raw)
  To: Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <78425c0d-86c5-457f-b171-a4c8dd3acb7d@arm.com>

Hi Steve,

On 5/22/26 1:49 AM, Steven Price wrote:
> On 21/05/2026 01:39, Gavin Shan wrote:
>> On 5/13/26 11:17 PM, Steven Price wrote:
>>> Query the RMI version number and check if it is a compatible version.
>>> The first two feature registers are read and exposed for future code to
>>> use.
>>>
>>> Signed-off-by: Steven Price <steven.price@arm.com>
>>> ---
>>> v14:
>>>    * This moves the basic RMI setup into the 'kernel' directory. This is
>>>      because RMI will be used for some features outside of KVM so should
>>>      be available even if KVM isn't compiled in.
>>> ---
>>>    arch/arm64/include/asm/rmi_cmds.h |  3 ++
>>>    arch/arm64/kernel/Makefile        |  2 +-
>>>    arch/arm64/kernel/cpufeature.c    |  1 +
>>>    arch/arm64/kernel/rmi.c           | 65 +++++++++++++++++++++++++++++++
>>>    4 files changed, 70 insertions(+), 1 deletion(-)
>>>    create mode 100644 arch/arm64/kernel/rmi.c
>>>
>>
>> [...]
>>
>>> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
>>> new file mode 100644
>>> index 000000000000..99c1ccc35c11
>>> --- /dev/null
>>> +++ b/arch/arm64/kernel/rmi.c
>>> @@ -0,0 +1,65 @@
>>> +// SPDX-License-Identifier: GPL-2.0
>>> +/*
>>> + * Copyright (C) 2023-2025 ARM Ltd.
>>> + */
>>> +
>>> +#include <linux/memblock.h>
>>> +
>>> +#include <asm/rmi_cmds.h>
>>> +
>>> +unsigned long rmm_feat_reg0;
>>> +unsigned long rmm_feat_reg1;
>>> +
>>> +static int rmi_check_version(void)
>>> +{
>>> +    struct arm_smccc_res res;
>>> +    unsigned short version_major, version_minor;
>>> +    unsigned long host_version = RMI_ABI_VERSION(RMI_ABI_MAJOR_VERSION,
>>> +                             RMI_ABI_MINOR_VERSION);
>>> +    unsigned long aa64pfr0 =
>>> read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
>>> +
>>> +    /* If RME isn't supported, then RMI can't be */
>>> +    if (cpuid_feature_extract_unsigned_field(aa64pfr0,
>>> ID_AA64PFR0_EL1_RME_SHIFT) == 0)
>>> +        return -ENXIO;
>>> +
>>> +    arm_smccc_1_1_invoke(SMC_RMI_VERSION, host_version, &res);
>>> +
>>> +    if (res.a0 == SMCCC_RET_NOT_SUPPORTED)
>>> +        return -ENXIO;
>>> +
>>> +    version_major = RMI_ABI_VERSION_GET_MAJOR(res.a1);
>>> +    version_minor = RMI_ABI_VERSION_GET_MINOR(res.a1);
>>> +
>>> +    if (res.a0 != RMI_SUCCESS) {
>>> +        unsigned short high_version_major, high_version_minor;
>>> +
>>> +        high_version_major = RMI_ABI_VERSION_GET_MAJOR(res.a2);
>>> +        high_version_minor = RMI_ABI_VERSION_GET_MINOR(res.a2);
>>> +
>>> +        pr_err("Unsupported RMI ABI (v%d.%d - v%d.%d) we want v%d.%d\n",
>>> +               version_major, version_minor,
>>> +               high_version_major, high_version_minor,
>>> +               RMI_ABI_MAJOR_VERSION,
>>> +               RMI_ABI_MINOR_VERSION);
>>> +        return -ENXIO;
>>> +    }
>>> +
>>> +    pr_info("RMI ABI version %d.%d\n", version_major, version_minor);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int __init arm64_init_rmi(void)
>>> +{
>>> +    /* Continue without realm support if we can't agree on a version */
>>> +    if (rmi_check_version())
>>> +        return 0;
>>
>> Is this still a valid point that we have to return zero on errors returned
>> from rmi_check_version() or other other function calls like rmi_features()?
>> arm64_init_rmi() is triggered by subsys_initcall() where the return value
>> needs to indicate success or failure. It's fine to return error code from
>> arm64_init_rmi() in the path.
> 
> Hmm, I guess now this is moved to arm64 code this indeed doesn't need
> to. Within a module I believe an error return can fail the module loading.
> 
> I'm not sure it really makes much difference though - if this
> initialisation fails then it's not really an error - it just means the
> feature is unavailable.
> 

I think the return value would be consistent to the value of 'arm64_rmi_is_available'.
'arm64_rmi_is_available' is true when zero is returned, otherwise, 'arm64_rmi_is_available'
is false.

With the consistency between the return value and 'arm64_rmi_is_available', users are
able to know the value of 'arm64_rmi_is_available' through kernel parameter 'initcall_debug'.
With the kernel parameter, the initcalls including arm64_init_rmi() are traced and its
return value is outputted in the traced messages, seeing do_trace_initcall_start().

> Thanks,
> Steve
> 
>>> +
>>> +    if (WARN_ON(rmi_features(0, &rmm_feat_reg0)))
>>> +        return 0;
>>> +    if (WARN_ON(rmi_features(1, &rmm_feat_reg1)))
>>> +        return 0;
>>> +
>>> +    return 0;
>>> +}
>>> +subsys_initcall(arm64_init_rmi);
>>

Thanks,
Gavin


^ permalink raw reply

* Re: [PATCH 04/15] x86/virt/tdx: Enable the Extensions right after basic TDX Module init
From: Xiaoyao Li @ 2026-05-25  8:05 UTC (permalink / raw)
  To: Xu Yilun, kas, djbw, rick.p.edgecombe, x86, peter.fang
  Cc: linux-coco, linux-kernel, kvm, sohil.mehta, yilun.xu, baolu.lu,
	zhenzhong.duan
In-Reply-To: <20260522034128.3144354-5-yilun.xu@linux.intel.com>

On 5/22/2026 11:41 AM, Xu Yilun wrote:
> The detailed initialization flow for TDX Module Extensions has been
> fully implemented. Enable the flow after basic TDX Module
> initialization.
> 
> Theoretically, the Extensions doesn't need to be enabled right after
> basic TDX initialization. It could be enabled right before the first
> Extension SEAMCALL is issued. That would save or postpone memory usage.
> But it isn't worth the complexity, the needs for the Extensions are vast
> but the savings are little for a typical TDX capable system (about
> 0.001% of memory). So the Linux decision is to just enable it along with
> the basic TDX.



> Note that the Extensions initialization flow will still not start if no
> add-on features require Extensions. The enabling of add-on features will
> be in later patches. Until then, the system hasn't consumed extra memory.

based on the above, how about putting this patch before patch 02 and 03? 
so that we can eliminate the churn of add "__init" and the 
"__maybe_unused " in patch 02.

To be more safer, we can even make the code as

static bool tdx_supports_extension(void)
{
	/* To be enabled when kernel is ready. */
	return false;
}

static __init int init_tdx_ext(void)
{
	if (!tdx_supports_extension())
		return 0;

	/* No feature requires TDX Module Extensions. */
	if (!tdx_sysinfo.ext.ext_required)
		return 0;
}

and after all the pieces implemented, we can change 
tdx_supports_extension() to

static bool tdx_supports_extension(void)
{
	/* To be enabled when kernel is ready. */
	return !!(tdx_sysinfo.features.tdx_features0 & TDX_FEATURES0_EXT);
}

> Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
> ---
>   arch/x86/virt/vmx/tdx/tdx.c | 16 ++++++++++------
>   1 file changed, 10 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> index ff2b96c20d2b..dad5ec642723 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -1180,7 +1180,7 @@ static __init int init_tdmrs(struct tdmr_info_list *tdmr_list)
>   	return 0;
>   }
>   
> -static void tdx_clflush_hpa_list(struct page *root, unsigned int nr_pages)
> +static __init void tdx_clflush_hpa_list(struct page *root, unsigned int nr_pages)
>   {
>   	u64 *entries = page_to_virt(root);
>   	int i;
> @@ -1193,7 +1193,7 @@ static void tdx_clflush_hpa_list(struct page *root, unsigned int nr_pages)
>   #define HPA_LIST_INFO_PFN		GENMASK_U64(51, 12)
>   #define HPA_LIST_INFO_LAST_ENTRY	GENMASK_U64(63, 55)
>   
> -static u64 to_hpa_list_info(struct page *root, unsigned int nr_pages)
> +static __init u64 to_hpa_list_info(struct page *root, unsigned int nr_pages)
>   {
>   	return FIELD_PREP(HPA_LIST_INFO_FIRST_ENTRY, 0) |
>   	       FIELD_PREP(HPA_LIST_INFO_PFN, page_to_pfn(root)) |
> @@ -1201,7 +1201,7 @@ static u64 to_hpa_list_info(struct page *root, unsigned int nr_pages)
>   }
>   
>   /* Initialize the TDX Module Extensions then Extension-SEAMCALLs can be used */
> -static int tdx_ext_init(void)
> +static __init int tdx_ext_init(void)
>   {
>   	struct tdx_module_args args = {};
>   	u64 r;
> @@ -1216,7 +1216,7 @@ static int tdx_ext_init(void)
>   	return 0;
>   }
>   
> -static int tdx_ext_mem_add(struct page *root, unsigned int nr_pages)
> +static __init int tdx_ext_mem_add(struct page *root, unsigned int nr_pages)
>   {
>   	struct tdx_module_args args = {
>   		.rcx = to_hpa_list_info(root, nr_pages),
> @@ -1240,7 +1240,7 @@ static int tdx_ext_mem_add(struct page *root, unsigned int nr_pages)
>   	return 0;
>   }
>   
> -static int tdx_ext_mem_setup(void)
> +static __init int tdx_ext_mem_setup(void)
>   {
>   	unsigned int nr_pages;
>   	struct page *page;
> @@ -1301,7 +1301,7 @@ static int tdx_ext_mem_setup(void)
>   	return ret;
>   }
>   
> -static int __maybe_unused init_tdx_ext(void)
> +static __init int init_tdx_ext(void)
>   {
>   	int ret;
>   
> @@ -1373,6 +1373,10 @@ static __init int init_tdx_module(void)
>   	if (ret)
>   		goto err_reset_pamts;
>   
> +	ret = init_tdx_ext();
> +	if (ret)
> +		goto err_reset_pamts;
> +
>   	pr_info("%lu KB allocated for PAMT\n", tdmrs_count_pamt_kb(&tdx_tdmr_list));
>   
>   out_put_tdxmem:


^ permalink raw reply

* Re: [PATCH 02/15] x86/virt/tdx: Add extra memory to TDX Module for Extensions
From: Xiaoyao Li @ 2026-05-25  8:56 UTC (permalink / raw)
  To: Xu Yilun, kas, djbw, rick.p.edgecombe, x86, peter.fang
  Cc: linux-coco, linux-kernel, kvm, sohil.mehta, yilun.xu, baolu.lu,
	zhenzhong.duan
In-Reply-To: <20260522034128.3144354-3-yilun.xu@linux.intel.com>

On 5/22/2026 11:41 AM, Xu Yilun wrote:
> TDX Module introduces a new concept called "TDX Module Extensions" to
> support long running / hard-irq preemptible flows inside. This makes TDX
> Module capable of handling complex tasks through "Extension SEAMCALLs".
> Adding more memory to TDX Module is the first step to enable Extensions.
> 
> Currently, TDX Module memory use is relatively static. But, the
> Extensions need to use memory more dynamically. While 'static' here
> means the kernel provides necessary amount of memory to TDX Module for
> its basic functionalities, 'dynamic' means extra memory is needed only
> if new add-on features are to be enabled. So add a new memory feeding
> process backed by a new SEAMCALL TDH.EXT.MEM.ADD.
> 
> The process is mostly the same as adding PAMT. The kernel queries TDX
> Module how much memory needed, allocates it, hands it over, and never
> gets it back.
> 
> TDH.EXT.MEM.ADD uses a new parameter type HPA_LIST_INFO to provide
> control (private) pages to TDX Module. This type represents a list of
> pages for TDX Module to access. It needs a 'root page' which contains
> the list of HPAs of the pages. It collapses the HPA of the root page
> and the number of valid HPAs into a 64 bit raw value for SEAMCALL
> parameters. The root page is always a medium, TDX Module never keeps
> the root page.
> 
> Introduce a tdx_clflush_hpa_list() helper to flush shared cache before
> SEAMCALL, to avoid shared cache writeback damaging these private pages.
> 
> For now, TDX Module Extensions consumes relatively large amount of
> memory (~50MB). Use contiguous page allocation to avoid permanently
> fragment too much memory. Print the allocation amount on TDX Module
> Extensions initialization for visibility.
> 
> Co-developed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
> ---
>   arch/x86/virt/vmx/tdx/tdx.h |   1 +
>   arch/x86/virt/vmx/tdx/tdx.c | 118 ++++++++++++++++++++++++++++++++++++
>   2 files changed, 119 insertions(+)
> 
> diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
> index a5eec8e3cc71..2335f88bbb10 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.h
> +++ b/arch/x86/virt/vmx/tdx/tdx.h
> @@ -46,6 +46,7 @@
>   #define TDH_PHYMEM_PAGE_WBINVD		41
>   #define TDH_VP_WR			43
>   #define TDH_SYS_CONFIG			45
> +#define TDH_EXT_MEM_ADD			61
>   #define TDH_SYS_DISABLE			69
>   
>   /*
> diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> index c0c6281b08a5..622399d8da68 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -31,6 +31,7 @@
>   #include <linux/syscore_ops.h>
>   #include <linux/idr.h>
>   #include <linux/kvm_types.h>
> +#include <linux/bitfield.h>
>   #include <asm/page.h>
>   #include <asm/special_insns.h>
>   #include <asm/msr-index.h>
> @@ -1179,6 +1180,123 @@ static __init int init_tdmrs(struct tdmr_info_list *tdmr_list)
>   	return 0;
>   }
>   
> +static void tdx_clflush_hpa_list(struct page *root, unsigned int nr_pages)
> +{
> +	u64 *entries = page_to_virt(root);
> +	int i;
> +
> +	for (i = 0; i < nr_pages; i++)
> +		clflush_cache_range(__va(entries[i]), PAGE_SIZE);

Is the page flush only needed when CLFLUSH_BEFORE_ALLOC is true?

If so, it inherits the same decision to always flush as what 
tdx_clflush_page() did. Then, any chance we can use tdx_clflush_page() 
here so that we have a single central place of the comment to explain 
the kernel design decision.

> +}
> +
> +#define HPA_LIST_INFO_FIRST_ENTRY	GENMASK_U64(11, 3)
> +#define HPA_LIST_INFO_PFN		GENMASK_U64(51, 12)
> +#define HPA_LIST_INFO_LAST_ENTRY	GENMASK_U64(63, 55)
> +
> +static u64 to_hpa_list_info(struct page *root, unsigned int nr_pages)
> +{
> +	return FIELD_PREP(HPA_LIST_INFO_FIRST_ENTRY, 0) |
> +	       FIELD_PREP(HPA_LIST_INFO_PFN, page_to_pfn(root)) |
> +	       FIELD_PREP(HPA_LIST_INFO_LAST_ENTRY, nr_pages - 1);
> +}
> +
> +static int tdx_ext_mem_add(struct page *root, unsigned int nr_pages)
> +{
> +	struct tdx_module_args args = {
> +		.rcx = to_hpa_list_info(root, nr_pages),
> +	};
> +	u64 r;
> +
> +	tdx_clflush_hpa_list(root, nr_pages);
> +
> +	do {
> +		/*
> +		 * TDH_EXT_MEM_ADD is designed to use output parameter RCX to
> +		 * override/update input parameter RCX, so the caller doesn't
> +		 * have to do manual parameter update on retry call.
> +		 */
> +		r = seamcall_ret(TDH_EXT_MEM_ADD, &args);
> +	} while (r == TDX_INTERRUPTED_RESUMABLE);
> +
> +	if (r != TDX_SUCCESS)
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static int tdx_ext_mem_setup(void)
> +{
> +	unsigned int nr_pages;
> +	struct page *page;
> +	u64 *root;
> +	unsigned int i;
> +	int ret;
> +
> +	nr_pages = tdx_sysinfo.ext.memory_pool_required_pages;
> +	/*
> +	 * memory_pool_required_pages == 0 means no need to add pages,
> +	 * skip the memory setup.
> +	 */
> +	if (!nr_pages)
> +		return 0;
> +
> +	root = kzalloc(PAGE_SIZE, GFP_KERNEL);
> +	if (!root)
> +		return -ENOMEM;
> +
> +	page = alloc_contig_pages(nr_pages, GFP_KERNEL, numa_mem_id(),
> +				  &node_online_map);
> +	if (!page) {
> +		ret = -ENOMEM;
> +		goto out_free_root;
> +	}
> +
> +	for (i = 0; i < nr_pages;) {
> +		unsigned int nents = min(nr_pages - i,
> +					 PAGE_SIZE / sizeof(*root));
> +		int j;
> +
> +		for (j = 0; j < nents; j++)
> +			root[j] = page_to_phys(page + i + j);
> +
> +		ret = tdx_ext_mem_add(virt_to_page(root), nents);
> +		/*
> +		 * No SEAMCALLs to reclaim the added pages. For simple error
> +		 * handling, leak all pages.
> +		 */
> +		WARN_ON_ONCE(ret);
> +		if (ret)
> +			break;
> +
> +		i += nents;
> +	}
> +
> +	/*
> +	 * Extensions memory can't be reclaimed once added, print out the
> +	 * amount, stop tracking it and free the root page, no matter success
> +	 * or failure.
> +	 */
> +	pr_info("%lu KB allocated for TDX Module Extensions\n",
> +		nr_pages * PAGE_SIZE / 1024);
> +
> +out_free_root:
> +	kfree(root);
> +
> +	return ret;
> +}
> +
> +static int __maybe_unused init_tdx_ext(void)
> +{
> +	if (!(tdx_sysinfo.features.tdx_features0 & TDX_FEATURES0_EXT))
> +		return 0;
> +
> +	/* No feature requires TDX Module Extensions. */
> +	if (!tdx_sysinfo.ext.ext_required)
> +		return 0;
> +
> +	return tdx_ext_mem_setup();
> +}
> +
>   static __init int init_tdx_module(void)
>   {
>   	int ret;


^ permalink raw reply

* Re: [PATCH 03/15] x86/virt/tdx: Make TDX Module initialize Extensions
From: Xiaoyao Li @ 2026-05-25  8:58 UTC (permalink / raw)
  To: Xu Yilun, kas, djbw, rick.p.edgecombe, x86, peter.fang
  Cc: linux-coco, linux-kernel, kvm, sohil.mehta, yilun.xu, baolu.lu,
	zhenzhong.duan
In-Reply-To: <20260522034128.3144354-4-yilun.xu@linux.intel.com>

On 5/22/2026 11:41 AM, Xu Yilun wrote:
> After providing all required memory to TDX Module, initialize TDX
> Module Extensions via TDH.EXT.INIT, so Extension-SEAMCALLs can be used.
> 
> Co-developed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
> Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>

Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>

> ---
>   arch/x86/virt/vmx/tdx/tdx.h |  1 +
>   arch/x86/virt/vmx/tdx/tdx.c | 24 +++++++++++++++++++++++-
>   2 files changed, 24 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
> index 2335f88bbb10..c5bffd118145 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.h
> +++ b/arch/x86/virt/vmx/tdx/tdx.h
> @@ -46,6 +46,7 @@
>   #define TDH_PHYMEM_PAGE_WBINVD		41
>   #define TDH_VP_WR			43
>   #define TDH_SYS_CONFIG			45
> +#define TDH_EXT_INIT			60
>   #define TDH_EXT_MEM_ADD			61
>   #define TDH_SYS_DISABLE			69
>   
> diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> index 622399d8da68..ff2b96c20d2b 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -1200,6 +1200,22 @@ static u64 to_hpa_list_info(struct page *root, unsigned int nr_pages)
>   	       FIELD_PREP(HPA_LIST_INFO_LAST_ENTRY, nr_pages - 1);
>   }
>   
> +/* Initialize the TDX Module Extensions then Extension-SEAMCALLs can be used */
> +static int tdx_ext_init(void)
> +{
> +	struct tdx_module_args args = {};
> +	u64 r;
> +
> +	do {
> +		r = seamcall(TDH_EXT_INIT, &args);
> +	} while (r == TDX_INTERRUPTED_RESUMABLE);
> +
> +	if (r != TDX_SUCCESS)
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
>   static int tdx_ext_mem_add(struct page *root, unsigned int nr_pages)
>   {
>   	struct tdx_module_args args = {
> @@ -1287,6 +1303,8 @@ static int tdx_ext_mem_setup(void)
>   
>   static int __maybe_unused init_tdx_ext(void)
>   {
> +	int ret;
> +
>   	if (!(tdx_sysinfo.features.tdx_features0 & TDX_FEATURES0_EXT))
>   		return 0;
>   
> @@ -1294,7 +1312,11 @@ static int __maybe_unused init_tdx_ext(void)
>   	if (!tdx_sysinfo.ext.ext_required)
>   		return 0;
>   
> -	return tdx_ext_mem_setup();
> +	ret = tdx_ext_mem_setup();
> +	if (ret)
> +		return ret;
> +
> +	return tdx_ext_init();
>   }
>   
>   static __init int init_tdx_module(void)


^ permalink raw reply

* Re: [RFC PATCH 14/15] x86/virt/tdx: Embed version info in SEAMCALL leaf function definitions
From: Xiaoyao Li @ 2026-05-25  9:00 UTC (permalink / raw)
  To: Xu Yilun, kas, djbw, rick.p.edgecombe, x86, peter.fang
  Cc: linux-coco, linux-kernel, kvm, sohil.mehta, yilun.xu, baolu.lu,
	zhenzhong.duan
In-Reply-To: <20260522034128.3144354-15-yilun.xu@linux.intel.com>

On 5/22/2026 11:41 AM, Xu Yilun wrote:
> Embed version information in SEAMCALL leaf function definitions rather
> than let the caller open code them. For now, only TDH.VP.INIT is
> involved.
> 
> Don't bother the caller to choose the SEAMCALL version if unnecessary.
> New version SEAMCALLs are guaranteed to be backward compatible, so
> ideally kernel doesn't need to keep version history and only uses the
> latest version SEAMCALLs.
> 
> The concern is some old TDX Modules don't recognize new version
> SEAMCALLs. Multiple SEAMCALL versions co-exist when kernel should
> support these old Modules. As time goes by, the old Modules deprecate
> and old version SEAMCALL definitions should disappear.
> 
> The old TDX Modules that only support TDH.VP.INIT v0 are all deprecated,
> so only provide the latest (v1) definition.
> 
> Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
> ---
>   arch/x86/virt/vmx/tdx/tdx.h | 23 ++++++++++++++---------
>   arch/x86/virt/vmx/tdx/tdx.c |  4 ++--
>   2 files changed, 16 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
> index 01a7d7d8ada9..10aff23cd01f 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.h
> +++ b/arch/x86/virt/vmx/tdx/tdx.h
> @@ -2,6 +2,7 @@
>   #ifndef _X86_VIRT_TDX_H
>   #define _X86_VIRT_TDX_H
>   
> +#include <linux/bitfield.h>
>   #include <linux/bits.h>
>   
>   /*
> @@ -11,6 +12,18 @@
>    * architectural definitions come first.
>    */
>   
> +/*
> + * SEAMCALL leaf:
> + *
> + * Bit 15:0	Leaf number
> + * Bit 23:16	Version number
> + */
> +#define SEAMCALL_LEAF			GENMASK(15, 0)
> +#define SEAMCALL_VER			GENMASK(23, 16)
> +
> +#define SEAMCALL_LEAF_VER(l, v)		(FIELD_PREP(SEAMCALL_LEAF, l) | \
> +					 FIELD_PREP(SEAMCALL_VER, v))
> +
>   /*
>    * TDX module SEAMCALL leaf functions
>    */
> @@ -31,7 +44,7 @@
>   #define TDH_VP_CREATE			10
>   #define TDH_MNG_KEY_FREEID		20
>   #define TDH_MNG_INIT			21
> -#define TDH_VP_INIT			22
> +#define TDH_VP_INIT			SEAMCALL_LEAF_VER(22, 1)

how about

#define TDH_VP_INIT			22
#define TDH_VP_INIT_V1			SEAMCALL_LEAF_VER(TDH_VP_INIT, 1)

and use TDH_VP_INIT_V1 below?

>   #define TDH_PHYMEM_PAGE_RDMD		24
>   #define TDH_VP_RD			26
>   #define TDH_PHYMEM_PAGE_RECLAIM		28
> @@ -52,14 +65,6 @@
>   #define TDH_QUOTE_GET			98
>   #define TDH_QUOTE_INIT			100
>   
> -/*
> - * SEAMCALL leaf:
> - *
> - * Bit 15:0	Leaf number
> - * Bit 23:16	Version number
> - */
> -#define TDX_VERSION_SHIFT		16
> -
>   /* TDX page types */
>   #define	PT_NDA		0x0
>   #define	PT_RSVD		0x1
> diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
> index 821f677e9a86..f7600f930c6e 100644
> --- a/arch/x86/virt/vmx/tdx/tdx.c
> +++ b/arch/x86/virt/vmx/tdx/tdx.c
> @@ -2217,8 +2217,8 @@ u64 tdh_vp_init(struct tdx_vp *vp, u64 initial_rcx, u32 x2apicid)
>   		.r8 = x2apicid,
>   	};
>   
> -	/* apicid requires version == 1. */
> -	return seamcall(TDH_VP_INIT | (1ULL << TDX_VERSION_SHIFT), &args);
> +	/* apicid requires version == 1. See TDH_VP_INIT definition.*/
> +	return seamcall(TDH_VP_INIT, &args);
>   }
>   EXPORT_SYMBOL_FOR_KVM(tdh_vp_init);
>   


^ permalink raw reply

* Re: [RFC PATCH 15/15] x86/virt/tdx: Enable TDX Quoting extension
From: Xiaoyao Li @ 2026-05-25 10:51 UTC (permalink / raw)
  To: Tony Lindgren, Xu Yilun
  Cc: kas, djbw, rick.p.edgecombe, x86, peter.fang, linux-coco,
	linux-kernel, kvm, sohil.mehta, yilun.xu, baolu.lu,
	zhenzhong.duan
In-Reply-To: <ahPbb1Ws9hBruJ2d@tlindgre-MOBL1>

On 5/25/2026 1:17 PM, Tony Lindgren wrote:
> On Fri, May 22, 2026 at 11:41:28AM +0800, Xu Yilun wrote:
>> From: Peter Fang <peter.fang@intel.com>
>>
>> TDX Module updates global metadata when add-on features are enabled.
>> Host should update the cached tdx_sysinfo to reflect these changes.
> 
> This should be made clearer IMO. How about mention that get_tdx_sys_info()
> needs to get called again to reload the TDX module global metadata?

Ah ha! This patch answers my comment to patch 1:
https://lore.kernel.org/all/956fa1e6-2920-4b2e-8037-d4b9d812ae53@intel.com/

sysinfo_ext->memory_pool_required_pages and sysinfo_ext->ext_required 
will be updated after extensions are enabled by TDH.SYS.CONFIG.

Patch 06 in this series already reads the tdx_sys_info_quote out of 
get_tdx_sys_info(), which mean get_tdx_sys_info() doesn't ensure all the 
global metadata will be update again.

So how about move the read of memory_pool_required_pages and 
ext_required out of get_tdx_sys_info() and put them after 
TDH.SYS.CONFIG, so that we don't need call get_tdx_sys_info() again?

>> --- a/arch/x86/virt/vmx/tdx/tdx.c
>> +++ b/arch/x86/virt/vmx/tdx/tdx.c
>> @@ -1049,6 +1049,7 @@ static __init int construct_tdmrs(struct list_head *tmb_list,
>>   static __init int config_tdx_module(struct tdmr_info_list *tdmr_list,
>>   				    u64 global_keyid)
>>   {
>> +	u64 seamcall_fn = TDH_SYS_CONFIG_V0;
>>   	struct tdx_module_args args = {};
>>   	u64 *tdmr_pa_array;
>>   	size_t array_sz;
>> @@ -1074,8 +1075,22 @@ static __init int config_tdx_module(struct tdmr_info_list *tdmr_list,
>>   	args.rcx = __pa(tdmr_pa_array);
>>   	args.rdx = tdmr_list->nr_consumed_tdmrs;
>>   	args.r8 = global_keyid;
>> -	ret = seamcall_prerr(TDH_SYS_CONFIG, &args);
>>   
>> +	if (tdx_sysinfo.features.tdx_features0 & TDX_FEATURES0_QUOTE) {
>> +		args.r9 |= TDX_FEATURES0_QUOTE;
>> +		/* These parameters require version >= 1 */
>> +		seamcall_fn = TDH_SYS_CONFIG;
>> +	}
>> +
>> +	ret = seamcall_prerr(seamcall_fn, &args);
>> +	if (ret)
>> +		goto free_tdmr;
>> +
>> +	/* enabling TDX Quoting may change tdx_sysinfo, update it */
>> +	if (tdx_sysinfo.features.tdx_features0 & TDX_FEATURES0_QUOTE)
>> +		ret = get_tdx_sys_info(&tdx_sysinfo);
> 
> The comment above helps, but the change in the handling will be easy to
> miss.
> 
>> +free_tdmr:
>>   	/* Free the array as it is not required anymore. */
>>   	kfree(tdmr_pa_array);
>>   
> 
> So I think it would be good to also add a comment to get_tdx_sys_info()
> to make it easier for folks to follow that it may get called multiple
> times.
> 
> Regards,
> 
> Tony


^ permalink raw reply

* [PATCH v5 0/5] Add iommufd ioctls to support TSM operations
From: Aneesh Kumar K.V (Arm) @ 2026-05-25 15:48 UTC (permalink / raw)
  To: linux-coco, iommu, linux-kernel, kvm
  Cc: Aneesh Kumar K.V (Arm), Alexey Kardashevskiy, Bjorn Helgaas,
	Dan Williams, Jason Gunthorpe, Joerg Roedel, Jonathan Cameron,
	Kevin Tian, Nicolin Chen, Samuel Ortiz, Steven Price,
	Suzuki K Poulose, Will Deacon, Xu Yilun, Shameer Kolothum,
	Paolo Bonzini, Tony Krowiak, Halil Pasic, Jason Herne,
	Harald Freudenberger, Holger Dengler, Heiko Carstens,
	Vasily Gorbik, Alexander Gordeev, Christian Borntraeger,
	Sven Schnelle, Alex Williamson, Matthew Rosato, Farhan Ali,
	Eric Farman, linux-s390

This patch series adds iommufd ioctl support for TSM-related operations.
These ioctls allow VMMs to perform TSM management tasks such as bind and
unbind operations, and to handle guest requests.

Changes from v4:
https://lore.kernel.org/all/20260427061005.901854-1-aneesh.kumar@kernel.org
* Switch VFIO/iommufd to use struct file *kvm_file instead of relying on
  kvm->users_count references.
* Define TSM request scope values globally in iommufd.
* Rename the ioctl to IOMMU_VDEVICE_TSM_REQ.
* Address other review feedback.

Changes from v2:
https://lore.kernel.org/all/20260309111704.2330479-1-aneesh.kumar@kernel.org
* Bump the series revision to v4 to keep it in sync with the dependent CCA DA
  patchsets. There was no v3 posting.
* Drop [PATCH v2 1/3] iommufd/viommu: Allow associating a KVM VM fd with a
  vIOMMU
* Add two new patches to associate a struct kvm * with iommufd objects:
  iommufd/device: Associate a kvm pointer to iommufd_device
  iommufd/viommu: Associate a kvm pointer to iommufd_viommu
* Address review feedback

Changes from v1:
https://lore.kernel.org/all/20250728135216.48084-8-aneesh.kumar@kernel.org
* Rebase onto the latest kernel
* Address review feedback
* Drop the TSM map ioctl; the KVM prefault patch will be used instead to
  ensure that private memory is preallocated

Cc: Alexey Kardashevskiy <aik@amd.com>
Cc: Bjorn Helgaas <helgaas@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Jonathan Cameron <jic23@kernel.org>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Nicolin Chen <nicolinc@nvidia.com>
Cc: Samuel Ortiz <sameo@rivosinc.com>
Cc: Steven Price <steven.price@arm.com>
Cc: Suzuki K Poulose <Suzuki.Poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Xu Yilun <yilun.xu@linux.intel.com>
Cc: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Tony Krowiak <akrowiak@linux.ibm.com>
Cc: Halil Pasic <pasic@linux.ibm.com>
Cc: Jason Herne <jjherne@linux.ibm.com>
Cc: Harald Freudenberger <freude@linux.ibm.com>
Cc: Holger Dengler <dengler@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Alex Williamson <alex@shazbot.org>
Cc: Matthew Rosato <mjrosato@linux.ibm.com>
Cc: Farhan Ali <alifm@linux.ibm.com>
Cc: Eric Farman <farman@linux.ibm.com>
Cc: linux-s390@vger.kernel.org

Aneesh Kumar K.V (Arm) (3):
  vfio: cache KVM VM file references instead of raw struct kvm pointers
  iommufd/tsm: add vdevice TSM bind/unbind ioctl
  iommufd/vdevice: add TSM request ioctl

Nicolin Chen (1):
  iommufd/viommu: Keep a reference to the KVM file

Shameer Kolothum (1):
  iommufd/device: Associate KVM file pointer with iommufd_device

 drivers/iommu/iommufd/Makefile          |   2 +
 drivers/iommu/iommufd/device.c          |   7 +-
 drivers/iommu/iommufd/iommufd_private.h |  16 +++
 drivers/iommu/iommufd/main.c            |   6 ++
 drivers/iommu/iommufd/selftest.c        |   2 +-
 drivers/iommu/iommufd/tsm.c             | 130 ++++++++++++++++++++++++
 drivers/iommu/iommufd/viommu.c          |   9 ++
 drivers/s390/crypto/vfio_ap_ops.c       |   5 +-
 drivers/vfio/device_cdev.c              |  10 +-
 drivers/vfio/group.c                    |  14 ++-
 drivers/vfio/iommufd.c                  |   3 +-
 drivers/vfio/pci/vfio_pci_zdev.c        |   7 +-
 drivers/vfio/vfio.h                     |  16 ++-
 drivers/vfio/vfio_main.c                |  81 ++++++++-------
 drivers/virt/coco/tsm-core.c            |  58 +++++++++++
 include/linux/iommufd.h                 |   5 +-
 include/linux/kvm_host.h                |   3 +
 include/linux/pci-tsm.h                 |   9 +-
 include/linux/tsm.h                     |  42 ++++++++
 include/linux/vfio.h                    |  17 +++-
 include/uapi/linux/iommufd.h            | 106 +++++++++++++++++++
 virt/kvm/kvm_main.c                     |   2 +
 22 files changed, 478 insertions(+), 72 deletions(-)
 create mode 100644 drivers/iommu/iommufd/tsm.c


base-commit: 50897c955902c93ae71c38698abb910525ebdc89
-- 
2.43.0


^ permalink raw reply

* [PATCH v5 1/5] vfio: cache KVM VM file references instead of raw struct kvm pointers
From: Aneesh Kumar K.V (Arm) @ 2026-05-25 15:48 UTC (permalink / raw)
  To: linux-coco, iommu, linux-kernel, kvm
  Cc: Aneesh Kumar K.V (Arm), Alexey Kardashevskiy, Bjorn Helgaas,
	Dan Williams, Jason Gunthorpe, Joerg Roedel, Jonathan Cameron,
	Kevin Tian, Nicolin Chen, Samuel Ortiz, Steven Price,
	Suzuki K Poulose, Will Deacon, Xu Yilun, Shameer Kolothum,
	Paolo Bonzini, Tony Krowiak, Halil Pasic, Jason Herne,
	Harald Freudenberger, Holger Dengler, Heiko Carstens,
	Vasily Gorbik, Alexander Gordeev, Christian Borntraeger,
	Sven Schnelle, Alex Williamson, Matthew Rosato, Farhan Ali,
	Eric Farman, linux-s390
In-Reply-To: <20260525154816.1029642-1-aneesh.kumar@kernel.org>

VFIO currently records struct kvm pointers on vfio_group, vfio_device_file
and the opened vfio_device. Switch VFIO to track the VM's struct file
instead, so VFIO and iommufd can use normal file references for VM lifetime
instead of depending on KVM's internal struct kvm refcounting.

KVM_CREATE_DEVICE binds the KVM VM lifetime to the KVM device fd lifetime.
For KVM_DEV_TYPE_VFIO, the KVM VFIO device fd also takes references to each
VFIO file added through KVM_DEV_VFIO_FILE_ADD. The KVM VFIO device fd
therefore owns both the internal KVM reference and the VFIO file references
in kvf->file.

KVM_DEV_VFIO_FILE_ADD further installs the VM file association into the
VFIO file. VFIO converts the struct kvm pointer to a VM file reference with
get_file_active(&kvm->_file), because the KVM device fd can keep struct kvm
alive after the original VM fd is already in final release.

The association intentionally pins the VM file until KVM_DEV_VFIO_FILE_DEL
or until the KVM VFIO device fd is released. This gives VFIO/iommufd a
stable VM file reference source without taking a dependency on KVM's struct
kvm lifetime. The KVM VFIO device release path clears the VFIO-side
association before dropping its VFIO file references.

When a VFIO device is opened or bound, VFIO takes an additional reference
from the associated VM file and stores it in vfio_device::kvm_file for
driver and iommufd use. That open-time reference is released from
vfio_device_put_kvm() when the VFIO device is closed or unbound.

This gives the ownership model:

  - KVM device fd pins struct kvm through kvm->users_count
  - KVM VFIO device fd pins VFIO files through kvf->file
  - VFIO group/device-file state pins the VM file while associated with KVM
  - vfio_device::kvm_file pins the VM file during active VFIO device use

Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
 drivers/s390/crypto/vfio_ap_ops.c |  5 +-
 drivers/vfio/device_cdev.c        | 10 ++--
 drivers/vfio/group.c              | 14 +++---
 drivers/vfio/pci/vfio_pci_zdev.c  |  7 +--
 drivers/vfio/vfio.h               | 16 ++++--
 drivers/vfio/vfio_main.c          | 81 ++++++++++++++++---------------
 include/linux/kvm_host.h          |  3 ++
 include/linux/vfio.h              | 17 ++++++-
 virt/kvm/kvm_main.c               |  2 +
 9 files changed, 91 insertions(+), 64 deletions(-)

diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index 44b3a1dcc1b3..05996a8fd860 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -2054,11 +2054,12 @@ static int vfio_ap_mdev_open_device(struct vfio_device *vdev)
 {
 	struct ap_matrix_mdev *matrix_mdev =
 		container_of(vdev, struct ap_matrix_mdev, vdev);
+	struct kvm *kvm = vfio_device_get_kvm(vdev);
 
-	if (!vdev->kvm)
+	if (!kvm)
 		return -EINVAL;
 
-	return vfio_ap_mdev_set_kvm(matrix_mdev, vdev->kvm);
+	return vfio_ap_mdev_set_kvm(matrix_mdev, kvm);
 }
 
 static void vfio_ap_mdev_close_device(struct vfio_device *vdev)
diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index 54abf312cf04..ca75ab8eb7bd 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -56,7 +56,7 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep)
 static void vfio_df_get_kvm_safe(struct vfio_device_file *df)
 {
 	spin_lock(&df->kvm_ref_lock);
-	vfio_device_get_kvm_safe(df->device, df->kvm);
+	vfio_device_get_kvm_safe(df->device, df->kvm_file);
 	spin_unlock(&df->kvm_ref_lock);
 }
 
@@ -133,10 +133,10 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df,
 	}
 
 	/*
-	 * Before the device open, get the KVM pointer currently
-	 * associated with the device file (if there is) and obtain
-	 * a reference.  This reference is held until device closed.
-	 * Save the pointer in the device for use by drivers.
+	 * Before the device open, get the VM struct file currently
+	 * associated with the device file (if there is one) and obtain a
+	 * reference. This reference is held until the device is closed.
+	 * Save the file in the device for use by drivers.
 	 */
 	vfio_df_get_kvm_safe(df);
 
diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
index b2299e5bc6df..8950cfb9405d 100644
--- a/drivers/vfio/group.c
+++ b/drivers/vfio/group.c
@@ -163,7 +163,7 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group,
 static void vfio_device_group_get_kvm_safe(struct vfio_device *device)
 {
 	spin_lock(&device->group->kvm_ref_lock);
-	vfio_device_get_kvm_safe(device, device->group->kvm);
+	vfio_device_get_kvm_safe(device, device->group->kvm_file);
 	spin_unlock(&device->group->kvm_ref_lock);
 }
 
@@ -181,10 +181,10 @@ static int vfio_df_group_open(struct vfio_device_file *df)
 	mutex_lock(&device->dev_set->lock);
 
 	/*
-	 * Before the first device open, get the KVM pointer currently
-	 * associated with the group (if there is one) and obtain a reference
-	 * now that will be held until the open_count reaches 0 again.  Save
-	 * the pointer in the device for use by drivers.
+	 * Before the first device open, get the VM struct file currently
+	 * associated with the group (if there is one) and obtain a
+	 * reference now that will be held until the open_count reaches 0
+	 * again. Save the file in the device for use by drivers.
 	 */
 	if (device->open_count == 0)
 		vfio_device_group_get_kvm_safe(device);
@@ -862,9 +862,7 @@ bool vfio_group_enforced_coherent(struct vfio_group *group)
 
 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
 {
-	spin_lock(&group->kvm_ref_lock);
-	group->kvm = kvm;
-	spin_unlock(&group->kvm_ref_lock);
+	vfio_kvm_file_replace(&group->kvm_file, &group->kvm_ref_lock, kvm);
 }
 
 /**
diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
index 0990fdb146b7..a9d8e6aa3839 100644
--- a/drivers/vfio/pci/vfio_pci_zdev.c
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -144,15 +144,16 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev,
 int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev)
 {
 	struct zpci_dev *zdev = to_zpci(vdev->pdev);
+	struct kvm *kvm = vfio_device_get_kvm(&vdev->vdev);
 
 	if (!zdev)
 		return -ENODEV;
 
-	if (!vdev->vdev.kvm)
+	if (!kvm)
 		return 0;
 
 	if (zpci_kvm_hook.kvm_register)
-		return zpci_kvm_hook.kvm_register(zdev, vdev->vdev.kvm);
+		return zpci_kvm_hook.kvm_register(zdev, kvm);
 
 	return -ENOENT;
 }
@@ -161,7 +162,7 @@ void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev)
 {
 	struct zpci_dev *zdev = to_zpci(vdev->pdev);
 
-	if (!zdev || !vdev->vdev.kvm)
+	if (!zdev || !vfio_device_get_kvm(&vdev->vdev))
 		return;
 
 	if (zpci_kvm_hook.kvm_unregister)
diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index e4b72e79b7e3..41032104eb36 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -22,8 +22,8 @@ struct vfio_device_file {
 
 	u8 access_granted;
 	u32 devid; /* only valid when iommufd is valid */
-	spinlock_t kvm_ref_lock; /* protect kvm field */
-	struct kvm *kvm;
+	spinlock_t kvm_ref_lock; /* protect kvm_file */
+	struct file *kvm_file;
 	struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */
 };
 
@@ -88,7 +88,7 @@ struct vfio_group {
 #endif
 	enum vfio_group_type		type;
 	struct mutex			group_lock;
-	struct kvm			*kvm;
+	struct file			*kvm_file;
 	struct file			*opened_file;
 	struct iommufd_ctx		*iommufd;
 	spinlock_t			kvm_ref_lock;
@@ -434,11 +434,17 @@ static inline void vfio_virqfd_exit(void)
 #endif
 
 #if IS_ENABLED(CONFIG_KVM)
-void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm);
+void vfio_kvm_file_replace(struct file **dst, spinlock_t *lock, struct kvm *kvm);
+void vfio_device_get_kvm_safe(struct vfio_device *device, struct file *kvm_file);
 void vfio_device_put_kvm(struct vfio_device *device);
 #else
+static inline void vfio_kvm_file_replace(struct file **dst,
+		spinlock_t *lock, struct kvm *kvm)
+{
+}
+
 static inline void vfio_device_get_kvm_safe(struct vfio_device *device,
-					    struct kvm *kvm)
+					    struct file *kvm_file)
 {
 }
 
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 6222376ab6ab..88c85a7b98c0 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -442,55 +442,61 @@ void vfio_unregister_group_dev(struct vfio_device *device)
 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
 
 #if IS_ENABLED(CONFIG_KVM)
-void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
+void vfio_kvm_file_replace(struct file **dst, spinlock_t *lock, struct kvm *kvm)
 {
-	void (*pfn)(struct kvm *kvm);
-	bool (*fn)(struct kvm *kvm);
-	bool ret;
+	struct file *old_kvm_file, *new_kvm_file = NULL;
 
-	lockdep_assert_held(&device->dev_set->lock);
+	/*
+	 * @kvm can outlive the VM fd and its final __fput(). Only take a
+	 * new reference if the VM file is still active.
+	 */
+	if (kvm)
+		new_kvm_file = get_file_active(&kvm->_file);
 
-	if (!kvm)
-		return;
+	spin_lock(lock);
+	old_kvm_file = *dst;
+	*dst = new_kvm_file;
+	spin_unlock(lock);
 
-	pfn = symbol_get(kvm_put_kvm);
-	if (WARN_ON(!pfn))
-		return;
+	if (old_kvm_file)
+		fput(old_kvm_file);
+}
 
-	fn = symbol_get(kvm_get_kvm_safe);
-	if (WARN_ON(!fn)) {
-		symbol_put(kvm_put_kvm);
-		return;
-	}
+void vfio_device_get_kvm_safe(struct vfio_device *device, struct file *kvm_file)
+{
+	lockdep_assert_held(&device->dev_set->lock);
 
-	ret = fn(kvm);
-	symbol_put(kvm_get_kvm_safe);
-	if (!ret) {
-		symbol_put(kvm_put_kvm);
-		return;
-	}
+	/*
+	 * Take a VM file reference if the KVM fd is still active.
+	 */
+	if (kvm_file)
+		kvm_file = get_file(kvm_file);
 
-	device->put_kvm = pfn;
-	device->kvm = kvm;
+	device->kvm_file = kvm_file;
 }
 
 void vfio_device_put_kvm(struct vfio_device *device)
 {
+	struct file *kvm_file;
+
 	lockdep_assert_held(&device->dev_set->lock);
 
-	if (!device->kvm)
+	kvm_file = device->kvm_file;
+	if (!kvm_file)
 		return;
 
-	if (WARN_ON(!device->put_kvm))
-		goto clear;
+	device->kvm_file = NULL;
+	fput(kvm_file);
+}
 
-	device->put_kvm(device->kvm);
-	device->put_kvm = NULL;
-	symbol_put(kvm_put_kvm);
+struct kvm *vfio_device_get_kvm(struct vfio_device *device)
+{
+	if (!device->kvm_file)
+		return NULL;
 
-clear:
-	device->kvm = NULL;
+	return device->kvm_file->private_data;
 }
+EXPORT_SYMBOL_GPL(vfio_device_get_kvm);
 #endif
 
 /* true if the vfio_device has open_device() called but not close_device() */
@@ -1518,13 +1524,10 @@ static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
 	struct vfio_device_file *df = file->private_data;
 
 	/*
-	 * The kvm is first recorded in the vfio_device_file, and will
-	 * be propagated to vfio_device::kvm when the file is bound to
-	 * iommufd successfully in the vfio device cdev path.
+	 * Cache the VM file reference associated with this VFIO file so it
+	 * can be pinned into vfio_device while the device is open.
 	 */
-	spin_lock(&df->kvm_ref_lock);
-	df->kvm = kvm;
-	spin_unlock(&df->kvm_ref_lock);
+	vfio_kvm_file_replace(&df->kvm_file, &df->kvm_ref_lock, kvm);
 }
 
 /**
@@ -1532,8 +1535,8 @@ static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
  * @file: VFIO group file or VFIO device file
  * @kvm: KVM to link
  *
- * When a VFIO device is first opened the KVM will be available in
- * device->kvm if one was associated with the file.
+ * When a VFIO device is first opened, VFIO caches a VM file reference if
+ * one was associated with the file.
  */
 void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
 {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4c14aee1fb06..31afac5fb0ea 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -45,6 +45,8 @@
 #include <asm/kvm_host.h>
 #include <linux/kvm_dirty_ring.h>
 
+struct file;
+
 #ifndef KVM_MAX_VCPU_IDS
 #define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS
 #endif
@@ -861,6 +863,7 @@ struct kvm {
 	struct srcu_struct srcu;
 	struct srcu_struct irq_srcu;
 	pid_t userspace_pid;
+	struct file __rcu *_file;
 	bool override_halt_poll_ns;
 	unsigned int max_halt_poll_ns;
 	u32 dirty_ring_size;
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 31b826efba00..bca1d00f7845 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -22,8 +22,22 @@ struct kvm;
 struct iommufd_ctx;
 struct iommufd_device;
 struct iommufd_access;
+struct vfio_device;
 struct vfio_info_cap;
 
+#if IS_ENABLED(CONFIG_KVM)
+/*
+ * Return the KVM associated with @vdev's kvm_file. The returned pointer
+ * is valid only while VFIO device open holds the kvm_file reference.
+ */
+struct kvm *vfio_device_get_kvm(struct vfio_device *vdev);
+#else
+static inline struct kvm *vfio_device_get_kvm(struct vfio_device *vdev)
+{
+	return NULL;
+}
+#endif
+
 /*
  * VFIO devices can be placed in a set, this allows all devices to share this
  * structure and the VFIO core will provide a lock that is held around
@@ -54,7 +68,7 @@ struct vfio_device {
 	struct list_head dev_set_list;
 	unsigned int migration_flags;
 	u8 precopy_info_v2;
-	struct kvm *kvm;
+	struct file *kvm_file;
 
 	/* Members below here are private, not for driver use */
 	unsigned int index;
@@ -66,7 +80,6 @@ struct vfio_device {
 	unsigned int open_count;
 	struct completion comp;
 	struct iommufd_access *iommufd_access;
-	void (*put_kvm)(struct kvm *kvm);
 	struct inode *inode;
 #if IS_ENABLED(CONFIG_IOMMUFD)
 	struct iommufd_device *iommufd_device;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 89489996fbc1..011819c5c47c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1351,6 +1351,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
 
 	kvm_irqfd_release(kvm);
 
+	RCU_INIT_POINTER(kvm->_file, NULL);
 	kvm_put_kvm(kvm);
 	return 0;
 }
@@ -5500,6 +5501,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
 		r = PTR_ERR(file);
 		goto put_kvm;
 	}
+	rcu_assign_pointer(kvm->_file, file);
 
 	/*
 	 * Don't call kvm_put_kvm anymore at this point; file->f_op is
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 2/5] iommufd/device: Associate KVM file pointer with iommufd_device
From: Aneesh Kumar K.V (Arm) @ 2026-05-25 15:48 UTC (permalink / raw)
  To: linux-coco, iommu, linux-kernel, kvm
  Cc: Aneesh Kumar K.V (Arm), Alexey Kardashevskiy, Bjorn Helgaas,
	Dan Williams, Jason Gunthorpe, Joerg Roedel, Jonathan Cameron,
	Kevin Tian, Nicolin Chen, Samuel Ortiz, Steven Price,
	Suzuki K Poulose, Will Deacon, Xu Yilun, Shameer Kolothum,
	Paolo Bonzini, Tony Krowiak, Halil Pasic, Jason Herne,
	Harald Freudenberger, Holger Dengler, Heiko Carstens,
	Vasily Gorbik, Alexander Gordeev, Christian Borntraeger,
	Sven Schnelle, Alex Williamson, Matthew Rosato, Farhan Ali,
	Eric Farman, linux-s390, Jason Gunthorpe
In-Reply-To: <20260525154816.1029642-1-aneesh.kumar@kernel.org>

From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

TSM vDevice support needs access to the KVM associated with a VFIO device
after the device has been bound to iommufd.

Extend iommufd_device_bind() to accept the device's KVM file and store it
in the iommufd_device. The KVM file reference is owned by VFIO and is
already held for the duration of the device open path.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
[nicolinc: fix build error in iommufd_test_mock_domain()]
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
[aneesh.kumar: Switch to use kvm_file]
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
 drivers/iommu/iommufd/device.c          | 7 ++++++-
 drivers/iommu/iommufd/iommufd_private.h | 2 ++
 drivers/iommu/iommufd/selftest.c        | 2 +-
 drivers/vfio/iommufd.c                  | 3 ++-
 include/linux/iommufd.h                 | 4 +++-
 5 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 170a7005f0bc..718abdc0e627 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -203,6 +203,7 @@ void iommufd_device_destroy(struct iommufd_object *obj)
  * iommufd_device_bind - Bind a physical device to an iommu fd
  * @ictx: iommufd file descriptor
  * @dev: Pointer to a physical device struct
+ * @kvm_file: VM file if device belongs to a KVM VM
  * @id: Output ID number to return to userspace for this device
  *
  * A successful bind establishes an ownership over the device and returns
@@ -216,7 +217,9 @@ void iommufd_device_destroy(struct iommufd_object *obj)
  * The caller must undo this with iommufd_device_unbind()
  */
 struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
-					   struct device *dev, u32 *id)
+					   struct device *dev,
+					   struct file *kvm_file,
+					   u32 *id)
 {
 	struct iommufd_device *idev;
 	struct iommufd_group *igroup;
@@ -266,6 +269,8 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 	if (!iommufd_selftest_is_mock_dev(dev))
 		iommufd_ctx_get(ictx);
 	idev->dev = dev;
+	/* reference is already taken in vfio_df_ioctl_bind_iommufd() */
+	idev->kvm_file = kvm_file;
 	idev->enforce_cache_coherency =
 		device_iommu_capable(dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
 	/* The calling driver is a user until iommufd_device_unbind() */
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 6ac1965199e9..44eb026c206d 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -488,6 +488,8 @@ struct iommufd_device {
 	struct list_head group_item;
 	/* always the physical device */
 	struct device *dev;
+	/* ..and the VM file if available */
+	struct file *kvm_file;
 	bool enforce_cache_coherency;
 	struct iommufd_vdevice *vdev;
 	bool destroying;
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index af07c642a526..a193390f9d07 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -1069,7 +1069,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd,
 		goto out_sobj;
 	}
 
-	idev = iommufd_device_bind(ucmd->ictx, &sobj->idev.mock_dev->dev,
+	idev = iommufd_device_bind(ucmd->ictx, &sobj->idev.mock_dev->dev, NULL,
 				   &idev_id);
 	if (IS_ERR(idev)) {
 		rc = PTR_ERR(idev);
diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
index a38d262c6028..d2d0bd9382a1 100644
--- a/drivers/vfio/iommufd.c
+++ b/drivers/vfio/iommufd.c
@@ -119,7 +119,8 @@ int vfio_iommufd_physical_bind(struct vfio_device *vdev,
 {
 	struct iommufd_device *idev;
 
-	idev = iommufd_device_bind(ictx, vdev->dev, out_device_id);
+	idev = iommufd_device_bind(ictx, vdev->dev, vdev->kvm_file,
+				   out_device_id);
 	if (IS_ERR(idev))
 		return PTR_ERR(idev);
 	vdev->iommufd_device = idev;
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 6e7efe83bc5d..0a0bb4abfbd2 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -59,7 +59,9 @@ struct iommufd_object {
 };
 
 struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
-					   struct device *dev, u32 *id);
+					   struct device *dev,
+					   struct file *kvm_file,
+					   u32 *id);
 void iommufd_device_unbind(struct iommufd_device *idev);
 
 int iommufd_device_attach(struct iommufd_device *idev, ioasid_t pasid,
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 3/5] iommufd/viommu: Keep a reference to the KVM file
From: Aneesh Kumar K.V (Arm) @ 2026-05-25 15:48 UTC (permalink / raw)
  To: linux-coco, iommu, linux-kernel, kvm
  Cc: Aneesh Kumar K.V (Arm), Alexey Kardashevskiy, Bjorn Helgaas,
	Dan Williams, Jason Gunthorpe, Joerg Roedel, Jonathan Cameron,
	Kevin Tian, Nicolin Chen, Samuel Ortiz, Steven Price,
	Suzuki K Poulose, Will Deacon, Xu Yilun, Shameer Kolothum,
	Paolo Bonzini, Tony Krowiak, Halil Pasic, Jason Herne,
	Harald Freudenberger, Holger Dengler, Heiko Carstens,
	Vasily Gorbik, Alexander Gordeev, Christian Borntraeger,
	Sven Schnelle, Alex Williamson, Matthew Rosato, Farhan Ali,
	Eric Farman, linux-s390
In-Reply-To: <20260525154816.1029642-1-aneesh.kumar@kernel.org>

From: Nicolin Chen <nicolinc@nvidia.com>

The TSM vDevice operations need access to the KVM associated with the
device's vIOMMU. Save the device's KVM file in the iommufd_viommu when the
vIOMMU is allocated, and take a file reference so it remains valid for the
lifetime of the vIOMMU.

Release the reference when the vIOMMU is destroyed.

Based on an original patch by Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>

[nicolinc: hold kvm's users_count]
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
[aneesh.kumar: Switch to use kvm_file]
Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
 drivers/iommu/iommufd/viommu.c | 5 +++++
 include/linux/iommufd.h        | 1 +
 2 files changed, 6 insertions(+)

diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 4081deda9b33..bf5d58d55939 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
  */
+#include <linux/file.h>
 #include "iommufd_private.h"
 
 void iommufd_viommu_destroy(struct iommufd_object *obj)
@@ -11,6 +12,8 @@ void iommufd_viommu_destroy(struct iommufd_object *obj)
 	if (viommu->ops && viommu->ops->destroy)
 		viommu->ops->destroy(viommu);
 	refcount_dec(&viommu->hwpt->common.obj.users);
+	if (viommu->kvm_file)
+		fput(viommu->kvm_file);
 	xa_destroy(&viommu->vdevs);
 }
 
@@ -76,6 +79,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	}
 
 	xa_init(&viommu->vdevs);
+	if (idev->kvm_file)
+		viommu->kvm_file = get_file(idev->kvm_file);
 	viommu->type = cmd->type;
 	viommu->ictx = ucmd->ictx;
 	viommu->hwpt = hwpt_paging;
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 0a0bb4abfbd2..3267717f676d 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -103,6 +103,7 @@ struct iommufd_viommu {
 	struct iommufd_ctx *ictx;
 	struct iommu_device *iommu_dev;
 	struct iommufd_hwpt_paging *hwpt;
+	struct file *kvm_file;
 
 	const struct iommufd_viommu_ops *ops;
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 4/5] iommufd/tsm: add vdevice TSM bind/unbind ioctl
From: Aneesh Kumar K.V (Arm) @ 2026-05-25 15:48 UTC (permalink / raw)
  To: linux-coco, iommu, linux-kernel, kvm
  Cc: Aneesh Kumar K.V (Arm), Alexey Kardashevskiy, Bjorn Helgaas,
	Dan Williams, Jason Gunthorpe, Joerg Roedel, Jonathan Cameron,
	Kevin Tian, Nicolin Chen, Samuel Ortiz, Steven Price,
	Suzuki K Poulose, Will Deacon, Xu Yilun, Shameer Kolothum,
	Paolo Bonzini, Tony Krowiak, Halil Pasic, Jason Herne,
	Harald Freudenberger, Holger Dengler, Heiko Carstens,
	Vasily Gorbik, Alexander Gordeev, Christian Borntraeger,
	Sven Schnelle, Alex Williamson, Matthew Rosato, Farhan Ali,
	Eric Farman, linux-s390
In-Reply-To: <20260525154816.1029642-1-aneesh.kumar@kernel.org>

Introduce IOMMU_VDEVICE_TSM_OP to allow userspace to issue TSM bind/unbind
operations for an iommufd vdevice.

The new ioctl:
- looks up the vdevice object from vdevice_id
- resolves the associated KVM VM from the vIOMMU KVM file reference
- dispatches bind/unbind via tsm_bind()/tsm_unbind()

Also add common TSM helpers in tsm-core and wire vdevice teardown to unbind
the device from TSM state.

This provides iommufd plumbing to bind a TDI to a confidential guest through
the TSM layer.

Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
 drivers/iommu/iommufd/Makefile          |  2 +
 drivers/iommu/iommufd/iommufd_private.h |  8 ++++
 drivers/iommu/iommufd/main.c            |  3 ++
 drivers/iommu/iommufd/tsm.c             | 62 +++++++++++++++++++++++++
 drivers/iommu/iommufd/viommu.c          |  4 ++
 drivers/virt/coco/tsm-core.c            | 19 ++++++++
 include/linux/tsm.h                     | 17 +++++++
 include/uapi/linux/iommufd.h            | 26 +++++++++++
 8 files changed, 141 insertions(+)
 create mode 100644 drivers/iommu/iommufd/tsm.c

diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 71d692c9a8f4..431089089ee9 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -10,6 +10,8 @@ iommufd-y := \
 	vfio_compat.o \
 	viommu.o
 
+iommufd-$(CONFIG_TSM) += tsm.o
+
 iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
 
 obj-$(CONFIG_IOMMUFD) += iommufd.o
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 44eb026c206d..8eea0c2c332b 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -699,6 +699,14 @@ void iommufd_vdevice_destroy(struct iommufd_object *obj);
 void iommufd_vdevice_abort(struct iommufd_object *obj);
 int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_hw_queue_destroy(struct iommufd_object *obj);
+#ifdef CONFIG_TSM
+int iommufd_vdevice_tsm_op_ioctl(struct iommufd_ucmd *ucmd);
+#else
+static inline int iommufd_vdevice_tsm_op_ioctl(struct iommufd_ucmd *ucmd)
+{
+	return -EOPNOTSUPP;
+}
+#endif
 
 static inline struct iommufd_vdevice *
 iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id)
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 8c6d43601afb..d73e6b391c6f 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -432,6 +432,7 @@ union ucmd_buffer {
 	struct iommu_veventq_alloc veventq;
 	struct iommu_vfio_ioas vfio_ioas;
 	struct iommu_viommu_alloc viommu;
+	struct iommu_vdevice_tsm_op tsm_op;
 #ifdef CONFIG_IOMMUFD_TEST
 	struct iommu_test_cmd test;
 #endif
@@ -493,6 +494,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 __reserved),
 	IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
 		 struct iommu_viommu_alloc, out_viommu_id),
+	IOCTL_OP(IOMMU_VDEVICE_TSM_OP, iommufd_vdevice_tsm_op_ioctl,
+		 struct iommu_vdevice_tsm_op, vdevice_id),
 #ifdef CONFIG_IOMMUFD_TEST
 	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
 #endif
diff --git a/drivers/iommu/iommufd/tsm.c b/drivers/iommu/iommufd/tsm.c
new file mode 100644
index 000000000000..09ee668dbed9
--- /dev/null
+++ b/drivers/iommu/iommufd/tsm.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2026 ARM Ltd.
+ */
+
+#include <linux/tsm.h>
+#include "iommufd_private.h"
+
+/**
+ * iommufd_vdevice_tsm_op_ioctl - Handle vdevice TSM operations
+ * @ucmd: user command data for IOMMU_VDEVICE_TSM_OP
+ *
+ * Currently only supports TSM bind/unbind operations
+ * Resolve @iommu_vdevice_tsm_op::vdevice_id to a vdevice and dispatch the
+ * requested bind/unbind operation through the TSM core.
+ *
+ * Return: 0 on success, or a negative error code on failure.
+ */
+int iommufd_vdevice_tsm_op_ioctl(struct iommufd_ucmd *ucmd)
+{
+	int rc;
+	struct kvm *kvm = NULL;
+	struct iommufd_vdevice *vdev;
+	struct iommu_vdevice_tsm_op *cmd = ucmd->cmd;
+
+	if (cmd->flags)
+		return -EOPNOTSUPP;
+
+	vdev = iommufd_get_vdevice(ucmd->ictx, cmd->vdevice_id);
+	if (IS_ERR(vdev))
+		return PTR_ERR(vdev);
+
+	if (vdev->viommu->kvm_file)
+		kvm = vdev->viommu->kvm_file->private_data;
+
+	if (!kvm) {
+		rc = -ENODEV;
+		goto out_put_vdev;
+	}
+
+	/* tsm layer will take care of parallel calls to tsm_bind/unbind */
+	switch (cmd->type) {
+	case IOMMU_VDEVICE_TSM_BIND:
+		rc = tsm_bind(vdev->idev->dev, kvm, vdev->virt_id);
+		break;
+	case IOMMU_VDEVICE_TSM_UNBIND:
+		rc = tsm_unbind(vdev->idev->dev);
+		break;
+	default:
+		rc = -EINVAL;
+		goto out_put_vdev;
+	}
+
+	if (rc)
+		goto out_put_vdev;
+
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+
+out_put_vdev:
+	iommufd_put_object(ucmd->ictx, &vdev->obj);
+	return rc;
+}
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index bf5d58d55939..1b9379fcba84 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -3,6 +3,8 @@
  */
 #include <linux/file.h>
 #include "iommufd_private.h"
+#include <linux/cleanup.h>
+#include <linux/tsm.h>
 
 void iommufd_viommu_destroy(struct iommufd_object *obj)
 {
@@ -124,6 +126,8 @@ void iommufd_vdevice_abort(struct iommufd_object *obj)
 
 	lockdep_assert_held(&idev->igroup->lock);
 
+	tsm_unbind(idev->dev);
+
 	if (vdev->destroy)
 		vdev->destroy(vdev);
 	/* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */
diff --git a/drivers/virt/coco/tsm-core.c b/drivers/virt/coco/tsm-core.c
index e784993353d8..3870d08ffe0d 100644
--- a/drivers/virt/coco/tsm-core.c
+++ b/drivers/virt/coco/tsm-core.c
@@ -108,6 +108,25 @@ void tsm_unregister(struct tsm_dev *tsm_dev)
 }
 EXPORT_SYMBOL_GPL(tsm_unregister);
 
+int tsm_bind(struct device *dev, struct kvm *kvm, u64 tdi_id)
+{
+	if (!dev_is_pci(dev))
+		return -EINVAL;
+
+	return pci_tsm_bind(to_pci_dev(dev), kvm, tdi_id);
+}
+EXPORT_SYMBOL_GPL(tsm_bind);
+
+int tsm_unbind(struct device *dev)
+{
+	if (!dev_is_pci(dev))
+		return -EINVAL;
+
+	pci_tsm_unbind(to_pci_dev(dev));
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tsm_unbind);
+
 static void tsm_release(struct device *dev)
 {
 	struct tsm_dev *tsm_dev = container_of(dev, typeof(*tsm_dev), dev);
diff --git a/include/linux/tsm.h b/include/linux/tsm.h
index 381c53244c83..7b6df827321b 100644
--- a/include/linux/tsm.h
+++ b/include/linux/tsm.h
@@ -123,4 +123,21 @@ int tsm_report_unregister(const struct tsm_report_ops *ops);
 struct tsm_dev *tsm_register(struct device *parent, struct pci_tsm_ops *ops);
 void tsm_unregister(struct tsm_dev *tsm_dev);
 struct tsm_dev *find_tsm_dev(int id);
+
+struct kvm;
+#ifdef CONFIG_TSM
+int tsm_bind(struct device *dev, struct kvm *kvm, u64 tdi_id);
+int tsm_unbind(struct device *dev);
+#else
+static inline int tsm_bind(struct device *dev, struct kvm *kvm, u64 tdi_id)
+{
+	return -EINVAL;
+}
+
+static inline int tsm_unbind(struct device *dev)
+{
+	return 0;
+}
+#endif
+
 #endif /* __TSM_H */
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index e998dfbd6960..66398efa31d1 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -57,6 +57,7 @@ enum {
 	IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92,
 	IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93,
 	IOMMUFD_CMD_HW_QUEUE_ALLOC = 0x94,
+	IOMMUFD_CMD_VDEVICE_TSM_OP = 0x95,
 };
 
 /**
@@ -1143,6 +1144,31 @@ struct iommu_vdevice_alloc {
 };
 #define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC)
 
+/**
+ * enum iommu_vdevice_tsm_op_type - operation type for struct iommu_vdevice_tsm_op
+ * @IOMMU_VDEVICE_TSM_BIND: Bind a vDevice to TSM
+ * @IOMMU_VDEVICE_TSM_UNBIND: Unbind a vDevice from TSM
+ */
+enum iommu_vdevice_tsm_op_type {
+	IOMMU_VDEVICE_TSM_BIND = 0x1,
+	IOMMU_VDEVICE_TSM_UNBIND,
+};
+
+/**
+ * struct iommu_vdevice_tsm_op - ioctl(IOMMU_VDEVICE_TSM_OP)
+ * @size: sizeof(struct iommu_vdevice_tsm_op)
+ * @type: Type of TSM operation. Must be defined in enum iommu_vdevice_tsm_op_type
+ * @flags: Must be 0
+ * @vdevice_id: Object handle for the vDevice. Returned from IOMMU_VDEVICE_ALLOC
+ */
+struct iommu_vdevice_tsm_op {
+	__u32 size;
+	__u32 type;
+	__u32 flags;
+	__u32 vdevice_id;
+};
+#define IOMMU_VDEVICE_TSM_OP _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_TSM_OP)
+
 /**
  * struct iommu_ioas_change_process - ioctl(VFIO_IOAS_CHANGE_PROCESS)
  * @size: sizeof(struct iommu_ioas_change_process)
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 5/5] iommufd/vdevice: add TSM request ioctl
From: Aneesh Kumar K.V (Arm) @ 2026-05-25 15:48 UTC (permalink / raw)
  To: linux-coco, iommu, linux-kernel, kvm
  Cc: Aneesh Kumar K.V (Arm), Alexey Kardashevskiy, Bjorn Helgaas,
	Dan Williams, Jason Gunthorpe, Joerg Roedel, Jonathan Cameron,
	Kevin Tian, Nicolin Chen, Samuel Ortiz, Steven Price,
	Suzuki K Poulose, Will Deacon, Xu Yilun, Shameer Kolothum,
	Paolo Bonzini, Tony Krowiak, Halil Pasic, Jason Herne,
	Harald Freudenberger, Holger Dengler, Heiko Carstens,
	Vasily Gorbik, Alexander Gordeev, Christian Borntraeger,
	Sven Schnelle, Alex Williamson, Matthew Rosato, Farhan Ali,
	Eric Farman, linux-s390
In-Reply-To: <20260525154816.1029642-1-aneesh.kumar@kernel.org>

Add IOMMU_VDEVICE_TSM_REQUEST for issuing TSM guest request/response
transactions against an iommufd vdevice.

The ioctl takes a vdevice_id plus request/response user buffers and length
fields, and forwards the request through tsm_guest_req() to the PCI TSM
backend. This provides the host-side passthrough path used by CoCo guests
for TSM device attestation and acceptance flows after the device has been
bound to TSM.

Also add the supporting tsm_guest_req() helper and associated TSM core
interface definitions.

Based on changes from: Alexey Kardashevskiy <aik@amd.com>

Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
 drivers/iommu/iommufd/iommufd_private.h |  6 ++
 drivers/iommu/iommufd/main.c            |  3 +
 drivers/iommu/iommufd/tsm.c             | 68 +++++++++++++++++++++
 drivers/virt/coco/tsm-core.c            | 39 ++++++++++++
 include/linux/pci-tsm.h                 |  9 +--
 include/linux/tsm.h                     | 25 ++++++++
 include/uapi/linux/iommufd.h            | 80 +++++++++++++++++++++++++
 7 files changed, 226 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 8eea0c2c332b..0080895e9e92 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -701,11 +701,17 @@ int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_hw_queue_destroy(struct iommufd_object *obj);
 #ifdef CONFIG_TSM
 int iommufd_vdevice_tsm_op_ioctl(struct iommufd_ucmd *ucmd);
+int iommufd_vdevice_tsm_req_ioctl(struct iommufd_ucmd *ucmd);
 #else
 static inline int iommufd_vdevice_tsm_op_ioctl(struct iommufd_ucmd *ucmd)
 {
 	return -EOPNOTSUPP;
 }
+
+static inline int iommufd_vdevice_tsm_req_ioctl(struct iommufd_ucmd *ucmd)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 static inline struct iommufd_vdevice *
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index d73e6b391c6f..5f49b546ec92 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -433,6 +433,7 @@ union ucmd_buffer {
 	struct iommu_vfio_ioas vfio_ioas;
 	struct iommu_viommu_alloc viommu;
 	struct iommu_vdevice_tsm_op tsm_op;
+	struct iommu_vdevice_tsm_req tsm_req;
 #ifdef CONFIG_IOMMUFD_TEST
 	struct iommu_test_cmd test;
 #endif
@@ -496,6 +497,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 struct iommu_viommu_alloc, out_viommu_id),
 	IOCTL_OP(IOMMU_VDEVICE_TSM_OP, iommufd_vdevice_tsm_op_ioctl,
 		 struct iommu_vdevice_tsm_op, vdevice_id),
+	IOCTL_OP(IOMMU_VDEVICE_TSM_REQ, iommufd_vdevice_tsm_req_ioctl,
+		 struct iommu_vdevice_tsm_req, resp_uptr),
 #ifdef CONFIG_IOMMUFD_TEST
 	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
 #endif
diff --git a/drivers/iommu/iommufd/tsm.c b/drivers/iommu/iommufd/tsm.c
index 09ee668dbed9..342fbdb6a6b9 100644
--- a/drivers/iommu/iommufd/tsm.c
+++ b/drivers/iommu/iommufd/tsm.c
@@ -60,3 +60,71 @@ int iommufd_vdevice_tsm_op_ioctl(struct iommufd_ucmd *ucmd)
 	iommufd_put_object(ucmd->ictx, &vdev->obj);
 	return rc;
 }
+
+static bool iommufd_vdevice_tsm_req_scope_valid(u32 scope)
+{
+	if (scope > IOMMU_VDEVICE_TSM_REQ_SCOPE_PCI_LAST)
+		return false;
+
+	switch (scope) {
+	case IOMMU_VDEVICE_TSM_REQ_PCI_INFO:
+	case IOMMU_VDEVICE_TSM_REQ_PCI_STATE_CHANGE:
+	case IOMMU_VDEVICE_TSM_REQ_PCI_DEBUG_READ:
+	case IOMMU_VDEVICE_TSM_REQ_PCI_DEBUG_WRITE:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/**
+ * iommufd_vdevice_tsm_req_ioctl - Forward TSM requests
+ * @ucmd: user command data for IOMMU_VDEVICE_TSM_REQ
+ *
+ * Resolve @iommu_vdevice_tsm_req::vdevice_id to a vdevice and pass the
+ * request/response buffers to the TSM core.
+ *
+ * Return:
+ *  -errno on error.
+ *  positive residue if response/request bytes were left unconsumed.
+ *    if response buffer is provided, residue indicates the number of bytes
+ *    not used in response buffer
+ *    if there is no response buffer, residue indicates the number of bytes
+ *    not consumed in req buffer
+ *  0 otherwise.
+ */
+int iommufd_vdevice_tsm_req_ioctl(struct iommufd_ucmd *ucmd)
+{
+	int rc;
+	struct iommufd_vdevice *vdev;
+	struct iommu_vdevice_tsm_req *cmd = ucmd->cmd;
+	struct tsm_guest_req_info info = {
+		.scope = cmd->scope,
+		.req   = {
+			.user = u64_to_user_ptr(cmd->req_uptr),
+			.is_kernel = false,
+		},
+		.req_len = cmd->req_len,
+		.resp    =  {
+			.user = u64_to_user_ptr(cmd->resp_uptr),
+			.is_kernel = false,
+		},
+		.resp_len = cmd->resp_len,
+	};
+
+	if (cmd->__reserved)
+		return -EOPNOTSUPP;
+
+	if (!iommufd_vdevice_tsm_req_scope_valid(cmd->scope))
+		return -EINVAL;
+
+	vdev = iommufd_get_vdevice(ucmd->ictx, cmd->vdevice_id);
+	if (IS_ERR(vdev))
+		return PTR_ERR(vdev);
+
+	rc = tsm_guest_req(vdev->idev->dev, &info);
+
+	/* No inline response, hence we don't need to copy the response */
+	iommufd_put_object(ucmd->ictx, &vdev->obj);
+	return rc;
+}
diff --git a/drivers/virt/coco/tsm-core.c b/drivers/virt/coco/tsm-core.c
index 3870d08ffe0d..c24886851f9e 100644
--- a/drivers/virt/coco/tsm-core.c
+++ b/drivers/virt/coco/tsm-core.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/cleanup.h>
 #include <linux/pci-tsm.h>
+#include <uapi/linux/iommufd.h>
 
 static void tsm_release(struct device *);
 static const struct class tsm_class = {
@@ -127,6 +128,44 @@ int tsm_unbind(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(tsm_unbind);
 
+static int tsm_pci_req_scope(u32 scope, enum pci_tsm_req_scope *pci_scope)
+{
+	switch (scope) {
+	case IOMMU_VDEVICE_TSM_REQ_PCI_INFO:
+		*pci_scope = PCI_TSM_REQ_INFO;
+		return 0;
+	case IOMMU_VDEVICE_TSM_REQ_PCI_STATE_CHANGE:
+		*pci_scope = PCI_TSM_REQ_STATE_CHANGE;
+		return 0;
+	case IOMMU_VDEVICE_TSM_REQ_PCI_DEBUG_READ:
+		*pci_scope = PCI_TSM_REQ_DEBUG_READ;
+		return 0;
+	case IOMMU_VDEVICE_TSM_REQ_PCI_DEBUG_WRITE:
+		*pci_scope = PCI_TSM_REQ_DEBUG_WRITE;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+ssize_t tsm_guest_req(struct device *dev, struct tsm_guest_req_info *info)
+{
+	int ret;
+	enum pci_tsm_req_scope pci_scope;
+
+	if (!dev_is_pci(dev))
+		return -EINVAL;
+
+	ret = tsm_pci_req_scope(info->scope, &pci_scope);
+	if (ret)
+		return ret;
+
+	return pci_tsm_guest_req(to_pci_dev(dev), pci_scope, info->req,
+				 info->req_len, info->resp, info->resp_len,
+				 NULL);
+}
+EXPORT_SYMBOL_GPL(tsm_guest_req);
+
 static void tsm_release(struct device *dev)
 {
 	struct tsm_dev *tsm_dev = container_of(dev, typeof(*tsm_dev), dev);
diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h
index a6435aba03f9..ec2236a7a279 100644
--- a/include/linux/pci-tsm.h
+++ b/include/linux/pci-tsm.h
@@ -4,6 +4,7 @@
 #include <linux/mutex.h>
 #include <linux/pci.h>
 #include <linux/sockptr.h>
+#include <uapi/linux/iommufd.h>
 
 struct pci_tsm;
 struct tsm_dev;
@@ -173,7 +174,7 @@ enum pci_tsm_req_scope {
 	 * typical TDISP collateral information like Device Interface Reports.
 	 * No device secrets are permitted, and no device state is changed.
 	 */
-	PCI_TSM_REQ_INFO = 0,
+	PCI_TSM_REQ_INFO = IOMMU_VDEVICE_TSM_REQ_PCI_INFO,
 	/**
 	 * @PCI_TSM_REQ_STATE_CHANGE: Request to change the TDISP state from
 	 * UNLOCKED->LOCKED, LOCKED->RUN, or other architecture specific state
@@ -181,14 +182,14 @@ enum pci_tsm_req_scope {
 	 * to TDISP) device / host state, configuration, or data change is
 	 * permitted.
 	 */
-	PCI_TSM_REQ_STATE_CHANGE = 1,
+	PCI_TSM_REQ_STATE_CHANGE = IOMMU_VDEVICE_TSM_REQ_PCI_STATE_CHANGE,
 	/**
 	 * @PCI_TSM_REQ_DEBUG_READ: Read-only request for debug information
 	 *
 	 * A method to facilitate TVM information retrieval outside of typical
 	 * TDISP operational requirements. No device secrets are permitted.
 	 */
-	PCI_TSM_REQ_DEBUG_READ = 2,
+	PCI_TSM_REQ_DEBUG_READ = IOMMU_VDEVICE_TSM_REQ_PCI_DEBUG_READ,
 	/**
 	 * @PCI_TSM_REQ_DEBUG_WRITE: Device state changes for debug purposes
 	 *
@@ -196,7 +197,7 @@ enum pci_tsm_req_scope {
 	 * the TDISP operational model. If allowed, requires CAP_SYS_RAW_IO, and
 	 * will taint the kernel.
 	 */
-	PCI_TSM_REQ_DEBUG_WRITE = 3,
+	PCI_TSM_REQ_DEBUG_WRITE = IOMMU_VDEVICE_TSM_REQ_PCI_DEBUG_WRITE,
 };
 
 #ifdef CONFIG_PCI_TSM
diff --git a/include/linux/tsm.h b/include/linux/tsm.h
index 7b6df827321b..6101a2a1db61 100644
--- a/include/linux/tsm.h
+++ b/include/linux/tsm.h
@@ -6,6 +6,7 @@
 #include <linux/types.h>
 #include <linux/uuid.h>
 #include <linux/device.h>
+#include <linux/sockptr.h>
 
 #define TSM_REPORT_INBLOB_MAX 64
 #define TSM_REPORT_OUTBLOB_MAX SZ_16M
@@ -128,6 +129,23 @@ struct kvm;
 #ifdef CONFIG_TSM
 int tsm_bind(struct device *dev, struct kvm *kvm, u64 tdi_id);
 int tsm_unbind(struct device *dev);
+
+/**
+ * struct tsm_guest_req_info - parameter for tsm_guest_req()
+ * @scope: iommufd allocated scope for tsm guest request
+ * @req: request data buffer filled by guest
+ * @req_len: the size of @req filled by guest
+ * @resp: response data buffer filled by host
+ * @resp_len: the size of @resp buffer filled by guest
+ */
+struct tsm_guest_req_info {
+	u32 scope;
+	sockptr_t req;
+	size_t req_len;
+	sockptr_t resp;
+	size_t resp_len;
+};
+ssize_t tsm_guest_req(struct device *dev, struct tsm_guest_req_info *info);
 #else
 static inline int tsm_bind(struct device *dev, struct kvm *kvm, u64 tdi_id)
 {
@@ -138,6 +156,13 @@ static inline int tsm_unbind(struct device *dev)
 {
 	return 0;
 }
+
+struct tsm_guest_req_info;
+static inline ssize_t tsm_guest_req(struct device *dev,
+		struct tsm_guest_req_info *info)
+{
+	return -EINVAL;
+}
 #endif
 
 #endif /* __TSM_H */
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 66398efa31d1..7953e99a9671 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -58,6 +58,7 @@ enum {
 	IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93,
 	IOMMUFD_CMD_HW_QUEUE_ALLOC = 0x94,
 	IOMMUFD_CMD_VDEVICE_TSM_OP = 0x95,
+	IOMMUFD_CMD_VDEVICE_TSM_REQ = 0x96,
 };
 
 /**
@@ -1373,4 +1374,83 @@ struct iommu_hw_queue_alloc {
 	__aligned_u64 length;
 };
 #define IOMMU_HW_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HW_QUEUE_ALLOC)
+
+/*
+ * TSM request scope values are allocated by iommufd. Each device-bus transport
+ * gets a range from this number space.
+ */
+#define IOMMU_VDEVICE_TSM_REQ_SCOPE_PCI_BASE	0
+
+enum iommu_vdevice_tsm_req_scope {
+	/*
+	 * Read-only, without side effects, request for typical TDISP
+	 * collateral information like Device Interface Reports. No device
+	 * secrets are permitted, and no device state is changed.
+	 */
+	IOMMU_VDEVICE_TSM_REQ_PCI_INFO =
+		IOMMU_VDEVICE_TSM_REQ_SCOPE_PCI_BASE,
+	/*
+	 * Request to change the TDISP state from UNLOCKED->LOCKED,
+	 * LOCKED->RUN, or other architecture specific state changes to
+	 * support those transitions for a TDI. No other device or host state,
+	 * configuration, or data change is permitted.
+	 */
+	IOMMU_VDEVICE_TSM_REQ_PCI_STATE_CHANGE =
+		IOMMU_VDEVICE_TSM_REQ_SCOPE_PCI_BASE + 1,
+	/*
+	 * Read-only request for debug information outside of typical TDISP
+	 * operational requirements. No device secrets are permitted.
+	 */
+	IOMMU_VDEVICE_TSM_REQ_PCI_DEBUG_READ =
+		IOMMU_VDEVICE_TSM_REQ_SCOPE_PCI_BASE + 2,
+	/*
+	 * Device state changes for debug purposes. The request may affect the
+	 * operational state of the device outside of the TDISP operational
+	 * model. If allowed, this requires CAP_SYS_RAW_IO and taints the
+	 * kernel.
+	 */
+	IOMMU_VDEVICE_TSM_REQ_PCI_DEBUG_WRITE =
+		IOMMU_VDEVICE_TSM_REQ_SCOPE_PCI_BASE + 3,
+	IOMMU_VDEVICE_TSM_REQ_SCOPE_PCI_LAST =
+		IOMMU_VDEVICE_TSM_REQ_PCI_DEBUG_WRITE,
+};
+
+/**
+ * struct iommu_vdevice_tsm_req - ioctl(IOMMU_VDEVICE_TSM_REQ)
+ * @size: sizeof(struct iommu_vdevice_tsm_req)
+ * @vdevice_id: vDevice ID the guest request is for
+ * @scope: One of enum iommu_vdevice_tsm_req_scope
+ * @req_len: Size in bytes of the input payload at @req_uptr
+ * @resp_len: Size in bytes of the output buffer at @resp_uptr
+ * @__reserved: Must be 0
+ * @req_uptr: Userspace pointer to the guest-provided request payload
+ * @resp_uptr: Userspace pointer to the guest response buffer
+ *
+ * Forward a TSM request to the TSM bound vDevice. This is intended for
+ * guest TSM/TDISP message transport where the host kernel only marshals
+ * bytes between userspace and the TSM implementation.
+ *
+ * Requests outside the iommufd allocated scope values are rejected. Lower
+ * layers may reject scope values that are valid in the global iommufd
+ * namespace, but not permitted for a specific bus.
+ *
+ * The request payload is read from @req_uptr/@req_len. If a response is
+ * expected, userspace provides @resp_uptr/@resp_len as writable storage for
+ * response bytes returned by the TSM path.
+ *
+ * The ioctl is only suitable for commands and results that the host kernel
+ * has no use, the host is only facilitating guest to TSM communication.
+ */
+struct iommu_vdevice_tsm_req {
+	__u32 size;
+	__u32 vdevice_id;
+	__u32 scope;
+	__u32 req_len;
+	__u32 resp_len;
+	__u32 __reserved;
+	__aligned_u64 req_uptr;
+	__aligned_u64 resp_uptr;
+};
+
+#define IOMMU_VDEVICE_TSM_REQ _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_TSM_REQ)
 #endif
-- 
2.43.0


^ permalink raw reply related

* [PATCH v6 02/11] x86/virt/tdx: Allocate page bitmap for Dynamic PAMT
From: Rick Edgecombe @ 2026-05-26  2:35 UTC (permalink / raw)
  To: bp, dave.hansen, hpa, kas, kvm, linux-coco, linux-doc,
	linux-kernel, mingo, nik.borisov, pbonzini, seanjc, tglx,
	vannapurve, x86, chao.gao, yan.y.zhao, kai.huang
  Cc: rick.p.edgecombe, Kirill A. Shutemov, Binbin Wu
In-Reply-To: <20260526023515.288829-1-rick.p.edgecombe@intel.com>

From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>

The TDX Physical Address Metadata Table (PAMT) holds data about the
physical memory used by TDX, and must be allocated by the kernel during
TDX module initialization.

The exact size of the required PAMT memory is determined by the TDX module
and may vary between TDX module versions. Currently it is approximately
0.4% of the system memory. This is a significant commitment, especially if
it is not known upfront whether the machine will run any TDX guests.

Each memory region that the TDX module might use needs three separate PAMT
allocations. One for each supported page size (1GB, 2MB, 4KB). The
TDX module supports a new feature designed to reduce PAMT overhead called
Dynamic PAMT. At a high level, Dynamic PAMT still has the 1GB and 2MB
levels allocated on TDX module initialization, but the 4KB level is
allocated dynamically during runtime.

However, in the details, Dynamic PAMT still needs some smaller per 4KB
page scoped data (currently it is 1 bit per page). The TDX module exposes
the number of bits as a separate piece of metadata than the 4KB static
allocation for regular PAMT. Although the size is enumerated differently,
it is handed to the TDX module in the same way the 4KB page size PAMT
allocation is for regular, non-dynamic PAMT.

Begin to implement Dynamic PAMT in the kernel by reading the bits-per-page
needed for Dynamic PAMT. Calculate the size needed for the bitmap,
and use it instead of the 4KB size determined for normal PAMT, in the case
of Dynamic PAMT.

Unlike the existing metadata reading code, this code is not generated by a
script. So adjust the comment to be more generic. Also, start to adopt a
more normal kernel code style without the tenary statements and if
conditionals assignments that the auto generated code has.

Assisted-by: Sashiko:claude-opus-4-6
Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Co-developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
v6:
 - Improve comment (Binbin)
 - Log tweaks
 - Mark tdmr_get_pamt_bitmap_sz() __init in response to upstream
   changes
 - Switch to more normal kernel code style, even though it differs from
   the existing auto generated code.
---
 arch/x86/include/asm/tdx.h                  |  5 +++++
 arch/x86/include/asm/tdx_global_metadata.h  |  3 +++
 arch/x86/virt/vmx/tdx/tdx.c                 | 19 ++++++++++++++++++-
 arch/x86/virt/vmx/tdx/tdx_global_metadata.c | 21 ++++++++++++++++++++-
 4 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 503f9a3f46d61..82dc27aecf297 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -149,6 +149,11 @@ static __always_inline u64 sc_retry(sc_func_t func, u64 fn,
 const char *tdx_dump_mce_info(struct mce *m);
 const struct tdx_sys_info *tdx_get_sysinfo(void);
 
+static inline bool tdx_supports_dynamic_pamt(const struct tdx_sys_info *sysinfo)
+{
+	return false; /* To be enabled when kernel is ready */
+}
+
 int tdx_guest_keyid_alloc(void);
 u32 tdx_get_nr_guest_keyids(void);
 void tdx_guest_keyid_free(unsigned int keyid);
diff --git a/arch/x86/include/asm/tdx_global_metadata.h b/arch/x86/include/asm/tdx_global_metadata.h
index 40689c8dc67eb..88040ddb51af4 100644
--- a/arch/x86/include/asm/tdx_global_metadata.h
+++ b/arch/x86/include/asm/tdx_global_metadata.h
@@ -21,6 +21,9 @@ struct tdx_sys_info_tdmr {
 	u16 pamt_4k_entry_size;
 	u16 pamt_2m_entry_size;
 	u16 pamt_1g_entry_size;
+
+	/* Optional metadata, if Dynamic PAMT is supported */
+	u8  pamt_page_bitmap_entry_bits;
 };
 
 struct tdx_sys_info_td_ctrl {
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 487f389f52f4b..9ebd192cb5c17 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -512,6 +512,18 @@ static __init int fill_out_tdmrs(struct list_head *tmb_list,
 	return 0;
 }
 
+static __init unsigned long tdmr_get_pamt_bitmap_sz(struct tdmr_info *tdmr)
+{
+	unsigned long pamt_sz, nr_pamt_entries;
+	int bits_per_entry;
+
+	bits_per_entry = tdx_sysinfo.tdmr.pamt_page_bitmap_entry_bits;
+	nr_pamt_entries = tdmr->size >> PAGE_SHIFT;
+	pamt_sz = DIV_ROUND_UP(nr_pamt_entries * bits_per_entry, BITS_PER_BYTE);
+
+	return PAGE_ALIGN(pamt_sz);
+}
+
 /*
  * Calculate PAMT size given a TDMR and a page size.  The returned
  * PAMT size is always aligned up to 4K page boundary.
@@ -579,7 +591,12 @@ static __init int tdmr_set_up_pamt(struct tdmr_info *tdmr,
 	 * Calculate the PAMT size for each TDX supported page size
 	 * and the total PAMT size.
 	 */
-	tdmr->pamt_4k_size = tdmr_get_pamt_sz(tdmr, TDX_PS_4K);
+	if (tdx_supports_dynamic_pamt(&tdx_sysinfo)) {
+		/* With Dynamic PAMT, PAMT_4K is replaced with a bitmap */
+		tdmr->pamt_4k_size = tdmr_get_pamt_bitmap_sz(tdmr);
+	} else {
+		tdmr->pamt_4k_size = tdmr_get_pamt_sz(tdmr, TDX_PS_4K);
+	}
 	tdmr->pamt_2m_size = tdmr_get_pamt_sz(tdmr, TDX_PS_2M);
 	tdmr->pamt_1g_size = tdmr_get_pamt_sz(tdmr, TDX_PS_1G);
 	tdmr_pamt_size = tdmr->pamt_4k_size + tdmr->pamt_2m_size + tdmr->pamt_1g_size;
diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
index c7db393a9cfb1..7e8e913463be1 100644
--- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
+++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Automatically generated functions to read TDX global metadata.
+ * Functions to read TDX global metadata.
  *
  * This file doesn't compile on its own as it lacks of inclusion
  * of SEAMCALL wrapper primitive which reads global metadata.
@@ -33,6 +33,18 @@ static __init int get_tdx_sys_info_features(struct tdx_sys_info_features *sysinf
 	return ret;
 }
 
+static __init int get_tdx_sys_info_tdmr_dpamt(struct tdx_sys_info_tdmr *sysinfo_tdmr)
+{
+	int ret;
+	u64 val;
+
+	ret = read_sys_metadata_field(0x9100000100000013, &val);
+	if (!ret)
+		sysinfo_tdmr->pamt_page_bitmap_entry_bits = val;
+
+	return ret;
+}
+
 static __init int get_tdx_sys_info_tdmr(struct tdx_sys_info_tdmr *sysinfo_tdmr)
 {
 	int ret = 0;
@@ -116,5 +128,12 @@ static __init int get_tdx_sys_info(struct tdx_sys_info *sysinfo)
 	ret = ret ?: get_tdx_sys_info_td_ctrl(&sysinfo->td_ctrl);
 	ret = ret ?: get_tdx_sys_info_td_conf(&sysinfo->td_conf);
 
+	/*
+	 * Don't treat a module that doesn't support Dynamic PAMT
+	 * as a failure. Only read the metadata optionally.
+	 */
+	if (!ret && tdx_supports_dynamic_pamt(sysinfo))
+		ret = get_tdx_sys_info_tdmr_dpamt(&sysinfo->tdmr);
+
 	return ret;
 }
-- 
2.54.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox