Linux Confidential Computing Development

Linux Confidential Computing Development
 help / color / mirror / Atom feed

* [PATCH v14 11/44] arm64: RMI: Check for RMI support at KVM init
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
  To: kvm, kvmarm
  Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
	James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
	linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
	Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
	Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
	Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>

Check if the RMI support is sufficient for using in KVM. Specifically we
currently only support KVM in VHE mode when for creating realm VMs.

Signed-off-by: Steven Price <steven.price@arm.com>
---
Changes since v13:
 * Most of the init has been moved out of the 'kvm' directory so this is
   much more basic now.
Changes since v12:
 * Drop check for 4k page size.
Changes since v11:
 * Reword slightly the comments on the realm states.
Changes since v10:
 * kvm_is_realm() no longer has a NULL check.
 * Rename from "rme" to "rmi" when referring to the RMM interface.
 * Check for RME (hardware) support before probing for RMI support.
Changes since v8:
 * No need to guard kvm_init_rme() behind 'in_hyp_mode'.
Changes since v6:
 * Improved message for an unsupported RMI ABI version.
Changes since v5:
 * Reword "unsupported" message from "host supports" to "we want" to
   clarify that 'we' are the 'host'.
Changes since v2:
 * Drop return value from kvm_init_rme(), it was always 0.
 * Rely on the RMM return value to identify whether the RSI ABI is
   compatible.
---
 arch/arm64/include/asm/kvm_host.h |  4 ++++
 arch/arm64/include/asm/kvm_rmi.h  | 17 +++++++++++++++++
 arch/arm64/include/asm/virt.h     |  1 +
 arch/arm64/kvm/Makefile           |  2 +-
 arch/arm64/kvm/arm.c              |  5 +++++
 arch/arm64/kvm/rmi.c              | 24 ++++++++++++++++++++++++
 6 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/include/asm/kvm_rmi.h
 create mode 100644 arch/arm64/kvm/rmi.c

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 851f6171751c..3512696ed506 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -27,6 +27,7 @@
 #include <asm/fpsimd.h>
 #include <asm/kvm.h>
 #include <asm/kvm_asm.h>
+#include <asm/kvm_rmi.h>
 #include <asm/vncr_mapping.h>
 
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
@@ -424,6 +425,9 @@ struct kvm_arch {
 	/* Nested virtualization info */
 	struct dentry *debugfs_nv_dentry;
 #endif
+
+	bool is_realm;
+	struct realm realm;
 };
 
 struct kvm_vcpu_fault_info {
diff --git a/arch/arm64/include/asm/kvm_rmi.h b/arch/arm64/include/asm/kvm_rmi.h
new file mode 100644
index 000000000000..4936007947fd
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_rmi.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023-2025 ARM Ltd.
+ */
+
+#ifndef __ASM_KVM_RMI_H
+#define __ASM_KVM_RMI_H
+
+/**
+ * struct realm - Additional per VM data for a Realm
+ */
+struct realm {
+};
+
+void kvm_init_rmi(void);
+
+#endif /* __ASM_KVM_RMI_H */
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index b546703c3ab9..92cec42952f4 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -87,6 +87,7 @@ void __hyp_reset_vectors(void);
 bool is_kvm_arm_initialised(void);
 
 DECLARE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
+DECLARE_STATIC_KEY_FALSE(kvm_rmi_is_available);
 
 static inline bool is_pkvm_initialized(void)
 {
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 59612d2f277c..ed3cf30eb06e 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -16,7 +16,7 @@ CFLAGS_handle_exit.o += -Wno-override-init
 kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
 	 inject_fault.o va_layout.o handle_exit.o config.o \
 	 guest.o debug.o reset.o sys_regs.o stacktrace.o \
-	 vgic-sys-reg-v3.o fpsimd.o pkvm.o \
+	 vgic-sys-reg-v3.o fpsimd.o pkvm.o rmi.o \
 	 arch_timer.o trng.o vmid.o emulate-nested.o nested.o at.o \
 	 vgic/vgic.o vgic/vgic-init.o \
 	 vgic/vgic-irqfd.o vgic/vgic-v2.o \
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 176cbe8baad3..247e03b33035 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -41,6 +41,7 @@
 #include <asm/kvm_nested.h>
 #include <asm/kvm_pkvm.h>
 #include <asm/kvm_ptrauth.h>
+#include <asm/kvm_rmi.h>
 #include <asm/sections.h>
 #include <asm/stacktrace/nvhe.h>
 
@@ -109,6 +110,8 @@ long kvm_get_cap_for_kvm_ioctl(unsigned int ioctl, long *ext)
 	return -EINVAL;
 }
 
+DEFINE_STATIC_KEY_FALSE(kvm_rmi_is_available);
+
 DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
 
 DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_base);
@@ -2975,6 +2978,8 @@ static __init int kvm_arm_init(void)
 
 	in_hyp_mode = is_kernel_in_hyp_mode();
 
+	kvm_init_rmi();
+
 	if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
 	    cpus_have_final_cap(ARM64_WORKAROUND_1508412))
 		kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \
diff --git a/arch/arm64/kvm/rmi.c b/arch/arm64/kvm/rmi.c
new file mode 100644
index 000000000000..1acc972a4b92
--- /dev/null
+++ b/arch/arm64/kvm/rmi.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023-2025 ARM Ltd.
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/rmi_cmds.h>
+#include <asm/virt.h>
+
+void kvm_init_rmi(void)
+{
+	/*
+	 * TODO: Support Realm guests in nVHE mode, this will require adding
+	 * EL2 stub(s) for REC entry and possibly other things.
+	 */
+	if (!is_kernel_in_hyp_mode())
+		return;
+
+	if (!rmi_is_available())
+		return;
+
+	/* Future patch will enable static branch kvm_rmi_is_available */
+}
-- 
2.43.0


^ permalink raw reply related

* [PATCH v14 10/44] arm64: RMI: Add support for SRO
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
  To: kvm, kvmarm
  Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
	James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
	linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
	Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
	Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
	Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>

RMM v2.0 introduces the concept of "Stateful RMI Operations" (SRO). This
means that an SMC can return with an operation still in progress. The
host is excepted to continue the operation until is reaches a conclusion
(either success or failure). During this process the RMM can request
additional memory ('donate') or hand memory back to the host
('reclaim'). The host can request an in progress operation is cancelled,
but still continue the operation until it has completed (otherwise the
incomplete operation may cause future RMM operations to fail).

The SRO is tracked using a struct rmi_sro_state object which keeps track
of any memory which has been allocated but not yet consumed by the RMM
or reclaimed from the RMM. This allows the memory to be reused in a
future request within the same operation. It will also permit an
operation to be done in a context where memory allocation may be
difficult (e.g. atomic context) with the option to abort the operation
and retry the memory allocation outside of the atomic context. The
memory stored in the struct rmi_sro_state object can then be reused on
the subsequent attempt.

Signed-off-by: Steven Price <steven.price@arm.com>
---
v14:
 * SRO support has improved although is still not fully complete. The
   infrastructure has been moved out of KVM.
---
 arch/arm64/include/asm/rmi_cmds.h |   1 +
 arch/arm64/kernel/rmi.c           | 359 ++++++++++++++++++++++++++++++
 2 files changed, 360 insertions(+)

diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
index eb213c8e6f26..1a7b0c8f1e38 100644
--- a/arch/arm64/include/asm/rmi_cmds.h
+++ b/arch/arm64/include/asm/rmi_cmds.h
@@ -35,6 +35,7 @@ struct rmi_sro_state {
 
 int rmi_delegate_range(phys_addr_t phys, unsigned long size);
 int rmi_undelegate_range(phys_addr_t phys, unsigned long size);
+int free_delegated_page(phys_addr_t phys);
 
 static inline int rmi_delegate_page(phys_addr_t phys)
 {
diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
index 08cef54acadb..a8107ca9bb6d 100644
--- a/arch/arm64/kernel/rmi.c
+++ b/arch/arm64/kernel/rmi.c
@@ -48,6 +48,365 @@ int rmi_undelegate_range(phys_addr_t phys, unsigned long size)
 	return ret;
 }
 
+static unsigned long donate_req_to_size(unsigned long donatereq)
+{
+	unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
+
+	switch (unit_size) {
+	case 0:
+		return PAGE_SIZE;
+	case 1:
+		return PMD_SIZE;
+	case 2:
+		return PUD_SIZE;
+	case 3:
+		return P4D_SIZE;
+	}
+	unreachable();
+}
+
+static void rmi_smccc_invoke(struct arm_smccc_1_2_regs *regs_in,
+			     struct arm_smccc_1_2_regs *regs_out)
+{
+	struct arm_smccc_1_2_regs regs = *regs_in;
+	unsigned long status;
+
+	do {
+		arm_smccc_1_2_invoke(&regs, regs_out);
+		status = RMI_RETURN_STATUS(regs_out->a0);
+	} while (status == RMI_BUSY || status == RMI_BLOCKED);
+}
+
+int free_delegated_page(phys_addr_t phys)
+{
+	if (WARN_ON(rmi_undelegate_page(phys))) {
+		/* Undelegate failed: leak the page */
+		return -EBUSY;
+	}
+
+	free_page((unsigned long)phys_to_virt(phys));
+
+	return 0;
+}
+
+static int rmi_sro_ensure_capacity(struct rmi_sro_state *sro,
+				   unsigned long count)
+{
+	if (WARN_ON_ONCE(sro->addr_count > RMI_MAX_ADDR_LIST))
+		return -EOVERFLOW;
+
+	if (count > RMI_MAX_ADDR_LIST - sro->addr_count)
+		return -ENOSPC;
+
+	return 0;
+}
+
+static int rmi_sro_donate_contig(struct rmi_sro_state *sro,
+				 unsigned long sro_handle,
+				 unsigned long donatereq,
+				 struct arm_smccc_1_2_regs *out_regs,
+				 gfp_t gfp)
+{
+	unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
+	unsigned long unit_size_bytes = donate_req_to_size(donatereq);
+	unsigned long count = RMI_DONATE_COUNT(donatereq);
+	unsigned long state = RMI_DONATE_STATE(donatereq);
+	unsigned long size = unit_size_bytes * count;
+	unsigned long addr_range;
+	int ret;
+	void *virt;
+	phys_addr_t phys;
+	struct arm_smccc_1_2_regs regs = {
+		SMC_RMI_OP_MEM_DONATE,
+		sro_handle
+	};
+
+	for (int i = 0; i < sro->addr_count; i++) {
+		unsigned long entry = sro->addr_list[i];
+
+		if (RMI_ADDR_RANGE_SIZE(entry) == unit_size &&
+		    RMI_ADDR_RANGE_COUNT(entry) == count &&
+		    RMI_ADDR_RANGE_STATE(entry) == state) {
+			sro->addr_count--;
+			swap(sro->addr_list[sro->addr_count],
+			     sro->addr_list[i]);
+
+			goto out;
+		}
+	}
+
+	ret = rmi_sro_ensure_capacity(sro, 1);
+	if (ret)
+		return ret;
+
+	virt = alloc_pages_exact(size, gfp);
+	if (!virt)
+		return -ENOMEM;
+	phys = virt_to_phys(virt);
+
+	if (state == RMI_OP_MEM_DELEGATED) {
+		if (rmi_delegate_range(phys, size)) {
+			free_pages_exact(virt, size);
+			return -ENXIO;
+		}
+	}
+
+	addr_range = phys & RMI_ADDR_RANGE_ADDR_MASK;
+	FIELD_MODIFY(RMI_ADDR_RANGE_SIZE_MASK, &addr_range, unit_size);
+	FIELD_MODIFY(RMI_ADDR_RANGE_COUNT_MASK, &addr_range, count);
+	FIELD_MODIFY(RMI_ADDR_RANGE_STATE_MASK, &addr_range, state);
+
+	sro->addr_list[sro->addr_count] = addr_range;
+
+out:
+	regs.a2 = virt_to_phys(&sro->addr_list[sro->addr_count]);
+	regs.a3 = 1;
+	rmi_smccc_invoke(&regs, out_regs);
+
+	unsigned long donated_granules = out_regs->a1;
+	unsigned long donated_size = donated_granules << PAGE_SHIFT;
+
+	if (donated_granules == 0) {
+		/* No pages used by the RMM */
+		sro->addr_count++;
+	} else if (donated_size < size) {
+		phys = sro->addr_list[sro->addr_count] & RMI_ADDR_RANGE_ADDR_MASK;
+
+		/* Not all granules used by the RMM, free the remaining pages */
+		for (long i = donated_size; i < size; i += PAGE_SIZE) {
+			if (state == RMI_OP_MEM_DELEGATED)
+				free_delegated_page(phys + i);
+			else
+				__free_page(phys_to_page(phys + i));
+		}
+	}
+
+	return 0;
+}
+
+static int rmi_sro_donate_noncontig(struct rmi_sro_state *sro,
+				    unsigned long sro_handle,
+				    unsigned long donatereq,
+				    struct arm_smccc_1_2_regs *out_regs,
+				    gfp_t gfp)
+{
+	unsigned long unit_size = RMI_DONATE_SIZE(donatereq);
+	unsigned long unit_size_bytes = donate_req_to_size(donatereq);
+	unsigned long count = RMI_DONATE_COUNT(donatereq);
+	unsigned long state = RMI_DONATE_STATE(donatereq);
+	unsigned long found = 0;
+	unsigned long addr_list_start = sro->addr_count;
+	int ret;
+	struct arm_smccc_1_2_regs regs = {
+		SMC_RMI_OP_MEM_DONATE,
+		sro_handle
+	};
+
+	for (int i = 0; i < addr_list_start && found < count; i++) {
+		unsigned long entry = sro->addr_list[i];
+
+		if (RMI_ADDR_RANGE_SIZE(entry) == unit_size &&
+		    RMI_ADDR_RANGE_COUNT(entry) == 1 &&
+		    RMI_ADDR_RANGE_STATE(entry) == state) {
+			addr_list_start--;
+			swap(sro->addr_list[addr_list_start],
+			     sro->addr_list[i]);
+			found++;
+			i--;
+		}
+	}
+
+	ret = rmi_sro_ensure_capacity(sro, count - found);
+	if (ret)
+		return ret;
+
+	while (found < count) {
+		unsigned long addr_range;
+		void *virt = alloc_pages_exact(unit_size_bytes, gfp);
+		phys_addr_t phys;
+
+		if (!virt)
+			return -ENOMEM;
+
+		phys = virt_to_phys(virt);
+
+		if (state == RMI_OP_MEM_DELEGATED) {
+			if (rmi_delegate_range(phys, unit_size_bytes)) {
+				free_pages_exact(virt, unit_size_bytes);
+				return -ENXIO;
+			}
+		}
+
+		addr_range = phys & RMI_ADDR_RANGE_ADDR_MASK;
+		FIELD_MODIFY(RMI_ADDR_RANGE_SIZE_MASK, &addr_range, unit_size);
+		FIELD_MODIFY(RMI_ADDR_RANGE_COUNT_MASK, &addr_range, 1);
+		FIELD_MODIFY(RMI_ADDR_RANGE_STATE_MASK, &addr_range, state);
+
+		sro->addr_list[sro->addr_count++] = addr_range;
+		found++;
+	}
+
+	regs.a2 = virt_to_phys(&sro->addr_list[addr_list_start]);
+	regs.a3 = found;
+	rmi_smccc_invoke(&regs, out_regs);
+
+	unsigned long donated_granules = out_regs->a1;
+
+	if (WARN_ON(donated_granules & ((unit_size_bytes >> PAGE_SHIFT) - 1))) {
+		/*
+		 * FIXME: RMM has only consumed part of a huge page, this leaks
+		 * the rest of the huge page
+		 */
+		donated_granules = ALIGN(donated_granules,
+					 (unit_size_bytes >> PAGE_SHIFT));
+	}
+	unsigned long donated_blocks = donated_granules / (unit_size_bytes >> PAGE_SHIFT);
+
+	if (WARN_ON(donated_blocks > found))
+		donated_blocks = found;
+
+	unsigned long undonated_blocks = found - donated_blocks;
+
+	while (donated_blocks && undonated_blocks) {
+		sro->addr_count--;
+		swap(sro->addr_list[addr_list_start],
+		     sro->addr_list[sro->addr_count]);
+		addr_list_start++;
+
+		donated_blocks--;
+		undonated_blocks--;
+	}
+	sro->addr_count -= donated_blocks;
+
+	return 0;
+}
+
+static int rmi_sro_donate(struct rmi_sro_state *sro,
+			  unsigned long sro_handle,
+			  unsigned long donatereq,
+			  struct arm_smccc_1_2_regs *regs,
+			  gfp_t gfp)
+{
+	unsigned long count = RMI_DONATE_COUNT(donatereq);
+
+	if (WARN_ON(!count))
+		return 0;
+
+	if (RMI_DONATE_CONTIG(donatereq)) {
+		return rmi_sro_donate_contig(sro, sro_handle, donatereq,
+					     regs, gfp);
+	} else {
+		return rmi_sro_donate_noncontig(sro, sro_handle, donatereq,
+						regs, gfp);
+	}
+}
+
+static int rmi_sro_reclaim(struct rmi_sro_state *sro,
+			   unsigned long sro_handle,
+			   struct arm_smccc_1_2_regs *out_regs)
+{
+	unsigned long capacity;
+	struct arm_smccc_1_2_regs regs;
+	int ret;
+
+	ret = rmi_sro_ensure_capacity(sro, 1);
+	if (ret)
+		rmi_sro_free(sro);
+
+	capacity = RMI_MAX_ADDR_LIST - sro->addr_count;
+
+	regs = (struct arm_smccc_1_2_regs){
+		SMC_RMI_OP_MEM_RECLAIM,
+		sro_handle,
+		virt_to_phys(&sro->addr_list[sro->addr_count]),
+		capacity
+	};
+	rmi_smccc_invoke(&regs, out_regs);
+
+	if (WARN_ON_ONCE(out_regs->a1 > capacity))
+		out_regs->a1 = capacity;
+
+	sro->addr_count += out_regs->a1;
+
+	return 0;
+}
+
+void rmi_sro_free(struct rmi_sro_state *sro)
+{
+	for (int i = 0; i < sro->addr_count; i++) {
+		unsigned long entry = sro->addr_list[i];
+		unsigned long addr = RMI_ADDR_RANGE_ADDR(entry);
+		unsigned long unit_size = RMI_ADDR_RANGE_SIZE(entry);
+		unsigned long count = RMI_ADDR_RANGE_COUNT(entry);
+		unsigned long state = RMI_ADDR_RANGE_STATE(entry);
+		unsigned long size = donate_req_to_size(unit_size) * count;
+
+		if (state == RMI_OP_MEM_DELEGATED) {
+			if (WARN_ON(rmi_undelegate_range(addr, size))) {
+				/* Leak the pages */
+				continue;
+			}
+		}
+		free_pages_exact(phys_to_virt(addr), size);
+	}
+
+	sro->addr_count = 0;
+}
+
+unsigned long rmi_sro_execute(struct rmi_sro_state *sro, gfp_t gfp)
+{
+	unsigned long sro_handle;
+	struct arm_smccc_1_2_regs regs;
+	struct arm_smccc_1_2_regs *regs_in = &sro->regs;
+
+	rmi_smccc_invoke(regs_in, &regs);
+
+	sro_handle = regs.a1;
+
+	while (RMI_RETURN_STATUS(regs.a0) == RMI_INCOMPLETE) {
+		bool can_cancel = RMI_RETURN_CAN_CANCEL(regs.a0);
+		int ret;
+
+		switch (RMI_RETURN_MEMREQ(regs.a0)) {
+		case RMI_OP_MEM_REQ_NONE:
+			regs = (struct arm_smccc_1_2_regs){
+				SMC_RMI_OP_CONTINUE, sro_handle, 0
+			};
+			rmi_smccc_invoke(&regs, &regs);
+			break;
+		case RMI_OP_MEM_REQ_DONATE:
+			ret = rmi_sro_donate(sro, sro_handle, regs.a2, &regs,
+					     gfp);
+			break;
+		case RMI_OP_MEM_REQ_RECLAIM:
+			ret = rmi_sro_reclaim(sro, sro_handle, &regs);
+			break;
+		default:
+			ret = WARN_ON(1);
+			break;
+		}
+
+		if (ret) {
+			if (can_cancel) {
+				/*
+				 * FIXME: Handle cancelling properly!
+				 *
+				 * If the operation has failed due to memory
+				 * allocation failure then the information on
+				 * the memory allocation should be saved, so
+				 * that the allocation can be repeated outside
+				 * of any context which prevented the
+				 * allocation.
+				 */
+			}
+			if (WARN_ON(ret))
+				return ret;
+		}
+	}
+
+	return regs.a0;
+}
+
 static int rmi_check_version(void)
 {
 	struct arm_smccc_res res;
-- 
2.43.0


^ permalink raw reply related

* [PATCH v14 09/44] arm64: RMI: Provide functions to delegate/undelegate ranges of memory
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
  To: kvm, kvmarm
  Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
	James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
	linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
	Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
	Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
	Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>

The RMM requires memory is 'delegated' to it so that it can be used
either for a realm guest or for various tracking purposes within the RMM
(e.g. for metadata or page tables). Memory that has been delegated
cannot be accessed by the host (it will result in a Granule Protection
Fault).

Undelegation may fail if the memory is still in use by the RMM. This
shouldn't happen (Linux should ensure it has destroyed the RMM objects
before attempting to undelegate). In the event that it does happen this
points to a programming bug and the only reasonable approach is for the
physical pages to be leaked - it is up to the caller of
rmi_undelegate_range() to handle this.

Signed-off-by: Steven Price <steven.price@arm.com>
---
v14:
 * Split into separate patch and moved out of KVM
---
 arch/arm64/include/asm/rmi_cmds.h | 13 +++++++++++
 arch/arm64/kernel/rmi.c           | 36 +++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
index 9078a2920a7c..eb213c8e6f26 100644
--- a/arch/arm64/include/asm/rmi_cmds.h
+++ b/arch/arm64/include/asm/rmi_cmds.h
@@ -33,6 +33,19 @@ struct rmi_sro_state {
 } while (RMI_RETURN_STATUS(res.a0) == RMI_BUSY ||			\
 	 RMI_RETURN_STATUS(res.a0) == RMI_BLOCKED)
 
+int rmi_delegate_range(phys_addr_t phys, unsigned long size);
+int rmi_undelegate_range(phys_addr_t phys, unsigned long size);
+
+static inline int rmi_delegate_page(phys_addr_t phys)
+{
+	return rmi_delegate_range(phys, PAGE_SIZE);
+}
+
+static inline int rmi_undelegate_page(phys_addr_t phys)
+{
+	return rmi_undelegate_range(phys, PAGE_SIZE);
+}
+
 bool rmi_is_available(void);
 
 unsigned long rmi_sro_execute(struct rmi_sro_state *sro, gfp_t gfp);
diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
index 52a415e99500..08cef54acadb 100644
--- a/arch/arm64/kernel/rmi.c
+++ b/arch/arm64/kernel/rmi.c
@@ -12,6 +12,42 @@ static bool arm64_rmi_is_available;
 unsigned long rmm_feat_reg0;
 unsigned long rmm_feat_reg1;
 
+int rmi_delegate_range(phys_addr_t phys, unsigned long size)
+{
+	unsigned long ret = 0;
+	unsigned long top = phys + size;
+	unsigned long out_top;
+
+	while (phys < top) {
+		ret = rmi_granule_range_delegate(phys, top, &out_top);
+		if (ret == RMI_SUCCESS)
+			phys = out_top;
+		else if (ret != RMI_BUSY && ret != RMI_BLOCKED)
+			return ret;
+	}
+
+	return ret;
+}
+
+int rmi_undelegate_range(phys_addr_t phys, unsigned long size)
+{
+	unsigned long ret = 0;
+	unsigned long top = phys + size;
+	unsigned long out_top;
+
+	WARN_ON(size == 0);
+
+	while (phys < top) {
+		ret = rmi_granule_range_undelegate(phys, top, &out_top);
+		if (ret == RMI_SUCCESS)
+			phys = out_top;
+		else if (ret != RMI_BUSY && ret != RMI_BLOCKED)
+			return ret;
+	}
+
+	return ret;
+}
+
 static int rmi_check_version(void)
 {
 	struct arm_smccc_res res;
-- 
2.43.0


^ permalink raw reply related

* [PATCH v14 08/44] arm64: RMI: Ensure that the RMM has GPT entries for memory
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
  To: kvm, kvmarm
  Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
	James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
	linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
	Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
	Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
	Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>

The RMM maintains the state of all the granules in the system to make
sure that the host is abiding by the rules. This state can be maintained
at different granularity, per page (TRACKING_FINE) or per region
(TRACKING_COARSE). The region size depends on the underlying
"RMI_GRANULE_SIZE". For a "coarse" region all pages in the region must
be of the same state, this implies we need to have "fine" tracking for
DRAM, so that we can delegated individual pages.

For now we only support a statically carved out memory for tracking
granules for the "fine" regions. This can be extended in the future to
allow modifying the tracking granularity and remove the need for a
static allocation.

Similarly, the firmware may create L0 GPT entries describing the total
address space. But if we change the "PAS" (Physical Address Space) of a
granule then the firmware may need to create L1 tables to track the PAS
at a finer granularity.

Note: support is currently missing for SROs which means that if the RMM
needs memory donating this will fail (and render CCA unusable in Linux).
This effectively means that the L1 GPT tables must be created before
Linux starts.

Signed-off-by: Steven Price <steven.price@arm.com>
---
Changes since v13:
 * Moved out of KVM
---
 arch/arm64/include/asm/rmi_cmds.h |   2 +
 arch/arm64/kernel/rmi.c           | 103 ++++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+)

diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
index 9179934925c5..9078a2920a7c 100644
--- a/arch/arm64/include/asm/rmi_cmds.h
+++ b/arch/arm64/include/asm/rmi_cmds.h
@@ -33,6 +33,8 @@ struct rmi_sro_state {
 } while (RMI_RETURN_STATUS(res.a0) == RMI_BUSY ||			\
 	 RMI_RETURN_STATUS(res.a0) == RMI_BLOCKED)
 
+bool rmi_is_available(void);
+
 unsigned long rmi_sro_execute(struct rmi_sro_state *sro, gfp_t gfp);
 void rmi_sro_free(struct rmi_sro_state *sro);
 
diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
index a14ead5dedda..52a415e99500 100644
--- a/arch/arm64/kernel/rmi.c
+++ b/arch/arm64/kernel/rmi.c
@@ -7,6 +7,8 @@
 
 #include <asm/rmi_cmds.h>
 
+static bool arm64_rmi_is_available;
+
 unsigned long rmm_feat_reg0;
 unsigned long rmm_feat_reg1;
 
@@ -88,6 +90,102 @@ static int rmi_configure(void)
 	return 0;
 }
 
+/*
+ * For now we set the tracking_region_size to 0 for RMI_RMM_CONFIG_SET().
+ * TODO: Support other tracking sizes (via Kconfig option).
+ */
+#ifdef CONFIG_PAGE_SIZE_4KB
+#define RMM_GRANULE_TRACKING_SIZE	SZ_1G
+#elif defined(CONFIG_PAGE_SIZE_16KB)
+#define RMM_GRANULE_TRACKING_SIZE	SZ_32M
+#elif defined(CONFIG_PAGE_SIZE_64KB)
+#define RMM_GRANULE_TRACKING_SIZE	SZ_512M
+#endif
+
+/*
+ * Make sure the area is tracked by RMM at FINE granularity.
+ * We do not support changing the tracking yet.
+ */
+static int rmi_verify_memory_tracking(phys_addr_t start, phys_addr_t end)
+{
+	while (start < end) {
+		unsigned long ret, category, state, next;
+
+		ret = rmi_granule_tracking_get(start, end, &category, &state, &next);
+		if (ret != RMI_SUCCESS ||
+		    state != RMI_TRACKING_FINE ||
+		    category != RMI_MEM_CATEGORY_CONVENTIONAL) {
+			/* TODO: Set granule tracking in this case */
+			pr_err("Granule tracking for region isn't fine/conventional: %llx",
+			       start);
+			return -ENODEV;
+		}
+		start = next;
+	}
+
+	return 0;
+}
+
+static unsigned long rmi_l0gpt_size(void)
+{
+	return 1UL << (30 + FIELD_GET(RMI_FEATURE_REGISTER_1_L0GPTSZ,
+				      rmm_feat_reg1));
+}
+
+static int rmi_create_gpts(phys_addr_t start, phys_addr_t end)
+{
+	unsigned long l0gpt_sz = rmi_l0gpt_size();
+
+	start = ALIGN_DOWN(start, l0gpt_sz);
+	end = ALIGN(end, l0gpt_sz);
+
+	while (start < end) {
+		int ret = rmi_gpt_l1_create(start);
+
+		/*
+		 * Make sure the L1 GPT tables are created for the region.
+		 * RMI_ERROR_GPT indicates the L1 table already exists.
+		 */
+		if (ret && ret != RMI_ERROR_GPT) {
+			/*
+			 * FIXME: Handle SRO so that memory can be donated for
+			 * the tables.
+			 */
+			pr_err("GPT Level1 table missing for %llx\n", start);
+			return -ENOMEM;
+		}
+		start += l0gpt_sz;
+	}
+
+	return 0;
+}
+
+static int rmi_init_metadata(void)
+{
+	phys_addr_t start, end;
+	const struct memblock_region *r;
+
+	for_each_mem_region(r) {
+		int ret;
+
+		start = memblock_region_memory_base_pfn(r) << PAGE_SHIFT;
+		end = memblock_region_memory_end_pfn(r) << PAGE_SHIFT;
+		ret = rmi_verify_memory_tracking(start, end);
+		if (ret)
+			return ret;
+		ret = rmi_create_gpts(start, end);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+bool rmi_is_available(void)
+{
+	return arm64_rmi_is_available;
+}
+
 static int __init arm64_init_rmi(void)
 {
 	/* Continue without realm support if we can't agree on a version */
@@ -101,6 +199,11 @@ static int __init arm64_init_rmi(void)
 
 	if (rmi_configure())
 		return 0;
+	if (rmi_init_metadata())
+		return 0;
+
+	arm64_rmi_is_available = true;
+	pr_info("RMI configured");
 
 	return 0;
 }
-- 
2.43.0


^ permalink raw reply related

* [PATCH v14 07/44] arm64: RMI: Configure the RMM with the host's page size
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
  To: kvm, kvmarm
  Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
	James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
	linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
	Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
	Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
	Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>

RMM v2.0 brings the ability to set the RMM's granule size. Check the
feature registers and configure the RMM so that it matches the host's
page size. This means that operations can be done with a granulatity
equal to PAGE_SIZE.

Signed-off-by: Steven Price <steven.price@arm.com>
---
Changes since v13:
 * Moved out of KVM.
---
 arch/arm64/kernel/rmi.c | 42 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
index 99c1ccc35c11..a14ead5dedda 100644
--- a/arch/arm64/kernel/rmi.c
+++ b/arch/arm64/kernel/rmi.c
@@ -49,6 +49,45 @@ static int rmi_check_version(void)
 	return 0;
 }
 
+static int rmi_configure(void)
+{
+	struct rmm_config *config __free(free_page) = NULL;
+	unsigned long ret;
+
+	config = (struct rmm_config *)get_zeroed_page(GFP_KERNEL);
+	if (!config)
+		return -ENOMEM;
+
+	switch (PAGE_SIZE) {
+	case SZ_4K:
+		config->rmi_granule_size = RMI_GRANULE_SIZE_4KB;
+		break;
+	case SZ_16K:
+		config->rmi_granule_size = RMI_GRANULE_SIZE_16KB;
+		break;
+	case SZ_64K:
+		config->rmi_granule_size = RMI_GRANULE_SIZE_64KB;
+		break;
+	default:
+		pr_err("Unsupported PAGE_SIZE for RMM\n");
+		return -EINVAL;
+	}
+
+	ret = rmi_rmm_config_set(virt_to_phys(config));
+	if (ret) {
+		pr_err("RMM config set failed\n");
+		return -EINVAL;
+	}
+
+	ret = rmi_rmm_activate();
+	if (ret) {
+		pr_err("RMM activate failed\n");
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
 static int __init arm64_init_rmi(void)
 {
 	/* Continue without realm support if we can't agree on a version */
@@ -60,6 +99,9 @@ static int __init arm64_init_rmi(void)
 	if (WARN_ON(rmi_features(1, &rmm_feat_reg1)))
 		return 0;
 
+	if (rmi_configure())
+		return 0;
+
 	return 0;
 }
 subsys_initcall(arm64_init_rmi);
-- 
2.43.0


^ permalink raw reply related

* [PATCH v14 06/44] arm64: RMI: Check for RMI support at init
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
  To: kvm, kvmarm
  Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
	James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
	linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
	Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
	Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
	Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>

Query the RMI version number and check if it is a compatible version.
The first two feature registers are read and exposed for future code to
use.

Signed-off-by: Steven Price <steven.price@arm.com>
---
v14:
 * This moves the basic RMI setup into the 'kernel' directory. This is
   because RMI will be used for some features outside of KVM so should
   be available even if KVM isn't compiled in.
---
 arch/arm64/include/asm/rmi_cmds.h |  3 ++
 arch/arm64/kernel/Makefile        |  2 +-
 arch/arm64/kernel/cpufeature.c    |  1 +
 arch/arm64/kernel/rmi.c           | 65 +++++++++++++++++++++++++++++++
 4 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/kernel/rmi.c

diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
index 04f7066894e9..9179934925c5 100644
--- a/arch/arm64/include/asm/rmi_cmds.h
+++ b/arch/arm64/include/asm/rmi_cmds.h
@@ -10,6 +10,9 @@
 
 #include <asm/rmi_smc.h>
 
+extern unsigned long rmm_feat_reg0;
+extern unsigned long rmm_feat_reg1;
+
 struct rtt_entry {
 	unsigned long walk_level;
 	unsigned long desc;
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 74b76bb70452..d68f351aae75 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -34,7 +34,7 @@ obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o	\
 			   syscall.o proton-pack.o idle.o patching.o pi/	\
-			   rsi.o jump_label.o
+			   rsi.o jump_label.o rmi.o
 
 obj-$(CONFIG_COMPAT)			+= sys32.o signal32.o			\
 					   sys_compat.o
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 6d53bb15cf7b..8bdd95a8c2de 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -292,6 +292,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar3[] = {
 static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV3_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV2_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_RME_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_DIT_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_AMU_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_MPAM_SHIFT, 4, 0),
diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
new file mode 100644
index 000000000000..99c1ccc35c11
--- /dev/null
+++ b/arch/arm64/kernel/rmi.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023-2025 ARM Ltd.
+ */
+
+#include <linux/memblock.h>
+
+#include <asm/rmi_cmds.h>
+
+unsigned long rmm_feat_reg0;
+unsigned long rmm_feat_reg1;
+
+static int rmi_check_version(void)
+{
+	struct arm_smccc_res res;
+	unsigned short version_major, version_minor;
+	unsigned long host_version = RMI_ABI_VERSION(RMI_ABI_MAJOR_VERSION,
+						     RMI_ABI_MINOR_VERSION);
+	unsigned long aa64pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
+
+	/* If RME isn't supported, then RMI can't be */
+	if (cpuid_feature_extract_unsigned_field(aa64pfr0, ID_AA64PFR0_EL1_RME_SHIFT) == 0)
+		return -ENXIO;
+
+	arm_smccc_1_1_invoke(SMC_RMI_VERSION, host_version, &res);
+
+	if (res.a0 == SMCCC_RET_NOT_SUPPORTED)
+		return -ENXIO;
+
+	version_major = RMI_ABI_VERSION_GET_MAJOR(res.a1);
+	version_minor = RMI_ABI_VERSION_GET_MINOR(res.a1);
+
+	if (res.a0 != RMI_SUCCESS) {
+		unsigned short high_version_major, high_version_minor;
+
+		high_version_major = RMI_ABI_VERSION_GET_MAJOR(res.a2);
+		high_version_minor = RMI_ABI_VERSION_GET_MINOR(res.a2);
+
+		pr_err("Unsupported RMI ABI (v%d.%d - v%d.%d) we want v%d.%d\n",
+		       version_major, version_minor,
+		       high_version_major, high_version_minor,
+		       RMI_ABI_MAJOR_VERSION,
+		       RMI_ABI_MINOR_VERSION);
+		return -ENXIO;
+	}
+
+	pr_info("RMI ABI version %d.%d\n", version_major, version_minor);
+
+	return 0;
+}
+
+static int __init arm64_init_rmi(void)
+{
+	/* Continue without realm support if we can't agree on a version */
+	if (rmi_check_version())
+		return 0;
+
+	if (WARN_ON(rmi_features(0, &rmm_feat_reg0)))
+		return 0;
+	if (WARN_ON(rmi_features(1, &rmm_feat_reg1)))
+		return 0;
+
+	return 0;
+}
+subsys_initcall(arm64_init_rmi);
-- 
2.43.0


^ permalink raw reply related

* [PATCH v14 05/44] arm64: RMI: Add wrappers for RMI calls
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
  To: kvm, kvmarm
  Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
	James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
	linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
	Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
	Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
	Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>

The wrappers make the call sites easier to read and deal with the
boiler plate of handling the error codes from the RMM.

Signed-off-by: Steven Price <steven.price@arm.com>
---
Changes from v13:
 * Update to RMM v2.0-bet1 spec including some SRO support (there still
   some FIXMEs where SRO support is incomplete).
Changes from v12:
 * Update to RMM v2.0 specification
Changes from v8:
 * Switch from arm_smccc_1_2_smc() to arm_smccc_1_2_invoke() in
   rmi_rtt_read_entry() for consistency.
Changes from v7:
 * Minor renaming of parameters and updated comments
Changes from v5:
 * Further improve comments
Changes from v4:
 * Improve comments
Changes from v2:
 * Make output arguments optional.
 * Mask RIPAS value rmi_rtt_read_entry()
 * Drop unused rmi_rtt_get_phys()
---
 arch/arm64/include/asm/rmi_cmds.h | 661 ++++++++++++++++++++++++++++++
 1 file changed, 661 insertions(+)
 create mode 100644 arch/arm64/include/asm/rmi_cmds.h

diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
new file mode 100644
index 000000000000..04f7066894e9
--- /dev/null
+++ b/arch/arm64/include/asm/rmi_cmds.h
@@ -0,0 +1,661 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023 ARM Ltd.
+ */
+
+#ifndef __ASM_RMI_CMDS_H
+#define __ASM_RMI_CMDS_H
+
+#include <linux/arm-smccc.h>
+
+#include <asm/rmi_smc.h>
+
+struct rtt_entry {
+	unsigned long walk_level;
+	unsigned long desc;
+	int state;
+	int ripas;
+};
+
+#define RMI_MAX_ADDR_LIST	256
+
+struct rmi_sro_state {
+	struct arm_smccc_1_2_regs regs;
+	unsigned long addr_count;
+	unsigned long addr_list[RMI_MAX_ADDR_LIST];
+};
+
+#define rmi_smccc(...) do {						\
+	arm_smccc_1_1_invoke(__VA_ARGS__);				\
+} while (RMI_RETURN_STATUS(res.a0) == RMI_BUSY ||			\
+	 RMI_RETURN_STATUS(res.a0) == RMI_BLOCKED)
+
+unsigned long rmi_sro_execute(struct rmi_sro_state *sro, gfp_t gfp);
+void rmi_sro_free(struct rmi_sro_state *sro);
+
+/**
+ * rmi_rmm_config_set() - Configure the RMM
+ * @cfg_ptr: PA of a struct rmm_config
+ *
+ * Sets configuration options on the RMM.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_rmm_config_set(unsigned long cfg_ptr)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_RMM_CONFIG_SET, cfg_ptr, &res);
+
+	return res.a0;
+}
+
+/**
+ * rmi_rmm_activate() - Activate the RMM
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_rmm_activate(void)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_RMM_ACTIVATE, &res);
+
+	return res.a0;
+}
+
+/**
+ * rmi_granule_tracking_get() - Get configuration of a Granule tracking region
+ * @start: Base PA of the tracking region
+ * @end: End of the PA region
+ * @out_category: Memory category
+ * @out_state: Tracking region state
+ * @out_top: Top of the memory region
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_granule_tracking_get(unsigned long start,
+					   unsigned long end,
+					   unsigned long *out_category,
+					   unsigned long *out_state,
+					   unsigned long *out_top)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_GRANULE_TRACKING_GET, start, end, &res);
+
+	if (out_category)
+		*out_category = res.a1;
+	if (out_state)
+		*out_state = res.a2;
+	if (out_top)
+		*out_top = res.a3;
+
+	return res.a0;
+}
+
+/**
+ * rmi_gpt_l1_create() - Create a Level 1 GPT
+ * @addr: Base of physical address region described by the L1GPT
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_gpt_l1_create(unsigned long addr)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_GPT_L1_CREATE, addr, &res);
+
+	if (RMI_RETURN_STATUS(res.a0) == RMI_INCOMPLETE) {
+		/* FIXME */
+		return WARN_ON(res.a0);
+	}
+
+	return res.a0;
+}
+
+/**
+ * rmi_rtt_data_map_init() - Create a protected mapping with data contents
+ * @rd: PA of the RD
+ * @data: PA of the target granule
+ * @ipa: IPA at which the granule will be mapped in the guest
+ * @src: PA of the source granule
+ * @flags: RMI_MEASURE_CONTENT if the contents should be measured
+ *
+ * Create a mapping from Protected IPA space to conventional memory, copying
+ * contents from a Non-secure Granule provided by the caller.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_rtt_data_map_init(unsigned long rd, unsigned long data,
+					unsigned long ipa, unsigned long src,
+					unsigned long flags)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_RTT_DATA_MAP_INIT, rd, data, ipa, src,
+			     flags, &res);
+
+	return res.a0;
+}
+
+/**
+ * rmi_rtt_data_map() - Create mappings in protected IPA with unknown contents
+ * @rd: PA of the RD
+ * @base: Base of the target IPA range
+ * @top: Top of the target IPA range
+ * @flags: Flags
+ * @oaddr: Output address set descriptor
+ * @out_top: Top address of range which was processed.
+ *
+ * Return RMI return code
+ */
+static inline int rmi_rtt_data_map(unsigned long rd,
+				   unsigned long base,
+				   unsigned long top,
+				   unsigned long flags,
+				   unsigned long oaddr,
+				   unsigned long *out_top)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_RTT_DATA_MAP, rd, base, top, flags, oaddr,
+			     &res);
+
+	if (RMI_RETURN_STATUS(res.a0) == RMI_INCOMPLETE) {
+		/* FIXME */
+		return WARN_ON(res.a0);
+	}
+
+	if (out_top)
+		*out_top = res.a1;
+
+	return res.a0;
+}
+
+/**
+ * rmi_rtt_data_unmap() - Remove mappings to conventional memory
+ * @rd: PA of the RD for the target Realm
+ * @base: Base of the target IPA range
+ * @top: Top of the target IPA range
+ * @flags: Flags
+ * @oaddr: Output address set descriptor
+ * @out_top: Returns top IPA of range which has been unmapped
+ * @out_range: Output address range
+ * @out_count: Number of entries in output address list
+ *
+ * Removes mappings to convention memory with a target Protected IPA range.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_rtt_data_unmap(unsigned long rd,
+				     unsigned long base,
+				     unsigned long top,
+				     unsigned long flags,
+				     unsigned long oaddr,
+				     unsigned long *out_top,
+				     unsigned long *out_range,
+				     unsigned long *out_count)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_RTT_DATA_UNMAP, rd, base, top, flags,
+			     oaddr, &res);
+
+	/* FIXME: Handle SRO */
+
+	if (out_top)
+		*out_top = res.a1;
+	if (out_range)
+		*out_range = res.a2;
+	if (out_count)
+		*out_count = res.a3;
+
+	return res.a0;
+}
+
+/**
+ * rmi_features() - Read feature register
+ * @index: Feature register index
+ * @out: Feature register value is written to this pointer
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_features(unsigned long index, unsigned long *out)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_FEATURES, index, &res);
+
+	if (out)
+		*out = res.a1;
+	return res.a0;
+}
+
+/**
+ * rmi_granule_range_delegate() - Delegate granules
+ * @base: PA of the first granule of the range
+ * @top: PA of the first granule after the range
+ * @out_top: PA of the first granule not delegated
+ *
+ * Delegate a range of granule for use by the realm world. If the entire range
+ * was delegated then @out_top == @top, otherwise the function should be called
+ * again with @base == @out_top.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_granule_range_delegate(unsigned long base,
+					     unsigned long top,
+					     unsigned long *out_top)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_GRANULE_RANGE_DELEGATE, base, top, &res);
+
+	if (RMI_RETURN_STATUS(res.a0) == RMI_INCOMPLETE) {
+		/* FIXME - Handle SRO */
+		return WARN_ON(res.a0);
+	}
+
+	if (out_top)
+		*out_top = res.a1;
+
+	return res.a0;
+}
+
+/**
+ * rmi_granule_range_undelegate() - Undelegate a range of granules
+ * @base: Base PA of the target range
+ * @top: Top PA of the target range
+ * @out_top: Returns the top PA of range whose state is undelegated
+ *
+ * Undelegate a range of granules to allow use by the normal world. Will fail if
+ * the granules are in use.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_granule_range_undelegate(unsigned long base,
+					       unsigned long top,
+					       unsigned long *out_top)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_GRANULE_RANGE_UNDELEGATE, base, top, &res);
+
+	if (RMI_RETURN_STATUS(res.a0) == RMI_INCOMPLETE) {
+		/* FIXME - Handle SRO */
+		return WARN_ON(res.a0);
+	}
+
+	if (out_top)
+		*out_top = res.a1;
+
+	return res.a0;
+}
+
+/**
+ * rmi_psci_complete() - Complete pending PSCI command
+ * @calling_rec: PA of the calling REC
+ * @target_rec: PA of the target REC
+ * @status: Status of the PSCI request
+ *
+ * Completes a pending PSCI command which was called with an MPIDR argument, by
+ * providing the corresponding REC.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_psci_complete(unsigned long calling_rec,
+				    unsigned long target_rec,
+				    unsigned long status)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_PSCI_COMPLETE, calling_rec, target_rec,
+			     status, &res);
+
+	return res.a0;
+}
+
+/**
+ * rmi_realm_activate() - Active a realm
+ * @rd: PA of the RD
+ *
+ * Mark a realm as Active signalling that creation is complete and allowing
+ * execution of the realm.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_realm_activate(unsigned long rd)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_REALM_ACTIVATE, rd, &res);
+
+	return res.a0;
+}
+
+/**
+ * rmi_realm_create() - Create a realm
+ * @rd: PA of the RD
+ * @params: PA of realm parameters
+ *
+ * Create a new realm using the given parameters.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_realm_create(unsigned long rd, unsigned long params)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_REALM_CREATE, rd, params, &res);
+
+	return res.a0;
+}
+
+/**
+ * rmi_realm_terminate() - Terminate a realm
+ * @rd: PA of the RD
+ *
+ * Terminates a realm, moving it into a ZOMBIE state
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_realm_terminate(unsigned long rd)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_REALM_TERMINATE, rd, &res);
+
+	return res.a0;
+}
+
+/**
+ * rmi_realm_destroy() - Destroy a realm
+ * @rd: PA of the RD
+ *
+ * Destroys a realm, all objects belonging to the realm must be destroyed first.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_realm_destroy(unsigned long rd)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_REALM_DESTROY, rd, &res);
+
+	return res.a0;
+}
+
+/**
+ * rmi_rec_create() - Create a REC
+ * @rd: PA of the RD
+ * @rec: PA of the target REC
+ * @params: PA of REC parameters
+ * @sro: Allocated SRO context to be used
+ *
+ * Create a REC using the parameters specified in the struct rec_params pointed
+ * to by @params.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_rec_create(unsigned long rd,
+				 unsigned long rec,
+				 unsigned long params,
+				 struct rmi_sro_state *sro)
+{
+	int ret;
+
+	*sro = (struct rmi_sro_state){.regs = {
+		SMC_RMI_REC_CREATE, rd, rec, params
+	}};
+	ret = rmi_sro_execute(sro, GFP_KERNEL);
+	rmi_sro_free(sro);
+
+	return ret;
+}
+
+/**
+ * rmi_rec_destroy() - Destroy a REC
+ * @rec: PA of the target REC
+ * @sro: Allocated SRO context to be used
+ *
+ * Destroys a REC. The REC must not be running.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_rec_destroy(unsigned long rec,
+				  struct rmi_sro_state *sro)
+{
+	int ret;
+
+	*sro = (struct rmi_sro_state){.regs = {
+		SMC_RMI_REC_DESTROY, rec
+	}};
+	ret = rmi_sro_execute(sro, GFP_KERNEL);
+	rmi_sro_free(sro);
+
+	return ret;
+}
+
+/**
+ * rmi_rec_enter() - Enter a REC
+ * @rec: PA of the target REC
+ * @run_ptr: PA of RecRun structure
+ *
+ * Starts (or continues) execution within a REC.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_rec_enter(unsigned long rec, unsigned long run_ptr)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_REC_ENTER, rec, run_ptr, &res);
+
+	return res.a0;
+}
+
+/**
+ * rmi_rtt_create() - Creates an RTT
+ * @rd: PA of the RD
+ * @rtt: PA of the target RTT
+ * @ipa: Base of the IPA range described by the RTT
+ * @level: Depth of the RTT within the tree
+ *
+ * Creates an RTT (Realm Translation Table) at the specified level for the
+ * translation of the specified address within the realm.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_rtt_create(unsigned long rd, unsigned long rtt,
+				 unsigned long ipa, long level)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_RTT_CREATE, rd, rtt, ipa, level, &res);
+
+	return res.a0;
+}
+
+/**
+ * rmi_rtt_destroy() - Destroy an RTT
+ * @rd: PA of the RD
+ * @ipa: Base of the IPA range described by the RTT
+ * @level: Depth of the RTT within the tree
+ * @out_rtt: Pointer to write the PA of the RTT which was destroyed
+ * @out_top: Pointer to write the top IPA of non-live RTT entries
+ *
+ * Destroys an RTT. The RTT must be non-live, i.e. none of the entries in the
+ * table are in ASSIGNED or TABLE state.
+ *
+ * Return: RMI return code.
+ */
+static inline int rmi_rtt_destroy(unsigned long rd,
+				  unsigned long ipa,
+				  long level,
+				  unsigned long *out_rtt,
+				  unsigned long *out_top)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_RTT_DESTROY, rd, ipa, level, &res);
+
+	if (out_rtt)
+		*out_rtt = res.a1;
+	if (out_top)
+		*out_top = res.a2;
+
+	return res.a0;
+}
+
+/**
+ * rmi_rtt_fold() - Fold an RTT
+ * @rd: PA of the RD
+ * @ipa: Base of the IPA range described by the RTT
+ * @level: Depth of the RTT within the tree
+ * @out_rtt: Pointer to write the PA of the RTT which was destroyed
+ *
+ * Folds an RTT. If all entries with the RTT are 'homogeneous' the RTT can be
+ * folded into the parent and the RTT destroyed.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_rtt_fold(unsigned long rd, unsigned long ipa,
+			       long level, unsigned long *out_rtt)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_RTT_FOLD, rd, ipa, level, &res);
+
+	if (out_rtt)
+		*out_rtt = res.a1;
+
+	return res.a0;
+}
+
+/**
+ * rmi_rtt_init_ripas() - Set RIPAS for new realm
+ * @rd: PA of the RD
+ * @base: Base of target IPA region
+ * @top: Top of target IPA region
+ * @out_top: Top IPA of range whose RIPAS was modified
+ *
+ * Sets the RIPAS of a target IPA range to RAM, for a realm in the NEW state.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_rtt_init_ripas(unsigned long rd, unsigned long base,
+				     unsigned long top, unsigned long *out_top)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_RTT_INIT_RIPAS, rd, base, top, &res);
+
+	if (out_top)
+		*out_top = res.a1;
+
+	return res.a0;
+}
+
+/**
+ * rmi_rtt_unprot_map() - Map unprotected granules into a realm
+ * @rd: PA of the RD
+ * @base: Base IPA of the mapping
+ * @top: Top of the target IPA range
+ * @flags: Flags
+ * @oaddr: Output address set descriptor
+ * @out_top: Top IPA of range which has been mapped
+ *
+ * Create mappings to memory within a target unprotected IPA range.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_rtt_unprot_map(unsigned long rd,
+				     unsigned long base,
+				     unsigned long top,
+				     unsigned long flags,
+				     unsigned long oaddr,
+				     unsigned long *out_top)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_RTT_UNPROT_MAP, rd, base, top, flags,
+			     oaddr, &res);
+
+	/* FIXME: Handle SRO */
+
+	if (out_top)
+		*out_top = res.a1;
+
+	return res.a0;
+}
+
+/**
+ * rmi_rtt_set_ripas() - Set RIPAS for an running realm
+ * @rd: PA of the RD
+ * @rec: PA of the REC making the request
+ * @base: Base of target IPA region
+ * @top: Top of target IPA region
+ * @out_top: Pointer to write top IPA of range whose RIPAS was modified
+ *
+ * Completes a request made by the realm to change the RIPAS of a target IPA
+ * range.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_rtt_set_ripas(unsigned long rd, unsigned long rec,
+				    unsigned long base, unsigned long top,
+				    unsigned long *out_top)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_RTT_SET_RIPAS, rd, rec, base, top, &res);
+
+	if (out_top)
+		*out_top = res.a1;
+
+	return res.a0;
+}
+
+/**
+ * rmi_rtt_unprot_unmap() - Remove mappings within an unprotected IPA range
+ * @rd: PA of the RD
+ * @base: Base IPA of the mapping
+ * @top: Top of the target IPA range
+ * @flags: Flags
+ * @oaddr: Output address set descriptor
+ * @out_top: Top IPA which has been unmapped
+ * @out_range: Output address range
+ * @out_count: Number of entries in output address list
+ *
+ * Removes mappings to memory within a target unprotected IPA range.
+ *
+ * Return: RMI return code
+ */
+static inline int rmi_rtt_unprot_unmap(unsigned long rd,
+				       unsigned long base,
+				       unsigned long top,
+				       unsigned long flags,
+				       unsigned long oaddr,
+				       unsigned long *out_top,
+				       unsigned long *out_range,
+				       unsigned long *out_count)
+{
+	struct arm_smccc_res res;
+
+	arm_smccc_1_1_invoke(SMC_RMI_RTT_UNPROT_UNMAP, rd, base, top,
+			     flags, oaddr, &res);
+
+	/* FIXME: Handle SRO */
+
+	if (out_top)
+		*out_top = res.a1;
+	if (out_range)
+		*out_range = res.a2;
+	if (out_count)
+		*out_count = res.a3;
+
+	return res.a0;
+}
+
+#endif /* __ASM_RMI_CMDS_H */
-- 
2.43.0


^ permalink raw reply related

* [PATCH v14 04/44] arm64: RMI: Add SMC definitions for calling the RMM
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
  To: kvm, kvmarm
  Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
	James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
	linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
	Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
	Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
	Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>

The RMM (Realm Management Monitor) provides functionality that can be
accessed by SMC calls from the host.

The SMC definitions are based on DEN0137[1] version 2.0-bet1

[1] https://developer.arm.com/documentation/den0137/2-0bet1/

Signed-off-by: Steven Price <steven.price@arm.com>
---
Changes since v13:
 * Updated to RMM spec v2.0-bet1
Changes since v12:
 * Updated to RMM spec v2.0-bet0
Changes since v9:
 * Corrected size of 'ripas_value' in struct rec_exit. The spec states
   this is an 8-bit type with padding afterwards (rather than a u64).
Changes since v8:
 * Added RMI_PERMITTED_GICV3_HCR_BITS to define which bits the RMM
   permits to be modified.
Changes since v6:
 * Renamed REC_ENTER_xxx defines to include 'FLAG' to make it obvious
   these are flag values.
Changes since v5:
 * Sorted the SMC #defines by value.
 * Renamed SMI_RxI_CALL to SMI_RMI_CALL since the macro is only used for
   RMI calls.
 * Renamed REC_GIC_NUM_LRS to REC_MAX_GIC_NUM_LRS since the actual
   number of available list registers could be lower.
 * Provided a define for the reserved fields of FeatureRegister0.
 * Fix inconsistent names for padding fields.
Changes since v4:
 * Update to point to final released RMM spec.
 * Minor rearrangements.
Changes since v3:
 * Update to match RMM spec v1.0-rel0-rc1.
Changes since v2:
 * Fix specification link.
 * Rename rec_entry->rec_enter to match spec.
 * Fix size of pmu_ovf_status to match spec.
---
 arch/arm64/include/asm/rmi_smc.h | 448 +++++++++++++++++++++++++++++++
 1 file changed, 448 insertions(+)
 create mode 100644 arch/arm64/include/asm/rmi_smc.h

diff --git a/arch/arm64/include/asm/rmi_smc.h b/arch/arm64/include/asm/rmi_smc.h
new file mode 100644
index 000000000000..a09b7a631fef
--- /dev/null
+++ b/arch/arm64/include/asm/rmi_smc.h
@@ -0,0 +1,448 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023-2026 ARM Ltd.
+ *
+ * The values and structures in this file are from the Realm Management Monitor
+ * specification (DEN0137) version 2.0-bet1:
+ * https://developer.arm.com/documentation/den0137/2-0bet1/
+ */
+
+#ifndef __ASM_RMI_SMC_H
+#define __ASM_RMI_SMC_H
+
+#include <linux/arm-smccc.h>
+
+#define SMC_RMI_CALL(func)				\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,		\
+			   ARM_SMCCC_SMC_64,		\
+			   ARM_SMCCC_OWNER_STANDARD,	\
+			   (func))
+
+#define SMC_RMI_VERSION				SMC_RMI_CALL(0x0150)
+
+#define SMC_RMI_RTT_DATA_MAP_INIT		SMC_RMI_CALL(0x0153)
+
+#define SMC_RMI_REALM_ACTIVATE			SMC_RMI_CALL(0x0157)
+#define SMC_RMI_REALM_CREATE			SMC_RMI_CALL(0x0158)
+#define SMC_RMI_REALM_DESTROY			SMC_RMI_CALL(0x0159)
+#define SMC_RMI_REC_CREATE			SMC_RMI_CALL(0x015a)
+#define SMC_RMI_REC_DESTROY			SMC_RMI_CALL(0x015b)
+#define SMC_RMI_REC_ENTER			SMC_RMI_CALL(0x015c)
+#define SMC_RMI_RTT_CREATE			SMC_RMI_CALL(0x015d)
+#define SMC_RMI_RTT_DESTROY			SMC_RMI_CALL(0x015e)
+
+#define SMC_RMI_RTT_READ_ENTRY			SMC_RMI_CALL(0x0161)
+
+#define SMC_RMI_RTT_DEV_VALIDATE		SMC_RMI_CALL(0x0163)
+#define SMC_RMI_PSCI_COMPLETE			SMC_RMI_CALL(0x0164)
+#define SMC_RMI_FEATURES			SMC_RMI_CALL(0x0165)
+#define SMC_RMI_RTT_FOLD			SMC_RMI_CALL(0x0166)
+
+#define SMC_RMI_RTT_INIT_RIPAS			SMC_RMI_CALL(0x0168)
+#define SMC_RMI_RTT_SET_RIPAS			SMC_RMI_CALL(0x0169)
+#define SMC_RMI_VSMMU_CREATE			SMC_RMI_CALL(0x016a)
+#define SMC_RMI_VSMMU_DESTROY			SMC_RMI_CALL(0x016b)
+#define SMC_RMI_RMM_CONFIG_SET			SMC_RMI_CALL(0x016e)
+#define SMC_RMI_PSMMU_IRQ_NOTIFY		SMC_RMI_CALL(0x016f)
+
+#define SMC_RMI_PDEV_ABORT			SMC_RMI_CALL(0x0174)
+#define SMC_RMI_PDEV_COMMUNICATE		SMC_RMI_CALL(0x0175)
+#define SMC_RMI_PDEV_CREATE			SMC_RMI_CALL(0x0176)
+#define SMC_RMI_PDEV_DESTROY			SMC_RMI_CALL(0x0177)
+#define SMC_RMI_PDEV_GET_STATE			SMC_RMI_CALL(0x0178)
+
+#define SMC_RMI_PDEV_STREAM_KEY_REFRESH		SMC_RMI_CALL(0x017a)
+#define SMC_RMI_PDEV_SET_PUBKEY			SMC_RMI_CALL(0x017b)
+#define SMC_RMI_PDEV_STOP			SMC_RMI_CALL(0x017c)
+#define SMC_RMI_RTT_AUX_CREATE			SMC_RMI_CALL(0x017d)
+#define SMC_RMI_RTT_AUX_DESTROY			SMC_RMI_CALL(0x017e)
+#define SMC_RMI_RTT_AUX_FOLD			SMC_RMI_CALL(0x017f)
+
+#define SMC_RMI_VDEV_ABORT			SMC_RMI_CALL(0x0185)
+#define SMC_RMI_VDEV_COMMUNICATE		SMC_RMI_CALL(0x0186)
+#define SMC_RMI_VDEV_CREATE			SMC_RMI_CALL(0x0187)
+#define SMC_RMI_VDEV_DESTROY			SMC_RMI_CALL(0x0188)
+#define SMC_RMI_VDEV_GET_STATE			SMC_RMI_CALL(0x0189)
+#define SMC_RMI_VDEV_UNLOCK			SMC_RMI_CALL(0x018a)
+#define SMC_RMI_RTT_SET_S2AP			SMC_RMI_CALL(0x018b)
+#define SMC_RMI_VDEV_COMPLETE			SMC_RMI_CALL(0x018e)
+
+#define SMC_RMI_VDEV_GET_INTERFACE_REPORT	SMC_RMI_CALL(0x01d0)
+#define SMC_RMI_VDEV_GET_MEASUREMENTS		SMC_RMI_CALL(0x01d1)
+#define SMC_RMI_VDEV_LOCK			SMC_RMI_CALL(0x01d2)
+#define SMC_RMI_VDEV_START			SMC_RMI_CALL(0x01d3)
+
+#define SMC_RMI_VSMMU_EVENT_NOTIFY		SMC_RMI_CALL(0x01d6)
+#define SMC_RMI_PSMMU_ACTIVATE			SMC_RMI_CALL(0x01d7)
+#define SMC_RMI_PSMMU_DEACTIVATE		SMC_RMI_CALL(0x01d8)
+
+#define SMC_RMI_PSMMU_ST_L2_CREATE		SMC_RMI_CALL(0x01db)
+#define SMC_RMI_PSMMU_ST_L2_DESTROY		SMC_RMI_CALL(0x01dc)
+#define SMC_RMI_DPT_L0_CREATE			SMC_RMI_CALL(0x01dd)
+#define SMC_RMI_DPT_L0_DESTROY			SMC_RMI_CALL(0x01de)
+#define SMC_RMI_DPT_L1_CREATE			SMC_RMI_CALL(0x01df)
+#define SMC_RMI_DPT_L1_DESTROY			SMC_RMI_CALL(0x01e0)
+#define SMC_RMI_GRANULE_TRACKING_GET		SMC_RMI_CALL(0x01e1)
+
+#define SMC_RMI_GRANULE_TRACKING_SET		SMC_RMI_CALL(0x01e3)
+
+#define SMC_RMI_RMM_CONFIG_GET			SMC_RMI_CALL(0x01ec)
+
+#define SMC_RMI_RMM_STATE_GET			SMC_RMI_CALL(0x01ee)
+
+#define SMC_RMI_PSMMU_EVENT_CONSUME		SMC_RMI_CALL(0x01f0)
+#define SMC_RMI_GRANULE_RANGE_DELEGATE		SMC_RMI_CALL(0x01f1)
+#define SMC_RMI_GRANULE_RANGE_UNDELEGATE	SMC_RMI_CALL(0x01f2)
+#define SMC_RMI_GPT_L1_CREATE			SMC_RMI_CALL(0x01f3)
+#define SMC_RMI_GPT_L1_DESTROY			SMC_RMI_CALL(0x01f4)
+#define SMC_RMI_RTT_DATA_MAP			SMC_RMI_CALL(0x01f5)
+#define SMC_RMI_RTT_DATA_UNMAP			SMC_RMI_CALL(0x01f6)
+#define SMC_RMI_RTT_DEV_MAP			SMC_RMI_CALL(0x01f7)
+#define SMC_RMI_RTT_DEV_UNMAP			SMC_RMI_CALL(0x01f8)
+#define SMC_RMI_RTT_ARCH_DEV_MAP		SMC_RMI_CALL(0x01f9)
+#define SMC_RMI_RTT_ARCH_DEV_UNMAP		SMC_RMI_CALL(0x01fa)
+#define SMC_RMI_RTT_UNPROT_MAP			SMC_RMI_CALL(0x01fb)
+#define SMC_RMI_RTT_UNPROT_UNMAP		SMC_RMI_CALL(0x01fc)
+#define SMC_RMI_RTT_AUX_PROT_MAP		SMC_RMI_CALL(0x01fd)
+#define SMC_RMI_RTT_AUX_PROT_UNMAP		SMC_RMI_CALL(0x01fe)
+#define SMC_RMI_RTT_AUX_UNPROT_MAP		SMC_RMI_CALL(0x01ff)
+#define SMC_RMI_RTT_AUX_UNPROT_UNMAP		SMC_RMI_CALL(0x0200)
+#define SMC_RMI_REALM_TERMINATE			SMC_RMI_CALL(0x0201)
+#define SMC_RMI_RMM_ACTIVATE			SMC_RMI_CALL(0x0202)
+#define SMC_RMI_OP_CONTINUE			SMC_RMI_CALL(0x0203)
+#define SMC_RMI_PDEV_STREAM_CONNECT		SMC_RMI_CALL(0x0204)
+#define SMC_RMI_PDEV_STREAM_DISCONNECT		SMC_RMI_CALL(0x0205)
+#define SMC_RMI_PDEV_STREAM_COMPLETE		SMC_RMI_CALL(0x0206)
+#define SMC_RMI_PDEV_STREAM_KEY_PURGE		SMC_RMI_CALL(0x0207)
+#define SMC_RMI_OP_MEM_DONATE			SMC_RMI_CALL(0x0208)
+#define SMC_RMI_OP_MEM_RECLAIM			SMC_RMI_CALL(0x0209)
+#define SMC_RMI_OP_CANCEL			SMC_RMI_CALL(0x020a)
+#define SMC_RMI_VSMMU_FEATURES			SMC_RMI_CALL(0x020b)
+#define SMC_RMI_VSMMU_CMD_GET			SMC_RMI_CALL(0x020c)
+#define SMC_RMI_VSMMU_CMD_COMPLETE		SMC_RMI_CALL(0x020d)
+#define SMC_RMI_PSMMU_INFO			SMC_RMI_CALL(0x020e)
+
+#define RMI_ABI_MAJOR_VERSION	2
+#define RMI_ABI_MINOR_VERSION	0
+
+#define RMI_ABI_VERSION_GET_MAJOR(version) ((version) >> 16)
+#define RMI_ABI_VERSION_GET_MINOR(version) ((version) & 0xFFFF)
+#define RMI_ABI_VERSION(major, minor)      (((major) << 16) | (minor))
+
+#define RMI_UNASSIGNED			0
+#define RMI_ASSIGNED			1
+#define RMI_TABLE			2
+
+#define RMI_RETURN_STATUS(ret)		((ret) & 0xFF)
+#define RMI_RETURN_INDEX(ret)		(((ret) >> 8) & 0xFF)
+#define RMI_RETURN_MEMREQ(ret)		(((ret) >> 8) & 0x3)
+#define RMI_RETURN_CAN_CANCEL(ret)	(((ret) >> 10) & 0x1)
+
+#define RMI_SUCCESS			0
+#define RMI_ERROR_INPUT			1
+#define RMI_ERROR_REALM			2
+#define RMI_ERROR_REC			3
+#define RMI_ERROR_RTT			4
+#define RMI_ERROR_NOT_SUPPORTED		5
+#define RMI_ERROR_DEVICE		6
+#define RMI_ERROR_RTT_AUX		7
+#define RMI_ERROR_PSMMU_ST		8
+#define RMI_ERROR_DPT			9
+#define RMI_BUSY			10
+#define RMI_ERROR_GLOBAL		11
+#define RMI_ERROR_TRACKING		12
+#define RMI_INCOMPLETE			13
+#define RMI_BLOCKED			14
+#define RMI_ERROR_GPT			15
+#define RMI_ERROR_GRANULE		16
+
+#define RMI_OP_MEM_REQ_NONE		0
+#define RMI_OP_MEM_REQ_DONATE		1
+#define RMI_OP_MEM_REQ_RECLAIM		2
+
+#define RMI_DONATE_SIZE(req)		((req) & 0x3)
+#define RMI_DONATE_COUNT_MASK		GENMASK(15, 2)
+#define RMI_DONATE_COUNT(req)		(((req) & RMI_DONATE_COUNT_MASK) >> 2)
+#define RMI_DONATE_CONTIG(req)		(!!((req) & BIT(16)))
+#define RMI_DONATE_STATE(req)		(!!((req) & BIT(17)))
+
+#define RMI_OP_MEM_DELEGATED		0
+#define RMI_OP_MEM_UNDELEGATED		1
+
+#define RMI_ADDR_TYPE_NONE		0
+#define RMI_ADDR_TYPE_SINGLE		1
+#define RMI_ADDR_TYPE_LIST		2
+
+#define RMI_ADDR_RANGE_SIZE_MASK	GENMASK(1, 0)
+#define RMI_ADDR_RANGE_COUNT_MASK	GENMASK(PAGE_SHIFT - 1, 2)
+#define RMI_ADDR_RANGE_ADDR_MASK	(PAGE_MASK & GENMASK(51, 0))
+#define RMI_ADDR_RANGE_STATE_MASK	BIT(63)
+
+#define RMI_ADDR_RANGE_SIZE(ar)		(FIELD_GET(RMI_ADDR_RANGE_SIZE_MASK, \
+						   (ar)))
+#define RMI_ADDR_RANGE_COUNT(ar)	(FIELD_GET(RMI_ADDR_RANGE_COUNT_MASK, \
+						   (ar)))
+#define RMI_ADDR_RANGE_ADDR(ar)		((ar) & RMI_ADDR_RANGE_ADDR_MASK)
+#define RMI_ADDR_RANGE_STATE(ar)	(FIELD_GET(RMI_ADDR_RANGE_STATE_MASK, \
+						   (ar)))
+
+enum rmi_ripas {
+	RMI_EMPTY = 0,
+	RMI_RAM = 1,
+	RMI_DESTROYED = 2,
+	RMI_DEV = 3,
+};
+
+#define RMI_NO_MEASURE_CONTENT	0
+#define RMI_MEASURE_CONTENT	1
+
+#define RMI_FEATURE_REGISTER_0_S2SZ		GENMASK(7, 0)
+#define RMI_FEATURE_REGISTER_0_LPA2		BIT(8)
+#define RMI_FEATURE_REGISTER_0_SVE		BIT(9)
+#define RMI_FEATURE_REGISTER_0_SVE_VL		GENMASK(13, 10)
+#define RMI_FEATURE_REGISTER_0_NUM_BPS		GENMASK(19, 14)
+#define RMI_FEATURE_REGISTER_0_NUM_WPS		GENMASK(25, 20)
+#define RMI_FEATURE_REGISTER_0_PMU		BIT(26)
+#define RMI_FEATURE_REGISTER_0_PMU_NUM_CTRS	GENMASK(31, 27)
+
+#define RMI_FEATURE_REGISTER_1_RMI_GRAN_SZ_4KB	BIT(0)
+#define RMI_FEATURE_REGISTER_1_RMI_GRAN_SZ_16KB	BIT(1)
+#define RMI_FEATURE_REGISTER_1_RMI_GRAN_SZ_64KB	BIT(2)
+#define RMI_FEATURE_REGISTER_1_HASH_SHA_256	BIT(3)
+#define RMI_FEATURE_REGISTER_1_HASH_SHA_384	BIT(4)
+#define RMI_FEATURE_REGISTER_1_HASH_SHA_512	BIT(5)
+#define RMI_FEATURE_REGISTER_1_MAX_RECS_ORDER	GENMASK(9, 6)
+#define RMI_FEATURE_REGISTER_1_L0GPTSZ		GENMASK(13, 10)
+#define RMI_FEATURE_REGISTER_1_PPS		GENMASK(16, 14)
+
+#define RMI_FEATURE_REGISTER_2_DA		BIT(0)
+#define RMI_FEATURE_REGISTER_2_DA_COH		BIT(1)
+#define RMI_FEATURE_REGISTER_2_VSMMU		BIT(2)
+#define RMI_FEATURE_REGISTER_2_ATS		BIT(3)
+#define RMI_FEATURE_REGISTER_2_MAX_VDEVS_ORDER	GENMASK(7, 4)
+#define RMI_FEATURE_REGISTER_2_VDEV_KROU	BIT(8)
+#define RMI_FEATURE_REGISTER_2_NON_TEE_STREAM	BIT(9)
+
+#define RMI_FEATURE_REGISTER_3_MAX_NUM_AUX_PLANES	GENMASK(3, 0)
+#define RMI_FEATURE_REGISTER_3_RTT_PLAN			GENMASK(5, 4)
+#define RMI_FEATURE_REGISTER_3_RTT_S2AP_INDIRECT	BIT(6)
+
+#define RMI_FEATURE_REGISTER_4_MEC_COUNT		GENMASK(63, 0)
+
+#define RMI_MEM_CATEGORY_CONVENTIONAL		0
+#define RMI_MEM_CATEGORY_DEV_NCOH		1
+#define RMI_MEM_CATEGORY_DEV_COH		2
+
+#define RMI_TRACKING_RESERVED			0
+#define RMI_TRACKING_NONE			1
+#define RMI_TRACKING_FINE			2
+#define RMI_TRACKING_COARSE			3
+
+#define RMI_GRANULE_SIZE_4KB	0
+#define RMI_GRANULE_SIZE_16KB	1
+#define RMI_GRANULE_SIZE_64KB	2
+
+/*
+ * Note many of these fields are smaller than u64 but all fields have u64
+ * alignment, so use u64 to ensure correct alignment.
+ */
+struct rmm_config {
+	union { /* 0x0 */
+		struct {
+			u64 tracking_region_size;
+			u64 rmi_granule_size;
+		};
+		u8 sizer[0x1000];
+	};
+};
+
+#define RMI_REALM_PARAM_FLAG_LPA2		BIT(0)
+#define RMI_REALM_PARAM_FLAG_SVE		BIT(1)
+#define RMI_REALM_PARAM_FLAG_PMU		BIT(2)
+
+struct realm_params {
+	union { /* 0x0 */
+		struct {
+			u64 flags;
+			u64 s2sz;
+			u64 sve_vl;
+			u64 num_bps;
+			u64 num_wps;
+			u64 pmu_num_ctrs;
+			u64 hash_algo;
+			u64 num_aux_planes;
+		};
+		u8 padding0[0x400];
+	};
+	union { /* 0x400 */
+		struct {
+			u8 rpv[64];
+			u64 ats_plane;
+		};
+		u8 padding1[0x400];
+	};
+	union { /* 0x800 */
+		struct {
+			u64 padding;
+			u64 rtt_base;
+			s64 rtt_level_start;
+			u64 rtt_num_start;
+			u64 flags1;
+			u64 aux_rtt_base[3];
+		};
+		u8 padding2[0x800];
+	};
+};
+
+/*
+ * The number of GPRs (starting from X0) that are
+ * configured by the host when a REC is created.
+ */
+#define REC_CREATE_NR_GPRS		8
+
+#define REC_PARAMS_FLAG_RUNNABLE	BIT_ULL(0)
+
+struct rec_params {
+	union { /* 0x0 */
+		u64 flags;
+		u8 padding0[0x100];
+	};
+	union { /* 0x100 */
+		u64 mpidr;
+		u8 padding1[0x100];
+	};
+	union { /* 0x200 */
+		u64 pc;
+		u8 padding2[0x100];
+	};
+	union { /* 0x300 */
+		u64 gprs[REC_CREATE_NR_GPRS];
+		u8 padding3[0xd00];
+	};
+};
+
+#define REC_ENTER_FLAG_EMULATED_MMIO	BIT(0)
+#define REC_ENTER_FLAG_INJECT_SEA	BIT(1)
+#define REC_ENTER_FLAG_TRAP_WFI		BIT(2)
+#define REC_ENTER_FLAG_TRAP_WFE		BIT(3)
+#define REC_ENTER_FLAG_RIPAS_RESPONSE	BIT(4)
+#define REC_ENTER_FLAG_S2AP_RESPONSE	BIT(5)
+#define REC_ENTER_FLAG_DEV_MEM_RESPONSE	BIT(6)
+#define REC_ENTER_FLAG_FORCE_P0		BIT(7)
+
+#define REC_RUN_GPRS			31
+#define REC_MAX_GIC_NUM_LRS		16
+
+#define RMI_PERMITTED_GICV3_HCR_BITS	(ICH_HCR_EL2_UIE |		\
+					 ICH_HCR_EL2_LRENPIE |		\
+					 ICH_HCR_EL2_NPIE |		\
+					 ICH_HCR_EL2_VGrp0EIE |		\
+					 ICH_HCR_EL2_VGrp0DIE |		\
+					 ICH_HCR_EL2_VGrp1EIE |		\
+					 ICH_HCR_EL2_VGrp1DIE |		\
+					 ICH_HCR_EL2_TDIR)
+
+struct rec_enter {
+	union { /* 0x000 */
+		u64 flags;
+		u8 padding0[0x200];
+	};
+	union { /* 0x200 */
+		u64 gprs[REC_RUN_GPRS];
+		u8 padding1[0x100];
+	};
+	u8 padding3[0x500];
+};
+
+#define RMI_EXIT_SYNC			0x00
+#define RMI_EXIT_IRQ			0x01
+#define RMI_EXIT_FIQ			0x02
+#define RMI_EXIT_PSCI			0x03
+#define RMI_EXIT_RIPAS_CHANGE		0x04
+#define RMI_EXIT_HOST_CALL		0x05
+#define RMI_EXIT_SERROR			0x06
+#define RMI_EXIT_S2AP_CHANGE		0x07
+#define RMI_EXIT_VDEV_REQUEST		0x08
+#define RMI_EXIT_VDEV_VALIDATE_MAPPING	0x09
+#define RMI_EXIT_VSMMU_COMMAND		0x0a
+
+struct rec_exit {
+	union { /* 0x000 */
+		u8 exit_reason;
+		u8 padding0[0x100];
+	};
+	union { /* 0x100 */
+		struct {
+			u64 esr;
+			u64 far;
+			u64 hpfar;
+			u64 rtt_tree;
+		};
+		u8 padding1[0x100];
+	};
+	union { /* 0x200 */
+		u64 gprs[REC_RUN_GPRS];
+		u8 padding2[0x100];
+	};
+	union { /* 0x300 */
+		u8 padding3[0x100];
+	};
+	union { /* 0x400 */
+		struct {
+			u64 cntp_ctl;
+			u64 cntp_cval;
+			u64 cntv_ctl;
+			u64 cntv_cval;
+		};
+		u8 padding4[0x100];
+	};
+	union { /* 0x500 */
+		struct {
+			u64 ripas_base;
+			u64 ripas_top;
+			u8 ripas_value;
+			u8 padding8[15];
+			u64 s2ap_base;
+			u64 s2ap_top;
+			u64 vdev_id_1;
+			u64 vdev_id_2;
+			u64 dev_mem_base;
+			u64 dev_mem_top;
+			u64 dev_mem_pa;
+		};
+		u8 padding5[0x100];
+	};
+	union { /* 0x600 */
+		struct {
+			u16 imm;
+			u16 padding9;
+			u64 plane;
+		};
+		u8 padding6[0x100];
+	};
+	union { /* 0x700 */
+		struct {
+			u8 pmu_ovf_status;
+			u8 padding10[15];
+			u64 vsmmu;
+		};
+		u8 padding7[0x100];
+	};
+};
+
+struct rec_run {
+	struct rec_enter enter;
+	struct rec_exit exit;
+};
+
+/* RMI_RTT_UNPROT_MAP_FLAGS definitions */
+#define RMI_RTT_UNPROT_MAP_FLAGS_OADDR_TYPE	GENMASK(1, 0)
+#define RMI_RTT_UNPROT_MAP_FLAGS_LIST_COUNT	GENMASK(15, 2)
+#define RMI_RTT_UNPROT_MAP_FLAGS_MEMATTR	GENMASK(18, 16)
+#define RMI_RTT_UNPROT_MAP_FLAGS_S2AP		GENMASK(22, 19)
+
+/* S2AP Direct Encodings, used in RMI_RTT_UNPROT_MAP_FLAGS_S2AP */
+#define RMI_S2AP_DIRECT_WRITE			BIT(0)
+#define RMI_S2AP_DIRECT_READ			BIT(1)
+
+#endif /* __ASM_RMI_SMC_H */
-- 
2.43.0


^ permalink raw reply related

* [PATCH v14 03/44] arm64: RME: Handle Granule Protection Faults (GPFs)
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
  To: kvm, kvmarm
  Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
	James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
	linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
	Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
	Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
	Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>

If the host attempts to access granules that have been delegated for use
in a realm these accesses will be caught and will trigger a Granule
Protection Fault (GPF).

A fault during a page walk signals a bug in the kernel and is handled by
oopsing the kernel. A non-page walk fault could be caused by user space
having access to a page which has been delegated to the kernel and will
trigger a SIGBUS to allow debugging why user space is trying to access a
delegated page.

Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Signed-off-by: Steven Price <steven.price@arm.com>
---
Changes since v10:
 * Don't call arm64_notify_die() in do_gpf() but simply return 1.
Changes since v2:
 * Include missing "Granule Protection Fault at level -1"
---
 arch/arm64/mm/fault.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 0f3c5c7ca054..6358ea4787ba 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -905,6 +905,22 @@ static int do_tag_check_fault(unsigned long far, unsigned long esr,
 	return 0;
 }
 
+static int do_gpf_ptw(unsigned long far, unsigned long esr, struct pt_regs *regs)
+{
+	const struct fault_info *inf = esr_to_fault_info(esr);
+
+	die_kernel_fault(inf->name, far, esr, regs);
+	return 0;
+}
+
+static int do_gpf(unsigned long far, unsigned long esr, struct pt_regs *regs)
+{
+	if (!is_el1_instruction_abort(esr) && fixup_exception(regs, esr))
+		return 0;
+
+	return 1;
+}
+
 static const struct fault_info fault_info[] = {
 	{ do_bad,		SIGKILL, SI_KERNEL,	"ttbr address size fault"	},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"level 1 address size fault"	},
@@ -941,12 +957,12 @@ static const struct fault_info fault_info[] = {
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 32"			},
 	{ do_alignment_fault,	SIGBUS,  BUS_ADRALN,	"alignment fault"		},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 34"			},
-	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 35"			},
-	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 36"			},
-	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 37"			},
-	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 38"			},
-	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 39"			},
-	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 40"			},
+	{ do_gpf_ptw,		SIGKILL, SI_KERNEL,	"Granule Protection Fault at level -1" },
+	{ do_gpf_ptw,		SIGKILL, SI_KERNEL,	"Granule Protection Fault at level 0" },
+	{ do_gpf_ptw,		SIGKILL, SI_KERNEL,	"Granule Protection Fault at level 1" },
+	{ do_gpf_ptw,		SIGKILL, SI_KERNEL,	"Granule Protection Fault at level 2" },
+	{ do_gpf_ptw,		SIGKILL, SI_KERNEL,	"Granule Protection Fault at level 3" },
+	{ do_gpf,		SIGBUS,  SI_KERNEL,	"Granule Protection Fault not on table walk" },
 	{ do_bad,		SIGKILL, SI_KERNEL,	"level -1 address size fault"	},
 	{ do_bad,		SIGKILL, SI_KERNEL,	"unknown 42"			},
 	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level -1 translation fault"	},
-- 
2.43.0


^ permalink raw reply related

* [PATCH v14 02/44] kvm: arm64: Avoid including linux/kvm_host.h in kvm_pgtable.h
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
  To: kvm, kvmarm
  Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
	James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
	linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
	Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
	Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
	Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>

To avoid future include cycles, drop the linux/kvm_host.h include in
kvm_pgtable.h and include two _types.h headers for the types that are
actually used. Additionally provide a forward declaration for struct
kvm_s2_mmu as it's only used as a pointer in this file.

Both pgtable.c and kvm_pkvm.h relied on the indirect inclusion of
kvm_host.h, so make that explicit.

Signed-off-by: Steven Price <steven.price@arm.com>
---
New patch in v13
---
 arch/arm64/include/asm/kvm_pgtable.h | 5 ++++-
 arch/arm64/include/asm/kvm_pkvm.h    | 2 +-
 arch/arm64/kvm/hyp/pgtable.c         | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 41a8687938eb..e4770ce2ccf6 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -8,9 +8,12 @@
 #define __ARM64_KVM_PGTABLE_H__
 
 #include <linux/bits.h>
-#include <linux/kvm_host.h>
+#include <linux/kvm_types.h>
+#include <linux/rbtree_types.h>
 #include <linux/types.h>
 
+struct kvm_s2_mmu;
+
 #define KVM_PGTABLE_FIRST_LEVEL		-1
 #define KVM_PGTABLE_LAST_LEVEL		3
 
diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index 2954b311128c..1bc6a6a34ec9 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -9,7 +9,7 @@
 #include <linux/arm_ffa.h>
 #include <linux/memblock.h>
 #include <linux/scatterlist.h>
-#include <asm/kvm_host.h>
+#include <linux/kvm_host.h>
 #include <asm/kvm_pgtable.h>
 
 /* Maximum number of VMs that can co-exist under pKVM. */
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 0c1defa5fb0f..0bcd6f06aafb 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -8,6 +8,7 @@
  */
 
 #include <linux/bitfield.h>
+#include <linux/kvm_host.h>
 #include <asm/kvm_pgtable.h>
 #include <asm/stage2_pgtable.h>
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH v14 01/44] kvm: arm64: Include kvm_emulate.h in kvm/arm_psci.h
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
  To: kvm, kvmarm
  Cc: Suzuki K Poulose, Catalin Marinas, Marc Zyngier, Will Deacon,
	James Morse, Oliver Upton, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
	Shanker Donthineni, Alper Gun, Aneesh Kumar K . V, Emi Kisanuki,
	Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2, Steven Price
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>

From: Suzuki K Poulose <suzuki.poulose@arm.com>

Fix a potential build error (like below, when asm/kvm_emulate.h gets
included after the kvm/arm_psci.h) by including the missing header file
in kvm/arm_psci.h:

./include/kvm/arm_psci.h: In function ‘kvm_psci_version’:
./include/kvm/arm_psci.h:29:13: error: implicit declaration of function
   ‘vcpu_has_feature’; did you mean ‘cpu_have_feature’? [-Werror=implicit-function-declaration]
   29 |         if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2)) {
	         |             ^~~~~~~~~~~~~~~~
			       |             cpu_have_feature

Reviewed-by: Gavin Shan <gshan@redhat.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
---
 include/kvm/arm_psci.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/kvm/arm_psci.h b/include/kvm/arm_psci.h
index cbaec804eb83..38dab7add79b 100644
--- a/include/kvm/arm_psci.h
+++ b/include/kvm/arm_psci.h
@@ -10,6 +10,8 @@
 #include <linux/kvm_host.h>
 #include <uapi/linux/psci.h>
 
+#include <asm/kvm_emulate.h>
+
 #define KVM_ARM_PSCI_0_1	PSCI_VERSION(0, 1)
 #define KVM_ARM_PSCI_0_2	PSCI_VERSION(0, 2)
 #define KVM_ARM_PSCI_1_0	PSCI_VERSION(1, 0)
-- 
2.43.0


^ permalink raw reply related

* [PATCH v14 00/44] arm64: Support for Arm CCA in KVM
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
  To: kvm, kvmarm
  Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
	James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
	linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
	Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
	Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
	Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2

This series adds support for running protected VMs using KVM under the
Arm Confidential Compute Architecture (CCA).

This is rebased on v7.1-rc1, but still targets RMM v2.0-bet1[1].

The major updates from v13 remain but have been more fully implemented:
the RMM uses the host's page size, range based RMI APIs mean we don't
have to break everything down to base page sizes, the GIC state is
passed via system registers, and the uAPI has been simplified.

The main changes since v13 are:

 * The RMI definitions and wrappers have been fully updated for RMM
   v2.0-bet1. In particular the temporary RMM v1.0 SMC compatibility
   patch has been dropped.

 * The PSCI completion ioctl has been removed. RMM v2.0-bet1 still
   requires the host to provide the target REC for PSCI calls which
   name another vCPU, but KVM now performs the RMI PSCI completion
   automatically before entering the REC again. Userspace no longer
   needs to issue KVM_ARM_VCPU_RMI_PSCI_COMPLETE. A future spec should
   remove the need for the host to provide the MPIDR mapping.

 * The generic RMI init, RMM configuration, GPT setup,
   delegate/undelegate helpers and SRO infrastructure have moved out of
   KVM into arch/arm64/kernel/rmi.c. RMI is expected to be used by
   features outside KVM, so this code should be available even when KVM
   is not built.

 * RMI_GRANULE_TRACKING_GET has been updated to work on a range, this
   allows it to work when the region is not aligned to the tracking
   size. Solves the problem reported by Mathieu[2].

 * SRO support has been moved earlier in the series and improved. It
   provides a cleaner way for the host to provide the RMM with the extra
   memory it requires. However support is still incomplete where the
   TF-RMM code does not yet implement it. This is noted by FIXMEs in the
   code.

 * The ARM VM type encoding has been reworked to coexist with the
   upstream pKVM KVM_VM_TYPE_ARM_PROTECTED bit.

 * The private-memory documentation now notes that arm64 uses
   KVM_CAP_MEMORY_ATTRIBUTES.

 * PMU support is dropped for now. It will be added later in a separate
   series. Similarly for selecting the hash algorithm and RPV.

There are also the usual rebase updates and smaller fixes, including
changes to the RMM v2.0-bet1 range APIs, removal of REC auxiliary
granule handling, fixes to the address range descriptor encoding, and
cleanups around realm stage-2 teardown.

Stateful RMI Operations
-----------------------

The RMM v2.0 spec introduces Stateful RMI Operations (SROs), which allow
the RMM to complete an operation over several SMC calls while requesting
or returning memory to the host. This allows interrupts to be handled in
the middle of an operation and lets the RMM dynamically allocate memory
for internal tracking purposes. For example, RMI_REC_CREATE no longer
needs auxiliary granules to be provided up front, and can instead
request memory during the operation.

This series includes the generic SRO infrastructure in
arch/arm64/kernel/rmi.c and uses it for REC create/destroy. The other
cases are not yet used by TF-RMM and a future revision will be needed to
finish those paths in Linux.

This series is based on v7.1-rc1. It is also available as a git
repository:

https://gitlab.arm.com/linux-arm/linux-cca cca-host/v14

Work in progress changes for kvmtool are available from the git
repository below:

https://gitlab.arm.com/linux-arm/kvmtool-cca cca/v12

The TF-RMM has not yet merged the RMM v2.0 support, so you will need to
use a branch with RMM v2.0-bet1 support. At the time of writing the
following branch is being used:

https://git.trustedfirmware.org/TF-RMM/tf-rmm.git topics/rmm-v2.0-poc_2
(tested on commit 3340667a291a)

There is a kvm-unit-test branch which has been updated to support the
attestation used in RMMv2.0 available here:

https://gitlab.arm.com/linux-arm/kvm-unit-tests-cca cca/v4

[1] https://developer.arm.com/documentation/den0137/2-0bet1/
[2] https://lore.kernel.org/all/acrj-cKphy4hJsEG@p14s/

Jean-Philippe Brucker (6):
  arm64: RMI: Propagate number of breakpoints and watchpoints to
    userspace
  arm64: RMI: Set breakpoint parameters through SET_ONE_REG
  arm64: RMI: Propagate max SVE vector length from RMM
  arm64: RMI: Configure max SVE vector length for a Realm
  arm64: RMI: Provide register list for unfinalized RMI RECs
  arm64: RMI: Provide accurate register list

Joey Gouly (2):
  arm64: RMI: allow userspace to inject aborts
  arm64: RMI: support RSI_HOST_CALL

Steven Price (33):
  kvm: arm64: Avoid including linux/kvm_host.h in kvm_pgtable.h
  arm64: RME: Handle Granule Protection Faults (GPFs)
  arm64: RMI: Add SMC definitions for calling the RMM
  arm64: RMI: Add wrappers for RMI calls
  arm64: RMI: Check for RMI support at init
  arm64: RMI: Configure the RMM with the host's page size
  arm64: RMI: Ensure that the RMM has GPT entries for memory
  arm64: RMI: Provide functions to delegate/undelegate ranges of memory
  arm64: RMI: Add support for SRO
  arm64: RMI: Check for RMI support at KVM init
  arm64: RMI: Check for LPA2 support
  arm64: RMI: Define the user ABI
  arm64: RMI: Basic infrastructure for creating a realm.
  KVM: arm64: Allow passing machine type in KVM creation
  arm64: RMI: RTT tear down
  arm64: RMI: Activate realm on first VCPU run
  arm64: RMI: Allocate/free RECs to match vCPUs
  arm64: RMI: Support for the VGIC in realms
  KVM: arm64: Support timers in realm RECs
  arm64: RMI: Handle realm enter/exit
  arm64: RMI: Handle RMI_EXIT_RIPAS_CHANGE
  KVM: arm64: Handle realm MMIO emulation
  KVM: arm64: Expose support for private memory
  arm64: RMI: Allow populating initial contents
  arm64: RMI: Set RIPAS of initial memslots
  arm64: RMI: Create the realm descriptor
  arm64: RMI: Runtime faulting of memory
  KVM: arm64: Handle realm VCPU load
  KVM: arm64: Validate register access for a Realm VM
  KVM: arm64: Handle Realm PSCI requests
  KVM: arm64: WARN on injected undef exceptions
  arm64: RMI: Prevent Device mappings for Realms
  arm64: RMI: Enable realms to be created

Suzuki K Poulose (3):
  kvm: arm64: Include kvm_emulate.h in kvm/arm_psci.h
  kvm: arm64: Don't expose unsupported capabilities for realm guests
  arm64: RMI: Allow checking SVE on VM instance

 Documentation/virt/kvm/api.rst       |   62 +-
 arch/arm64/include/asm/kvm_emulate.h |   37 +
 arch/arm64/include/asm/kvm_host.h    |   13 +-
 arch/arm64/include/asm/kvm_pgtable.h |    5 +-
 arch/arm64/include/asm/kvm_pkvm.h    |    2 +-
 arch/arm64/include/asm/kvm_rmi.h     |  127 +++
 arch/arm64/include/asm/rmi_cmds.h    |  680 +++++++++++++
 arch/arm64/include/asm/rmi_smc.h     |  448 ++++++++
 arch/arm64/include/asm/virt.h        |    1 +
 arch/arm64/kernel/Makefile           |    2 +-
 arch/arm64/kernel/cpufeature.c       |    1 +
 arch/arm64/kernel/rmi.c              |  605 +++++++++++
 arch/arm64/kvm/Kconfig               |    2 +
 arch/arm64/kvm/Makefile              |    2 +-
 arch/arm64/kvm/arch_timer.c          |   28 +-
 arch/arm64/kvm/arm.c                 |  140 ++-
 arch/arm64/kvm/guest.c               |   93 +-
 arch/arm64/kvm/hyp/pgtable.c         |    1 +
 arch/arm64/kvm/hypercalls.c          |    4 +-
 arch/arm64/kvm/inject_fault.c        |    5 +-
 arch/arm64/kvm/mmio.c                |   16 +-
 arch/arm64/kvm/mmu.c                 |  197 +++-
 arch/arm64/kvm/psci.c                |   15 +-
 arch/arm64/kvm/reset.c               |   13 +-
 arch/arm64/kvm/rmi-exit.c            |  215 ++++
 arch/arm64/kvm/rmi.c                 | 1401 ++++++++++++++++++++++++++
 arch/arm64/kvm/sys_regs.c            |   47 +-
 arch/arm64/kvm/vgic/vgic-init.c      |    2 +-
 arch/arm64/mm/fault.c                |   28 +-
 include/kvm/arm_arch_timer.h         |    2 +
 include/kvm/arm_psci.h               |    2 +
 include/uapi/linux/kvm.h             |   20 +-
 32 files changed, 4122 insertions(+), 94 deletions(-)
 create mode 100644 arch/arm64/include/asm/kvm_rmi.h
 create mode 100644 arch/arm64/include/asm/rmi_cmds.h
 create mode 100644 arch/arm64/include/asm/rmi_smc.h
 create mode 100644 arch/arm64/kernel/rmi.c
 create mode 100644 arch/arm64/kvm/rmi-exit.c
 create mode 100644 arch/arm64/kvm/rmi.c

-- 
2.43.0

^ permalink raw reply

* Re: [PATCH v2 2/2] x86/tdx: Fix zero-extension for 32-bit port I/O
From: Sean Christopherson @ 2026-05-13 12:48 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Dave Hansen, Kiryl Shutsemau (Meta), Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, Rick Edgecombe,
	Kuppuswamy Sathyanarayanan, Kai Huang, Borys Tsyrulnikov,
	linux-kernel, linux-coco, kvm, stable
In-Reply-To: <B8D6B43E-4C3D-4E1F-BD07-5632E1BBECEA@zytor.com>

On Tue, May 12, 2026, H. Peter Anvin wrote:
> On May 12, 2026 6:14:13 PM PDT, Dave Hansen <dave.hansen@intel.com> wrote:
> >On 4/28/26 05:56, Kiryl Shutsemau (Meta) wrote:
> >> +	if (size == 4)
> >> +		regs->ax = 0;
> >> +	else
> >> +		regs->ax &= ~mask;
> >
> >I haven't thought about this _that_ much, but this feels wrong. Why is
> >is 4 so special cased?
> >
> >Also, what _are_ the limits on the registers that 'in' can be used on?
> >
> >RAX - n/a, no 64-bit I/O
> >EAX - size=4
> >AX  - size=2
> >AH  - n/a no encoding for inb
> >AL  - size=1
> >
> >I'd find this much easier to grasp if there was a nice table of what the
> >registers, sizes, and masks ended up being usable. As usual, x86 is
> >"fun" here.
> 
> Because zero extension only applies to dwords.
> 
> x86-64 has three subregisters per GPR:

Aren't there four?  The fourth being 31:0, which is the one that is zero-extended
and so "clobbers" 63:32.

> Bits 7-0
> Bits 15-8
> Bits 63-16

I assume you mean 15:0?  63:16 isn't addressable.  And these are the ones that
aren't zero-extended, i.e. don't "clobber" other bits.

^ permalink raw reply

* Re: [PATCH v2 2/2] x86/tdx: Fix zero-extension for 32-bit port I/O
From: H. Peter Anvin @ 2026-05-13  2:32 UTC (permalink / raw)
  To: Dave Hansen, Kiryl Shutsemau (Meta), Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86
  Cc: Rick Edgecombe, Kuppuswamy Sathyanarayanan, Kai Huang,
	Borys Tsyrulnikov, linux-kernel, linux-coco, kvm, stable
In-Reply-To: <bf92ebbf-8d70-406a-aea1-c11ca576de90@intel.com>

On May 12, 2026 6:14:13 PM PDT, Dave Hansen <dave.hansen@intel.com> wrote:
>On 4/28/26 05:56, Kiryl Shutsemau (Meta) wrote:
>> +	if (size == 4)
>> +		regs->ax = 0;
>> +	else
>> +		regs->ax &= ~mask;
>
>I haven't thought about this _that_ much, but this feels wrong. Why is
>is 4 so special cased?
>
>Also, what _are_ the limits on the registers that 'in' can be used on?
>
>RAX - n/a, no 64-bit I/O
>EAX - size=4
>AX  - size=2
>AH  - n/a no encoding for inb
>AL  - size=1
>
>I'd find this much easier to grasp if there was a nice table of what the
>registers, sizes, and masks ended up being usable. As usual, x86 is
>"fun" here.

Because zero extension only applies to dwords.

x86-64 has three subregisters per GPR:

Bits 7-0
Bits 15-8
Bits 63-16

^ permalink raw reply

* Re: [PATCH v2 2/2] x86/tdx: Fix zero-extension for 32-bit port I/O
From: Dave Hansen @ 2026-05-13  1:14 UTC (permalink / raw)
  To: Kiryl Shutsemau (Meta), Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86
  Cc: H . Peter Anvin, Rick Edgecombe, Kuppuswamy Sathyanarayanan,
	Kai Huang, Borys Tsyrulnikov, linux-kernel, linux-coco, kvm,
	stable
In-Reply-To: <20260428125632.129770-3-kas@kernel.org>

On 4/28/26 05:56, Kiryl Shutsemau (Meta) wrote:
> +	if (size == 4)
> +		regs->ax = 0;
> +	else
> +		regs->ax &= ~mask;

I haven't thought about this _that_ much, but this feels wrong. Why is
is 4 so special cased?

Also, what _are_ the limits on the registers that 'in' can be used on?

RAX - n/a, no 64-bit I/O
EAX - size=4
AX  - size=2
AH  - n/a no encoding for inb
AL  - size=1

I'd find this much easier to grasp if there was a nice table of what the
registers, sizes, and masks ended up being usable. As usual, x86 is
"fun" here.

^ permalink raw reply

* Re: [PATCH] x86/tdx: Fix zero-extension for CPUID emulation
From: Edgecombe, Rick P @ 2026-05-12 22:43 UTC (permalink / raw)
  To: linux-coco@lists.linux.dev, Hansen, Dave, clopez@suse.de,
	kas@kernel.org, x86@kernel.org
  Cc: ak@linux.intel.com, bp@alien8.de, dave.hansen@linux.intel.com,
	hpa@zytor.com, mingo@redhat.com, linux-kernel@vger.kernel.org,
	Luck, Tony, tglx@kernel.org, stable@vger.kernel.org,
	kvm@vger.kernel.org
In-Reply-To: <d76284f1-79e2-4e7b-94e7-252ff3ee9e5e@intel.com>

On Tue, 2026-05-12 at 15:37 -0700, Dave Hansen wrote:
> On 5/12/26 15:24, Edgecombe, Rick P wrote:
> > On the other hand, the #VE handler is supposed to do the emulation of the
> > instruction, with the help of the TDVMCALL, so maybe the correctness should be
> > in the guest... Hmm...
> 
> Maybe we should just change the GHCI spec.
> 
> What if we said:
> 
>  | Operand 	       | ... |
>  | R12 (lower 32 bits) | EAX |
>  | R13 (lower 32 bits) | EBX |
>  | R14 (lower 32 bits) | ECX |
>  | R15 (lower 32 bits) | EDX |
> 
> Then said the upper 32 bits are undefined. Then the kernel *must* mask
> them to be correct. Then we don't have to do any checking at all and
> there's no ambiguity about what the VMM is allowed to do or what chaos
> it might cause.

Hmm, let me check. It intersects with the other guests/hosts, but hard to see
how the other ones could be out of spec and not be buggy.

^ permalink raw reply

* Re: [PATCH] x86/tdx: Fix zero-extension for CPUID emulation
From: Dave Hansen @ 2026-05-12 22:37 UTC (permalink / raw)
  To: Edgecombe, Rick P, linux-coco@lists.linux.dev, clopez@suse.de,
	kas@kernel.org, x86@kernel.org
  Cc: ak@linux.intel.com, bp@alien8.de, dave.hansen@linux.intel.com,
	hpa@zytor.com, mingo@redhat.com, linux-kernel@vger.kernel.org,
	Luck, Tony, tglx@kernel.org, stable@vger.kernel.org,
	kvm@vger.kernel.org
In-Reply-To: <43a913a1b4721c752443416a685631478bee2f10.camel@intel.com>

On 5/12/26 15:24, Edgecombe, Rick P wrote:
> On the other hand, the #VE handler is supposed to do the emulation of the
> instruction, with the help of the TDVMCALL, so maybe the correctness should be
> in the guest... Hmm...

Maybe we should just change the GHCI spec.

What if we said:

 | Operand 	       | ... |
 | R12 (lower 32 bits) | EAX |
 | R13 (lower 32 bits) | EBX |
 | R14 (lower 32 bits) | ECX |
 | R15 (lower 32 bits) | EDX |

Then said the upper 32 bits are undefined. Then the kernel *must* mask
them to be correct. Then we don't have to do any checking at all and
there's no ambiguity about what the VMM is allowed to do or what chaos
it might cause.

^ permalink raw reply

* Re: [PATCH] x86/tdx: Fix zero-extension for CPUID emulation
From: Carlos López @ 2026-05-12 22:33 UTC (permalink / raw)
  To: Dave Hansen, Edgecombe, Rick P, linux-coco@lists.linux.dev,
	kas@kernel.org, x86@kernel.org
  Cc: ak@linux.intel.com, bp@alien8.de, dave.hansen@linux.intel.com,
	hpa@zytor.com, mingo@redhat.com, linux-kernel@vger.kernel.org,
	Luck, Tony, tglx@kernel.org, stable@vger.kernel.org,
	kvm@vger.kernel.org
In-Reply-To: <7f7b8bfd-f39e-417c-991f-d224d58cb52a@intel.com>

On 5/13/26 12:14 AM, Dave Hansen wrote:
> On 5/12/26 14:48, Edgecombe, Rick P wrote:
>>> -	regs->ax = args.r12;
>>> -	regs->bx = args.r13;
>>> -	regs->cx = args.r14;
>>> -	regs->dx = args.r15;
>>> +	regs->ax = lower_32_bits(args.r12);
>>> +	regs->bx = lower_32_bits(args.r13);
>>> +	regs->cx = lower_32_bits(args.r14);
>>> +	regs->dx = lower_32_bits(args.r15);
>>>  
>> Can you explain the impact here? Why should the guest fixup what the VMM
>> emulates?
> 
> Oh boy.
> 
> args.r12-15 come from the VMM, right? So the VMM Can put whatever it
> wants in there.

Yes, exactly.

> CPUID (the instruction) is defined to fill in eax/ebx/ecx/edx. Those are
> 32-bit registers so the normal register rules apply: "32-bit operands
> generate a 32-bit result, zero-extended to a 64-bit result in the
> destination general-purpose register."
> 
> So a properly-behaving CPUID implementation will always end up with the
> top 32 bits empty on the four CPUID registers after a CPUID is executed.
> 
> The VMM here obviously might be naughty and might put gunk in
> args.r12/r13/r14/r15 that gets copied to ptregs->ax/bx/cx/dx which are
> 'unsigned long' on 64-bit.
> 
> The end result is that a TDX guest can use CPUID and end up having bits
> set in rax/rbx/rcx/rdx that are architecturally impossible. This patch
> is effectively fixing up the VMM naughtiness before the guest CPUID
> instance can see it.
> 
> Does anybody disagree with any of that?
> 
> Do we *want* to fix this up silently? If we catch a malicious VMM trying
> to stuff garbage into the guest, shouldn't we be a bit more upset than
> silently papering over it?

Okay, how about this (on top of the changes I already sent)?

diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index 831475cf4313..cd33781c8d61 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -538,6 +538,13 @@ static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
        if (__tdx_hypercall(&args))
                return -EIO;
 
+       /* Emit a warning if the hypervisor tries to inject architecturally
+        * invalid (non-zero-extended) output values for CPUID */
+       if (upper_32_bits(args.r12) || upper_32_bits(args.r13)
+           || upper_32_bits(args.r14) || upper_32_bits(args.r15))
+               pr_warn("detected invalid CPUID result from VMM: eax=%lld ebx=%lld ecx=%lld edx=%lld",
+                                       args.r12, args.r13, args.r14, args.r15);
+
        /*
         * As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
         * EAX, EBX, ECX, EDX registers after the CPUID instruction execution.


^ permalink raw reply related

* Re: [PATCH RFC v5 10/53] KVM: guest_memfd: Add basic support for KVM_SET_MEMORY_ATTRIBUTES2
From: Ackerley Tng @ 2026-05-12 22:30 UTC (permalink / raw)
  To: Liam R. Howlett
  Cc: aik, andrew.jones, binbin.wu, brauner, chao.p.peng, david,
	ira.weiny, jmattson, jthoughton, michael.roth, oupton,
	pankaj.gupta, qperret, rick.p.edgecombe, rientjes, shivankg,
	steven.price, tabba, willy, wyihan, yan.y.zhao, forkloop,
	pratyush, suzuki.poulose, aneesh.kumar, Paolo Bonzini,
	Sean Christopherson, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, x86, H. Peter Anvin, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Jonathan Corbet, Shuah Khan,
	Shuah Khan, Vishal Annapurve, Andrew Morton, Chris Li,
	Kairui Song, Kemeng Shi, Nhat Pham, Baoquan He, Barry Song,
	Axel Rasmussen, Yuanchu Xie, Wei Xu, Youngjun Park, Qi Zheng,
	Shakeel Butt, Kiryl Shutsemau, Jason Gunthorpe, Vlastimil Babka,
	kvm, linux-kernel, linux-trace-kernel, linux-doc, linux-kselftest,
	linux-mm, linux-coco
In-Reply-To: <1DAB05E2-7F30-45D7-B155-B66C59D31AFF@infradead.org>

"Liam R. Howlett" <liam@infradead.org> writes:

>
> [...snip...]
>
>>
>>The invariant in this maple tree is that contiguous ranges with the same
>>attribute are stored as a single range.
>>
>>The goal of this first part is to get the entry at the index just after
>>the requested range, and see what the attribute there is. If that
>>attribute is what we're about to set, extend the requested range for
>>storing to the end of that range.
>>
>>If there is another range higher than end + 1, with the invariant
>>maintained, that attribute has to be different than the attribute stored
>>at end. Hence, we only want to extend this requested range up till end.
>>
>
> mas_find() will look for an entry at the given address for the first search, and if it is not found it will continue to search upwards.  Since you limit the search to end, it will work as you want and there isn't a bug as I was thinking in my sleep deprived state.
>
> Since you are searching for exactly one address (end), it might serve you better to walk there.  Maybe walking is a better API for what you are doing here?
>

Thanks again for this tip! I'll try the walk API in the next revision
after v6 [1]

[1] https://lore.kernel.org/all/20260507-gmem-inplace-conversion-v6-0-91ab5a8b19a4@google.com/T/

>
>>> Do you have testing of these functions somewhere?
>>>
>>
>>GMEM_CONVERSION_MULTIPAGE_TEST_INIT_SHARED(indexing, 4) tests setting
>>attributes in ranges. If test_page is 2,
>>
>>1. [0, 4) starts off shared (4 is the number of pages in the guest_memfd)
>>2. [2, 3) is converted to private
>>    => so the ranges should now be [0, 2), [2, 3), [3, 4)
>>3. [2, 3) is converted back to shared
>>    => so the ranges should now be [0, 4)
>>
>>I verified this by inserting some trace_printk()s and inspecting manually.
>>
>
> Thanks.  I find the exclusive ranges a bit odd to think about in the maple tree context, but this test case makes sense.  This is especially odd to look at a single index entry, at least for me.
>
> I generally have a set of test cases and append any bug reproduces to that list so they are unlikely to reoccur.  My testing is certainly different from what you'll be doing, but this method has done well with the quality of code improving over time, and limited (if any) regressions.
>

I've not worked directly with the maple tree tests but the xarray tests
(similarly set up, I believe) are a joy to work with.

> I actually insist that any fix has a test before I accept them.  There are two reasons for this: 1. Avoiding the regression. 2. People really understand the bug if they can create a reproducer.
>
> I hope this helps.
>
>

The maple tree tests are set up to directly test maple tree code, but
KVM selftests test from the userspace interface, and it's hard to test
this invariant from userspace.

>>>> +	if (entry && xa_to_value(entry) == attributes)
>>>> +		last = mas->last;
>>>> +
>>>> +	if (start > 0) {
>>>> +		mas_set_range(mas, start - 1, start - 1);
>>>> +		entry = mas_find(mas, start - 1);
>>>> +		if (entry && xa_to_value(entry) == attributes)
>>>> +			start = mas->index;
>>>> +	}
>>>> +
>>>> +	mas_set_range(mas, start, last);
>>>> +	return mas_preallocate(mas, xa_mk_value(attributes), GFP_KERNEL);
>>>> +}
>>>> +
>>>>
>>>> [...snip...]
>>>>

^ permalink raw reply

* Re: [PATCH] x86/tdx: Fix zero-extension for CPUID emulation
From: Edgecombe, Rick P @ 2026-05-12 22:24 UTC (permalink / raw)
  To: linux-coco@lists.linux.dev, Hansen, Dave, clopez@suse.de,
	kas@kernel.org, x86@kernel.org
  Cc: ak@linux.intel.com, bp@alien8.de, dave.hansen@linux.intel.com,
	hpa@zytor.com, mingo@redhat.com, linux-kernel@vger.kernel.org,
	Luck, Tony, tglx@kernel.org, stable@vger.kernel.org,
	kvm@vger.kernel.org
In-Reply-To: <7f7b8bfd-f39e-417c-991f-d224d58cb52a@intel.com>

On Tue, 2026-05-12 at 15:14 -0700, Dave Hansen wrote:
> The end result is that a TDX guest can use CPUID and end up having bits
> set in rax/rbx/rcx/rdx that are architecturally impossible. This patch
> is effectively fixing up the VMM naughtiness before the guest CPUID
> instance can see it.

A naughty VMM could mess with the guest in a number of ways though. For example
setting impossible bits in specific leafs in the lower 32 bits. This patch is a
relatively simple sanity check compared to a complete check of CPUID arch
matching (or MSR, etc) of course.

> 
> Does anybody disagree with any of that?
> 
> Do we *want* to fix this up silently? If we catch a malicious VMM trying
> to stuff garbage into the guest, shouldn't we be a bit more upset than
> silently papering over it?

I agree a warning would be appropriate. This should probably trigger a bug fix
in the VMM. For example, BIOS might hit it too. So I kind of wonder, how
valuable is catching this specific bug in the guest? Do we need to worry about
the specific issue for some reason?

On the other hand, the #VE handler is supposed to do the emulation of the
instruction, with the help of the TDVMCALL, so maybe the correctness should be
in the guest... Hmm...

^ permalink raw reply

* Re: [PATCH] x86/tdx: Fix zero-extension for CPUID emulation
From: Carlos López @ 2026-05-12 22:15 UTC (permalink / raw)
  To: Edgecombe, Rick P, linux-coco@lists.linux.dev, kas@kernel.org,
	x86@kernel.org
  Cc: ak@linux.intel.com, bp@alien8.de, dave.hansen@linux.intel.com,
	hpa@zytor.com, mingo@redhat.com, linux-kernel@vger.kernel.org,
	Luck, Tony, tglx@kernel.org, stable@vger.kernel.org,
	kvm@vger.kernel.org
In-Reply-To: <81343db56b8df8f70a2e13a17e62c620bee36897.camel@intel.com>

On 5/12/26 11:48 PM, Edgecombe, Rick P wrote:
> On Tue, 2026-05-12 at 23:37 +0200, Carlos López wrote:
>> In the x86 architecture, 32-bit operations zero-extend the result in the
>> destination register to 64 bits. This includes the CPUID instruction,
>> which writes 32-bit values EAX/EBX/ECX/EDX.
>>
>> When handling the CPUID instruction via #VE, copy only the lower 32-bits
>> provided by the hypervisor for the output registers, and zero out the
>> upper half.
>>
>> Fixes: c141fa2c2bba ("x86/tdx: Handle CPUID via #VE")
>> Cc: stable@vger.kernel.org
>> Signed-off-by: Carlos López <clopez@suse.de>
>> ---
>>  arch/x86/coco/tdx/tdx.c | 8 ++++----
>>  1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
>> index c8b9e86d0488..a2fe1ae019bd 100644
>> --- a/arch/x86/coco/tdx/tdx.c
>> +++ b/arch/x86/coco/tdx/tdx.c
>> @@ -543,10 +543,10 @@ static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
>>  	 * EAX, EBX, ECX, EDX registers after the CPUID instruction execution.
>>  	 * So copy the register contents back to pt_regs.
>>  	 */
>> -	regs->ax = args.r12;
>> -	regs->bx = args.r13;
>> -	regs->cx = args.r14;
>> -	regs->dx = args.r15;
>> +	regs->ax = lower_32_bits(args.r12);
>> +	regs->bx = lower_32_bits(args.r13);
>> +	regs->cx = lower_32_bits(args.r14);
>> +	regs->dx = lower_32_bits(args.r15);
>>  
> 
> Can you explain the impact here? Why should the guest fixup what the VMM
> emulates?

It's a correctness issue. The CPUID instruction has 32-bit operands,
which should be zero extended as per the SDM. Other code like read_msr()
in that same file does the same zero-extension. There was also a patch
sent for a similar issue in handle_in() not that long ago.

In terms of how this could materialize, if you have code like this:

	asm volatile("cpuid"
	    : "=a" (eax),
	      "=b" (ebx),
	      "=c" (ecx),
	      "=d" (edx)
	    : "0" (eax), "2" (ecx)
	    : "memory");

The compiler would be allowed to assume that e.g. RAX can be used as an
already-zero-extended register.

Best,
Carlos

^ permalink raw reply

* Re: [PATCH] x86/tdx: Fix zero-extension for CPUID emulation
From: Dave Hansen @ 2026-05-12 22:14 UTC (permalink / raw)
  To: Edgecombe, Rick P, linux-coco@lists.linux.dev, clopez@suse.de,
	kas@kernel.org, x86@kernel.org
  Cc: ak@linux.intel.com, bp@alien8.de, dave.hansen@linux.intel.com,
	hpa@zytor.com, mingo@redhat.com, linux-kernel@vger.kernel.org,
	Luck, Tony, tglx@kernel.org, stable@vger.kernel.org,
	kvm@vger.kernel.org
In-Reply-To: <81343db56b8df8f70a2e13a17e62c620bee36897.camel@intel.com>

On 5/12/26 14:48, Edgecombe, Rick P wrote:
>> -	regs->ax = args.r12;
>> -	regs->bx = args.r13;
>> -	regs->cx = args.r14;
>> -	regs->dx = args.r15;
>> +	regs->ax = lower_32_bits(args.r12);
>> +	regs->bx = lower_32_bits(args.r13);
>> +	regs->cx = lower_32_bits(args.r14);
>> +	regs->dx = lower_32_bits(args.r15);
>>  
> Can you explain the impact here? Why should the guest fixup what the VMM
> emulates?

Oh boy.

args.r12-15 come from the VMM, right? So the VMM Can put whatever it
wants in there.

CPUID (the instruction) is defined to fill in eax/ebx/ecx/edx. Those are
32-bit registers so the normal register rules apply: "32-bit operands
generate a 32-bit result, zero-extended to a 64-bit result in the
destination general-purpose register."

So a properly-behaving CPUID implementation will always end up with the
top 32 bits empty on the four CPUID registers after a CPUID is executed.

The VMM here obviously might be naughty and might put gunk in
args.r12/r13/r14/r15 that gets copied to ptregs->ax/bx/cx/dx which are
'unsigned long' on 64-bit.

The end result is that a TDX guest can use CPUID and end up having bits
set in rax/rbx/rcx/rdx that are architecturally impossible. This patch
is effectively fixing up the VMM naughtiness before the guest CPUID
instance can see it.

Does anybody disagree with any of that?

Do we *want* to fix this up silently? If we catch a malicious VMM trying
to stuff garbage into the guest, shouldn't we be a bit more upset than
silently papering over it?

^ permalink raw reply

* Re: [PATCH] x86/tdx: Fix zero-extension for CPUID emulation
From: Edgecombe, Rick P @ 2026-05-12 21:48 UTC (permalink / raw)
  To: linux-coco@lists.linux.dev, clopez@suse.de, kas@kernel.org,
	x86@kernel.org
  Cc: ak@linux.intel.com, bp@alien8.de, dave.hansen@linux.intel.com,
	hpa@zytor.com, mingo@redhat.com, linux-kernel@vger.kernel.org,
	Luck, Tony, tglx@kernel.org, stable@vger.kernel.org,
	kvm@vger.kernel.org
In-Reply-To: <20260512213719.20974-1-clopez@suse.de>

On Tue, 2026-05-12 at 23:37 +0200, Carlos López wrote:
> In the x86 architecture, 32-bit operations zero-extend the result in the
> destination register to 64 bits. This includes the CPUID instruction,
> which writes 32-bit values EAX/EBX/ECX/EDX.
> 
> When handling the CPUID instruction via #VE, copy only the lower 32-bits
> provided by the hypervisor for the output registers, and zero out the
> upper half.
> 
> Fixes: c141fa2c2bba ("x86/tdx: Handle CPUID via #VE")
> Cc: stable@vger.kernel.org
> Signed-off-by: Carlos López <clopez@suse.de>
> ---
>  arch/x86/coco/tdx/tdx.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
> index c8b9e86d0488..a2fe1ae019bd 100644
> --- a/arch/x86/coco/tdx/tdx.c
> +++ b/arch/x86/coco/tdx/tdx.c
> @@ -543,10 +543,10 @@ static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
>  	 * EAX, EBX, ECX, EDX registers after the CPUID instruction execution.
>  	 * So copy the register contents back to pt_regs.
>  	 */
> -	regs->ax = args.r12;
> -	regs->bx = args.r13;
> -	regs->cx = args.r14;
> -	regs->dx = args.r15;
> +	regs->ax = lower_32_bits(args.r12);
> +	regs->bx = lower_32_bits(args.r13);
> +	regs->cx = lower_32_bits(args.r14);
> +	regs->dx = lower_32_bits(args.r15);
>  

Can you explain the impact here? Why should the guest fixup what the VMM
emulates?

^ permalink raw reply

* [PATCH] x86/tdx: Fix zero-extension for CPUID emulation
From: Carlos López @ 2026-05-12 21:37 UTC (permalink / raw)
  To: kas, rick.p.edgecombe, x86, linux-coco
  Cc: Carlos López, stable, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin, Andi Kleen,
	Tony Luck, open list:X86 ARCHITECTURE (32-BIT AND 64-BIT),
	open list:X86 TRUST DOMAIN EXTENSIONS (TDX)

In the x86 architecture, 32-bit operations zero-extend the result in the
destination register to 64 bits. This includes the CPUID instruction,
which writes 32-bit values EAX/EBX/ECX/EDX.

When handling the CPUID instruction via #VE, copy only the lower 32-bits
provided by the hypervisor for the output registers, and zero out the
upper half.

Fixes: c141fa2c2bba ("x86/tdx: Handle CPUID via #VE")
Cc: stable@vger.kernel.org
Signed-off-by: Carlos López <clopez@suse.de>
---
 arch/x86/coco/tdx/tdx.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
index c8b9e86d0488..a2fe1ae019bd 100644
--- a/arch/x86/coco/tdx/tdx.c
+++ b/arch/x86/coco/tdx/tdx.c
@@ -543,10 +543,10 @@ static int handle_cpuid(struct pt_regs *regs, struct ve_info *ve)
 	 * EAX, EBX, ECX, EDX registers after the CPUID instruction execution.
 	 * So copy the register contents back to pt_regs.
 	 */
-	regs->ax = args.r12;
-	regs->bx = args.r13;
-	regs->cx = args.r14;
-	regs->dx = args.r15;
+	regs->ax = lower_32_bits(args.r12);
+	regs->bx = lower_32_bits(args.r13);
+	regs->cx = lower_32_bits(args.r14);
+	regs->dx = lower_32_bits(args.r15);
 
 	return ve_instr_len(ve);
 }
-- 
2.51.0


^ permalink raw reply related

* SVSM Development Call May 13th, 2026
From: Jörg Rödel @ 2026-05-12 16:29 UTC (permalink / raw)
  To: coconut-svsm, linux-coco

Hi,

Here is the call for agenda items for this weeks SVSM development call.  Please
send any agenda items you have in mind as a reply to this email or raise them
in the meeting.

We will use the LF Zoom instance. Details of the meeting  can be found in our
governance repository at:

	https://github.com/coconut-svsm/governance

The link to the COCONUT-SVSM calendar is:

	https://zoom-lfx.platform.linuxfoundation.org/meetings/coconut-svsm?view=week

The meeting will be recorded and the recording eventually published.

Regards,

	Jörg

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox