* [PATCH v14 30/44] KVM: arm64: Handle realm VCPU load
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
To: kvm, kvmarm
Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>
When loading a realm VCPU much of the work is handled by the RMM so only
some of the actions are required. Rearrange kvm_arch_vcpu_load()
slightly so we can bail out early for a realm guest.
Signed-off-by: Steven Price <steven.price@arm.com>
---
arch/arm64/kvm/arm.c | 19 ++++++++++++-------
1 file changed, 12 insertions(+), 7 deletions(-)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 073ba9181da9..495082e601a9 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -702,7 +702,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
struct kvm_s2_mmu *mmu;
int *last_ran;
- if (is_protected_kvm_enabled())
+ if (is_protected_kvm_enabled() || kvm_is_realm(vcpu->kvm))
goto nommu;
if (vcpu_has_nv(vcpu))
@@ -746,12 +746,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_vgic_load(vcpu);
kvm_vcpu_load_debug(vcpu);
kvm_vcpu_load_fgt(vcpu);
- if (has_vhe())
- kvm_vcpu_load_vhe(vcpu);
- kvm_arch_vcpu_load_fp(vcpu);
- kvm_vcpu_pmu_restore_guest(vcpu);
- if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
- kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
if (kvm_vcpu_should_clear_twe(vcpu))
vcpu->arch.hcr_el2 &= ~HCR_TWE;
@@ -773,6 +767,17 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
&vcpu->arch.vgic_cpu.vgic_v3);
}
+ /* No additional state needs to be loaded on Realmed VMs */
+ if (vcpu_is_rec(vcpu))
+ return;
+
+ if (has_vhe())
+ kvm_vcpu_load_vhe(vcpu);
+ kvm_arch_vcpu_load_fp(vcpu);
+ kvm_vcpu_pmu_restore_guest(vcpu);
+ if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
+ kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
+
if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus))
vcpu_set_on_unsupported_cpu(vcpu);
--
2.43.0
^ permalink raw reply related
* [PATCH v14 31/44] KVM: arm64: Validate register access for a Realm VM
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
To: kvm, kvmarm
Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>
The RMM only allows setting the GPRS (x0-x30) and PC for a realm
guest. Check this in kvm_arm_set_reg() so that the VMM can receive a
suitable error return if other registers are written to.
The RMM makes similar restrictions for reading of the guest's registers
(this is *confidential* compute after all), however we don't impose the
restriction here. This allows the VMM to read (stale) values from the
registers which might be useful to read back the initial values even if
the RMM doesn't provide the latest version. For migration of a realm VM,
a new interface will be needed so that the VMM can receive an
(encrypted) blob of the VM's state.
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
---
Changes since v5:
* Upper GPRS can be set as part of a HOST_CALL return, so fix up the
test to allow them.
---
arch/arm64/kvm/guest.c | 41 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 41 insertions(+)
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 332c453b87cf..e6682019ef6d 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -73,6 +73,25 @@ static u64 core_reg_offset_from_id(u64 id)
return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE);
}
+static bool kvm_realm_validate_core_reg(u64 off)
+{
+ /*
+ * Note that GPRs can only sometimes be controlled by the VMM.
+ * For PSCI only X0-X6 are used, higher registers are ignored (restored
+ * from the REC).
+ * For HOST_CALL all of X0-X30 are copied to the RsiHostCall structure.
+ * For emulated MMIO X0 is always used.
+ * PC can only be set before the realm is activated.
+ */
+ switch (off) {
+ case KVM_REG_ARM_CORE_REG(regs.regs[0]) ...
+ KVM_REG_ARM_CORE_REG(regs.regs[30]):
+ case KVM_REG_ARM_CORE_REG(regs.pc):
+ return true;
+ }
+ return false;
+}
+
static int core_reg_size_from_offset(const struct kvm_vcpu *vcpu, u64 off)
{
int size;
@@ -716,12 +735,34 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
return kvm_arm_sys_reg_get_reg(vcpu, reg);
}
+/*
+ * The RMI ABI only enables setting some GPRs and PC. The selection of GPRs
+ * that are available depends on the Realm state and the reason for the last
+ * exit. All other registers are reset to architectural or otherwise defined
+ * reset values by the RMM, except for a few configuration fields that
+ * correspond to Realm parameters.
+ */
+static bool validate_realm_set_reg(struct kvm_vcpu *vcpu,
+ const struct kvm_one_reg *reg)
+{
+ if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_CORE) {
+ u64 off = core_reg_offset_from_id(reg->id);
+
+ return kvm_realm_validate_core_reg(off);
+ }
+
+ return false;
+}
+
int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
/* We currently use nothing arch-specific in upper 32 bits */
if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32)
return -EINVAL;
+ if (kvm_is_realm(vcpu->kvm) && !validate_realm_set_reg(vcpu, reg))
+ return -EINVAL;
+
switch (reg->id & KVM_REG_ARM_COPROC_MASK) {
case KVM_REG_ARM_CORE: return set_core_reg(vcpu, reg);
case KVM_REG_ARM_FW:
--
2.43.0
^ permalink raw reply related
* [PATCH v14 32/44] KVM: arm64: Handle Realm PSCI requests
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
To: kvm, kvmarm
Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>
The RMM needs to be informed of the target REC when a PSCI call is made
with an MPIDR argument.
This requirement will be removed in a future release of the RMM 2.0
specification but is still required for v2.0-bet1.
Co-developed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
---
Chanegs since v13:
* The ioctl KVM_ARM_VCPU_RMI_PSCI_COMPLETE has gone. The RMI call is
made automatically just before entering the REC again.
Changes since v12:
* Chance return code for non-realms to -ENXIO to better represent that
the ioctl is invalid for non-realms (checkpatch is insistent that
"ENOSYS means 'invalid syscall nr' and nothing else").
Changes since v11:
* RMM->RMI renaming.
Changes since v6:
* Use vcpu_is_rec() rather than kvm_is_realm(vcpu->kvm).
* Minor renaming/formatting fixes.
---
arch/arm64/include/asm/kvm_rmi.h | 3 ++
arch/arm64/kvm/psci.c | 15 ++++++++-
arch/arm64/kvm/rmi.c | 58 ++++++++++++++++++++++++++++++++
3 files changed, 75 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/include/asm/kvm_rmi.h b/arch/arm64/include/asm/kvm_rmi.h
index b65cfec10dee..eacf82a7467d 100644
--- a/arch/arm64/include/asm/kvm_rmi.h
+++ b/arch/arm64/include/asm/kvm_rmi.h
@@ -109,6 +109,9 @@ int realm_map_non_secure(struct realm *realm,
unsigned long size,
enum kvm_pgtable_prot prot,
struct kvm_mmu_memory_cache *memcache);
+int realm_psci_complete(struct kvm_vcpu *source,
+ struct kvm_vcpu *target,
+ unsigned long status);
static inline bool kvm_realm_is_private_address(struct realm *realm,
unsigned long addr)
diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
index 3b5dbe9a0a0e..a2cd55dc7b5b 100644
--- a/arch/arm64/kvm/psci.c
+++ b/arch/arm64/kvm/psci.c
@@ -103,7 +103,6 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
reset_state->reset = true;
kvm_make_request(KVM_REQ_VCPU_RESET, vcpu);
-
/*
* Make sure the reset request is observed if the RUNNABLE mp_state is
* observed.
@@ -142,6 +141,20 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
/* Ignore other bits of target affinity */
target_affinity &= target_affinity_mask;
+ if (vcpu_is_rec(vcpu)) {
+ struct kvm_vcpu *target_vcpu;
+
+ /* RMM supports only zero affinity level */
+ if (lowest_affinity_level != 0)
+ return PSCI_RET_INVALID_PARAMS;
+
+ target_vcpu = kvm_mpidr_to_vcpu(kvm, target_affinity);
+ if (!target_vcpu)
+ return PSCI_RET_INVALID_PARAMS;
+
+ return PSCI_RET_SUCCESS;
+ }
+
/*
* If one or more VCPU matching target affinity are running
* then ON else OFF
diff --git a/arch/arm64/kvm/rmi.c b/arch/arm64/kvm/rmi.c
index 761b38a4071c..2b03e962ee41 100644
--- a/arch/arm64/kvm/rmi.c
+++ b/arch/arm64/kvm/rmi.c
@@ -3,6 +3,7 @@
* Copyright (C) 2023-2025 ARM Ltd.
*/
+#include <uapi/linux/psci.h>
#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
@@ -127,6 +128,25 @@ static void free_rtt(phys_addr_t phys)
kvm_account_pgtable_pages(phys_to_virt(phys), -1);
}
+int realm_psci_complete(struct kvm_vcpu *source, struct kvm_vcpu *target,
+ unsigned long status)
+{
+ int ret;
+
+ /*
+ * XXX: RMM-v2.0 doesn't require the target REC address for completing
+ * PSCI requests. Temporary hack until RMM implementation catches up
+ * to the full spec.
+ */
+ ret = rmi_psci_complete(virt_to_phys(source->arch.rec.rec_page),
+ virt_to_phys(target->arch.rec.rec_page),
+ status);
+ if (ret)
+ return -EINVAL;
+
+ return 0;
+}
+
static int realm_rtt_create(struct realm *realm,
unsigned long addr,
int level,
@@ -1004,6 +1024,41 @@ static void kvm_complete_ripas_change(struct kvm_vcpu *vcpu)
rec->run->exit.ripas_base = base;
}
+static void kvm_rec_complete_psci(struct kvm_vcpu *vcpu)
+{
+ struct rec_run *run = vcpu->arch.rec.run;
+ unsigned long status = PSCI_RET_DENIED;
+ unsigned long ret = vcpu_get_reg(vcpu, 0);
+ struct kvm_vcpu *target;
+
+ switch (run->exit.gprs[0]) {
+ /*
+ * XXX: RMM-v2.0 doesn't cause RMI_EXIT_PSCI for AFFINITY_INFO
+ * Temporary hack until tf-RMM gets the REC to MPIDR mapping via
+ * RD Auxiliary granules.
+ * For now always report SUCCESS
+ */
+ case PSCI_0_2_FN64_AFFINITY_INFO:
+ status = PSCI_RET_SUCCESS;
+ break;
+ case PSCI_0_2_FN64_CPU_ON: {
+ if (ret != PSCI_RET_SUCCESS &&
+ ret != PSCI_RET_ALREADY_ON)
+ status = PSCI_RET_DENIED;
+ else
+ status = PSCI_RET_SUCCESS;
+ break;
+ }
+ default:
+ return;
+ }
+
+ target = kvm_mpidr_to_vcpu(vcpu->kvm, run->exit.gprs[1]);
+ /* RMM makes sure that we don't get RMI_EXIT_PSCI for invalid mpidrs */
+ if (target)
+ realm_psci_complete(vcpu, target, status);
+}
+
/*
* kvm_rec_pre_enter - Complete operations before entering a REC
*
@@ -1028,6 +1083,9 @@ int kvm_rec_pre_enter(struct kvm_vcpu *vcpu)
for (int i = 0; i < REC_RUN_GPRS; i++)
rec->run->enter.gprs[i] = vcpu_get_reg(vcpu, i);
break;
+ case RMI_EXIT_PSCI:
+ kvm_rec_complete_psci(vcpu);
+ break;
case RMI_EXIT_RIPAS_CHANGE:
kvm_complete_ripas_change(vcpu);
break;
--
2.43.0
^ permalink raw reply related
* [PATCH v14 33/44] KVM: arm64: WARN on injected undef exceptions
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
To: kvm, kvmarm
Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>
The RMM doesn't allow injection of a undefined exception into a realm
guest. Add a WARN to catch if this ever happens.
Signed-off-by: Steven Price <steven.price@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
---
Changes since v6:
* if (x) WARN(1, ...) makes no sense, just WARN(x, ...)!
---
arch/arm64/kvm/inject_fault.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c
index 6492397b73d7..613f223bc7a3 100644
--- a/arch/arm64/kvm/inject_fault.c
+++ b/arch/arm64/kvm/inject_fault.c
@@ -327,6 +327,7 @@ void kvm_inject_size_fault(struct kvm_vcpu *vcpu)
*/
void kvm_inject_undefined(struct kvm_vcpu *vcpu)
{
+ WARN(vcpu_is_rec(vcpu), "Unexpected undefined exception injection to REC");
if (vcpu_el1_is_32bit(vcpu))
inject_undef32(vcpu);
else
--
2.43.0
^ permalink raw reply related
* [PATCH v14 34/44] arm64: RMI: allow userspace to inject aborts
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
To: kvm, kvmarm
Cc: Joey Gouly, Catalin Marinas, Marc Zyngier, Will Deacon,
James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
linux-arm-kernel, linux-kernel, Alexandru Elisei,
Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2,
Steven Price
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>
From: Joey Gouly <joey.gouly@arm.com>
Extend KVM_SET_VCPU_EVENTS to support realms, where KVM cannot set the
system registers, and the RMM must perform it on next REC entry.
Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
---
Documentation/virt/kvm/api.rst | 2 ++
arch/arm64/kvm/guest.c | 24 ++++++++++++++++++++++++
2 files changed, 26 insertions(+)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index a47c60490475..4e0dcca0d261 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -1314,6 +1314,8 @@ User space may need to inject several types of events to the guest.
Set the pending SError exception state for this VCPU. It is not possible to
'cancel' an Serror that has been made pending.
+User space cannot inject SErrors into Realms.
+
If the guest performed an access to I/O memory which could not be handled by
userspace, for example because of missing instruction syndrome decode
information or because there is no device mapped at the accessed IPA, then
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index e6682019ef6d..447674373426 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -827,6 +827,30 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
u64 esr = events->exception.serror_esr;
int ret = 0;
+ if (vcpu_is_rec(vcpu)) {
+ /* Cannot inject SError into a Realm. */
+ if (serror_pending)
+ return -EINVAL;
+
+ /*
+ * If a data abort is pending, set the flag and let the RMM
+ * inject an SEA when the REC is scheduled to be run.
+ */
+ if (ext_dabt_pending) {
+ /*
+ * Can only inject SEA into a Realm if the previous exit
+ * was due to a data abort of an Unprotected IPA.
+ */
+ if (!(vcpu->arch.rec.run->enter.flags & REC_ENTER_FLAG_EMULATED_MMIO))
+ return -EINVAL;
+
+ vcpu->arch.rec.run->enter.flags &= ~REC_ENTER_FLAG_EMULATED_MMIO;
+ vcpu->arch.rec.run->enter.flags |= REC_ENTER_FLAG_INJECT_SEA;
+ }
+
+ return 0;
+ }
+
/*
* Immediately commit the pending SEA to the vCPU's architectural
* state which is necessary since we do not return a pending SEA
--
2.43.0
^ permalink raw reply related
* [PATCH v14 35/44] arm64: RMI: support RSI_HOST_CALL
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
To: kvm, kvmarm
Cc: Joey Gouly, Catalin Marinas, Marc Zyngier, Will Deacon,
James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
linux-arm-kernel, linux-kernel, Alexandru Elisei,
Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2,
Steven Price
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>
From: Joey Gouly <joey.gouly@arm.com>
Realm VMs can talk to the hypervisor using the RSI_HOST_CALL SMC. The
RMM forwards this to the host and KVM handles them as regular
hypercalls.
Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
---
Changes since v7:
* Avoid turning a negative return from kvm_smccc_call_handler() into a
error response to the guest. Instead propogate the error back to user
space.
Changes since v4:
* Setting GPRS is now done by kvm_rec_enter() rather than
rec_exit_host_call() (see previous patch - arm64: RME: Handle realm
enter/exit). This fixes a bug where the registers set by user space
were being ignored.
---
arch/arm64/kvm/rmi-exit.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/arch/arm64/kvm/rmi-exit.c b/arch/arm64/kvm/rmi-exit.c
index 8ec0d179eba2..e5647aa004d3 100644
--- a/arch/arm64/kvm/rmi-exit.c
+++ b/arch/arm64/kvm/rmi-exit.c
@@ -116,6 +116,19 @@ static int rec_exit_ripas_change(struct kvm_vcpu *vcpu)
return -EFAULT;
}
+static int rec_exit_host_call(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct realm_rec *rec = &vcpu->arch.rec;
+
+ vcpu->stat.hvc_exit_stat++;
+
+ for (i = 0; i < REC_RUN_GPRS; i++)
+ vcpu_set_reg(vcpu, i, rec->run->exit.gprs[i]);
+
+ return kvm_smccc_call_handler(vcpu);
+}
+
static void update_arch_timer_irq_lines(struct kvm_vcpu *vcpu)
{
struct realm_rec *rec = &vcpu->arch.rec;
@@ -191,6 +204,8 @@ int handle_rec_exit(struct kvm_vcpu *vcpu, int rec_run_ret)
return rec_exit_psci(vcpu);
case RMI_EXIT_RIPAS_CHANGE:
return rec_exit_ripas_change(vcpu);
+ case RMI_EXIT_HOST_CALL:
+ return rec_exit_host_call(vcpu);
}
kvm_pr_unimpl("Unsupported exit reason: %u\n",
--
2.43.0
^ permalink raw reply related
* [PATCH v14 36/44] arm64: RMI: Allow checking SVE on VM instance
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
To: kvm, kvmarm
Cc: Suzuki K Poulose, Catalin Marinas, Marc Zyngier, Will Deacon,
James Morse, Oliver Upton, Zenghui Yu, linux-arm-kernel,
linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Gavin Shan,
Shanker Donthineni, Alper Gun, Aneesh Kumar K . V, Emi Kisanuki,
Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2, Steven Price
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>
From: Suzuki K Poulose <suzuki.poulose@arm.com>
Given we have different types of VMs supported, check the
support for SVE for the given instance of the VM to accurately
report the status.
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
---
Changes since v10:
* RME->RMI renaming.
* Adapt to move CAP check to kvm_realm_ext_allowed().
---
arch/arm64/include/asm/kvm_rmi.h | 2 ++
arch/arm64/kvm/arm.c | 2 ++
arch/arm64/kvm/rmi.c | 5 +++++
3 files changed, 9 insertions(+)
diff --git a/arch/arm64/include/asm/kvm_rmi.h b/arch/arm64/include/asm/kvm_rmi.h
index eacf82a7467d..d641748b5306 100644
--- a/arch/arm64/include/asm/kvm_rmi.h
+++ b/arch/arm64/include/asm/kvm_rmi.h
@@ -79,6 +79,8 @@ struct realm_rec {
void kvm_init_rmi(void);
u32 kvm_realm_ipa_limit(void);
+bool kvm_rmi_supports_sve(void);
+
int kvm_init_realm(struct kvm *kvm);
int kvm_activate_realm(struct kvm *kvm);
void kvm_destroy_realm(struct kvm *kvm);
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 495082e601a9..aacbeb524b6a 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -148,6 +148,8 @@ static bool kvm_realm_ext_allowed(long ext)
case KVM_CAP_ARM_PTRAUTH_GENERIC:
case KVM_CAP_ARM_RMI:
return true;
+ case KVM_CAP_ARM_SVE:
+ return kvm_rmi_supports_sve();
}
return false;
}
diff --git a/arch/arm64/kvm/rmi.c b/arch/arm64/kvm/rmi.c
index 2b03e962ee41..678d775aa1c7 100644
--- a/arch/arm64/kvm/rmi.c
+++ b/arch/arm64/kvm/rmi.c
@@ -25,6 +25,11 @@ static bool rmi_has_feature(unsigned long feature)
return !!u64_get_bits(rmm_feat_reg0, feature);
}
+bool kvm_rmi_supports_sve(void)
+{
+ return rmi_has_feature(RMI_FEATURE_REGISTER_0_SVE);
+}
+
u32 kvm_realm_ipa_limit(void)
{
return u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_S2SZ);
--
2.43.0
^ permalink raw reply related
* [PATCH v14 37/44] arm64: RMI: Prevent Device mappings for Realms
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
To: kvm, kvmarm
Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>
Physical device assignment is not yet supported. RMM v2.0 does add the
relevant APIs, but device assignment is a big topic so will be handled
in a future patch series. For now prevent device mappings when the guest
is a realm.
Signed-off-by: Steven Price <steven.price@arm.com>
---
Changes from v6:
* Fix the check in user_mem_abort() to prevent all pages that are not
guest_memfd() from being mapped into the protected half of the IPA.
Changes from v5:
* Also prevent accesses in user_mem_abort()
---
arch/arm64/kvm/mmu.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 776ffe56d17e..7678226ffd38 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1230,6 +1230,10 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
if (is_protected_kvm_enabled())
return -EPERM;
+ /* We don't support mapping special pages into a Realm */
+ if (kvm_is_realm(kvm))
+ return -EPERM;
+
size += offset_in_page(guest_ipa);
guest_ipa &= PAGE_MASK;
--
2.43.0
^ permalink raw reply related
* [PATCH v14 38/44] arm64: RMI: Propagate number of breakpoints and watchpoints to userspace
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
To: kvm, kvmarm
Cc: Jean-Philippe Brucker, Catalin Marinas, Marc Zyngier, Will Deacon,
James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2,
Steven Price
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
The RMM describes the maximum number of BPs/WPs available to the guest
in the Feature Register 0. Propagate those numbers into ID_AA64DFR0_EL1,
which is visible to userspace. A VMM needs this information in order to
set up realm parameters.
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Steven Price <steven.price@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Reviewed-by: Joey Gouly <joey.gouly@arm.com>
---
arch/arm64/include/asm/kvm_rmi.h | 2 ++
arch/arm64/kvm/rmi.c | 19 +++++++++++++++++++
arch/arm64/kvm/sys_regs.c | 3 +++
3 files changed, 24 insertions(+)
diff --git a/arch/arm64/include/asm/kvm_rmi.h b/arch/arm64/include/asm/kvm_rmi.h
index d641748b5306..568b0169ab46 100644
--- a/arch/arm64/include/asm/kvm_rmi.h
+++ b/arch/arm64/include/asm/kvm_rmi.h
@@ -79,6 +79,8 @@ struct realm_rec {
void kvm_init_rmi(void);
u32 kvm_realm_ipa_limit(void);
+u64 kvm_realm_reset_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val);
+
bool kvm_rmi_supports_sve(void);
int kvm_init_realm(struct kvm *kvm);
diff --git a/arch/arm64/kvm/rmi.c b/arch/arm64/kvm/rmi.c
index 678d775aa1c7..64e8e50f86d6 100644
--- a/arch/arm64/kvm/rmi.c
+++ b/arch/arm64/kvm/rmi.c
@@ -35,6 +35,25 @@ u32 kvm_realm_ipa_limit(void)
return u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_S2SZ);
}
+u64 kvm_realm_reset_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
+{
+ u32 bps = u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_NUM_BPS);
+ u32 wps = u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_NUM_WPS);
+ u32 ctx_cmps;
+
+ /* Ensure CTX_CMPs is still valid */
+ ctx_cmps = FIELD_GET(ID_AA64DFR0_EL1_CTX_CMPs, val);
+ ctx_cmps = min(bps, ctx_cmps);
+
+ val &= ~(ID_AA64DFR0_EL1_BRPs_MASK | ID_AA64DFR0_EL1_WRPs_MASK |
+ ID_AA64DFR0_EL1_CTX_CMPs);
+ val |= FIELD_PREP(ID_AA64DFR0_EL1_BRPs_MASK, bps) |
+ FIELD_PREP(ID_AA64DFR0_EL1_WRPs_MASK, wps) |
+ FIELD_PREP(ID_AA64DFR0_EL1_CTX_CMPs, ctx_cmps);
+
+ return val;
+}
+
static int get_start_level(struct realm *realm)
{
return 4 - stage2_pgtable_levels(realm->ia_bits);
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 148fc3400ea8..10d191f83bb0 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -2145,6 +2145,9 @@ static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
/* Hide BRBE from guests */
val &= ~ID_AA64DFR0_EL1_BRBE_MASK;
+ if (vcpu_is_rec(vcpu))
+ return kvm_realm_reset_id_aa64dfr0_el1(vcpu, val);
+
return val;
}
--
2.43.0
^ permalink raw reply related
* [PATCH v14 39/44] arm64: RMI: Set breakpoint parameters through SET_ONE_REG
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
To: kvm, kvmarm
Cc: Jean-Philippe Brucker, Catalin Marinas, Marc Zyngier, Will Deacon,
James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2,
Steven Price
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Allow userspace to configure the number of breakpoints and watchpoints
of a Realm VM through KVM_SET_ONE_REG ID_AA64DFR0_EL1.
The KVM sys_reg handler checks the user value against the maximum value
given by RMM (arm64_check_features() gets it from the
read_sanitised_id_aa64dfr0_el1() reset handler).
Userspace discovers that it can write these fields by issuing a
KVM_ARM_GET_REG_WRITABLE_MASKS ioctl.
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Steven Price <steven.price@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
---
arch/arm64/kvm/guest.c | 7 +++++++
arch/arm64/kvm/rmi.c | 3 +++
arch/arm64/kvm/sys_regs.c | 17 +++++++++++------
3 files changed, 21 insertions(+), 6 deletions(-)
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 447674373426..fd7233e00215 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -735,6 +735,8 @@ int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
return kvm_arm_sys_reg_get_reg(vcpu, reg);
}
+#define KVM_REG_ARM_ID_AA64DFR0_EL1 ARM64_SYS_REG(3, 0, 0, 5, 0)
+
/*
* The RMI ABI only enables setting some GPRs and PC. The selection of GPRs
* that are available depends on the Realm state and the reason for the last
@@ -749,6 +751,11 @@ static bool validate_realm_set_reg(struct kvm_vcpu *vcpu,
u64 off = core_reg_offset_from_id(reg->id);
return kvm_realm_validate_core_reg(off);
+ } else {
+ switch (reg->id) {
+ case KVM_REG_ARM_ID_AA64DFR0_EL1:
+ return true;
+ }
}
return false;
diff --git a/arch/arm64/kvm/rmi.c b/arch/arm64/kvm/rmi.c
index 64e8e50f86d6..251de0a3425c 100644
--- a/arch/arm64/kvm/rmi.c
+++ b/arch/arm64/kvm/rmi.c
@@ -469,6 +469,7 @@ static int realm_create_rd(struct kvm *kvm)
void *rd = NULL;
phys_addr_t rd_phys, params_phys;
size_t pgd_size = kvm_pgtable_stage2_pgd_size(kvm->arch.mmu.vtcr);
+ u64 dfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1);
int r;
realm->ia_bits = VTCR_EL2_IPA(kvm->arch.mmu.vtcr);
@@ -495,6 +496,8 @@ static int realm_create_rd(struct kvm *kvm)
params->rtt_level_start = get_start_level(realm);
params->rtt_num_start = pgd_size / PAGE_SIZE;
params->rtt_base = kvm->arch.mmu.pgd_phys;
+ params->num_bps = SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr0);
+ params->num_wps = SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr0);
if (kvm->arch.arm_pmu) {
params->pmu_num_ctrs = kvm->arch.nr_pmu_counters;
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 10d191f83bb0..607396f378dc 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -2177,6 +2177,9 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
{
u8 debugver = SYS_FIELD_GET(ID_AA64DFR0_EL1, DebugVer, val);
u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val);
+ u8 bps = SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, val);
+ u8 wps = SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, val);
+ u8 ctx_cmps = SYS_FIELD_GET(ID_AA64DFR0_EL1, CTX_CMPs, val);
/*
* Prior to commit 3d0dba5764b9 ("KVM: arm64: PMU: Move the
@@ -2196,10 +2199,11 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
val &= ~ID_AA64DFR0_EL1_PMUVer_MASK;
/*
- * ID_AA64DFR0_EL1.DebugVer is one of those awkward fields with a
- * nonzero minimum safe value.
+ * ID_AA64DFR0_EL1.DebugVer, BRPs and WRPs all have to be greater than
+ * zero. CTX_CMPs is never greater than BRPs.
*/
- if (debugver < ID_AA64DFR0_EL1_DebugVer_IMP)
+ if (debugver < ID_AA64DFR0_EL1_DebugVer_IMP || !bps || !wps ||
+ ctx_cmps > bps)
return -EINVAL;
if (ignore_feat_doublelock(vcpu, val)) {
@@ -2432,10 +2436,11 @@ static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
mutex_lock(&vcpu->kvm->arch.config_lock);
/*
- * Once the VM has started the ID registers are immutable. Reject any
- * write that does not match the final register value.
+ * Once the VM has started or the Realm descriptor is created, the ID
+ * registers are immutable. Reject any write that does not match the
+ * final register value.
*/
- if (kvm_vm_has_ran_once(vcpu->kvm)) {
+ if (kvm_vm_has_ran_once(vcpu->kvm) || kvm_realm_is_created(vcpu->kvm)) {
if (val != read_id_reg(vcpu, rd))
ret = -EBUSY;
else
--
2.43.0
^ permalink raw reply related
* [PATCH v14 40/44] arm64: RMI: Propagate max SVE vector length from RMM
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
To: kvm, kvmarm
Cc: Jean-Philippe Brucker, Catalin Marinas, Marc Zyngier, Will Deacon,
James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2,
Steven Price
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
RMM provides the maximum vector length it supports for a guest in its
feature register. Make it visible to the rest of KVM and to userspace
via KVM_REG_ARM64_SVE_VLS.
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Steven Price <steven.price@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
---
arch/arm64/include/asm/kvm_host.h | 2 +-
arch/arm64/include/asm/kvm_rmi.h | 1 +
arch/arm64/kvm/guest.c | 2 +-
arch/arm64/kvm/reset.c | 12 ++++++++++--
arch/arm64/kvm/rmi.c | 6 ++++++
5 files changed, 19 insertions(+), 4 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 11e7b629c950..94e83da160cc 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -78,9 +78,9 @@ enum kvm_mode kvm_get_mode(void);
static inline enum kvm_mode kvm_get_mode(void) { return KVM_MODE_NONE; };
#endif
-extern unsigned int __ro_after_init kvm_sve_max_vl;
extern unsigned int __ro_after_init kvm_host_sve_max_vl;
int __init kvm_arm_init_sve(void);
+unsigned int kvm_sve_get_max_vl(struct kvm *kvm);
u32 __attribute_const__ kvm_target_cpu(void);
void kvm_reset_vcpu(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/asm/kvm_rmi.h b/arch/arm64/include/asm/kvm_rmi.h
index 568b0169ab46..de56330e08c6 100644
--- a/arch/arm64/include/asm/kvm_rmi.h
+++ b/arch/arm64/include/asm/kvm_rmi.h
@@ -78,6 +78,7 @@ struct realm_rec {
void kvm_init_rmi(void);
u32 kvm_realm_ipa_limit(void);
+unsigned int kvm_realm_sve_max_vl(void);
u64 kvm_realm_reset_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val);
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index fd7233e00215..a92bd07ef53a 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -375,7 +375,7 @@ static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
if (vq_present(vqs, vq))
max_vq = vq;
- if (max_vq > sve_vq_from_vl(kvm_sve_max_vl))
+ if (max_vq > sve_vq_from_vl(kvm_sve_get_max_vl(vcpu->kvm)))
return -EINVAL;
/*
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index c18cdca7d125..7b8681a602d4 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -46,7 +46,7 @@ unsigned int __ro_after_init kvm_host_sve_max_vl;
#define VCPU_RESET_PSTATE_SVC (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \
PSR_AA32_I_BIT | PSR_AA32_F_BIT)
-unsigned int __ro_after_init kvm_sve_max_vl;
+static unsigned int __ro_after_init kvm_sve_max_vl;
int __init kvm_arm_init_sve(void)
{
@@ -76,9 +76,17 @@ int __init kvm_arm_init_sve(void)
return 0;
}
+unsigned int kvm_sve_get_max_vl(struct kvm *kvm)
+{
+ if (kvm_is_realm(kvm))
+ return kvm_realm_sve_max_vl();
+ else
+ return kvm_sve_max_vl;
+}
+
static void kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu)
{
- vcpu->arch.sve_max_vl = kvm_sve_max_vl;
+ vcpu->arch.sve_max_vl = kvm_sve_get_max_vl(vcpu->kvm);
/*
* Userspace can still customize the vector lengths by writing
diff --git a/arch/arm64/kvm/rmi.c b/arch/arm64/kvm/rmi.c
index 251de0a3425c..35ad65efa5db 100644
--- a/arch/arm64/kvm/rmi.c
+++ b/arch/arm64/kvm/rmi.c
@@ -35,6 +35,12 @@ u32 kvm_realm_ipa_limit(void)
return u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_S2SZ);
}
+unsigned int kvm_realm_sve_max_vl(void)
+{
+ return sve_vl_from_vq(u64_get_bits(rmm_feat_reg0,
+ RMI_FEATURE_REGISTER_0_SVE_VL) + 1);
+}
+
u64 kvm_realm_reset_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
{
u32 bps = u64_get_bits(rmm_feat_reg0, RMI_FEATURE_REGISTER_0_NUM_BPS);
--
2.43.0
^ permalink raw reply related
* [PATCH v14 41/44] arm64: RMI: Configure max SVE vector length for a Realm
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
To: kvm, kvmarm
Cc: Jean-Philippe Brucker, Catalin Marinas, Marc Zyngier, Will Deacon,
James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2,
Steven Price
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Obtain the max vector length configured by userspace on the vCPUs, and
write it into the Realm parameters. By default the vCPU is configured
with the max vector length reported by RMM, and userspace can reduce it
with a write to KVM_REG_ARM64_SVE_VLS.
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Steven Price <steven.price@arm.com>
---
Changes since v6:
* Rename max_vl/realm_max_vl to vl/last_vl - there is nothing "maximum"
about them, we're just checking that all realms have the same vector
length
---
arch/arm64/kvm/guest.c | 3 ++-
arch/arm64/kvm/rmi.c | 37 +++++++++++++++++++++++++++++++++++++
2 files changed, 39 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index a92bd07ef53a..5f451ee18649 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -361,7 +361,7 @@ static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
if (!vcpu_has_sve(vcpu))
return -ENOENT;
- if (kvm_arm_vcpu_sve_finalized(vcpu))
+ if (kvm_arm_vcpu_sve_finalized(vcpu) || kvm_realm_is_created(vcpu->kvm))
return -EPERM; /* too late! */
if (WARN_ON(vcpu->arch.sve_state))
@@ -754,6 +754,7 @@ static bool validate_realm_set_reg(struct kvm_vcpu *vcpu,
} else {
switch (reg->id) {
case KVM_REG_ARM_ID_AA64DFR0_EL1:
+ case KVM_REG_ARM64_SVE_VLS:
return true;
}
}
diff --git a/arch/arm64/kvm/rmi.c b/arch/arm64/kvm/rmi.c
index 35ad65efa5db..732cecb11355 100644
--- a/arch/arm64/kvm/rmi.c
+++ b/arch/arm64/kvm/rmi.c
@@ -468,6 +468,39 @@ static void realm_unmap_shared_range(struct kvm *kvm,
start, end);
}
+static int realm_init_sve_param(struct kvm *kvm, struct realm_params *params)
+{
+ unsigned long i;
+ struct kvm_vcpu *vcpu;
+ int vl, last_vl = -1;
+
+ if (!kvm_has_sve(kvm))
+ return 0;
+
+ /*
+ * Get the preferred SVE configuration, set by userspace with the
+ * KVM_ARM_VCPU_SVE feature and KVM_REG_ARM64_SVE_VLS pseudo-register.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_arm_vcpu_sve_finalized(vcpu))
+ return -EINVAL;
+
+ vl = vcpu->arch.sve_max_vl;
+
+ /* We need all vCPUs to have the same SVE config */
+ if (last_vl >= 0 && last_vl != vl)
+ return -EINVAL;
+
+ last_vl = vl;
+ }
+
+ if (last_vl > 0) {
+ params->sve_vl = sve_vq_from_vl(last_vl) - 1;
+ params->flags |= RMI_REALM_PARAM_FLAG_SVE;
+ }
+ return 0;
+}
+
static int realm_create_rd(struct kvm *kvm)
{
struct realm *realm = &kvm->arch.realm;
@@ -513,6 +546,10 @@ static int realm_create_rd(struct kvm *kvm)
if (kvm_lpa2_is_enabled())
params->flags |= RMI_REALM_PARAM_FLAG_LPA2;
+ r = realm_init_sve_param(kvm, params);
+ if (r)
+ goto out_undelegate_tables;
+
params_phys = virt_to_phys(params);
if (rmi_realm_create(rd_phys, params_phys)) {
--
2.43.0
^ permalink raw reply related
* [PATCH v14 42/44] arm64: RMI: Provide register list for unfinalized RMI RECs
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
To: kvm, kvmarm
Cc: Jean-Philippe Brucker, Catalin Marinas, Marc Zyngier, Will Deacon,
James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2,
Steven Price
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
KVM_GET_REG_LIST should not be called before SVE is finalized. The ioctl
handler currently returns -EPERM in this case. But because it uses
kvm_arm_vcpu_is_finalized(), it now also rejects the call for
unfinalized REC even though finalizing the REC can only be done late,
after Realm descriptor creation.
Move the check to copy_sve_reg_indices(). One adverse side effect of
this change is that a KVM_GET_REG_LIST call that only probes for the
array size will now succeed even if SVE is not finalized, but that seems
harmless since the following KVM_GET_REG_LIST with the full array will
fail.
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Steven Price <steven.price@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
---
arch/arm64/kvm/arm.c | 4 ----
arch/arm64/kvm/guest.c | 10 +++++-----
2 files changed, 5 insertions(+), 9 deletions(-)
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index aacbeb524b6a..902ca4cf4fa5 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1944,10 +1944,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (unlikely(!kvm_vcpu_initialized(vcpu)))
break;
- r = -EPERM;
- if (!kvm_arm_vcpu_is_finalized(vcpu))
- break;
-
r = -EFAULT;
if (copy_from_user(®_list, user_list, sizeof(reg_list)))
break;
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 5f451ee18649..a55618cd7a27 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -617,8 +617,8 @@ static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu)
if (!vcpu_has_sve(vcpu))
return 0;
- /* Policed by KVM_GET_REG_LIST: */
- WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu));
+ if (!kvm_arm_vcpu_sve_finalized(vcpu))
+ return 1; /* KVM_REG_ARM64_SVE_VLS */
return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */)
+ 1; /* KVM_REG_ARM64_SVE_VLS */
@@ -635,9 +635,6 @@ static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu,
if (!vcpu_has_sve(vcpu))
return 0;
- /* Policed by KVM_GET_REG_LIST: */
- WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu));
-
/*
* Enumerate this first, so that userspace can save/restore in
* the order reported by KVM_GET_REG_LIST:
@@ -647,6 +644,9 @@ static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu,
return -EFAULT;
++num_regs;
+ if (!kvm_arm_vcpu_sve_finalized(vcpu))
+ return num_regs;
+
for (i = 0; i < slices; i++) {
for (n = 0; n < SVE_NUM_ZREGS; n++) {
reg = KVM_REG_ARM64_SVE_ZREG(n, i);
--
2.43.0
^ permalink raw reply related
* [PATCH v14 43/44] arm64: RMI: Provide accurate register list
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
To: kvm, kvmarm
Cc: Jean-Philippe Brucker, Catalin Marinas, Marc Zyngier, Will Deacon,
James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2,
Steven Price
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Userspace can set a few registers with KVM_SET_ONE_REG (9 GP registers
at runtime, and 3 system registers during initialization). Update the
register list returned by KVM_GET_REG_LIST.
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Steven Price <steven.price@arm.com>
---
Changes since v11:
* Reworked due to upstream changes.
Changes since v8:
* Minor type changes following review.
Changes since v7:
* Reworked on upstream changes.
---
arch/arm64/kvm/guest.c | 6 ++++++
arch/arm64/kvm/hypercalls.c | 4 ++--
arch/arm64/kvm/sys_regs.c | 27 +++++++++++++++++++++------
3 files changed, 29 insertions(+), 8 deletions(-)
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index a55618cd7a27..4f34eed9dbbb 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -620,6 +620,9 @@ static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu)
if (!kvm_arm_vcpu_sve_finalized(vcpu))
return 1; /* KVM_REG_ARM64_SVE_VLS */
+ if (kvm_is_realm(vcpu->kvm))
+ return 1; /* KVM_REG_ARM64_SVE_VLS */
+
return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */)
+ 1; /* KVM_REG_ARM64_SVE_VLS */
}
@@ -647,6 +650,9 @@ static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu,
if (!kvm_arm_vcpu_sve_finalized(vcpu))
return num_regs;
+ if (kvm_is_realm(vcpu->kvm))
+ return num_regs;
+
for (i = 0; i < slices; i++) {
for (n = 0; n < SVE_NUM_ZREGS; n++) {
reg = KVM_REG_ARM64_SVE_ZREG(n, i);
diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c
index 58c5fe7d7572..70ac7971416c 100644
--- a/arch/arm64/kvm/hypercalls.c
+++ b/arch/arm64/kvm/hypercalls.c
@@ -414,14 +414,14 @@ void kvm_arm_teardown_hypercalls(struct kvm *kvm)
int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu)
{
- return ARRAY_SIZE(kvm_arm_fw_reg_ids);
+ return kvm_is_realm(vcpu->kvm) ? 0 : ARRAY_SIZE(kvm_arm_fw_reg_ids);
}
int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
{
int i;
- for (i = 0; i < ARRAY_SIZE(kvm_arm_fw_reg_ids); i++) {
+ for (i = 0; i < kvm_arm_get_fw_num_regs(vcpu); i++) {
if (put_user(kvm_arm_fw_reg_ids[i], uindices++))
return -EFAULT;
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 607396f378dc..2887f90b3b4e 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -5547,18 +5547,18 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
}
-static unsigned int num_demux_regs(void)
+static inline unsigned int num_demux_regs(struct kvm_vcpu *vcpu)
{
- return CSSELR_MAX;
+ return kvm_is_realm(vcpu->kvm) ? 0 : CSSELR_MAX;
}
-static int write_demux_regids(u64 __user *uindices)
+static int write_demux_regids(struct kvm_vcpu *vcpu, u64 __user *uindices)
{
u64 val = KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX;
unsigned int i;
val |= KVM_REG_ARM_DEMUX_ID_CCSIDR;
- for (i = 0; i < CSSELR_MAX; i++) {
+ for (i = 0; i < num_demux_regs(vcpu); i++) {
if (put_user(val | i, uindices))
return -EFAULT;
uindices++;
@@ -5602,11 +5602,26 @@ static bool copy_reg_to_user(const struct sys_reg_desc *reg, u64 __user **uind)
return true;
}
+static inline bool kvm_realm_sys_reg_hidden_user(const struct kvm_vcpu *vcpu,
+ u64 reg)
+{
+ switch (reg) {
+ case SYS_ID_AA64DFR0_EL1:
+ case SYS_PMCR_EL0:
+ return false;
+ }
+ return true;
+}
+
static int walk_one_sys_reg(const struct kvm_vcpu *vcpu,
const struct sys_reg_desc *rd,
u64 __user **uind,
unsigned int *total)
{
+ if (vcpu_is_rec(vcpu) &&
+ kvm_realm_sys_reg_hidden_user(vcpu, reg_to_encoding(rd)))
+ return 0;
+
/*
* Ignore registers we trap but don't save,
* and for which no custom user accessor is provided.
@@ -5644,7 +5659,7 @@ static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind)
unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu)
{
- return num_demux_regs()
+ return num_demux_regs(vcpu)
+ walk_sys_regs(vcpu, (u64 __user *)NULL);
}
@@ -5657,7 +5672,7 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
return err;
uindices += err;
- return write_demux_regids(uindices);
+ return write_demux_regids(vcpu, uindices);
}
#define KVM_ARM_FEATURE_ID_RANGE_INDEX(r) \
--
2.43.0
^ permalink raw reply related
* [PATCH v14 44/44] arm64: RMI: Enable realms to be created
From: Steven Price @ 2026-05-13 13:17 UTC (permalink / raw)
To: kvm, kvmarm
Cc: Steven Price, Catalin Marinas, Marc Zyngier, Will Deacon,
James Morse, Oliver Upton, Suzuki K Poulose, Zenghui Yu,
linux-arm-kernel, linux-kernel, Joey Gouly, Alexandru Elisei,
Christoffer Dall, Fuad Tabba, linux-coco, Ganapatrao Kulkarni,
Gavin Shan, Shanker Donthineni, Alper Gun, Aneesh Kumar K . V,
Emi Kisanuki, Vishal Annapurve, WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-1-steven.price@arm.com>
All the pieces are now in place, so enable kvm_rmi_is_available when the
RMM is detected.
Signed-off-by: Steven Price <steven.price@arm.com>
---
arch/arm64/kvm/rmi.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/kvm/rmi.c b/arch/arm64/kvm/rmi.c
index 732cecb11355..67c1d1526b07 100644
--- a/arch/arm64/kvm/rmi.c
+++ b/arch/arm64/kvm/rmi.c
@@ -1396,5 +1396,6 @@ void kvm_init_rmi(void)
if (rmm_check_features())
return;
- /* Future patch will enable static branch kvm_rmi_is_available */
+ kvm_info("Realm guests supported");
+ static_branch_enable(&kvm_rmi_is_available);
}
--
2.43.0
^ permalink raw reply related
* Re: [PATCH v4 01/13] dma-direct: swiotlb: handle swiotlb alloc/free outside __dma_direct_alloc_pages
From: Mostafa Saleh @ 2026-05-13 13:57 UTC (permalink / raw)
To: Aneesh Kumar K.V (Arm)
Cc: iommu, linux-arm-kernel, linux-kernel, linux-coco, Robin Murphy,
Marek Szyprowski, Will Deacon, Marc Zyngier, Steven Price,
Suzuki K Poulose, Catalin Marinas, Jiri Pirko, Jason Gunthorpe,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86
In-Reply-To: <20260512090408.794195-2-aneesh.kumar@kernel.org>
On Tue, May 12, 2026 at 02:33:56PM +0530, Aneesh Kumar K.V (Arm) wrote:
> Move swiotlb allocation out of __dma_direct_alloc_pages() and handle it in
> dma_direct_alloc() / dma_direct_alloc_pages().
>
> This is needed for follow-up changes that simplify the handling of
> memory encryption/decryption based on the DMA attribute flags.
>
> swiotlb backing pages are already mapped decrypted by
> swiotlb_update_mem_attributes() and rmem_swiotlb_device_init(), so
> dma-direct should not call dma_set_decrypted() on allocation nor
> dma_set_encrypted() on free for swiotlb-backed memory.
>
> Update alloc/free paths to detect swiotlb-backed pages and skip
> encrypt/decrypt transitions for those paths. Keep the existing highmem
> rejection in dma_direct_alloc_pages() for swiotlb allocations.
>
> Only for "restricted-dma-pool", we currently set `for_alloc = true`, while
> rmem_swiotlb_device_init() decrypts the whole pool up front. This pool is
> typically used together with "shared-dma-pool", where the shared region is
> accessed after remap/ioremap and the returned address is suitable for
> decrypted memory access. So existing code paths remain valid.
>
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> ---
> kernel/dma/direct.c | 44 +++++++++++++++++++++++++++++++++++++-------
> 1 file changed, 37 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index ec887f443741..b958f150718a 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -125,9 +125,6 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
>
> WARN_ON_ONCE(!PAGE_ALIGNED(size));
>
> - if (is_swiotlb_for_alloc(dev))
> - return dma_direct_alloc_swiotlb(dev, size);
> -
> gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
> page = dma_alloc_contiguous(dev, size, gfp);
> if (page) {
> @@ -204,6 +201,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
> dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
> {
> bool remap = false, set_uncached = false;
> + bool mark_mem_decrypt = true;
> struct page *page;
> void *ret;
>
> @@ -250,11 +248,21 @@ void *dma_direct_alloc(struct device *dev, size_t size,
> dma_direct_use_pool(dev, gfp))
> return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
>
> + if (is_swiotlb_for_alloc(dev)) {
> + page = dma_direct_alloc_swiotlb(dev, size);
> + if (page) {
> + mark_mem_decrypt = false;
> + goto setup_page;
> + }
> + return NULL;
> + }
> +
> /* we always manually zero the memory once we are done */
> page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO, true);
> if (!page)
> return NULL;
>
> +setup_page:
> /*
> * dma_alloc_contiguous can return highmem pages depending on a
> * combination the cma= arguments and per-arch setup. These need to be
> @@ -281,7 +289,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
> goto out_free_pages;
> } else {
> ret = page_address(page);
> - if (dma_set_decrypted(dev, ret, size))
> + if (mark_mem_decrypt && dma_set_decrypted(dev, ret, size))
I am ok with that approach, but Jason was mentioning we shouldn’t
special case swiotlb and make the allocator return the memory state
(similar to the dma_page [1]) . I am also OK if you want to merge that
part of my series with is.
[1] https://lore.kernel.org/linux-iommu/20260408194750.2280873-1-smostafa@google.com/
> goto out_leak_pages;
> }
>
> @@ -298,7 +306,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
> return ret;
>
> out_encrypt_pages:
> - if (dma_set_encrypted(dev, page_address(page), size))
> + if (mark_mem_decrypt && dma_set_encrypted(dev, page_address(page), size))
> return NULL;
> out_free_pages:
> __dma_direct_free_pages(dev, page, size);
> @@ -310,6 +318,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
> void dma_direct_free(struct device *dev, size_t size,
> void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
> {
> + bool mark_mem_encrypted = true;
> unsigned int page_order = get_order(size);
>
> if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
> @@ -338,12 +347,15 @@ void dma_direct_free(struct device *dev, size_t size,
> dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
> return;
>
> + if (swiotlb_find_pool(dev, dma_to_phys(dev, dma_addr)))
> + mark_mem_encrypted = false;
> +
> if (is_vmalloc_addr(cpu_addr)) {
> vunmap(cpu_addr);
> } else {
> if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
> arch_dma_clear_uncached(cpu_addr, size);
> - if (dma_set_encrypted(dev, cpu_addr, size))
> + if (mark_mem_encrypted && dma_set_encrypted(dev, cpu_addr, size))
> return;
> }
>
> @@ -359,6 +371,19 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
> if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
> return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
>
> + if (is_swiotlb_for_alloc(dev)) {
> + page = dma_direct_alloc_swiotlb(dev, size);
> + if (!page)
> + return NULL;
> +
> + if (PageHighMem(page)) {
My understanding is that rmem_swiotlb_device_init() asserts that there
is no PageHighMem()? Also a similar check doesn’t exist in
dma_direct_alloc().
Thanks,
Mostafa
> + swiotlb_free(dev, page, size);
> + return NULL;
> + }
> + ret = page_address(page);
> + goto setup_page;
> + }
> +
> page = __dma_direct_alloc_pages(dev, size, gfp, false);
> if (!page)
> return NULL;
> @@ -366,6 +391,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
> ret = page_address(page);
> if (dma_set_decrypted(dev, ret, size))
> goto out_leak_pages;
> +setup_page:
> memset(ret, 0, size);
> *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
> return page;
> @@ -378,13 +404,17 @@ void dma_direct_free_pages(struct device *dev, size_t size,
> enum dma_data_direction dir)
> {
> void *vaddr = page_address(page);
> + bool mark_mem_encrypted = true;
>
> /* If cpu_addr is not from an atomic pool, dma_free_from_pool() fails */
> if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
> dma_free_from_pool(dev, vaddr, size))
> return;
>
> - if (dma_set_encrypted(dev, vaddr, size))
> + if (swiotlb_find_pool(dev, page_to_phys(page)))
> + mark_mem_encrypted = false;
> +
> + if (mark_mem_encrypted && dma_set_encrypted(dev, vaddr, size))
> return;
> __dma_direct_free_pages(dev, page, size);
> }
> --
> 2.43.0
>
^ permalink raw reply
* Re: [PATCH v4 02/13] dma-direct: use DMA_ATTR_CC_SHARED in alloc/free paths
From: Mostafa Saleh @ 2026-05-13 13:58 UTC (permalink / raw)
To: Aneesh Kumar K.V (Arm)
Cc: iommu, linux-arm-kernel, linux-kernel, linux-coco, Robin Murphy,
Marek Szyprowski, Will Deacon, Marc Zyngier, Steven Price,
Suzuki K Poulose, Catalin Marinas, Jiri Pirko, Jason Gunthorpe,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86
In-Reply-To: <20260512090408.794195-3-aneesh.kumar@kernel.org>
On Tue, May 12, 2026 at 02:33:57PM +0530, Aneesh Kumar K.V (Arm) wrote:
> Propagate force_dma_unencrypted() into DMA_ATTR_CC_SHARED in the
> dma-direct allocation path and use the attribute to drive the related
> decisions.
>
> This updates dma_direct_alloc(), dma_direct_free(), and
> dma_direct_alloc_pages() to fold the forced unencrypted case into attrs.
>
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> ---
> kernel/dma/direct.c | 44 ++++++++++++++++++++++++++++++++++++--------
> 1 file changed, 36 insertions(+), 8 deletions(-)
>
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index b958f150718a..0c2e1f8436ce 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -201,16 +201,31 @@ void *dma_direct_alloc(struct device *dev, size_t size,
> dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
> {
> bool remap = false, set_uncached = false;
> - bool mark_mem_decrypt = true;
> + bool mark_mem_decrypt = false;
> struct page *page;
> void *ret;
>
> + /*
> + * DMA_ATTR_CC_SHARED is not a caller-visible dma_alloc_*()
> + * attribute. The direct allocator uses it internally after it has
> + * decided that the backing pages must be shared/decrypted, so the
> + * rest of the allocation path can consistently select DMA addresses,
> + * choose compatible pools and restore encryption on free.
> + */
> + if (attrs & DMA_ATTR_CC_SHARED)
> + return NULL;
> +
> + if (force_dma_unencrypted(dev)) {
> + attrs |= DMA_ATTR_CC_SHARED;
> + mark_mem_decrypt = true;
> + }
> +
> size = PAGE_ALIGN(size);
> if (attrs & DMA_ATTR_NO_WARN)
> gfp |= __GFP_NOWARN;
>
> - if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
> - !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev))
> + if (((attrs & (DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_CC_SHARED)) ==
> + DMA_ATTR_NO_KERNEL_MAPPING) && !is_swiotlb_for_alloc(dev))
> return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp);
>
> if (!dev_is_dma_coherent(dev)) {
> @@ -244,7 +259,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
> * Remapping or decrypting memory may block, allocate the memory from
> * the atomic pools instead if we aren't allowed block.
> */
> - if ((remap || force_dma_unencrypted(dev)) &&
> + if ((remap || (attrs & DMA_ATTR_CC_SHARED)) &&
> dma_direct_use_pool(dev, gfp))
> return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
>
> @@ -318,11 +333,20 @@ void *dma_direct_alloc(struct device *dev, size_t size,
> void dma_direct_free(struct device *dev, size_t size,
> void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
> {
> - bool mark_mem_encrypted = true;
> + bool mark_mem_encrypted = false;
> unsigned int page_order = get_order(size);
>
> - if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
> - !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) {
> + /*
> + * if the device had requested for an unencrypted buffer,
> + * convert it to encrypted on free
> + */
> + if (force_dma_unencrypted(dev)) {
> + attrs |= DMA_ATTR_CC_SHARED;
> + mark_mem_encrypted = true;
> + }
> +
> + if (((attrs & (DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_CC_SHARED)) ==
> + DMA_ATTR_NO_KERNEL_MAPPING) && !is_swiotlb_for_alloc(dev)) {
> /* cpu_addr is a struct page cookie, not a kernel address */
> dma_free_contiguous(dev, cpu_addr, size);
> return;
> @@ -365,10 +389,14 @@ void dma_direct_free(struct device *dev, size_t size,
> struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
> dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp)
> {
> + unsigned long attrs = 0;
> struct page *page;
> void *ret;
>
> - if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
> + if (force_dma_unencrypted(dev))
> + attrs |= DMA_ATTR_CC_SHARED;
> +
> + if ((attrs & DMA_ATTR_CC_SHARED) && dma_direct_use_pool(dev, gfp))
> return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
What about dma_direct_free_pages()? Nothing inside uses attrs, but
that’s quite similar to dma_direct_alloc_pages()
Also, at this point, shouldn’t this patch also remove
force_dma_unencrypted() calls from dma_set_decrypted() and
dma_set_encrypted()?
Thanks,
Mostafa
>
> if (is_swiotlb_for_alloc(dev)) {
> --
> 2.43.0
>
^ permalink raw reply
* Re: [PATCH v4 03/13] dma-pool: track decrypted atomic pools and select them via attrs
From: Mostafa Saleh @ 2026-05-13 14:00 UTC (permalink / raw)
To: Aneesh Kumar K.V (Arm)
Cc: iommu, linux-arm-kernel, linux-kernel, linux-coco, Robin Murphy,
Marek Szyprowski, Will Deacon, Marc Zyngier, Steven Price,
Suzuki K Poulose, Catalin Marinas, Jiri Pirko, Jason Gunthorpe,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86
In-Reply-To: <20260512090408.794195-4-aneesh.kumar@kernel.org>
On Tue, May 12, 2026 at 02:33:58PM +0530, Aneesh Kumar K.V (Arm) wrote:
> Teach the atomic DMA pool code to distinguish between encrypted and
> decrypted pools, and make pool allocation select the matching pool based
> on DMA attributes.
>
> Introduce a dma_gen_pool wrapper that records whether a pool is
> decrypted, initialize that state when the atomic pools are created, and
> use it when expanding and resizing the pools. Update dma_alloc_from_pool()
> to take attrs and skip pools whose encrypted/decrypted state does not
> match DMA_ATTR_CC_SHARED. Update dma_free_from_pool() accordingly.
>
> Also pass DMA_ATTR_CC_SHARED from the swiotlb atomic allocation path
> so decrypted swiotlb allocations are taken from the correct atomic pool.
>
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> ---
> drivers/iommu/dma-iommu.c | 2 +-
> include/linux/dma-map-ops.h | 2 +-
> kernel/dma/direct.c | 11 ++-
> kernel/dma/pool.c | 163 +++++++++++++++++++++++-------------
> kernel/dma/swiotlb.c | 7 +-
> 5 files changed, 122 insertions(+), 63 deletions(-)
>
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index 54d96e847f16..c2595bee3d41 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -1673,7 +1673,7 @@ void *iommu_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
> if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
> !gfpflags_allow_blocking(gfp) && !coherent)
> page = dma_alloc_from_pool(dev, PAGE_ALIGN(size), &cpu_addr,
> - gfp, NULL);
> + gfp, attrs, NULL);
> else
> cpu_addr = iommu_dma_alloc_pages(dev, size, &page, gfp, attrs);
> if (!cpu_addr)
> diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
> index 6a1832a73cad..696b2c3a2305 100644
> --- a/include/linux/dma-map-ops.h
> +++ b/include/linux/dma-map-ops.h
> @@ -212,7 +212,7 @@ void *dma_common_pages_remap(struct page **pages, size_t size, pgprot_t prot,
> void dma_common_free_remap(void *cpu_addr, size_t size);
>
> struct page *dma_alloc_from_pool(struct device *dev, size_t size,
> - void **cpu_addr, gfp_t flags,
> + void **cpu_addr, gfp_t flags, unsigned long attrs,
> bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t));
> bool dma_free_from_pool(struct device *dev, void *start, size_t size);
>
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 0c2e1f8436ce..dc2907439b3d 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -162,7 +162,7 @@ static bool dma_direct_use_pool(struct device *dev, gfp_t gfp)
> }
>
> static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
> - dma_addr_t *dma_handle, gfp_t gfp)
> + dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
> {
> struct page *page;
> u64 phys_limit;
> @@ -172,7 +172,8 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
> return NULL;
>
> gfp |= dma_direct_optimal_gfp_mask(dev, &phys_limit);
> - page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
> + page = dma_alloc_from_pool(dev, size, &ret, gfp, attrs,
> + dma_coherent_ok);
> if (!page)
> return NULL;
> *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
> @@ -261,7 +262,8 @@ void *dma_direct_alloc(struct device *dev, size_t size,
> */
> if ((remap || (attrs & DMA_ATTR_CC_SHARED)) &&
> dma_direct_use_pool(dev, gfp))
> - return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
> + return dma_direct_alloc_from_pool(dev, size, dma_handle,
> + gfp, attrs);
>
> if (is_swiotlb_for_alloc(dev)) {
> page = dma_direct_alloc_swiotlb(dev, size);
> @@ -397,7 +399,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
> attrs |= DMA_ATTR_CC_SHARED;
>
> if ((attrs & DMA_ATTR_CC_SHARED) && dma_direct_use_pool(dev, gfp))
> - return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
> + return dma_direct_alloc_from_pool(dev, size, dma_handle,
> + gfp, attrs);
>
> if (is_swiotlb_for_alloc(dev)) {
> page = dma_direct_alloc_swiotlb(dev, size);
> diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
> index 2b2fbb709242..75f0eba48a23 100644
> --- a/kernel/dma/pool.c
> +++ b/kernel/dma/pool.c
> @@ -12,12 +12,18 @@
> #include <linux/set_memory.h>
> #include <linux/slab.h>
> #include <linux/workqueue.h>
> +#include <linux/cc_platform.h>
>
> -static struct gen_pool *atomic_pool_dma __ro_after_init;
> +struct dma_gen_pool {
> + bool unencrypted;
> + struct gen_pool *pool;
> +};
> +
> +static struct dma_gen_pool atomic_pool_dma __ro_after_init;
> static unsigned long pool_size_dma;
> -static struct gen_pool *atomic_pool_dma32 __ro_after_init;
> +static struct dma_gen_pool atomic_pool_dma32 __ro_after_init;
> static unsigned long pool_size_dma32;
> -static struct gen_pool *atomic_pool_kernel __ro_after_init;
> +static struct dma_gen_pool atomic_pool_kernel __ro_after_init;
> static unsigned long pool_size_kernel;
>
> /* Size can be defined by the coherent_pool command line */
> @@ -76,7 +82,7 @@ static bool cma_in_zone(gfp_t gfp)
> return true;
> }
>
> -static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
> +static int atomic_pool_expand(struct dma_gen_pool *dma_pool, size_t pool_size,
> gfp_t gfp)
> {
> unsigned int order;
> @@ -113,12 +119,15 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
> * Memory in the atomic DMA pools must be unencrypted, the pools do not
> * shrink so no re-encryption occurs in dma_direct_free().
> */
> - ret = set_memory_decrypted((unsigned long)page_to_virt(page),
> + if (dma_pool->unencrypted) {
> + ret = set_memory_decrypted((unsigned long)page_to_virt(page),
> 1 << order);
> - if (ret)
> - goto remove_mapping;
> - ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page),
> - pool_size, NUMA_NO_NODE);
> + if (ret)
> + goto remove_mapping;
> + }
> +
> + ret = gen_pool_add_virt(dma_pool->pool, (unsigned long)addr,
> + page_to_phys(page), pool_size, NUMA_NO_NODE);
> if (ret)
> goto encrypt_mapping;
>
> @@ -126,11 +135,15 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
> return 0;
>
> encrypt_mapping:
> - ret = set_memory_encrypted((unsigned long)page_to_virt(page),
> - 1 << order);
> - if (WARN_ON_ONCE(ret)) {
> - /* Decrypt succeeded but encrypt failed, purposely leak */
> - goto out;
> + if (dma_pool->unencrypted) {
> + int rc;
> +
> + rc = set_memory_encrypted((unsigned long)page_to_virt(page),
> + 1 << order);
> + if (WARN_ON_ONCE(rc)) {
> + /* Decrypt succeeded but encrypt failed, purposely leak */
> + goto out;
> + }
> }
> remove_mapping:
> #ifdef CONFIG_DMA_DIRECT_REMAP
> @@ -142,46 +155,52 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
> return ret;
> }
>
> -static void atomic_pool_resize(struct gen_pool *pool, gfp_t gfp)
> +static void atomic_pool_resize(struct dma_gen_pool *dma_pool, gfp_t gfp)
> {
> - if (pool && gen_pool_avail(pool) < atomic_pool_size)
> - atomic_pool_expand(pool, gen_pool_size(pool), gfp);
> + if (dma_pool->pool && gen_pool_avail(dma_pool->pool) < atomic_pool_size)
> + atomic_pool_expand(dma_pool, gen_pool_size(dma_pool->pool), gfp);
> }
>
> static void atomic_pool_work_fn(struct work_struct *work)
> {
> if (IS_ENABLED(CONFIG_ZONE_DMA))
> - atomic_pool_resize(atomic_pool_dma,
> + atomic_pool_resize(&atomic_pool_dma,
> GFP_KERNEL | GFP_DMA);
> if (IS_ENABLED(CONFIG_ZONE_DMA32))
> - atomic_pool_resize(atomic_pool_dma32,
> + atomic_pool_resize(&atomic_pool_dma32,
> GFP_KERNEL | GFP_DMA32);
> - atomic_pool_resize(atomic_pool_kernel, GFP_KERNEL);
> + atomic_pool_resize(&atomic_pool_kernel, GFP_KERNEL);
> }
>
> -static __init struct gen_pool *__dma_atomic_pool_init(size_t pool_size,
> - gfp_t gfp)
> +static __init struct dma_gen_pool *__dma_atomic_pool_init(struct dma_gen_pool *dma_pool,
> + size_t pool_size, gfp_t gfp)
> {
> - struct gen_pool *pool;
> int ret;
>
> - pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE);
> - if (!pool)
> + dma_pool->pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE);
> + if (!dma_pool->pool)
> return NULL;
>
> - gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL);
> + gen_pool_set_algo(dma_pool->pool, gen_pool_first_fit_order_align, NULL);
> +
> + /* if platform is using memory encryption atomic pools are by default decrypted. */
> + if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
> + dma_pool->unencrypted = true;
> + else
> + dma_pool->unencrypted = false;
I believe that’s a good start, although we might need to have more
fine grained policies in the future as CC guests might need
encrypted pools
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Thanks,
Mostafa
>
> - ret = atomic_pool_expand(pool, pool_size, gfp);
> + ret = atomic_pool_expand(dma_pool, pool_size, gfp);
> if (ret) {
> - gen_pool_destroy(pool);
> + gen_pool_destroy(dma_pool->pool);
> + dma_pool->pool = NULL;
> pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n",
> pool_size >> 10, &gfp);
> return NULL;
> }
>
> pr_info("DMA: preallocated %zu KiB %pGg pool for atomic allocations\n",
> - gen_pool_size(pool) >> 10, &gfp);
> - return pool;
> + gen_pool_size(dma_pool->pool) >> 10, &gfp);
> + return dma_pool;
> }
>
> #ifdef CONFIG_ZONE_DMA32
> @@ -207,21 +226,22 @@ static int __init dma_atomic_pool_init(void)
>
> /* All memory might be in the DMA zone(s) to begin with */
> if (has_managed_zone(ZONE_NORMAL)) {
> - atomic_pool_kernel = __dma_atomic_pool_init(atomic_pool_size,
> - GFP_KERNEL);
> - if (!atomic_pool_kernel)
> + __dma_atomic_pool_init(&atomic_pool_kernel, atomic_pool_size, GFP_KERNEL);
> + if (!atomic_pool_kernel.pool)
> ret = -ENOMEM;
> }
> +
> if (has_managed_dma()) {
> - atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size,
> - GFP_KERNEL | GFP_DMA);
> - if (!atomic_pool_dma)
> + __dma_atomic_pool_init(&atomic_pool_dma, atomic_pool_size,
> + GFP_KERNEL | GFP_DMA);
> + if (!atomic_pool_dma.pool)
> ret = -ENOMEM;
> }
> +
> if (has_managed_dma32) {
> - atomic_pool_dma32 = __dma_atomic_pool_init(atomic_pool_size,
> - GFP_KERNEL | GFP_DMA32);
> - if (!atomic_pool_dma32)
> + __dma_atomic_pool_init(&atomic_pool_dma32, atomic_pool_size,
> + GFP_KERNEL | GFP_DMA32);
> + if (!atomic_pool_dma32.pool)
> ret = -ENOMEM;
> }
>
> @@ -230,19 +250,44 @@ static int __init dma_atomic_pool_init(void)
> }
> postcore_initcall(dma_atomic_pool_init);
>
> -static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
> +static inline struct dma_gen_pool *__dma_guess_pool(struct dma_gen_pool *first,
> + struct dma_gen_pool *second, struct dma_gen_pool *third)
> +{
> + if (first->pool)
> + return first;
> + if (second && second->pool)
> + return second;
> + if (third && third->pool)
> + return third;
> + return NULL;
> +}
> +
> +static inline struct dma_gen_pool *dma_guess_pool(struct dma_gen_pool *prev,
> + gfp_t gfp)
> {
> - if (prev == NULL) {
> + if (!prev) {
> if (gfp & GFP_DMA)
> - return atomic_pool_dma ?: atomic_pool_dma32 ?: atomic_pool_kernel;
> + return __dma_guess_pool(&atomic_pool_dma,
> + &atomic_pool_dma32,
> + &atomic_pool_kernel);
> +
> if (gfp & GFP_DMA32)
> - return atomic_pool_dma32 ?: atomic_pool_dma ?: atomic_pool_kernel;
> - return atomic_pool_kernel ?: atomic_pool_dma32 ?: atomic_pool_dma;
> + return __dma_guess_pool(&atomic_pool_dma32,
> + &atomic_pool_dma,
> + &atomic_pool_kernel);
> +
> + return __dma_guess_pool(&atomic_pool_kernel,
> + &atomic_pool_dma32,
> + &atomic_pool_dma);
> }
> - if (prev == atomic_pool_kernel)
> - return atomic_pool_dma32 ? atomic_pool_dma32 : atomic_pool_dma;
> - if (prev == atomic_pool_dma32)
> - return atomic_pool_dma;
> +
> + if (prev == &atomic_pool_kernel)
> + return __dma_guess_pool(&atomic_pool_dma32,
> + &atomic_pool_dma, NULL);
> +
> + if (prev == &atomic_pool_dma32)
> + return __dma_guess_pool(&atomic_pool_dma, NULL, NULL);
> +
> return NULL;
> }
>
> @@ -272,16 +317,20 @@ static struct page *__dma_alloc_from_pool(struct device *dev, size_t size,
> }
>
> struct page *dma_alloc_from_pool(struct device *dev, size_t size,
> - void **cpu_addr, gfp_t gfp,
> + void **cpu_addr, gfp_t gfp, unsigned long attrs,
> bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t))
> {
> - struct gen_pool *pool = NULL;
> + struct dma_gen_pool *dma_pool = NULL;
> struct page *page;
> bool pool_found = false;
>
> - while ((pool = dma_guess_pool(pool, gfp))) {
> + while ((dma_pool = dma_guess_pool(dma_pool, gfp))) {
> +
> + if (dma_pool->unencrypted != !!(attrs & DMA_ATTR_CC_SHARED))
> + continue;
> +
nit: If we fail to find a matching pool, a slightly misleading message
is printed as pool_found = false
> pool_found = true;
> - page = __dma_alloc_from_pool(dev, size, pool, cpu_addr,
> + page = __dma_alloc_from_pool(dev, size, dma_pool->pool, cpu_addr,
> phys_addr_ok);
> if (page)
> return page;
> @@ -296,12 +345,14 @@ struct page *dma_alloc_from_pool(struct device *dev, size_t size,
>
> bool dma_free_from_pool(struct device *dev, void *start, size_t size)
> {
> - struct gen_pool *pool = NULL;
> + struct dma_gen_pool *dma_pool = NULL;
> +
> + while ((dma_pool = dma_guess_pool(dma_pool, 0))) {
>
> - while ((pool = dma_guess_pool(pool, 0))) {
> - if (!gen_pool_has_addr(pool, (unsigned long)start, size))
> + if (!gen_pool_has_addr(dma_pool->pool, (unsigned long)start, size))
> continue;
> - gen_pool_free(pool, (unsigned long)start, size);
> +
> + gen_pool_free(dma_pool->pool, (unsigned long)start, size);
> return true;
> }
>
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index 1abd3e6146f4..ab4eccbaa076 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -612,6 +612,7 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
> u64 phys_limit, gfp_t gfp)
> {
> struct page *page;
> + unsigned long attrs = 0;
>
> /*
> * Allocate from the atomic pools if memory is encrypted and
> @@ -623,8 +624,12 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
> if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
> return NULL;
>
> + /* swiotlb considered decrypted by default */
> + if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
> + attrs = DMA_ATTR_CC_SHARED;
> +
> return dma_alloc_from_pool(dev, bytes, &vaddr, gfp,
> - dma_coherent_ok);
> + attrs, dma_coherent_ok);
> }
>
> gfp &= ~GFP_ZONEMASK;
> --
> 2.43.0
>
^ permalink raw reply
* Re: [PATCH v4 04/13] dma: swiotlb: track pool encryption state and honor DMA_ATTR_CC_SHARED
From: Mostafa Saleh @ 2026-05-13 14:27 UTC (permalink / raw)
To: Aneesh Kumar K.V (Arm)
Cc: iommu, linux-arm-kernel, linux-kernel, linux-coco, Robin Murphy,
Marek Szyprowski, Will Deacon, Marc Zyngier, Steven Price,
Suzuki K Poulose, Catalin Marinas, Jiri Pirko, Jason Gunthorpe,
Petr Tesarik, Alexey Kardashevskiy, Dan Williams, Xu Yilun,
linuxppc-dev, linux-s390, Madhavan Srinivasan, Michael Ellerman,
Nicholas Piggin, Christophe Leroy (CS GROUP), Alexander Gordeev,
Gerald Schaefer, Heiko Carstens, Vasily Gorbik,
Christian Borntraeger, Sven Schnelle, x86
In-Reply-To: <20260512090408.794195-5-aneesh.kumar@kernel.org>
On Tue, May 12, 2026 at 02:33:59PM +0530, Aneesh Kumar K.V (Arm) wrote:
> Teach swiotlb to distinguish between encrypted and decrypted bounce
> buffer pools, and make allocation and mapping paths select a pool whose
> state matches the requested DMA attributes.
>
> Add a decrypted flag to io_tlb_mem, initialize it for the default and
> restricted pools, and propagate DMA_ATTR_CC_SHARED into swiotlb pool
> allocation. Reject swiotlb alloc/map requests when the selected pool does
> not match the required encrypted/decrypted state.
>
> Also return DMA addresses with the matching phys_to_dma_{encrypted,
> unencrypted} helper so the DMA address encoding stays consistent with the
> chosen pool.
>
> Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
> ---
> include/linux/dma-direct.h | 10 ++++
> include/linux/swiotlb.h | 8 ++-
> kernel/dma/direct.c | 14 +++--
> kernel/dma/swiotlb.c | 108 +++++++++++++++++++++++++++----------
> 4 files changed, 107 insertions(+), 33 deletions(-)
>
> diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
> index c249912456f9..94fad4e7c11e 100644
> --- a/include/linux/dma-direct.h
> +++ b/include/linux/dma-direct.h
> @@ -77,6 +77,10 @@ static inline dma_addr_t dma_range_map_max(const struct bus_dma_region *map)
> #ifndef phys_to_dma_unencrypted
> #define phys_to_dma_unencrypted phys_to_dma
> #endif
> +
> +#ifndef phys_to_dma_encrypted
> +#define phys_to_dma_encrypted phys_to_dma
> +#endif
> #else
> static inline dma_addr_t __phys_to_dma(struct device *dev, phys_addr_t paddr)
> {
> @@ -90,6 +94,12 @@ static inline dma_addr_t phys_to_dma_unencrypted(struct device *dev,
> {
> return dma_addr_unencrypted(__phys_to_dma(dev, paddr));
> }
> +
> +static inline dma_addr_t phys_to_dma_encrypted(struct device *dev,
> + phys_addr_t paddr)
> +{
> + return dma_addr_encrypted(__phys_to_dma(dev, paddr));
> +}
> /*
> * If memory encryption is supported, phys_to_dma will set the memory encryption
> * bit in the DMA address, and dma_to_phys will clear it.
> diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
> index 3dae0f592063..b3fa3c6e0169 100644
> --- a/include/linux/swiotlb.h
> +++ b/include/linux/swiotlb.h
> @@ -81,6 +81,7 @@ struct io_tlb_pool {
> struct list_head node;
> struct rcu_head rcu;
> bool transient;
> + bool unencrypted;
> #endif
> };
>
> @@ -111,6 +112,7 @@ struct io_tlb_mem {
> struct dentry *debugfs;
> bool force_bounce;
> bool for_alloc;
> + bool unencrypted;
> #ifdef CONFIG_SWIOTLB_DYNAMIC
> bool can_grow;
> u64 phys_limit;
> @@ -282,7 +284,8 @@ static inline void swiotlb_sync_single_for_cpu(struct device *dev,
> extern void swiotlb_print_info(void);
>
> #ifdef CONFIG_DMA_RESTRICTED_POOL
> -struct page *swiotlb_alloc(struct device *dev, size_t size);
> +struct page *swiotlb_alloc(struct device *dev, size_t size,
> + unsigned long attrs);
> bool swiotlb_free(struct device *dev, struct page *page, size_t size);
>
> static inline bool is_swiotlb_for_alloc(struct device *dev)
> @@ -290,7 +293,8 @@ static inline bool is_swiotlb_for_alloc(struct device *dev)
> return dev->dma_io_tlb_mem->for_alloc;
> }
> #else
> -static inline struct page *swiotlb_alloc(struct device *dev, size_t size)
> +static inline struct page *swiotlb_alloc(struct device *dev, size_t size,
> + unsigned long attrs)
> {
> return NULL;
> }
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index dc2907439b3d..97ae4fa10521 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -104,9 +104,10 @@ static void __dma_direct_free_pages(struct device *dev, struct page *page,
> dma_free_contiguous(dev, page, size);
> }
>
> -static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
> +static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size,
> + unsigned long attrs)
> {
> - struct page *page = swiotlb_alloc(dev, size);
> + struct page *page = swiotlb_alloc(dev, size, attrs);
>
> if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
> swiotlb_free(dev, page, size);
> @@ -266,8 +267,12 @@ void *dma_direct_alloc(struct device *dev, size_t size,
> gfp, attrs);
>
> if (is_swiotlb_for_alloc(dev)) {
> - page = dma_direct_alloc_swiotlb(dev, size);
> + page = dma_direct_alloc_swiotlb(dev, size, attrs);
> if (page) {
> + /*
> + * swiotlb allocations comes from pool already marked
> + * decrypted
> + */
> mark_mem_decrypt = false;
> goto setup_page;
> }
> @@ -374,6 +379,7 @@ void dma_direct_free(struct device *dev, size_t size,
> return;
>
> if (swiotlb_find_pool(dev, dma_to_phys(dev, dma_addr)))
> + /* Swiotlb doesn't need a page attribute update on free */
> mark_mem_encrypted = false;
>
> if (is_vmalloc_addr(cpu_addr)) {
> @@ -403,7 +409,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
> gfp, attrs);
>
> if (is_swiotlb_for_alloc(dev)) {
> - page = dma_direct_alloc_swiotlb(dev, size);
> + page = dma_direct_alloc_swiotlb(dev, size, attrs);
> if (!page)
> return NULL;
>
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index ab4eccbaa076..065663be282c 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -259,10 +259,21 @@ void __init swiotlb_update_mem_attributes(void)
> struct io_tlb_pool *mem = &io_tlb_default_mem.defpool;
> unsigned long bytes;
>
> + /*
> + * if platform support memory encryption, swiotlb buffers are
> + * decrypted by default.
> + */
> + if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
> + io_tlb_default_mem.unencrypted = true;
> + else
> + io_tlb_default_mem.unencrypted = false;
> +
> if (!mem->nslabs || mem->late_alloc)
> return;
> bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
> - set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
> +
> + if (io_tlb_default_mem.unencrypted)
> + set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
> }
>
> static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start,
> @@ -505,8 +516,10 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
> if (!mem->slots)
> goto error_slots;
>
> - set_memory_decrypted((unsigned long)vstart,
> - (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
> + if (io_tlb_default_mem.unencrypted)
> + set_memory_decrypted((unsigned long)vstart,
> + (nslabs << IO_TLB_SHIFT) >> PAGE_SHIFT);
> +
> swiotlb_init_io_tlb_pool(mem, virt_to_phys(vstart), nslabs, true,
> nareas);
> add_mem_pool(&io_tlb_default_mem, mem);
> @@ -539,7 +552,9 @@ void __init swiotlb_exit(void)
> tbl_size = PAGE_ALIGN(mem->end - mem->start);
> slots_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), mem->nslabs));
>
> - set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
> + if (io_tlb_default_mem.unencrypted)
> + set_memory_encrypted(tbl_vaddr, tbl_size >> PAGE_SHIFT);
> +
> if (mem->late_alloc) {
> area_order = get_order(array_size(sizeof(*mem->areas),
> mem->nareas));
> @@ -563,6 +578,7 @@ void __init swiotlb_exit(void)
> * @gfp: GFP flags for the allocation.
> * @bytes: Size of the buffer.
> * @phys_limit: Maximum allowed physical address of the buffer.
> + * @unencrypted: true to allocate unencrypted memory, false for encrypted memory
> *
> * Allocate pages from the buddy allocator. If successful, make the allocated
> * pages decrypted that they can be used for DMA.
> @@ -570,7 +586,8 @@ void __init swiotlb_exit(void)
> * Return: Decrypted pages, %NULL on allocation failure, or ERR_PTR(-EAGAIN)
> * if the allocated physical address was above @phys_limit.
> */
> -static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
> +static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes,
> + u64 phys_limit, bool unencrypted)
> {
> unsigned int order = get_order(bytes);
> struct page *page;
> @@ -588,13 +605,13 @@ static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
> }
>
> vaddr = phys_to_virt(paddr);
> - if (set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes)))
> + if (unencrypted && set_memory_decrypted((unsigned long)vaddr, PFN_UP(bytes)))
> goto error;
> return page;
>
> error:
> /* Intentional leak if pages cannot be encrypted again. */
> - if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
> + if (unencrypted && !set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
> __free_pages(page, order);
> return NULL;
> }
> @@ -604,30 +621,26 @@ static struct page *alloc_dma_pages(gfp_t gfp, size_t bytes, u64 phys_limit)
> * @dev: Device for which a memory pool is allocated.
> * @bytes: Size of the buffer.
> * @phys_limit: Maximum allowed physical address of the buffer.
> + * @attrs: DMA attributes for the allocation.
> * @gfp: GFP flags for the allocation.
> *
> * Return: Allocated pages, or %NULL on allocation failure.
> */
> static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
> - u64 phys_limit, gfp_t gfp)
> + u64 phys_limit, unsigned long attrs, gfp_t gfp)
> {
> struct page *page;
> - unsigned long attrs = 0;
>
> /*
> * Allocate from the atomic pools if memory is encrypted and
> * the allocation is atomic, because decrypting may block.
> */
> - if (!gfpflags_allow_blocking(gfp) && dev && force_dma_unencrypted(dev)) {
> + if (!gfpflags_allow_blocking(gfp) && (attrs & DMA_ATTR_CC_SHARED)) {
> void *vaddr;
>
> if (!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))
> return NULL;
>
> - /* swiotlb considered decrypted by default */
> - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
> - attrs = DMA_ATTR_CC_SHARED;
> -
> return dma_alloc_from_pool(dev, bytes, &vaddr, gfp,
> attrs, dma_coherent_ok);
> }
> @@ -638,7 +651,8 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
> else if (phys_limit <= DMA_BIT_MASK(32))
> gfp |= __GFP_DMA32;
>
> - while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit))) {
> + while (IS_ERR(page = alloc_dma_pages(gfp, bytes, phys_limit,
> + !!(attrs & DMA_ATTR_CC_SHARED)))) {
> if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
> phys_limit < DMA_BIT_MASK(64) &&
> !(gfp & (__GFP_DMA32 | __GFP_DMA)))
> @@ -657,15 +671,18 @@ static struct page *swiotlb_alloc_tlb(struct device *dev, size_t bytes,
> * swiotlb_free_tlb() - free a dynamically allocated IO TLB buffer
> * @vaddr: Virtual address of the buffer.
> * @bytes: Size of the buffer.
> + * @unencrypted: true if @vaddr was allocated decrypted and must be
> + * re-encrypted before being freed
> */
> -static void swiotlb_free_tlb(void *vaddr, size_t bytes)
> +static void swiotlb_free_tlb(void *vaddr, size_t bytes, bool unencrypted)
> {
> if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
> dma_free_from_pool(NULL, vaddr, bytes))
> return;
>
> /* Intentional leak if pages cannot be encrypted again. */
> - if (!set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
> + if (!unencrypted ||
> + !set_memory_encrypted((unsigned long)vaddr, PFN_UP(bytes)))
> __free_pages(virt_to_page(vaddr), get_order(bytes));
> }
>
> @@ -676,6 +693,7 @@ static void swiotlb_free_tlb(void *vaddr, size_t bytes)
> * @nslabs: Desired (maximum) number of slabs.
> * @nareas: Number of areas.
> * @phys_limit: Maximum DMA buffer physical address.
> + * @attrs: DMA attributes for the allocation.
> * @gfp: GFP flags for the allocations.
> *
> * Allocate and initialize a new IO TLB memory pool. The actual number of
> @@ -686,7 +704,8 @@ static void swiotlb_free_tlb(void *vaddr, size_t bytes)
> */
> static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
> unsigned long minslabs, unsigned long nslabs,
> - unsigned int nareas, u64 phys_limit, gfp_t gfp)
> + unsigned int nareas, u64 phys_limit, unsigned long attrs,
> + gfp_t gfp)
> {
> struct io_tlb_pool *pool;
> unsigned int slot_order;
> @@ -704,9 +723,10 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
> if (!pool)
> goto error;
> pool->areas = (void *)pool + sizeof(*pool);
> + pool->unencrypted = !!(attrs & DMA_ATTR_CC_SHARED);
>
> tlb_size = nslabs << IO_TLB_SHIFT;
> - while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, gfp))) {
> + while (!(tlb = swiotlb_alloc_tlb(dev, tlb_size, phys_limit, attrs, gfp))) {
> if (nslabs <= minslabs)
> goto error_tlb;
> nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
> @@ -724,7 +744,8 @@ static struct io_tlb_pool *swiotlb_alloc_pool(struct device *dev,
> return pool;
>
> error_slots:
> - swiotlb_free_tlb(page_address(tlb), tlb_size);
> + swiotlb_free_tlb(page_address(tlb), tlb_size,
> + !!(attrs & DMA_ATTR_CC_SHARED));
> error_tlb:
> kfree(pool);
> error:
> @@ -742,7 +763,9 @@ static void swiotlb_dyn_alloc(struct work_struct *work)
> struct io_tlb_pool *pool;
>
> pool = swiotlb_alloc_pool(NULL, IO_TLB_MIN_SLABS, default_nslabs,
> - default_nareas, mem->phys_limit, GFP_KERNEL);
> + default_nareas, mem->phys_limit,
> + mem->unencrypted ? DMA_ATTR_CC_SHARED : 0,
> + GFP_KERNEL);
> if (!pool) {
> pr_warn_ratelimited("Failed to allocate new pool");
> return;
> @@ -762,7 +785,7 @@ static void swiotlb_dyn_free(struct rcu_head *rcu)
> size_t tlb_size = pool->end - pool->start;
>
> free_pages((unsigned long)pool->slots, get_order(slots_size));
> - swiotlb_free_tlb(pool->vaddr, tlb_size);
> + swiotlb_free_tlb(pool->vaddr, tlb_size, pool->unencrypted);
> kfree(pool);
> }
>
> @@ -1232,6 +1255,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
> nslabs = nr_slots(alloc_size);
> phys_limit = min_not_zero(*dev->dma_mask, dev->bus_dma_limit);
> pool = swiotlb_alloc_pool(dev, nslabs, nslabs, 1, phys_limit,
> + mem->unencrypted ? DMA_ATTR_CC_SHARED : 0,
> GFP_NOWAIT);
> if (!pool)
> return -1;
> @@ -1394,6 +1418,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
> enum dma_data_direction dir, unsigned long attrs)
> {
> struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
> + bool require_decrypted = false;
> unsigned int offset;
> struct io_tlb_pool *pool;
> unsigned int i;
> @@ -1411,6 +1436,16 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
> if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
> pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
>
> + /*
> + * if we are trying to swiotlb map a decrypted paddr or the paddr is encrypted
> + * but the device is forcing decryption, use decrypted io_tlb_mem
> + */
> + if ((attrs & DMA_ATTR_CC_SHARED) || force_dma_unencrypted(dev))
> + require_decrypted = true;
> +
> + if (require_decrypted != mem->unencrypted)
> + return (phys_addr_t)DMA_MAPPING_ERROR;
> +
> /*
> * The default swiotlb memory pool is allocated with PAGE_SIZE
> * alignment. If a mapping is requested with larger alignment,
> @@ -1608,8 +1643,14 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
> if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
> return DMA_MAPPING_ERROR;
>
> - /* Ensure that the address returned is DMA'ble */
> - dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
> + /*
> + * Use the allocated io_tlb_mem encryption type to determine dma addr.
> + */
> + if (dev->dma_io_tlb_mem->unencrypted)
> + dma_addr = phys_to_dma_unencrypted(dev, swiotlb_addr);
> + else
> + dma_addr = phys_to_dma_encrypted(dev, swiotlb_addr);
> +
> if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
> __swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, dir,
> attrs | DMA_ATTR_SKIP_CPU_SYNC,
> @@ -1773,7 +1814,8 @@ static inline void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
>
> #ifdef CONFIG_DMA_RESTRICTED_POOL
>
> -struct page *swiotlb_alloc(struct device *dev, size_t size)
> +struct page *swiotlb_alloc(struct device *dev, size_t size,
> + unsigned long attrs)
> {
> struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
> struct io_tlb_pool *pool;
> @@ -1784,6 +1826,9 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
> if (!mem)
> return NULL;
>
> + if (mem->unencrypted != !!(attrs & DMA_ATTR_CC_SHARED))
> + return NULL;
> +
> align = (1 << (get_order(size) + PAGE_SHIFT)) - 1;
> index = swiotlb_find_slots(dev, 0, size, align, &pool);
> if (index == -1)
> @@ -1853,9 +1898,18 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
> kfree(mem);
> return -ENOMEM;
> }
> + /*
> + * if platform supports memory encryption,
> + * restricted mem pool is decrypted by default
> + */
> + if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
> + mem->unencrypted = true;
> + set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
> + rmem->size >> PAGE_SHIFT);
> + } else {
> + mem->unencrypted = false;
> + }
This breaks pKVM as it doesn’t set CC_ATTR_MEM_ENCRYPT, so all virtio
traffic now fails.
Also, by design, some drivers are clueless about bouncing, so
I believe that the pool should have a way to control it’s property
(encrypted or decrypted) and that takes priority over whatever
attributes comes from allocation.
And that brings us to the same point whether it’s better to return
the memory along with it’s state or we pass the requested state.
I think for other cases it’s fine for the device/DMA-API to dictate
the attrs, but not in restricted-dma case, the firmware just knows better.
Thanks,
Mostafa
>
> - set_memory_decrypted((unsigned long)phys_to_virt(rmem->base),
> - rmem->size >> PAGE_SHIFT);
> swiotlb_init_io_tlb_pool(pool, rmem->base, nslabs,
> false, nareas);
> mem->force_bounce = true;
> --
> 2.43.0
>
^ permalink raw reply
* [PATCH v9 00/23] Runtime TDX module update support
From: Chao Gao @ 2026-05-13 15:09 UTC (permalink / raw)
To: kvm, linux-coco, x86, linux-kernel, linux-rt-devel, linux-doc
Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
yilun.xu, xiaoyao.li, yan.y.zhao, Chao Gao, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, H. Peter Anvin,
Sebastian Andrzej Siewior, Clark Williams, Steven Rostedt,
Jonathan Corbet, Shuah Khan
Hi Dave,
Thanks for your thorough review of v8. This v9 addresses the issues you
pointed out. In particular, it adopts the new tdx_blob format you
suggested, removes module version printing during updates, and reworks
the do-while loop in the update flow to improve readability. It also
adds the two cleanup patches you suggested as patches 1 and 2.
Please take a look at this new version. I hope it can still be merged
for 7.2.
---
(For transparency, note that I used AI tools to help proofread this
cover-letter and commit messages)
This series adds support for runtime TDX module updates that preserve
running TDX guests. It is also available at:
https://github.com/gaochaointel/linux-dev/commits/tdx-module-updates-v9/
== Background ==
Intel TDX isolates Trusted Domains (TDs), or confidential guests, from the
host. A key component of Intel TDX is the TDX module, which enforces
security policies to protect the memory and CPU states of TDs from the
host. However, the TDX module is software that requires updates.
== Problems ==
Currently, the TDX module is loaded by the BIOS at boot time, and the only
way to update it is through a reboot, which results in significant system
downtime. Users expect the TDX module to be updatable at runtime without
disrupting TDX guests.
== Solution ==
On TDX platforms, P-SEAMLDR[1] is a component within the protected SEAM
range. It is loaded by the BIOS and provides the host with functions to
install a TDX module at runtime.
This series implements runtime TDX module updates through the fw_upload
mechanism. That interface is a good fit because TDX module selection is not
a simple "load a known file from disk" problem. The update image to load
depends on module versioning, compatibility rules. fw_upload lets userspace
choose the module explicitly while the kernel provides the update
mechanism.
This design intentionally keeps most update validation/policy in userspace.
The kernel exposes the information userspace needs, such as TDX module
version and P-SEAMLDR information, but userspace is responsible for
understanding TDX module's versioning and compatibility rules and for
choosing an appropriate update image (see "TDX module versioning" below).
The kernel still enforces the pieces that must be handled in-kernel:
1. Validate the tdx_blob header fields that are not passed through tothe
TDX module. Just the standard overflow and reserved bits defensive ABI stuff.
2. Make sure no non-update SEAMCALLs are called during the update.
3. Make sure SEAMCALLs are on the right CPU, for any the user has made
available to the kernel.
4. Handle the race between updates and concurrent TD builds by
returning -EBUSY to userspace.
Everything else remains a userspace responsibility.
In the unlikely event the update fails, for example userspace picks an
incompatible update image, or the image is otherwise corrupted, all TDs
will experience SEAMCALL failures and be killed. The recovery of TD
operation from that event requires a reboot.
Given there is no mechanism to quiesce SEAMCALLs, the TDs themselves must
pause execution over an update. The most straightforward way to meet the
'pause TDs while update executes' constraint is to run the update in
stop_machine() context. All other evaluated solutions export more
complexity to KVM, or exports more fragility to userspace.
== How to test this series ==
NOTE: This v9 uses a new tdx_blob format. The scripts and module blobs in
https://github.com/intel/tdx-module-binaries have not yet been updated
to match this version. Those updates will be done separately later.
== Other information relevant to Runtime TDX module updates ==
=== TDX module versioning ===
Each TDX module is assigned a version number x.y.z, where x represents the
"major" version, y the "minor" version, and z the "update" version.
Runtime TDX module updates are restricted to Z-stream releases.
Note that Z-stream releases do not necessarily guarantee compatibility. A
new release may not be compatible with all previous versions. To address this,
Intel provides a separate file containing compatibility information, which
specifies the minimum module version required for a particular update. This
information is referenced by the tool to determine if two modules are
compatible.
=== TCB Stability ===
Updates change the TCB as viewed by attestation reports. In TDX there is
a distinction between launch-time version and current version where
runtime TDX module updates cause that latter version number to change,
subject to Z-stream constraints.
The concern that a malicious host may attack confidential VMs by loading
insecure updates was addressed by Alex in [3]. Similarly, the scenario
where some "theoretical paranoid tenant" in the cloud wants to audit
updates and stop trusting the host after updates until audit completion
was also addressed in [4]. Users not in the cloud control the host machine
and can manage updates themselves, so they don't have these concerns.
See more about the implications of current TCB version changes in
attestation as summarized by Dave in [5].
=== TDX module Distribution Model ===
At a high level, Intel publishes all TDX modules on the github [2], along
with a mapping_file.json which documents the compatibility information
about each TDX module and a userspace tool to install the TDX module. OS
vendors can package these modules and distribute them. Administrators
install the package and use the tool to select the appropriate TDX module
and install it via the interfaces exposed by this series.
[1]: https://cdrdv2.intel.com/v1/dl/getContent/733584
[2]: https://github.com/intel/tdx-module-binaries
[3]: https://lore.kernel.org/all/665c5ae0-4b7c-4852-8995-255adf7b3a2f@amazon.com/
[4]: https://lore.kernel.org/all/5d1da767-491b-4077-b472-2cc3d73246d6@amazon.com/
[5]: https://lore.kernel.org/all/94d6047e-3b7c-4bc1-819c-85c16ff85abf@intel.com/
Chao Gao (22):
x86/virt/tdx: Consolidate TDX global initialization states
x86/virt/tdx: Move TDX_FEATURES0 bits to asm/tdx.h
coco/tdx-host: Introduce a "tdx_host" device
coco/tdx-host: Expose TDX module version
x86/virt/seamldr: Introduce a wrapper for P-SEAMLDR SEAMCALLs
x86/virt/seamldr: Add a helper to retrieve P-SEAMLDR information
coco/tdx-host: Expose P-SEAMLDR information via sysfs
coco/tdx-host: Don't expose P-SEAMLDR information on CPUs with erratum
coco/tdx-host: Implement firmware upload sysfs ABI for TDX module
updates
x86/virt/seamldr: Allocate and populate a module update request
x86/virt/seamldr: Introduce skeleton for TDX module updates
x86/virt/seamldr: Abort updates after a failed step
x86/virt/seamldr: Shut down the current TDX module
x86/virt/tdx: Reset software states during TDX module shutdown
x86/virt/seamldr: Install a new TDX module
x86/virt/seamldr: Do TDX per-CPU initialization after module
installation
x86/virt/tdx: Restore TDX module state
x86/virt/tdx: Refresh TDX module version after update
x86/virt/tdx: Reject updates during compatibility-sensitive operations
x86/virt/tdx: Enable TDX module runtime updates
coco/tdx-host: Document TDX module update compatibility criteria
x86/virt/tdx: Document TDX module update
Kai Huang (1):
x86/virt/tdx: Move low level SEAMCALL helpers out of <asm/tdx.h>
.../ABI/testing/sysfs-devices-faux-tdx-host | 68 ++++
Documentation/arch/x86/tdx.rst | 34 ++
arch/x86/include/asm/cpufeatures.h | 1 +
arch/x86/include/asm/seamldr.h | 37 +++
arch/x86/include/asm/tdx.h | 67 ++--
arch/x86/include/asm/tdx_global_metadata.h | 4 +
arch/x86/include/asm/vmx.h | 1 +
arch/x86/virt/vmx/tdx/Makefile | 2 +-
arch/x86/virt/vmx/tdx/seamcall_internal.h | 109 +++++++
arch/x86/virt/vmx/tdx/seamldr.c | 306 ++++++++++++++++++
arch/x86/virt/vmx/tdx/tdx.c | 162 ++++++----
arch/x86/virt/vmx/tdx/tdx.h | 8 +-
arch/x86/virt/vmx/tdx/tdx_global_metadata.c | 17 +-
drivers/virt/coco/Kconfig | 2 +
drivers/virt/coco/Makefile | 1 +
drivers/virt/coco/tdx-host/Kconfig | 12 +
drivers/virt/coco/tdx-host/Makefile | 1 +
drivers/virt/coco/tdx-host/tdx-host.c | 221 +++++++++++++
18 files changed, 940 insertions(+), 113 deletions(-)
create mode 100644 Documentation/ABI/testing/sysfs-devices-faux-tdx-host
create mode 100644 arch/x86/include/asm/seamldr.h
create mode 100644 arch/x86/virt/vmx/tdx/seamcall_internal.h
create mode 100644 arch/x86/virt/vmx/tdx/seamldr.c
create mode 100644 drivers/virt/coco/tdx-host/Kconfig
create mode 100644 drivers/virt/coco/tdx-host/Makefile
create mode 100644 drivers/virt/coco/tdx-host/tdx-host.c
base-commit: 5209e5bfe5cab593476c3e7754e42c5e47ce36de
--
2.52.0
^ permalink raw reply
* [PATCH v9 01/23] x86/virt/tdx: Consolidate TDX global initialization states
From: Chao Gao @ 2026-05-13 15:09 UTC (permalink / raw)
To: kvm, linux-coco, linux-kernel
Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
yilun.xu, xiaoyao.li, yan.y.zhao, Chao Gao, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, x86, H. Peter Anvin
In-Reply-To: <20260513151045.1420990-1-chao.gao@intel.com>
The kernel uses several global flags to guard one-time TDX initialization
flows and prevent them from being repeated.
When the TDX module is updated, all of those states must be reset so that
the module can be initialized again. Today those states are kept as separate
global variables, which makes the reset path awkward and easy to miss when
a new state is added.
Group the states into a single structure so they can be reset together, for
example with memset(), and so a newly added state won't be missed.
Drop the __ro_after_init annotation from tdx_module_initialized because
the other two states do not have it. And with TDX module update support,
all the states need to be writable at runtime.
Signed-off-by: Chao Gao <chao.gao@intel.com>
---
arch/x86/virt/vmx/tdx/tdx.c | 24 ++++++++++++++----------
1 file changed, 14 insertions(+), 10 deletions(-)
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index c0c6281b08a5..0172b432f229 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -44,6 +44,13 @@
#include <asm/virt.h>
#include "tdx.h"
+struct tdx_module_state {
+ bool initialized;
+ bool sysinit_done;
+ int sysinit_ret;
+};
+
+static struct tdx_module_state tdx_module_state;
static u32 tdx_global_keyid __ro_after_init;
static u32 tdx_guest_keyid_start __ro_after_init;
static u32 tdx_nr_guest_keyids __ro_after_init;
@@ -58,7 +65,6 @@ static struct tdmr_info_list tdx_tdmr_list;
static LIST_HEAD(tdx_memlist);
static struct tdx_sys_info tdx_sysinfo __ro_after_init;
-static bool tdx_module_initialized __ro_after_init;
typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
@@ -113,30 +119,28 @@ static int try_init_module_global(void)
{
struct tdx_module_args args = {};
static DEFINE_RAW_SPINLOCK(sysinit_lock);
- static bool sysinit_done;
- static int sysinit_ret;
raw_spin_lock(&sysinit_lock);
- if (sysinit_done)
+ if (tdx_module_state.sysinit_done)
goto out;
/* RCX is module attributes and all bits are reserved */
args.rcx = 0;
- sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
+ tdx_module_state.sysinit_ret = seamcall_prerr(TDH_SYS_INIT, &args);
/*
* The first SEAMCALL also detects the TDX module, thus
* it can fail due to the TDX module is not loaded.
* Dump message to let the user know.
*/
- if (sysinit_ret == -ENODEV)
+ if (tdx_module_state.sysinit_ret == -ENODEV)
pr_err("module not loaded\n");
- sysinit_done = true;
+ tdx_module_state.sysinit_done = true;
out:
raw_spin_unlock(&sysinit_lock);
- return sysinit_ret;
+ return tdx_module_state.sysinit_ret;
}
/**
@@ -1299,7 +1303,7 @@ static __init int tdx_enable(void)
register_syscore(&tdx_syscore);
- tdx_module_initialized = true;
+ tdx_module_state.initialized = true;
pr_info("TDX-Module initialized\n");
return 0;
}
@@ -1554,7 +1558,7 @@ void __init tdx_init(void)
const struct tdx_sys_info *tdx_get_sysinfo(void)
{
- if (!tdx_module_initialized)
+ if (!tdx_module_state.initialized)
return NULL;
return (const struct tdx_sys_info *)&tdx_sysinfo;
--
2.52.0
^ permalink raw reply related
* [PATCH v9 02/23] x86/virt/tdx: Move TDX_FEATURES0 bits to asm/tdx.h
From: Chao Gao @ 2026-05-13 15:09 UTC (permalink / raw)
To: kvm, linux-coco, linux-kernel
Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
yilun.xu, xiaoyao.li, yan.y.zhao, Chao Gao, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, x86, H. Peter Anvin
In-Reply-To: <20260513151045.1420990-1-chao.gao@intel.com>
Move the TDX_FEATURES0 bit definitions from the private TDX header to
asm/tdx.h, and opportunistically switch to BIT_ULL() since TDX_FEATURES0 is
64-bit.
This prepares for TDX module update [1] and Dynamic PAMT [2] support. Both
add new TDX_FEATURES0 capability bits, and both need those capabilities to
be queried from code outside arch/x86/virt. The corresponding feature-query
helpers therefore need to live in the public asm/tdx.h header, so move the
existing bit definitions there first.
No functional change intended.
Signed-off-by: Chao Gao <chao.gao@intel.com>
Link: https://lore.kernel.org/kvm/20260427152854.101171-17-chao.gao@intel.com/ # [1]
Link: https://lore.kernel.org/kvm/20251121005125.417831-16-rick.p.edgecombe@intel.com/ # [2]
---
arch/x86/include/asm/tdx.h | 3 +++
arch/x86/virt/vmx/tdx/tdx.h | 3 ---
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 15eac89b0afb..e2430dd0e4d5 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -32,6 +32,9 @@
#define TDX_SUCCESS 0ULL
#define TDX_RND_NO_ENTROPY 0x8000020300000000ULL
+/* Bit definitions of TDX_FEATURES0 metadata field */
+#define TDX_FEATURES0_NO_RBP_MOD BIT_ULL(18)
+
#ifndef __ASSEMBLER__
#include <uapi/asm/mce.h>
diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h
index e2cf2dd48755..76c5fb1e1ffe 100644
--- a/arch/x86/virt/vmx/tdx/tdx.h
+++ b/arch/x86/virt/vmx/tdx/tdx.h
@@ -85,9 +85,6 @@ struct tdmr_info {
DECLARE_FLEX_ARRAY(struct tdmr_reserved_area, reserved_areas);
} __packed __aligned(TDMR_INFO_ALIGNMENT);
-/* Bit definitions of TDX_FEATURES0 metadata field */
-#define TDX_FEATURES0_NO_RBP_MOD BIT(18)
-
/*
* Do not put any hardware-defined TDX structure representations below
* this comment!
--
2.52.0
^ permalink raw reply related
* [PATCH v9 04/23] coco/tdx-host: Introduce a "tdx_host" device
From: Chao Gao @ 2026-05-13 15:09 UTC (permalink / raw)
To: kvm, linux-coco, linux-kernel
Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
yilun.xu, xiaoyao.li, yan.y.zhao, Chao Gao, Dan Williams,
Jonathan Cameron, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
x86, H. Peter Anvin
In-Reply-To: <20260513151045.1420990-1-chao.gao@intel.com>
TDX depends on a platform firmware module that is invoked via instructions
similar to vmenter (i.e. enter into a new privileged "root-mode" context to
manage private memory and private device mechanisms). It is a software
construct that depends on the CPU vmxon state to enable invocation of
TDX module ABIs. Unlike other Trusted Execution Environment (TEE) platform
implementations that employ a firmware module running on a PCI device with
an MMIO mailbox for communication, TDX has no hardware device to point to
as the TEE Secure Manager (TSM).
Create a virtual device not only to align with other implementations but
also to make it easier to
- expose metadata (e.g., TDX module version, seamldr version etc) to
the userspace as device attributes
- implement firmware uploader APIs which are tied to a device. This is
needed to support TDX module runtime updates
- enable TDX Connect which will share a common infrastructure with other
platform implementations. In the TDX Connect context, every
architecture has a TSM, represented by a PCIe or virtual device. The
new "tdx_host" device will serve the TSM role.
A faux device is used for TDX because the TDX module is singular within
the system and lacks associated platform resources. Using a faux device
eliminates the need to create a stub bus.
The call to tdx_get_sysinfo() ensures that the TDX module is ready to
provide services.
Note that AMD has a PCI device for the PSP for SEV and ARM CCA will
likely have a faux device [1].
Co-developed-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Jonathan Cameron <jonathan.cameron@huawei.com>
Reviewed-by: Tony Lindgren <tony.lindgren@linux.intel.com>
Reviewed-by: Xu Yilun <yilun.xu@linux.intel.com>
Reviewed-by: Kai Huang <kai.huang@intel.com>
Reviewed-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Link: https://lore.kernel.org/all/2025073035-bulginess-rematch-b92e@gregkh/ # [1]
---
arch/x86/virt/vmx/tdx/tdx.c | 2 +-
drivers/virt/coco/Kconfig | 2 ++
drivers/virt/coco/Makefile | 1 +
drivers/virt/coco/tdx-host/Kconfig | 10 +++++++
drivers/virt/coco/tdx-host/Makefile | 1 +
drivers/virt/coco/tdx-host/tdx-host.c | 43 +++++++++++++++++++++++++++
6 files changed, 58 insertions(+), 1 deletion(-)
create mode 100644 drivers/virt/coco/tdx-host/Kconfig
create mode 100644 drivers/virt/coco/tdx-host/Makefile
create mode 100644 drivers/virt/coco/tdx-host/tdx-host.c
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index a0f8cf5e10d7..837e9b36e1ea 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -1520,7 +1520,7 @@ const struct tdx_sys_info *tdx_get_sysinfo(void)
return (const struct tdx_sys_info *)&tdx_sysinfo;
}
-EXPORT_SYMBOL_FOR_KVM(tdx_get_sysinfo);
+EXPORT_SYMBOL_FOR_MODULES(tdx_get_sysinfo, "kvm-intel,tdx-host");
u32 tdx_get_nr_guest_keyids(void)
{
diff --git a/drivers/virt/coco/Kconfig b/drivers/virt/coco/Kconfig
index df1cfaf26c65..f7691f64fbe3 100644
--- a/drivers/virt/coco/Kconfig
+++ b/drivers/virt/coco/Kconfig
@@ -17,5 +17,7 @@ source "drivers/virt/coco/arm-cca-guest/Kconfig"
source "drivers/virt/coco/guest/Kconfig"
endif
+source "drivers/virt/coco/tdx-host/Kconfig"
+
config TSM
bool
diff --git a/drivers/virt/coco/Makefile b/drivers/virt/coco/Makefile
index cb52021912b3..b323b0ae4f82 100644
--- a/drivers/virt/coco/Makefile
+++ b/drivers/virt/coco/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_EFI_SECRET) += efi_secret/
obj-$(CONFIG_ARM_PKVM_GUEST) += pkvm-guest/
obj-$(CONFIG_SEV_GUEST) += sev-guest/
obj-$(CONFIG_INTEL_TDX_GUEST) += tdx-guest/
+obj-$(CONFIG_INTEL_TDX_HOST) += tdx-host/
obj-$(CONFIG_ARM_CCA_GUEST) += arm-cca-guest/
obj-$(CONFIG_TSM) += tsm-core.o
obj-$(CONFIG_TSM_GUEST) += guest/
diff --git a/drivers/virt/coco/tdx-host/Kconfig b/drivers/virt/coco/tdx-host/Kconfig
new file mode 100644
index 000000000000..d35d85ef91c0
--- /dev/null
+++ b/drivers/virt/coco/tdx-host/Kconfig
@@ -0,0 +1,10 @@
+config TDX_HOST_SERVICES
+ tristate "TDX Host Services Driver"
+ depends on INTEL_TDX_HOST
+ default m
+ help
+ Enable access to TDX host services like module update and
+ extensions (e.g. TDX Connect).
+
+ Say y or m if enabling support for confidential virtual machine
+ support (CONFIG_INTEL_TDX_HOST). The module is called tdx_host.ko.
diff --git a/drivers/virt/coco/tdx-host/Makefile b/drivers/virt/coco/tdx-host/Makefile
new file mode 100644
index 000000000000..e61e749a8dff
--- /dev/null
+++ b/drivers/virt/coco/tdx-host/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_TDX_HOST_SERVICES) += tdx-host.o
diff --git a/drivers/virt/coco/tdx-host/tdx-host.c b/drivers/virt/coco/tdx-host/tdx-host.c
new file mode 100644
index 000000000000..c77885392b09
--- /dev/null
+++ b/drivers/virt/coco/tdx-host/tdx-host.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * TDX host user interface driver
+ *
+ * Copyright (C) 2025 Intel Corporation
+ */
+
+#include <linux/device/faux.h>
+#include <linux/module.h>
+#include <linux/mod_devicetable.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/tdx.h>
+
+static const struct x86_cpu_id tdx_host_ids[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_TDX_HOST_PLATFORM, NULL),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, tdx_host_ids);
+
+static struct faux_device *fdev;
+
+static int __init tdx_host_init(void)
+{
+ if (!x86_match_cpu(tdx_host_ids) || !tdx_get_sysinfo())
+ return -ENODEV;
+
+ fdev = faux_device_create(KBUILD_MODNAME, NULL, NULL);
+ if (!fdev)
+ return -ENODEV;
+
+ return 0;
+}
+module_init(tdx_host_init);
+
+static void __exit tdx_host_exit(void)
+{
+ faux_device_destroy(fdev);
+}
+module_exit(tdx_host_exit);
+
+MODULE_DESCRIPTION("TDX Host Services");
+MODULE_LICENSE("GPL");
--
2.52.0
^ permalink raw reply related
* [PATCH v9 03/23] x86/virt/tdx: Move low level SEAMCALL helpers out of <asm/tdx.h>
From: Chao Gao @ 2026-05-13 15:09 UTC (permalink / raw)
To: kvm, linux-coco, linux-kernel
Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
yilun.xu, xiaoyao.li, yan.y.zhao, Chao Gao, Zhenzhong Duan,
Thomas Gleixner, Ingo Molnar, Borislav Petkov, x86,
H. Peter Anvin
In-Reply-To: <20260513151045.1420990-1-chao.gao@intel.com>
From: Kai Huang <kai.huang@intel.com>
TDX host core code implements three seamcall*() helpers to make SEAMCALLs
to the TDX module. Currently, they are implemented in <asm/tdx.h> and
are exposed to other kernel code which includes <asm/tdx.h>.
However, other than the TDX host core, seamcall*() are not expected to
be used by other kernel code directly. For instance, for all SEAMCALLs
that are used by KVM, the TDX host core exports a wrapper function for
each of them.
Move seamcall*() and related code out of <asm/tdx.h> and make them only
visible to TDX host core.
Since TDX host core tdx.c is already very heavy, don't put low level
seamcall*() code there but to a new dedicated "seamcall_internal.h". Also,
currently tdx.c has seamcall_prerr*() helpers which additionally print
error message when calling seamcall*() fails. Move them to
"seamcall_internal.h" as well. In such way all low level SEAMCALL helpers
are in a dedicated place, which is much more readable.
Copy the copyright notice from the original files and consolidate the
date ranges to:
Copyright (C) 2021-2023 Intel Corporation
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Zhenzhong Duan <zhenzhong.duan@intel.com>
Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Reviewed-by: Tony Lindgren <tony.lindgren@linux.intel.com>
Reviewed-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Reviewed-by: Vishal Annapurve <vannapurve@google.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
---
arch/x86/include/asm/tdx.h | 47 ----------
arch/x86/virt/vmx/tdx/seamcall_internal.h | 109 ++++++++++++++++++++++
arch/x86/virt/vmx/tdx/tdx.c | 47 +---------
3 files changed, 111 insertions(+), 92 deletions(-)
create mode 100644 arch/x86/virt/vmx/tdx/seamcall_internal.h
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index e2430dd0e4d5..8b739ac01479 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -100,54 +100,7 @@ static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1,
#endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */
#ifdef CONFIG_INTEL_TDX_HOST
-u64 __seamcall(u64 fn, struct tdx_module_args *args);
-u64 __seamcall_ret(u64 fn, struct tdx_module_args *args);
-u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args);
void tdx_init(void);
-
-#include <linux/preempt.h>
-#include <asm/archrandom.h>
-#include <asm/processor.h>
-
-typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args);
-
-static __always_inline u64 __seamcall_dirty_cache(sc_func_t func, u64 fn,
- struct tdx_module_args *args)
-{
- lockdep_assert_preemption_disabled();
-
- /*
- * SEAMCALLs are made to the TDX module and can generate dirty
- * cachelines of TDX private memory. Mark cache state incoherent
- * so that the cache can be flushed during kexec.
- *
- * This needs to be done before actually making the SEAMCALL,
- * because kexec-ing CPU could send NMI to stop remote CPUs,
- * in which case even disabling IRQ won't help here.
- */
- this_cpu_write(cache_state_incoherent, true);
-
- return func(fn, args);
-}
-
-static __always_inline u64 sc_retry(sc_func_t func, u64 fn,
- struct tdx_module_args *args)
-{
- int retry = RDRAND_RETRY_LOOPS;
- u64 ret;
-
- do {
- preempt_disable();
- ret = __seamcall_dirty_cache(func, fn, args);
- preempt_enable();
- } while (ret == TDX_RND_NO_ENTROPY && --retry);
-
- return ret;
-}
-
-#define seamcall(_fn, _args) sc_retry(__seamcall, (_fn), (_args))
-#define seamcall_ret(_fn, _args) sc_retry(__seamcall_ret, (_fn), (_args))
-#define seamcall_saved_ret(_fn, _args) sc_retry(__seamcall_saved_ret, (_fn), (_args))
const char *tdx_dump_mce_info(struct mce *m);
const struct tdx_sys_info *tdx_get_sysinfo(void);
diff --git a/arch/x86/virt/vmx/tdx/seamcall_internal.h b/arch/x86/virt/vmx/tdx/seamcall_internal.h
new file mode 100644
index 000000000000..be5f446467df
--- /dev/null
+++ b/arch/x86/virt/vmx/tdx/seamcall_internal.h
@@ -0,0 +1,109 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * SEAMCALL utilities for TDX host-side operations.
+ *
+ * Provides convenient wrappers around SEAMCALL assembly with retry logic,
+ * error reporting and cache coherency tracking.
+ *
+ * Copyright (C) 2021-2023 Intel Corporation
+ */
+
+#ifndef _X86_VIRT_SEAMCALL_INTERNAL_H
+#define _X86_VIRT_SEAMCALL_INTERNAL_H
+
+#include <linux/printk.h>
+#include <linux/types.h>
+#include <asm/archrandom.h>
+#include <asm/processor.h>
+#include <asm/tdx.h>
+
+u64 __seamcall(u64 fn, struct tdx_module_args *args);
+u64 __seamcall_ret(u64 fn, struct tdx_module_args *args);
+u64 __seamcall_saved_ret(u64 fn, struct tdx_module_args *args);
+
+typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args);
+
+static __always_inline u64 __seamcall_dirty_cache(sc_func_t func, u64 fn,
+ struct tdx_module_args *args)
+{
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * SEAMCALLs are made to the TDX module and can generate dirty
+ * cachelines of TDX private memory. Mark cache state incoherent
+ * so that the cache can be flushed during kexec.
+ *
+ * This needs to be done before actually making the SEAMCALL,
+ * because kexec-ing CPU could send NMI to stop remote CPUs,
+ * in which case even disabling IRQ won't help here.
+ */
+ this_cpu_write(cache_state_incoherent, true);
+
+ return func(fn, args);
+}
+
+static __always_inline u64 sc_retry(sc_func_t func, u64 fn,
+ struct tdx_module_args *args)
+{
+ int retry = RDRAND_RETRY_LOOPS;
+ u64 ret;
+
+ do {
+ preempt_disable();
+ ret = __seamcall_dirty_cache(func, fn, args);
+ preempt_enable();
+ } while (ret == TDX_RND_NO_ENTROPY && --retry);
+
+ return ret;
+}
+
+#define seamcall(_fn, _args) sc_retry(__seamcall, (_fn), (_args))
+#define seamcall_ret(_fn, _args) sc_retry(__seamcall_ret, (_fn), (_args))
+#define seamcall_saved_ret(_fn, _args) sc_retry(__seamcall_saved_ret, (_fn), (_args))
+
+typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
+
+static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
+{
+ pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err);
+}
+
+static inline void seamcall_err_ret(u64 fn, u64 err,
+ struct tdx_module_args *args)
+{
+ seamcall_err(fn, err, args);
+ pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n",
+ args->rcx, args->rdx, args->r8);
+ pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n",
+ args->r9, args->r10, args->r11);
+}
+
+static __always_inline int sc_retry_prerr(sc_func_t func,
+ sc_err_func_t err_func,
+ u64 fn, struct tdx_module_args *args)
+{
+ u64 sret = sc_retry(func, fn, args);
+
+ if (sret == TDX_SUCCESS)
+ return 0;
+
+ if (sret == TDX_SEAMCALL_VMFAILINVALID)
+ return -ENODEV;
+
+ if (sret == TDX_SEAMCALL_GP)
+ return -EOPNOTSUPP;
+
+ if (sret == TDX_SEAMCALL_UD)
+ return -EACCES;
+
+ err_func(fn, sret, args);
+ return -EIO;
+}
+
+#define seamcall_prerr(__fn, __args) \
+ sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args))
+
+#define seamcall_prerr_ret(__fn, __args) \
+ sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args))
+
+#endif /* _X86_VIRT_SEAMCALL_INTERNAL_H */
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 0172b432f229..a0f8cf5e10d7 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -42,6 +42,8 @@
#include <asm/processor.h>
#include <asm/mce.h>
#include <asm/virt.h>
+
+#include "seamcall_internal.h"
#include "tdx.h"
struct tdx_module_state {
@@ -66,51 +68,6 @@ static LIST_HEAD(tdx_memlist);
static struct tdx_sys_info tdx_sysinfo __ro_after_init;
-typedef void (*sc_err_func_t)(u64 fn, u64 err, struct tdx_module_args *args);
-
-static inline void seamcall_err(u64 fn, u64 err, struct tdx_module_args *args)
-{
- pr_err("SEAMCALL (0x%016llx) failed: 0x%016llx\n", fn, err);
-}
-
-static inline void seamcall_err_ret(u64 fn, u64 err,
- struct tdx_module_args *args)
-{
- seamcall_err(fn, err, args);
- pr_err("RCX 0x%016llx RDX 0x%016llx R08 0x%016llx\n",
- args->rcx, args->rdx, args->r8);
- pr_err("R09 0x%016llx R10 0x%016llx R11 0x%016llx\n",
- args->r9, args->r10, args->r11);
-}
-
-static __always_inline int sc_retry_prerr(sc_func_t func,
- sc_err_func_t err_func,
- u64 fn, struct tdx_module_args *args)
-{
- u64 sret = sc_retry(func, fn, args);
-
- if (sret == TDX_SUCCESS)
- return 0;
-
- if (sret == TDX_SEAMCALL_VMFAILINVALID)
- return -ENODEV;
-
- if (sret == TDX_SEAMCALL_GP)
- return -EOPNOTSUPP;
-
- if (sret == TDX_SEAMCALL_UD)
- return -EACCES;
-
- err_func(fn, sret, args);
- return -EIO;
-}
-
-#define seamcall_prerr(__fn, __args) \
- sc_retry_prerr(__seamcall, seamcall_err, (__fn), (__args))
-
-#define seamcall_prerr_ret(__fn, __args) \
- sc_retry_prerr(__seamcall_ret, seamcall_err_ret, (__fn), (__args))
-
/*
* Do the module global initialization once and return its result.
* It can be done on any cpu, and from task or IRQ context.
--
2.52.0
^ permalink raw reply related
* [PATCH v9 05/23] coco/tdx-host: Expose TDX module version
From: Chao Gao @ 2026-05-13 15:09 UTC (permalink / raw)
To: kvm, linux-coco, linux-kernel
Cc: binbin.wu, dave.hansen, djbw, ira.weiny, kai.huang, kas,
nik.borisov, paulmck, pbonzini, reinette.chatre, rick.p.edgecombe,
sagis, seanjc, tony.lindgren, vannapurve, vishal.l.verma,
yilun.xu, xiaoyao.li, yan.y.zhao, Chao Gao, Thomas Gleixner,
Ingo Molnar, Borislav Petkov, x86, H. Peter Anvin
In-Reply-To: <20260513151045.1420990-1-chao.gao@intel.com>
For TDX module updates, userspace needs to select compatible update
versions based on the current module version. This design delegates
module selection complexity to userspace because TDX module update
policies are complex and version series are platform-specific.
For example, the 1.5.x series is for certain platform generations, while
the 2.0.x series is intended for others. And TDX module 1.5.x may be
updated to 1.5.y but not to 1.5.y+1.
Expose the TDX module version to userspace via sysfs to aid module
selection. Since the TDX faux device will drive module updates, expose
the version as its attribute.
One bonus of exposing TDX module version via sysfs is: TDX module
version information remains available even after dmesg logs are cleared.
Define TDX_VERSION_FMT macro for the TDX version format since it will be
used multiple times. Also convert an existing print statement to use it.
== Background ==
The "faux device + device attribute" approach compares to other update
mechanisms as follows:
1. AMD SEV leverages an existing PCI device for the PSP to expose
metadata. TDX uses a faux device as it doesn't have PCI device
in its architecture.
2. Microcode uses per-CPU virtual devices to report microcode revisions
because CPUs can have different revisions. But, there is only a
single TDX module, so exposing the TDX module version through a global
TDX faux device is appropriate
3. ARM's CCA implementation isn't in-tree yet, but will likely follow a
similar faux device approach, though it's unclear whether they need
to expose firmware version information
Signed-off-by: Chao Gao <chao.gao@intel.com>
Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Reviewed-by: Tony Lindgren <tony.lindgren@linux.intel.com>
Reviewed-by: Xu Yilun <yilun.xu@linux.intel.com>
Reviewed-by: Kai Huang <kai.huang@intel.com>
Reviewed-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Link: https://lore.kernel.org/all/2025073035-bulginess-rematch-b92e@gregkh/ # [1]
---
.../ABI/testing/sysfs-devices-faux-tdx-host | 6 +++++
arch/x86/include/asm/tdx.h | 6 +++++
arch/x86/virt/vmx/tdx/tdx_global_metadata.c | 2 +-
drivers/virt/coco/tdx-host/tdx-host.c | 26 ++++++++++++++++++-
4 files changed, 38 insertions(+), 2 deletions(-)
create mode 100644 Documentation/ABI/testing/sysfs-devices-faux-tdx-host
diff --git a/Documentation/ABI/testing/sysfs-devices-faux-tdx-host b/Documentation/ABI/testing/sysfs-devices-faux-tdx-host
new file mode 100644
index 000000000000..2cf682b65acf
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-faux-tdx-host
@@ -0,0 +1,6 @@
+What: /sys/devices/faux/tdx_host/version
+Contact: linux-coco@lists.linux.dev
+Description: (RO) Report the version of the loaded TDX module. The TDX module
+ version is formatted as x.y.z, where "x" is the major version,
+ "y" is the minor version and "z" is the update version. Versions
+ are used for bug reporting, TDX module updates etc.
diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 8b739ac01479..b7f4396b5cc5 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -41,6 +41,12 @@
#include <asm/tdx_global_metadata.h>
#include <linux/pgtable.h>
+/*
+ * TDX module and P-SEAMLDR version convention: "major.minor.update"
+ * (e.g., "1.5.08") with zero-padded two-digit update field.
+ */
+#define TDX_VERSION_FMT "%u.%u.%02u"
+
/*
* Used by the #VE exception handler to gather the #VE exception
* info from the TDX module. This is a software only structure
diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
index c7db393a9cfb..d54d4227990c 100644
--- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
+++ b/arch/x86/virt/vmx/tdx/tdx_global_metadata.c
@@ -106,7 +106,7 @@ static __init int get_tdx_sys_info(struct tdx_sys_info *sysinfo)
ret = ret ?: get_tdx_sys_info_version(&sysinfo->version);
- pr_info("Module version: %u.%u.%02u\n",
+ pr_info("Module version: " TDX_VERSION_FMT "\n",
sysinfo->version.major_version,
sysinfo->version.minor_version,
sysinfo->version.update_version);
diff --git a/drivers/virt/coco/tdx-host/tdx-host.c b/drivers/virt/coco/tdx-host/tdx-host.c
index c77885392b09..ef117a836b3a 100644
--- a/drivers/virt/coco/tdx-host/tdx-host.c
+++ b/drivers/virt/coco/tdx-host/tdx-host.c
@@ -8,6 +8,7 @@
#include <linux/device/faux.h>
#include <linux/module.h>
#include <linux/mod_devicetable.h>
+#include <linux/sysfs.h>
#include <asm/cpu_device_id.h>
#include <asm/tdx.h>
@@ -18,6 +19,29 @@ static const struct x86_cpu_id tdx_host_ids[] = {
};
MODULE_DEVICE_TABLE(x86cpu, tdx_host_ids);
+static ssize_t version_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ const struct tdx_sys_info *tdx_sysinfo = tdx_get_sysinfo();
+ const struct tdx_sys_info_version *ver;
+
+ if (!tdx_sysinfo)
+ return -ENXIO;
+
+ ver = &tdx_sysinfo->version;
+
+ return sysfs_emit(buf, TDX_VERSION_FMT "\n", ver->major_version,
+ ver->minor_version,
+ ver->update_version);
+}
+static DEVICE_ATTR_RO(version);
+
+static struct attribute *tdx_host_attrs[] = {
+ &dev_attr_version.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(tdx_host);
+
static struct faux_device *fdev;
static int __init tdx_host_init(void)
@@ -25,7 +49,7 @@ static int __init tdx_host_init(void)
if (!x86_match_cpu(tdx_host_ids) || !tdx_get_sysinfo())
return -ENODEV;
- fdev = faux_device_create(KBUILD_MODNAME, NULL, NULL);
+ fdev = faux_device_create_with_groups(KBUILD_MODNAME, NULL, NULL, tdx_host_groups);
if (!fdev)
return -ENODEV;
--
2.52.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox