* [PATCH 01/22] KVM: arm64: nv: Introduce struct for stage-2 walk step
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 18:41 ` [PATCH 02/22] KVM: arm64: nv: Consolidate computation of stage-2 permissions Oliver Upton
` (20 subsequent siblings)
21 siblings, 0 replies; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
Factoring out helpers from walk_nested_s2_pgd() will require passing the
context of the current walk step. Prepare by reorganizing the relvant
information into a struct.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/kvm/nested.c | 109 +++++++++++++++++++++-------------------
1 file changed, 57 insertions(+), 52 deletions(-)
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 94df26de6990..9e60c7c822ae 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -134,6 +134,12 @@ struct s2_walk_info {
bool ha;
};
+struct s2_walk_step {
+ u64 desc_pa;
+ u64 desc;
+ int level;
+};
+
static u32 compute_fsc(int level, u32 fsc)
{
return fsc | (level & 0x3);
@@ -199,13 +205,13 @@ static int check_output_size(struct s2_walk_info *wi, phys_addr_t output)
return 0;
}
-static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc,
+static int read_guest_s2_desc(struct kvm_vcpu *vcpu, struct s2_walk_step *ws,
struct s2_walk_info *wi)
{
u64 val;
int r;
- r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val));
+ r = kvm_read_guest(vcpu->kvm, ws->desc_pa, &val, sizeof(val));
if (r)
return r;
@@ -214,9 +220,9 @@ static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc,
* host and the guest hypervisor.
*/
if (wi->be)
- *desc = be64_to_cpu((__force __be64)val);
+ ws->desc = be64_to_cpu((__force __be64)val);
else
- *desc = le64_to_cpu((__force __le64)val);
+ ws->desc = le64_to_cpu((__force __le64)val);
return 0;
}
@@ -245,22 +251,22 @@ static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u6
static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa,
struct s2_walk_info *wi, struct kvm_s2_trans *out)
{
- int first_block_level, level, stride, input_size, base_lower_bound;
+ int first_block_level, stride, input_size, base_lower_bound;
+ struct s2_walk_step ws = {};
phys_addr_t base_addr;
unsigned int addr_top, addr_bottom;
- u64 desc, new_desc; /* page table entry */
+ u64 new_desc; /* page table entry */
int ret;
- phys_addr_t paddr;
switch (BIT(wi->pgshift)) {
default:
case SZ_64K:
case SZ_16K:
- level = 3 - wi->sl;
+ ws.level = 3 - wi->sl;
first_block_level = 2;
break;
case SZ_4K:
- level = 2 - wi->sl;
+ ws.level = 2 - wi->sl;
first_block_level = 1;
break;
}
@@ -270,13 +276,13 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa,
if (input_size > 48 || input_size < 25)
return -EFAULT;
- ret = check_base_s2_limits(vcpu, wi, level, input_size, stride);
+ ret = check_base_s2_limits(vcpu, wi, ws.level, input_size, stride);
if (WARN_ON(ret)) {
out->esr = compute_fsc(0, ESR_ELx_FSC_FAULT);
return ret;
}
- base_lower_bound = 3 + input_size - ((3 - level) * stride +
+ base_lower_bound = 3 + input_size - ((3 - ws.level) * stride +
wi->pgshift);
base_addr = wi->baddr & GENMASK_ULL(47, base_lower_bound);
@@ -291,96 +297,95 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa,
while (1) {
phys_addr_t index;
- addr_bottom = (3 - level) * stride + wi->pgshift;
+ addr_bottom = (3 - ws.level) * stride + wi->pgshift;
index = (ipa & GENMASK_ULL(addr_top, addr_bottom))
>> (addr_bottom - 3);
- paddr = base_addr | index;
- ret = read_guest_s2_desc(vcpu, paddr, &desc, wi);
+ ws.desc_pa = base_addr | index;
+ ret = read_guest_s2_desc(vcpu, &ws, wi);
if (ret < 0) {
- out->esr = ESR_ELx_FSC_SEA_TTW(level);
+ out->esr = ESR_ELx_FSC_SEA_TTW(ws.level);
return ret;
}
- new_desc = desc;
+ new_desc = ws.desc;
/* Check for valid descriptor at this point */
- if (!(desc & KVM_PTE_VALID)) {
- out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
- out->desc = desc;
+ if (!(ws.desc & KVM_PTE_VALID)) {
+ out->esr = compute_fsc(ws.level, ESR_ELx_FSC_FAULT);
+ out->desc = ws.desc;
return 1;
}
- if (FIELD_GET(KVM_PTE_TYPE, desc) == KVM_PTE_TYPE_BLOCK) {
- if (level < 3)
+ if (FIELD_GET(KVM_PTE_TYPE, ws.desc) == KVM_PTE_TYPE_BLOCK) {
+ if (ws.level < 3)
break;
- out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
- out->desc = desc;
+ out->esr = compute_fsc(ws.level, ESR_ELx_FSC_FAULT);
+ out->desc = ws.desc;
return 1;
}
/* We're at the final level */
- if (level == 3)
+ if (ws.level == 3)
break;
- if (check_output_size(wi, desc)) {
- out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
- out->desc = desc;
+ if (check_output_size(wi, ws.desc)) {
+ out->esr = compute_fsc(ws.level, ESR_ELx_FSC_ADDRSZ);
+ out->desc = ws.desc;
return 1;
}
- base_addr = desc & GENMASK_ULL(47, wi->pgshift);
+ base_addr = ws.desc & GENMASK_ULL(47, wi->pgshift);
- level += 1;
+ ws.level += 1;
addr_top = addr_bottom - 1;
}
- if (level < first_block_level) {
- out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
- out->desc = desc;
+ if (ws.level < first_block_level) {
+ out->esr = compute_fsc(ws.level, ESR_ELx_FSC_FAULT);
+ out->desc = ws.desc;
return 1;
}
- if (check_output_size(wi, desc)) {
- out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
- out->desc = desc;
+ if (check_output_size(wi, ws.desc)) {
+ out->esr = compute_fsc(ws.level, ESR_ELx_FSC_ADDRSZ);
+ out->desc = ws.desc;
return 1;
}
if (wi->ha)
new_desc |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
- if (new_desc != desc) {
- ret = swap_guest_s2_desc(vcpu, paddr, desc, new_desc, wi);
+ if (new_desc != ws.desc) {
+ ret = swap_guest_s2_desc(vcpu, ws.desc_pa, ws.desc, new_desc, wi);
if (ret == -EAGAIN)
return ret;
if (ret) {
- out->esr = ESR_ELx_FSC_SEA_TTW(level);
- out->desc = desc;
+ out->esr = ESR_ELx_FSC_SEA_TTW(ws.level);
+ out->desc = ws.desc;
return 1;
}
- desc = new_desc;
+ ws.desc = new_desc;
}
- if (!(desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) {
- out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS);
- out->desc = desc;
+ if (!(ws.desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) {
+ out->esr = compute_fsc(ws.level, ESR_ELx_FSC_ACCESS);
+ out->desc = ws.desc;
return 1;
}
- addr_bottom += contiguous_bit_shift(desc, wi, level);
+ addr_bottom += contiguous_bit_shift(ws.desc, wi, ws.level);
/* Calculate and return the result */
- paddr = (desc & GENMASK_ULL(47, addr_bottom)) |
- (ipa & GENMASK_ULL(addr_bottom - 1, 0));
- out->output = paddr;
- out->block_size = 1UL << ((3 - level) * stride + wi->pgshift);
- out->readable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
- out->writable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
- out->level = level;
- out->desc = desc;
+ out->output = (ws.desc & GENMASK_ULL(47, addr_bottom)) |
+ (ipa & GENMASK_ULL(addr_bottom - 1, 0));
+ out->block_size = 1UL << ((3 - ws.level) * stride + wi->pgshift);
+ out->readable = ws.desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
+ out->writable = ws.desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+ out->level = ws.level;
+ out->desc = ws.desc;
return 0;
}
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* [PATCH 02/22] KVM: arm64: nv: Consolidate computation of stage-2 permissions
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
2026-06-23 18:41 ` [PATCH 01/22] KVM: arm64: nv: Introduce struct for stage-2 walk step Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 18:57 ` sashiko-bot
2026-06-23 18:41 ` [PATCH 03/22] KVM: arm64: nv: Get rid of kvm_s2_trans*() accessors Oliver Upton
` (19 subsequent siblings)
21 siblings, 1 reply; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
Computing the output permissions from a translation requires
contextualization, as fields in the descriptors may change depending on
the MMU context. Centralize the computation of stage-2 permissions
within the table walk rather as opposed to inspecting the descriptor in
the case of pX and uX permissions.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/include/asm/kvm_nested.h | 42 ++++++++--------------------
arch/arm64/include/asm/kvm_pgtable.h | 5 ++--
arch/arm64/kvm/nested.c | 33 ++++++++++++++++++++--
3 files changed, 45 insertions(+), 35 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index cbdaaa2a2903..aa27f12cf2d4 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -87,13 +87,15 @@ extern void kvm_nested_sync_hwstate(struct kvm_vcpu *vcpu);
extern void kvm_nested_setup_mdcr_el2(struct kvm_vcpu *vcpu);
struct kvm_s2_trans {
- phys_addr_t output;
- unsigned long block_size;
- bool writable;
- bool readable;
- int level;
- u32 esr;
- u64 desc;
+ u64 desc;
+ phys_addr_t output;
+ unsigned long block_size;
+ int level;
+ u32 esr;
+ bool writable;
+ bool readable;
+ bool px;
+ bool ux;
};
static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans)
@@ -129,34 +131,12 @@ static inline bool kvm_has_xnx(struct kvm *kvm)
static inline bool kvm_s2_trans_exec_el0(struct kvm *kvm, struct kvm_s2_trans *trans)
{
- u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, trans->desc);
-
- if (!kvm_has_xnx(kvm))
- xn &= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, 0b10);
-
- switch (xn) {
- case 0b00:
- case 0b01:
- return true;
- default:
- return false;
- }
+ return trans->ux;
}
static inline bool kvm_s2_trans_exec_el1(struct kvm *kvm, struct kvm_s2_trans *trans)
{
- u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, trans->desc);
-
- if (!kvm_has_xnx(kvm))
- xn &= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, 0b10);
-
- switch (xn) {
- case 0b00:
- case 0b11:
- return true;
- default:
- return false;
- }
+ return trans->px;
}
extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 41a8687938eb..22aeb2ed18d1 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -79,6 +79,8 @@ typedef u64 kvm_pte_t;
#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR GENMASK(5, 2)
#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R BIT(6)
#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W BIT(7)
+#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
+ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W)
#define KVM_PTE_LEAF_ATTR_LO_S2_SH GENMASK(9, 8)
#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3
#define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10)
@@ -95,8 +97,7 @@ typedef u64 kvm_pte_t;
#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50)
-#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
- KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
+#define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP | \
KVM_PTE_LEAF_ATTR_HI_S2_XN)
/* pKVM invalid pte encodings */
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 9e60c7c822ae..c9300703bd0d 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -241,6 +241,36 @@ static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u6
return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
}
+static void compute_s2_permissions(struct kvm_vcpu *vcpu, struct s2_walk_info *wi,
+ struct s2_walk_step *ws, struct kvm_s2_trans *trans)
+{
+ u8 s2ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_S2AP, ws->desc);
+ u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, ws->desc);
+
+ if (!kvm_has_xnx(vcpu->kvm))
+ xn &= 0b10;
+
+ switch (xn) {
+ case 0b00:
+ trans->px = trans->ux = true;
+ break;
+ case 0b01:
+ trans->px = false;
+ trans->ux = true;
+ break;
+ case 0b10:
+ trans->px = trans->ux = false;
+ break;
+ case 0b11:
+ trans->px = true;
+ trans->ux = false;
+ break;
+ }
+
+ trans->readable = s2ap & BIT(0);
+ trans->writable = s2ap & BIT(1);
+}
+
/*
* This is essentially a C-version of the pseudo code from the ARM ARM
* AArch64.TranslationTableWalk function. I strongly recommend looking at
@@ -382,8 +412,7 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa,
out->output = (ws.desc & GENMASK_ULL(47, addr_bottom)) |
(ipa & GENMASK_ULL(addr_bottom - 1, 0));
out->block_size = 1UL << ((3 - ws.level) * stride + wi->pgshift);
- out->readable = ws.desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
- out->writable = ws.desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+ compute_s2_permissions(vcpu, wi, &ws, out);
out->level = ws.level;
out->desc = ws.desc;
return 0;
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* Re: [PATCH 02/22] KVM: arm64: nv: Consolidate computation of stage-2 permissions
2026-06-23 18:41 ` [PATCH 02/22] KVM: arm64: nv: Consolidate computation of stage-2 permissions Oliver Upton
@ 2026-06-23 18:57 ` sashiko-bot
0 siblings, 0 replies; 40+ messages in thread
From: sashiko-bot @ 2026-06-23 18:57 UTC (permalink / raw)
To: Oliver Upton; +Cc: kvmarm, Marc Zyngier
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
Pre-existing issues:
- [Critical] This problem wasn't introduced by this patch, but this patch silently fixes a critical pre-existing security vulnerability in nested KVM where stage-2 Execute-Never (XN) restrictions were ignored when the host lacks FEAT_XNX. The bug should be explicitly noted and split into a separate fix for stable backporting.
--
commit 60a1129130ae1d928ae1a5f97eb06ae059dacb1a
Author: Oliver Upton <oupton@kernel.org>
Subject: KVM: arm64: nv: Consolidate computation of stage-2 permissions
This commit centralizes the computation of stage-2 permissions within the
table walk rather than inspecting the descriptor directly for execution
permissions.
> diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
> --- a/arch/arm64/include/asm/kvm_nested.h
> +++ b/arch/arm64/include/asm/kvm_nested.h
> @@ -129,34 +131,12 @@ static inline bool kvm_has_xnx(struct kvm *kvm)
>
> static inline bool kvm_s2_trans_exec_el0(struct kvm *kvm, struct kvm_s2_trans *trans)
> {
> - u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, trans->desc);
> -
> - if (!kvm_has_xnx(kvm))
> - xn &= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, 0b10);
[ ... ]
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
> @@ -241,6 +241,36 @@ static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u6
> return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
> }
>
> +static void compute_s2_permissions(struct kvm_vcpu *vcpu, struct s2_walk_info *wi,
> + struct s2_walk_step *ws, struct kvm_s2_trans *trans)
> +{
> + u8 s2ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S2_S2AP, ws->desc);
> + u8 xn = FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, ws->desc);
> +
> + if (!kvm_has_xnx(vcpu->kvm))
> + xn &= 0b10;
[Severity: Critical]
This is a pre-existing issue, but this change silently fixes a regression
in nested virtualization isolation.
In the old code for kvm_s2_trans_exec_el0 and kvm_s2_trans_exec_el1, the
masking was done using FIELD_PREP. Because xn was a u8 and FIELD_PREP
returned a 64-bit value shifted by 53 bits, the bitwise AND always
evaluated to 0.
As a result, the check bypassed the L1 hypervisor's Execute-Never
restrictions entirely when the host lacked the hardware feature.
Since xn &= 0b10 correctly resolves this in the new compute_s2_permissions
function, does it make sense to split this specific correction into a
separate commit so it can be easily backported to stable trees?
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260623184201.1518871-1-oupton@kernel.org?part=2
^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH 03/22] KVM: arm64: nv: Get rid of kvm_s2_trans*() accessors
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
2026-06-23 18:41 ` [PATCH 01/22] KVM: arm64: nv: Introduce struct for stage-2 walk step Oliver Upton
2026-06-23 18:41 ` [PATCH 02/22] KVM: arm64: nv: Consolidate computation of stage-2 permissions Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 18:41 ` [PATCH 04/22] KVM: arm64: nv: Only shadow writable-dirty guest descs as writable Oliver Upton
` (18 subsequent siblings)
21 siblings, 0 replies; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
Access the aptly named fields in kvm_s2_trans directly rather than
indirecting through useless getters.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/include/asm/kvm_nested.h | 35 -----------------------------
arch/arm64/kvm/at.c | 6 ++---
arch/arm64/kvm/mmu.c | 20 ++++++++---------
arch/arm64/kvm/nested.c | 4 ++--
4 files changed, 15 insertions(+), 50 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index aa27f12cf2d4..3b36ed7c7608 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -98,47 +98,12 @@ struct kvm_s2_trans {
bool ux;
};
-static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans)
-{
- return trans->output;
-}
-
-static inline unsigned long kvm_s2_trans_size(struct kvm_s2_trans *trans)
-{
- return trans->block_size;
-}
-
-static inline u32 kvm_s2_trans_esr(struct kvm_s2_trans *trans)
-{
- return trans->esr;
-}
-
-static inline bool kvm_s2_trans_readable(struct kvm_s2_trans *trans)
-{
- return trans->readable;
-}
-
-static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans)
-{
- return trans->writable;
-}
-
static inline bool kvm_has_xnx(struct kvm *kvm)
{
return cpus_have_final_cap(ARM64_HAS_XNX) &&
kvm_has_feat(kvm, ID_AA64MMFR1_EL1, XNX, IMP);
}
-static inline bool kvm_s2_trans_exec_el0(struct kvm *kvm, struct kvm_s2_trans *trans)
-{
- return trans->ux;
-}
-
-static inline bool kvm_s2_trans_exec_el1(struct kvm *kvm, struct kvm_s2_trans *trans)
-{
- return trans->px;
-}
-
extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
struct kvm_s2_trans *result);
extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 8263c648207b..86b499e7a9a0 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -494,14 +494,14 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
return ret;
}
- if (!kvm_s2_trans_readable(&s2_trans)) {
+ if (!s2_trans.readable) {
fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
true);
return -EPERM;
}
- ipa = kvm_s2_trans_output(&s2_trans);
+ ipa = s2_trans.output;
}
if (wi->filter) {
@@ -582,7 +582,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
new_desc |= PTE_AF;
if (new_desc != desc) {
- if (wi->s2 && !kvm_s2_trans_writable(&s2_trans)) {
+ if (wi->s2 && !s2_trans.writable) {
fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), true);
return -EPERM;
}
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 8811ad60cf72..07bd1e3ae9fb 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1572,9 +1572,9 @@ static int topup_mmu_memcache(struct kvm_vcpu *vcpu, void *memcache)
static enum kvm_pgtable_prot adjust_nested_fault_perms(struct kvm_s2_trans *nested,
enum kvm_pgtable_prot prot)
{
- if (!kvm_s2_trans_writable(nested))
+ if (!nested->writable)
prot &= ~KVM_PGTABLE_PROT_W;
- if (!kvm_s2_trans_readable(nested))
+ if (!nested->readable)
prot &= ~KVM_PGTABLE_PROT_R;
return prot | kvm_encode_nested_level(nested);
@@ -1584,9 +1584,9 @@ static enum kvm_pgtable_prot adjust_nested_exec_perms(struct kvm *kvm,
struct kvm_s2_trans *nested,
enum kvm_pgtable_prot prot)
{
- if (!kvm_s2_trans_exec_el0(kvm, nested))
+ if (!nested->ux)
prot &= ~KVM_PGTABLE_PROT_UX;
- if (!kvm_s2_trans_exec_el1(kvm, nested))
+ if (!nested->px)
prot &= ~KVM_PGTABLE_PROT_PX;
return prot;
@@ -1623,7 +1623,7 @@ static int gmem_abort(const struct kvm_s2_fault_desc *s2fd)
}
if (s2fd->nested)
- gfn = kvm_s2_trans_output(s2fd->nested) >> PAGE_SHIFT;
+ gfn = s2fd->nested->output >> PAGE_SHIFT;
else
gfn = s2fd->fault_ipa >> PAGE_SHIFT;
@@ -1817,7 +1817,7 @@ static short kvm_s2_resolve_vma_size(const struct kvm_s2_fault_desc *s2fd,
* can only create a block mapping if the guest stage 2 page
* table uses at least as big a mapping.
*/
- max_map_size = min(kvm_s2_trans_size(s2fd->nested), max_map_size);
+ max_map_size = min(s2fd->nested->block_size, max_map_size);
/*
* Be careful that if the mapping size falls between
@@ -1891,7 +1891,7 @@ static gfn_t get_canonical_gfn(const struct kvm_s2_fault_desc *s2fd,
if (!s2fd->nested)
return s2vi->gfn;
- ipa = kvm_s2_trans_output(s2fd->nested);
+ ipa = s2fd->nested->output;
return ALIGN_DOWN(ipa, s2vi->vma_pagesize) >> PAGE_SHIFT;
}
@@ -2322,19 +2322,19 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
}
if (ret) {
- esr = kvm_s2_trans_esr(&nested_trans);
+ esr = nested_trans.esr;
kvm_inject_s2_fault(vcpu, esr);
goto out_unlock;
}
ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans);
if (ret) {
- esr = kvm_s2_trans_esr(&nested_trans);
+ esr = nested_trans.esr;
kvm_inject_s2_fault(vcpu, esr);
goto out_unlock;
}
- ipa = kvm_s2_trans_output(&nested_trans);
+ ipa = nested_trans.output;
nested = &nested_trans;
}
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index c9300703bd0d..b247bc1d83fa 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -954,9 +954,9 @@ int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans)
if (kvm_vcpu_trap_is_iabt(vcpu)) {
if (vcpu_mode_priv(vcpu))
- forward_fault = !kvm_s2_trans_exec_el1(vcpu->kvm, trans);
+ forward_fault = !trans->px;
else
- forward_fault = !kvm_s2_trans_exec_el0(vcpu->kvm, trans);
+ forward_fault = !trans->ux;
} else {
bool write_fault = kvm_is_write_fault(vcpu);
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* [PATCH 04/22] KVM: arm64: nv: Only shadow writable-dirty guest descs as writable
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (2 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 03/22] KVM: arm64: nv: Get rid of kvm_s2_trans*() accessors Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 18:58 ` sashiko-bot
2026-06-23 18:41 ` [PATCH 05/22] KVM: arm64: nv: Pass an access descriptor for stage-2 walks Oliver Upton
` (17 subsequent siblings)
21 siblings, 1 reply; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
KVM will soon add support for hardware dirty state management for nested
guests. In order to emulate dirty state transitions on the guest
desciptor, KVM will need to use read-only hardware mappings and re-walk
the guest page tables upon taking a permission fault.
Prepare by limiting shadow stage-2 and shadow VNCR translations to
read-only for writable-clean guest translations.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/include/asm/kvm_nested.h | 2 ++
arch/arm64/kvm/at.c | 1 +
arch/arm64/kvm/mmu.c | 2 +-
arch/arm64/kvm/nested.c | 4 +++-
4 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index 3b36ed7c7608..7fe6fb56c187 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -94,6 +94,7 @@ struct kvm_s2_trans {
u32 esr;
bool writable;
bool readable;
+ bool dirty;
bool px;
bool ux;
};
@@ -323,6 +324,7 @@ struct s1_walk_result {
bool pr;
bool pw;
bool px;
+ bool dirty;
};
struct {
u8 fst;
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 86b499e7a9a0..7a84495a2e6d 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -1317,6 +1317,7 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu,
(pan3_enabled(vcpu, wi->regime) && wr->ux));
wr->pw &= !pan;
wr->pr &= !pan;
+ wr->dirty = !(wr->desc & BIT(7));
}
static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par)
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 07bd1e3ae9fb..f35c4ce95473 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1572,7 +1572,7 @@ static int topup_mmu_memcache(struct kvm_vcpu *vcpu, void *memcache)
static enum kvm_pgtable_prot adjust_nested_fault_perms(struct kvm_s2_trans *nested,
enum kvm_pgtable_prot prot)
{
- if (!nested->writable)
+ if (!(nested->writable && nested->dirty))
prot &= ~KVM_PGTABLE_PROT_W;
if (!nested->readable)
prot &= ~KVM_PGTABLE_PROT_R;
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index b247bc1d83fa..dcc7d0cc7c95 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -269,6 +269,8 @@ static void compute_s2_permissions(struct kvm_vcpu *vcpu, struct s2_walk_info *w
trans->readable = s2ap & BIT(0);
trans->writable = s2ap & BIT(1);
+
+ trans->dirty = ws->desc & BIT(7);
}
/*
@@ -1603,7 +1605,7 @@ static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu)
vt->cpu = smp_processor_id();
- if (vt->hpa_writable && vt->wr.pw && vt->wr.pr)
+ if (vt->hpa_writable && vt->wr.pw && vt->wr.dirty && vt->wr.pr)
prot = PAGE_KERNEL;
else if (vt->wr.pr)
prot = PAGE_KERNEL_RO;
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* Re: [PATCH 04/22] KVM: arm64: nv: Only shadow writable-dirty guest descs as writable
2026-06-23 18:41 ` [PATCH 04/22] KVM: arm64: nv: Only shadow writable-dirty guest descs as writable Oliver Upton
@ 2026-06-23 18:58 ` sashiko-bot
2026-06-23 20:05 ` Oliver Upton
0 siblings, 1 reply; 40+ messages in thread
From: sashiko-bot @ 2026-06-23 18:58 UTC (permalink / raw)
To: Oliver Upton; +Cc: kvmarm, Marc Zyngier
Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
New issues:
- [High] Uninitialized or incorrectly evaluated `wr->dirty` field during Stage-1 page table walks causes an infinite VNCR trap loop.
Pre-existing issues:
- [High] Write-Only VNCR mappings cause an infinite trap loop because KVM falls back to `PAGE_KERNEL_NONE` but considers write faults resolved.
--
commit fb5a66a1eb4ac6175aad6cbbd38f45328f4f937b
Author: Oliver Upton <oupton@kernel.org>
KVM: arm64: nv: Only shadow writable-dirty guest descs as writable
KVM will soon add support for hardware dirty state management for nested
guests. In order to emulate dirty state transitions on the guest
desciptor, KVM will need to use read-only hardware mappings and re-walk
the guest page tables upon taking a permission fault.
Prepare by limiting shadow stage-2 and shadow VNCR translations to
read-only for writable-clean guest translations.
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> index 86b499e7a9a06..7a84495a2e6d4 100644
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
[ ... ]
> @@ -1317,6 +1317,7 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu,
> (pan3_enabled(vcpu, wi->regime) && wr->ux));
> wr->pw &= !pan;
> wr->pr &= !pan;
> + wr->dirty = !(wr->desc & BIT(7));
[Severity: High]
Could the unconditional use of BIT(7) here cause issues when S1PIE is
enabled?
With S1PIE, BIT(7) becomes PIIndex[3], so using it to evaluate the dirty
state might be incorrect.
Also, if the Stage-1 MMU is disabled, __kvm_translate_va() returns early
without initializing wr->dirty:
arch/arm64/kvm/at.c:__kvm_translate_va() {
if (wr->level == S1_MMU_DISABLED) {
wr->ur = wr->uw = wr->ux = true;
wr->pr = wr->pw = wr->px = true;
return 0;
}
...
}
Since wr->dirty remains false (0-initialized), could this lead to an
infinite VNCR trap loop? If wr->dirty is false, the shadow fixmap mapping
is downgraded to PAGE_KERNEL_RO in kvm_map_l1_vncr().
When a guest writes to the redirected register, kvm_handle_vncr_abort()
sees vt->wr.pw is true, considers the fault handled without injecting a
permission fault, and the hardware immediately retries and traps again.
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> index b247bc1d83fa7..dcc7d0cc7c952 100644
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
[ ... ]
> @@ -1603,7 +1605,7 @@ static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu)
>
> vt->cpu = smp_processor_id();
>
> - if (vt->hpa_writable && vt->wr.pw && vt->wr.pr)
> + if (vt->hpa_writable && vt->wr.pw && vt->wr.dirty && vt->wr.pr)
> prot = PAGE_KERNEL;
> else if (vt->wr.pr)
> prot = PAGE_KERNEL_RO;
[Severity: High]
This is a pre-existing issue, but how does this code handle Write-Only VNCR
mappings?
If a nested guest maps its VNCR page as Write-Only (which is possible using
POE/S1PIE), vt->wr.pr will be false. The logic here then falls through to
mapping it as PAGE_NONE:
arch/arm64/kvm/nested.c:kvm_map_l1_vncr() {
...
else if (vt->wr.pr)
prot = PAGE_KERNEL_RO;
else
prot = PAGE_NONE;
...
}
When the guest writes to the register, kvm_handle_vncr_abort() sees
vt->wr.pw is true, assumes the write fault is permitted, and returns 1
without injecting a fault.
The hardware then retries against PAGE_NONE and immediately traps again,
creating an infinite loop that can lock up the host CPU.
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260623184201.1518871-1-oupton@kernel.org?part=4
^ permalink raw reply [flat|nested] 40+ messages in thread* Re: [PATCH 04/22] KVM: arm64: nv: Only shadow writable-dirty guest descs as writable
2026-06-23 18:58 ` sashiko-bot
@ 2026-06-23 20:05 ` Oliver Upton
0 siblings, 0 replies; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 20:05 UTC (permalink / raw)
To: sashiko-reviews; +Cc: kvmarm, Marc Zyngier
On Tue, Jun 23, 2026 at 06:58:09PM +0000, sashiko-bot@kernel.org wrote:
> Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
>
> New issues:
> - [High] Uninitialized or incorrectly evaluated `wr->dirty` field during Stage-1 page table walks causes an infinite VNCR trap loop.
>
> Pre-existing issues:
> - [High] Write-Only VNCR mappings cause an infinite trap loop because KVM falls back to `PAGE_KERNEL_NONE` but considers write faults resolved.
> --
>
> commit fb5a66a1eb4ac6175aad6cbbd38f45328f4f937b
> Author: Oliver Upton <oupton@kernel.org>
>
> KVM: arm64: nv: Only shadow writable-dirty guest descs as writable
>
> KVM will soon add support for hardware dirty state management for nested
> guests. In order to emulate dirty state transitions on the guest
> desciptor, KVM will need to use read-only hardware mappings and re-walk
> the guest page tables upon taking a permission fault.
>
> Prepare by limiting shadow stage-2 and shadow VNCR translations to
> read-only for writable-clean guest translations.
>
> > diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> > index 86b499e7a9a06..7a84495a2e6d4 100644
> > --- a/arch/arm64/kvm/at.c
> > +++ b/arch/arm64/kvm/at.c
> [ ... ]
> > @@ -1317,6 +1317,7 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu,
> > (pan3_enabled(vcpu, wi->regime) && wr->ux));
> > wr->pw &= !pan;
> > wr->pr &= !pan;
> > + wr->dirty = !(wr->desc & BIT(7));
>
> [Severity: High]
> Could the unconditional use of BIT(7) here cause issues when S1PIE is
> enabled?
>
> With S1PIE, BIT(7) becomes PIIndex[3], so using it to evaluate the dirty
> state might be incorrect.
Bit 7 is nDirty when using indirect permissions, so this is wrong.
Thanks,
Oliver
^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH 05/22] KVM: arm64: nv: Pass an access descriptor for stage-2 walks
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (3 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 04/22] KVM: arm64: nv: Only shadow writable-dirty guest descs as writable Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 19:06 ` sashiko-bot
2026-06-23 18:41 ` [PATCH 06/22] KVM: arm64: nv: Use a helper for stage-2 descriptor updates Oliver Upton
` (16 subsequent siblings)
21 siblings, 1 reply; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
Pass sufficient context to the stage-2 PTW such that access-dependent
features like FEAT_HAFDBS can determine the correct behavior of the
walk.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/include/asm/kvm_nested.h | 15 ++++++++++++++-
arch/arm64/kvm/at.c | 23 +++++++++++++++++++----
arch/arm64/kvm/mmu.c | 16 +++++++++++++++-
arch/arm64/kvm/nested.c | 10 +++++-----
4 files changed, 53 insertions(+), 11 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index 7fe6fb56c187..71814c4aac3e 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -105,7 +105,20 @@ static inline bool kvm_has_xnx(struct kvm *kvm)
kvm_has_feat(kvm, ID_AA64MMFR1_EL1, XNX, IMP);
}
-extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
+struct kvm_walk_access {
+ enum {
+ WALK_ACCESS_IFETCH,
+ WALK_ACCESS_LDST,
+ WALK_ACCESS_CMO,
+ WALK_ACCESS_AT,
+ WALK_ACCESS_S1PTW,
+ } type;
+
+ u64 ia;
+ bool write;
+};
+
+extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, struct kvm_walk_access *access,
struct kvm_s2_trans *result);
extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
struct kvm_s2_trans *trans);
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 7a84495a2e6d..083014e9d86a 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -483,7 +483,19 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
ipa = baddr | index;
if (wi->s2) {
- ret = kvm_walk_nested_s2(vcpu, ipa, &s2_trans);
+ struct kvm_walk_access s2_access = {
+ .type = WALK_ACCESS_S1PTW,
+ .ia = ipa,
+
+ /*
+ * R_JCXVS, stage-2 dirty state can be updated
+ * for an S1PTW even if the stage-1 descriptor
+ * isn't updated.
+ */
+ .write = wi->ha,
+ };
+
+ ret = kvm_walk_nested_s2(vcpu, &s2_access, &s2_trans);
if (ret == -EAGAIN)
return ret;
@@ -1597,8 +1609,9 @@ int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
+ struct kvm_walk_access access = {};
struct kvm_s2_trans out = {};
- u64 ipa, par;
+ u64 par;
bool write;
int ret;
@@ -1642,9 +1655,11 @@ int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
return 0;
/* Do the stage-2 translation */
- ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
+ access.type = WALK_ACCESS_AT;
+ access.ia = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
out.esr = 0;
- ret = kvm_walk_nested_s2(vcpu, ipa, &out);
+
+ ret = kvm_walk_nested_s2(vcpu, &access, &out);
if (ret < 0)
return ret;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index f35c4ce95473..88998195274b 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -2313,9 +2313,23 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
*/
if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) &&
vcpu->arch.hw_mmu->nested_stage2_enabled) {
+ struct kvm_walk_access access = {
+ .ia = fault_ipa,
+ };
u32 esr;
- ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
+ if (kvm_vcpu_abt_iss1tw(vcpu))
+ access.type = WALK_ACCESS_S1PTW;
+ else if (is_iabt)
+ access.type = WALK_ACCESS_IFETCH;
+ else if (kvm_vcpu_dabt_is_cm(vcpu))
+ access.type = WALK_ACCESS_CMO;
+ else
+ access.type = WALK_ACCESS_LDST;
+
+ access.write = kvm_is_write_fault(vcpu);
+
+ ret = kvm_walk_nested_s2(vcpu, &access, &nested_trans);
if (ret == -EAGAIN) {
ret = 1;
goto out_unlock;
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index dcc7d0cc7c95..c2fb7290f0c8 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -280,7 +280,7 @@ static void compute_s2_permissions(struct kvm_vcpu *vcpu, struct s2_walk_info *w
*
* Must be called with the kvm->srcu read lock held
*/
-static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa,
+static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, struct kvm_walk_access *access,
struct s2_walk_info *wi, struct kvm_s2_trans *out)
{
int first_block_level, stride, input_size, base_lower_bound;
@@ -330,7 +330,7 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa,
phys_addr_t index;
addr_bottom = (3 - ws.level) * stride + wi->pgshift;
- index = (ipa & GENMASK_ULL(addr_top, addr_bottom))
+ index = (access->ia & GENMASK_ULL(addr_top, addr_bottom))
>> (addr_bottom - 3);
ws.desc_pa = base_addr | index;
@@ -412,7 +412,7 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa,
/* Calculate and return the result */
out->output = (ws.desc & GENMASK_ULL(47, addr_bottom)) |
- (ipa & GENMASK_ULL(addr_bottom - 1, 0));
+ (access->ia & GENMASK_ULL(addr_bottom - 1, 0));
out->block_size = 1UL << ((3 - ws.level) * stride + wi->pgshift);
compute_s2_permissions(vcpu, wi, &ws, out);
out->level = ws.level;
@@ -515,7 +515,7 @@ static void setup_s2_walk(struct kvm_vcpu *vcpu, struct s2_walk_info *wi)
wi->be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE;
}
-int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
+int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, struct kvm_walk_access *access,
struct kvm_s2_trans *result)
{
struct s2_walk_info wi;
@@ -528,7 +528,7 @@ int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
setup_s2_walk(vcpu, &wi);
- ret = walk_nested_s2_pgd(vcpu, gipa, &wi, result);
+ ret = walk_nested_s2_pgd(vcpu, access, &wi, result);
if (ret)
result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC);
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* Re: [PATCH 05/22] KVM: arm64: nv: Pass an access descriptor for stage-2 walks
2026-06-23 18:41 ` [PATCH 05/22] KVM: arm64: nv: Pass an access descriptor for stage-2 walks Oliver Upton
@ 2026-06-23 19:06 ` sashiko-bot
0 siblings, 0 replies; 40+ messages in thread
From: sashiko-bot @ 2026-06-23 19:06 UTC (permalink / raw)
To: Oliver Upton; +Cc: Marc Zyngier, kvmarm
Thank you for your contribution! Sashiko AI review found 5 potential issue(s) to consider:
Pre-existing issues:
- [Critical] __kvm_at_s12 invokes a nested stage-2 page table walk without holding the KVM SRCU read lock, causing a Use-After-Free.
- [Critical] Missing SRCU read lock in Synchronous External Abort (SEA) handling causes a Use-After-Free on memslots during __kvm_find_s1_desc_level.
- [High] kvm_s2_handle_perm_fault incorrectly checks execute permissions instead of read/write permissions for a stage-1 PTW triggered by an instruction fetch.
- [High] __kvm_at_s12 corrupts the intermediate physical address (IPA) for guests using 16KB or 64KB translation granules.
- [High] __kvm_at_s12 silently truncates 52-bit intermediate physical addresses when reading the stage-1 translation result from PAR_EL1.
--
commit 2325baae11b7348a33c6e1cec8c8a1a565d36558
Author: Oliver Upton <oupton@kernel.org>
Subject: KVM: arm64: nv: Pass an access descriptor for stage-2 walks
This commit modifies the stage-2 page table walk API to take an access
descriptor instead of just the IPA. This provides sufficient context to the
PTW to determine the correct behavior for access-dependent features like
FEAT_HAFDBS.
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> index 7a84495a2e6d4..083014e9d86a3 100644
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
> @@ -1642,9 +1655,11 @@ int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
> return 0;
>
> /* Do the stage-2 translation */
> - ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
> + access.type = WALK_ACCESS_AT;
> + access.ia = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
[Severity: High]
This is a pre-existing issue, but does this bitwise AND operation silently
truncate 52-bit intermediate physical addresses?
If the CPU and guest support 52-bit physical addresses (FEAT_LPA or
FEAT_LPA2), the physical address occupies bits up to 51. The
GENMASK_ULL(47, 12) truncates bits [51:48], feeding a 48-bit address into
kvm_walk_nested_s2() which breaks nested guests utilizing memory addresses
beyond the 48-bit boundary.
[Severity: High]
This isn't a bug introduced by this patch, but does this assume a 4KB granule
and corrupt the IPA for guests using 16KB or 64KB translation granules?
For 16KB and 64KB translation granules, the page offset spans 14 bits and
16 bits respectively. The hardware returns a PAR_EL1 where bits [15:12] are
RES0 for a 64KB granule. Because the offset is extracted exclusively from
vaddr & GENMASK_ULL(11, 0), bits [15:12] of the reconstructed IPA will be
zeroed out, causing the stage-2 walk to use the wrong address.
> out.esr = 0;
> - ret = kvm_walk_nested_s2(vcpu, ipa, &out);
> +
> + ret = kvm_walk_nested_s2(vcpu, &access, &out);
[Severity: Critical]
This is a pre-existing issue, but does this invoke a nested stage-2 page
table walk without holding the KVM SRCU read lock?
kvm_walk_nested_s2() reads from guest memory using kvm_read_guest(), which
dereferences the RCU-protected memslots array. If host userspace unmaps or
modifies the VM's memslots concurrently while the nested hypervisor executes
an AT S12 instruction, can this result in a use-after-free condition leading
to host kernel memory corruption?
[ ... ]
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index f35c4ce954731..88998195274b9 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -2313,9 +2313,23 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
> */
> if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) &&
> vcpu->arch.hw_mmu->nested_stage2_enabled) {
> + struct kvm_walk_access access = {
> + .ia = fault_ipa,
> + };
> u32 esr;
>
> - ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
> + if (kvm_vcpu_abt_iss1tw(vcpu))
> + access.type = WALK_ACCESS_S1PTW;
> + else if (is_iabt)
> + access.type = WALK_ACCESS_IFETCH;
> + else if (kvm_vcpu_dabt_is_cm(vcpu))
> + access.type = WALK_ACCESS_CMO;
> + else
> + access.type = WALK_ACCESS_LDST;
> +
> + access.write = kvm_is_write_fault(vcpu);
> +
> + ret = kvm_walk_nested_s2(vcpu, &access, &nested_trans);
> if (ret == -EAGAIN) {
> ret = 1;
> goto out_unlock;
[Severity: High]
This isn't a bug introduced by this patch, but does kvm_s2_handle_perm_fault()
incorrectly check execute permissions instead of read/write permissions for a
stage-1 PTW triggered by an instruction fetch?
When an instruction fetch triggers a stage-1 page table walk (S1PTW) that
causes a stage-2 permission fault, the resulting exception is reported as an
instruction abort (IABT) with the S1PTW bit set. A page table walk is a data
read, so the hardware checks stage-2 read/write permissions.
However, kvm_s2_handle_perm_fault() evaluates this as an instruction fetch:
arch/arm64/kvm/nested.c:kvm_s2_handle_perm_fault() {
...
if (kvm_vcpu_trap_is_iabt(vcpu)) {
if (vcpu_mode_priv(vcpu))
forward_fault = !trans->px;
else
forward_fault = !trans->ux;
} else {
...
}
Because stage-2 page tables mapping the stage-1 page tables are typically
marked Execute-Never (XN), will KVM spuriously inject a stage-2 permission
fault into the nested hypervisor?
[Severity: Critical]
This is a pre-existing issue, but does handling a Synchronous External Abort
(SEA) without the SRCU read lock cause a use-after-free during
__kvm_find_s1_desc_level()?
When KVM handles an SEA fault, kvm_handle_guest_abort() delegates to
kvm_inject_sea() before acquiring the SRCU read lock:
arch/arm64/kvm/mmu.c:kvm_handle_guest_abort() {
...
if (esr_fsc_is_translation_fault(esr)) {
/* Beyond sanitised PARange (which is the IPA limit) */
if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
kvm_inject_size_fault(vcpu);
return 1;
}
/* Falls between the IPA range and the PARange? */
if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) {
fault_ipa |= FAR_TO_FIPA_OFFSET(kvm_vcpu_get_hfar(vcpu));
return kvm_inject_sea(vcpu, is_iabt, fault_ipa);
}
}
...
}
If the SEA occurred during a stage-1 page table walk (S1PTW), it cascades
to inject_abt64(), which calls __kvm_find_s1_desc_level():
arch/arm64/kvm/inject_fault.c:inject_abt64() {
...
ret = __kvm_find_s1_desc_level(vcpu, addr, hpfar, &level);
...
}
This ultimately calls walk_s1() which relies on kvm_read_guest() to read the
descriptors. Because the SRCU read lock is not held, can the memslot lookup
race with concurrent host userspace memslot modifications, causing a
use-after-free?
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260623184201.1518871-1-oupton@kernel.org?part=5
^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH 06/22] KVM: arm64: nv: Use a helper for stage-2 descriptor updates
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (4 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 05/22] KVM: arm64: nv: Pass an access descriptor for stage-2 walks Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 18:41 ` [PATCH 07/22] KVM: arm64: nv: Set dirty state at stage-2 Oliver Upton
` (15 subsequent siblings)
21 siblings, 0 replies; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
Use a helper for handling stage-2 descriptor updates so the
implementation can be shared with updates to table descriptors.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/kvm/nested.c | 47 +++++++++++++++++++++++------------------
1 file changed, 26 insertions(+), 21 deletions(-)
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index c2fb7290f0c8..a70af3b3f05d 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -227,9 +227,23 @@ static int read_guest_s2_desc(struct kvm_vcpu *vcpu, struct s2_walk_step *ws,
return 0;
}
-static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u64 new,
- struct s2_walk_info *wi)
+static int handle_desc_update(struct kvm_vcpu *vcpu, struct s2_walk_info *wi,
+ struct s2_walk_step *ws, struct kvm_s2_trans *out,
+ struct kvm_walk_access *access)
{
+ u64 old, new;
+ int ret;
+
+ old = new = ws->desc;
+
+ if (wi->ha)
+ new |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
+
+ if (old == new)
+ return 0;
+
+ ws->desc = new;
+
if (wi->be) {
old = (__force u64)cpu_to_be64(old);
new = (__force u64)cpu_to_be64(new);
@@ -238,7 +252,13 @@ static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u6
new = (__force u64)cpu_to_le64(new);
}
- return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
+ ret = __kvm_at_swap_desc(vcpu->kvm, ws->desc_pa, old, new);
+ if (!ret || ret == -EAGAIN)
+ return ret;
+
+ out->esr = ESR_ELx_FSC_SEA_TTW(ws->level);
+ out->desc = ws->desc;
+ return 1;
}
static void compute_s2_permissions(struct kvm_vcpu *vcpu, struct s2_walk_info *wi,
@@ -287,7 +307,6 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, struct kvm_walk_access *acc
struct s2_walk_step ws = {};
phys_addr_t base_addr;
unsigned int addr_top, addr_bottom;
- u64 new_desc; /* page table entry */
int ret;
switch (BIT(wi->pgshift)) {
@@ -340,8 +359,6 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, struct kvm_walk_access *acc
return ret;
}
- new_desc = ws.desc;
-
/* Check for valid descriptor at this point */
if (!(ws.desc & KVM_PTE_VALID)) {
out->esr = compute_fsc(ws.level, ESR_ELx_FSC_FAULT);
@@ -386,21 +403,9 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, struct kvm_walk_access *acc
return 1;
}
- if (wi->ha)
- new_desc |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
-
- if (new_desc != ws.desc) {
- ret = swap_guest_s2_desc(vcpu, ws.desc_pa, ws.desc, new_desc, wi);
- if (ret == -EAGAIN)
- return ret;
- if (ret) {
- out->esr = ESR_ELx_FSC_SEA_TTW(ws.level);
- out->desc = ws.desc;
- return 1;
- }
-
- ws.desc = new_desc;
- }
+ ret = handle_desc_update(vcpu, wi, &ws, out, access);
+ if (ret)
+ return ret;
if (!(ws.desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) {
out->esr = compute_fsc(ws.level, ESR_ELx_FSC_ACCESS);
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* [PATCH 07/22] KVM: arm64: nv: Set dirty state at stage-2
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (5 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 06/22] KVM: arm64: nv: Use a helper for stage-2 descriptor updates Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 19:03 ` sashiko-bot
2026-06-23 18:41 ` [PATCH 08/22] KVM: arm64: nv: Treat DBM as writable " Oliver Upton
` (14 subsequent siblings)
21 siblings, 1 reply; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
Set the dirty state on descriptors at stage-2 for write accesses when
DBM is set.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/kvm/nested.c | 43 +++++++++++++++++++++++++++++++----------
1 file changed, 33 insertions(+), 10 deletions(-)
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index a70af3b3f05d..e5a407fc0880 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -132,6 +132,7 @@ struct s2_walk_info {
unsigned int t0sz;
bool be;
bool ha;
+ bool hd;
};
struct s2_walk_step {
@@ -227,6 +228,20 @@ static int read_guest_s2_desc(struct kvm_vcpu *vcpu, struct s2_walk_step *ws,
return 0;
}
+static bool should_set_dirty_state(struct s2_walk_info *wi, struct s2_walk_step *ws,
+ struct kvm_s2_trans *out, struct kvm_walk_access *access)
+{
+ switch (access->type) {
+ /* R_RKMHW */
+ case WALK_ACCESS_CMO:
+ case WALK_ACCESS_AT:
+ return false;
+ default:
+ /* R_NSXRD */
+ return access->write && wi->hd && out->writable;
+ }
+}
+
static int handle_desc_update(struct kvm_vcpu *vcpu, struct s2_walk_info *wi,
struct s2_walk_step *ws, struct kvm_s2_trans *out,
struct kvm_walk_access *access)
@@ -239,6 +254,9 @@ static int handle_desc_update(struct kvm_vcpu *vcpu, struct s2_walk_info *wi,
if (wi->ha)
new |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
+ if (should_set_dirty_state(wi, ws, out, access))
+ new |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
+
if (old == new)
return 0;
@@ -403,16 +421,6 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, struct kvm_walk_access *acc
return 1;
}
- ret = handle_desc_update(vcpu, wi, &ws, out, access);
- if (ret)
- return ret;
-
- if (!(ws.desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) {
- out->esr = compute_fsc(ws.level, ESR_ELx_FSC_ACCESS);
- out->desc = ws.desc;
- return 1;
- }
-
addr_bottom += contiguous_bit_shift(ws.desc, wi, ws.level);
/* Calculate and return the result */
@@ -422,6 +430,20 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, struct kvm_walk_access *acc
compute_s2_permissions(vcpu, wi, &ws, out);
out->level = ws.level;
out->desc = ws.desc;
+
+ ret = handle_desc_update(vcpu, wi, &ws, out, access);
+ if (ret)
+ return ret;
+
+ if (!(ws.desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) {
+ *out = (struct kvm_s2_trans) {
+ .esr = compute_fsc(ws.level, ESR_ELx_FSC_ACCESS),
+ .desc = ws.desc,
+ };
+
+ return 1;
+ }
+
return 0;
}
@@ -518,6 +540,7 @@ static void setup_s2_walk(struct kvm_vcpu *vcpu, struct s2_walk_info *wi)
ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false));
wi->ha = vtcr & VTCR_EL2_HA;
wi->be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE;
+ wi->hd = wi->ha && (vtcr & VTCR_EL2_HD);
}
int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, struct kvm_walk_access *access,
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* Re: [PATCH 07/22] KVM: arm64: nv: Set dirty state at stage-2
2026-06-23 18:41 ` [PATCH 07/22] KVM: arm64: nv: Set dirty state at stage-2 Oliver Upton
@ 2026-06-23 19:03 ` sashiko-bot
0 siblings, 0 replies; 40+ messages in thread
From: sashiko-bot @ 2026-06-23 19:03 UTC (permalink / raw)
To: Oliver Upton; +Cc: Marc Zyngier, kvmarm
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Medium] The kvm_s2_trans struct retains stale descriptor values and permissions because it is populated prior to handle_desc_update(), causing KVM to unnecessarily force a second VM exit for write faults on clean DBM pages.
--
commit c42a40795b5eef86805c06b6623a0ccb0c1d888d
Author: Oliver Upton <oupton@kernel.org>
KVM: arm64: nv: Set dirty state at stage-2
Set the dirty state on descriptors at stage-2 for write accesses when
DBM is set.
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> index a70af3b3f05db..e5a407fc0880f 100644
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
[ ... ]
> @@ -422,6 +430,20 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, struct kvm_walk_access *acc
> compute_s2_permissions(vcpu, wi, &ws, out);
> out->level = ws.level;
> out->desc = ws.desc;
> +
> + ret = handle_desc_update(vcpu, wi, &ws, out, access);
[Severity: Medium]
Will this result in KVM mapping the shadow PTE read-only on the first fault,
causing a second VM exit?
Because compute_s2_permissions() calculates the out->dirty state based on the
pre-update descriptor, it seems out->dirty will remain false even if
handle_desc_update() successfully sets the dirty bit in guest memory.
When the walk returns, KVM's MMU consumes this in adjust_nested_fault_perms()
and strips KVM_PGTABLE_PROT_W, causing the L2 guest to retry the write and
immediately trigger a second VM exit.
Should out->dirty be updated to reflect the newly set dirty bit after
handle_desc_update() succeeds?
> + if (ret)
> + return ret;
[ ... ]
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260623184201.1518871-1-oupton@kernel.org?part=7
^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH 08/22] KVM: arm64: nv: Treat DBM as writable at stage-2
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (6 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 07/22] KVM: arm64: nv: Set dirty state at stage-2 Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 18:55 ` sashiko-bot
2026-06-23 18:41 ` [PATCH 09/22] KVM: arm64: Compute S1 permissions as part of s1_walk() Oliver Upton
` (13 subsequent siblings)
21 siblings, 1 reply; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
When using direct permissions, the DBM bit has the effect of granting
the write permission. And, depending on the access, it could grant write
permission without a dirty state update.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/include/asm/kvm_pgtable.h | 1 +
arch/arm64/kvm/nested.c | 11 +++++++++++
2 files changed, 12 insertions(+)
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 22aeb2ed18d1..6ae36973686c 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -94,6 +94,7 @@ typedef u64 kvm_pte_t;
#define KVM_PTE_LEAF_ATTR_HI_S1_PXN BIT(53)
#define KVM_PTE_LEAF_ATTR_HI_S2_XN GENMASK(54, 53)
+#define KVM_PTE_LEAF_ATTR_HI_S2_DBM BIT(51)
#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50)
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index e5a407fc0880..4f13f37e560b 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -305,6 +305,17 @@ static void compute_s2_permissions(struct kvm_vcpu *vcpu, struct s2_walk_info *w
break;
}
+ /*
+ * Descriptors with the DBM bit set while hardware dirty state are
+ * considered writable, even though certain accesses (like AT instructions)
+ * don't actually update the dirty state.
+ *
+ * Assume that walk_nestd_s2_pgd() made the necessary descriptor updates
+ * for the access and just treat DBM as writable here.
+ */
+ if (wi->hd && ws->desc & KVM_PTE_LEAF_ATTR_HI_S2_DBM)
+ s2ap |= BIT(1);
+
trans->readable = s2ap & BIT(0);
trans->writable = s2ap & BIT(1);
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* Re: [PATCH 08/22] KVM: arm64: nv: Treat DBM as writable at stage-2
2026-06-23 18:41 ` [PATCH 08/22] KVM: arm64: nv: Treat DBM as writable " Oliver Upton
@ 2026-06-23 18:55 ` sashiko-bot
2026-06-23 20:08 ` Oliver Upton
0 siblings, 1 reply; 40+ messages in thread
From: sashiko-bot @ 2026-06-23 18:55 UTC (permalink / raw)
To: Oliver Upton; +Cc: kvmarm, Marc Zyngier
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Critical] DBM incorrectly grants write permission to "No access" stage-2 pages, breaking nested guest isolation.
--
commit 99b59d5177f5650f344f4c618e4c4dcd2d5d6d1c
Author: Oliver Upton <oupton@kernel.org>
KVM: arm64: nv: Treat DBM as writable at stage-2
This commit modifies the stage-2 permission computation to treat the DBM bit
as granting write permission when hardware dirty state is enabled.
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
[ ... ]
> @@ -305,6 +305,17 @@ static void compute_s2_permissions(struct kvm_vcpu *vcpu, struct s2_walk_info *w
> break;
> }
>
> + /*
> + * Descriptors with the DBM bit set while hardware dirty state are
> + * considered writable, even though certain accesses (like AT instructions)
> + * don't actually update the dirty state.
> + *
> + * Assume that walk_nestd_s2_pgd() made the necessary descriptor updates
> + * for the access and just treat DBM as writable here.
> + */
> + if (wi->hd && ws->desc & KVM_PTE_LEAF_ATTR_HI_S2_DBM)
> + s2ap |= BIT(1);
[Severity: Critical]
If s2ap is initially 0b00 (No access), this unconditional OR operation will
upgrade it to 0b10 (Write-only) in compute_s2_permissions(). Does this
inadvertently bypass stage-2 memory protections if an L1 hypervisor maps a
page as No access but leaves the DBM bit set?
Without verifying that read access is already granted via s2ap & BIT(0),
this logic grants write permission to a No access page, allowing an L2 guest
to write to memory explicitly protected by the L1 hypervisor.
Additionally, mapping a stage-2 page as Write-only on hosts without
FEAT_S2FWB is architecturally unpredictable and might result in host-level
issues.
Could we gate the DBM write permission upgrade so it only applies if read
access is already established?
> trans->readable = s2ap & BIT(0);
> trans->writable = s2ap & BIT(1);
>
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260623184201.1518871-1-oupton@kernel.org?part=8
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 08/22] KVM: arm64: nv: Treat DBM as writable at stage-2
2026-06-23 18:55 ` sashiko-bot
@ 2026-06-23 20:08 ` Oliver Upton
0 siblings, 0 replies; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 20:08 UTC (permalink / raw)
To: sashiko-reviews; +Cc: kvmarm, Marc Zyngier
On Tue, Jun 23, 2026 at 06:55:33PM +0000, sashiko-bot@kernel.org wrote:
> Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
> - [Critical] DBM incorrectly grants write permission to "No access" stage-2 pages, breaking nested guest isolation.
> --
>
> commit 99b59d5177f5650f344f4c618e4c4dcd2d5d6d1c
> Author: Oliver Upton <oupton@kernel.org>
>
> KVM: arm64: nv: Treat DBM as writable at stage-2
>
> This commit modifies the stage-2 permission computation to treat the DBM bit
> as granting write permission when hardware dirty state is enabled.
>
> > diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> > --- a/arch/arm64/kvm/nested.c
> > +++ b/arch/arm64/kvm/nested.c
> [ ... ]
> > @@ -305,6 +305,17 @@ static void compute_s2_permissions(struct kvm_vcpu *vcpu, struct s2_walk_info *w
> > break;
> > }
> >
> > + /*
> > + * Descriptors with the DBM bit set while hardware dirty state are
> > + * considered writable, even though certain accesses (like AT instructions)
> > + * don't actually update the dirty state.
> > + *
> > + * Assume that walk_nestd_s2_pgd() made the necessary descriptor updates
> > + * for the access and just treat DBM as writable here.
> > + */
> > + if (wi->hd && ws->desc & KVM_PTE_LEAF_ATTR_HI_S2_DBM)
> > + s2ap |= BIT(1);
>
> [Severity: Critical]
> If s2ap is initially 0b00 (No access), this unconditional OR operation will
> upgrade it to 0b10 (Write-only) in compute_s2_permissions(). Does this
> inadvertently bypass stage-2 memory protections if an L1 hypervisor maps a
> page as No access but leaves the DBM bit set?
>
> Without verifying that read access is already granted via s2ap & BIT(0),
> this logic grants write permission to a No access page, allowing an L2 guest
> to write to memory explicitly protected by the L1 hypervisor.
This is wrong.
R_XZFQH defines a descriptor with DBM set that would otherwise generate
a write permission fault as writable-clean. That means the descriptor
grants write permission and hardware can relax it to writable-dirty
depending on the access.
Thanks,
Oliver
^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH 09/22] KVM: arm64: Compute S1 permissions as part of s1_walk()
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (7 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 08/22] KVM: arm64: nv: Treat DBM as writable " Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 18:41 ` [PATCH 10/22] KVM: arm64: Plumb through access descriptor for stage-1 Oliver Upton
` (12 subsequent siblings)
21 siblings, 0 replies; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
Implementing support for hardware dirty state means that the table
walker needs to have visibility into the permissions on the final
translation.
Compute the S1 permissions as part of s1_walk() and initialize
s1_walk_result before checking all fault conditions. The appropriate
fields will be reinitialized if the walk happens to fail at a later
point.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/kvm/at.c | 41 ++++++++++++++++++++---------------------
1 file changed, 20 insertions(+), 21 deletions(-)
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 083014e9d86a..6930bc3bc86b 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -461,6 +461,10 @@ static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new,
return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
}
+static void compute_s1_permissions(struct kvm_vcpu *vcpu,
+ struct s1_walk_info *wi,
+ struct s1_walk_result *wr);
+
static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
struct s1_walk_result *wr, u64 va)
{
@@ -590,6 +594,20 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
if (check_output_size(baddr & GENMASK(52, va_bottom), wi))
goto addrsz;
+ va_bottom += contiguous_bit_shift(desc, wi, level);
+
+ wr->failed = false;
+ wr->level = level;
+ wr->desc = desc;
+ wr->pa = baddr & GENMASK(52, va_bottom);
+ wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
+
+ wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG);
+ if (wr->nG)
+ wr->asid = get_asid_by_regime(vcpu, wi->regime);
+
+ compute_s1_permissions(vcpu, wi, wr);
+
if (wi->ha)
new_desc |= PTE_AF;
@@ -615,18 +633,6 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
return -EACCES;
}
- va_bottom += contiguous_bit_shift(desc, wi, level);
-
- wr->failed = false;
- wr->level = level;
- wr->desc = desc;
- wr->pa = baddr & GENMASK(52, va_bottom);
- wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
-
- wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG);
- if (wr->nG)
- wr->asid = get_asid_by_regime(vcpu, wi->regime);
-
return 0;
addrsz:
@@ -1365,8 +1371,6 @@ static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par)
if (ret)
goto compute_par;
- compute_s1_permissions(vcpu, &wi, &wr);
-
switch (op) {
case OP_AT_S1E1RP:
case OP_AT_S1E1R:
@@ -1690,15 +1694,10 @@ int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
if (wr->level == S1_MMU_DISABLED) {
wr->ur = wr->uw = wr->ux = true;
wr->pr = wr->pw = wr->px = true;
- } else {
- ret = walk_s1(vcpu, wi, wr, va);
- if (ret)
- return ret;
-
- compute_s1_permissions(vcpu, wi, wr);
+ return 0;
}
- return 0;
+ return walk_s1(vcpu, wi, wr, va);
}
struct desc_match {
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* [PATCH 10/22] KVM: arm64: Plumb through access descriptor for stage-1
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (8 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 09/22] KVM: arm64: Compute S1 permissions as part of s1_walk() Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 18:41 ` [PATCH 11/22] KVM: arm64: Use a struct for stage-1 walk context Oliver Upton
` (11 subsequent siblings)
21 siblings, 0 replies; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
Pass sufficient context to the stage-1 walk such that access-dependent
features like FEAT_HAFDBS can implement the correct behavior.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/include/asm/kvm_nested.h | 4 +++-
arch/arm64/kvm/at.c | 32 ++++++++++++++++++++++-------
arch/arm64/kvm/nested.c | 10 ++++++---
3 files changed, 35 insertions(+), 11 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index 71814c4aac3e..347d79fd350c 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -112,6 +112,8 @@ struct kvm_walk_access {
WALK_ACCESS_CMO,
WALK_ACCESS_AT,
WALK_ACCESS_S1PTW,
+ WALK_ACCESS_NV2,
+ WALK_ACCESS_NONARCH,
} type;
u64 ia;
@@ -357,7 +359,7 @@ static inline void fail_s1_walk(struct s1_walk_result *wr, u8 fst, bool s1ptw)
}
int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
- struct s1_walk_result *wr, u64 va);
+ struct s1_walk_result *wr, struct kvm_walk_access *access);
int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa,
int *level);
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 6930bc3bc86b..597e9cddfc7e 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -466,12 +466,13 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu,
struct s1_walk_result *wr);
static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
- struct s1_walk_result *wr, u64 va)
+ struct s1_walk_result *wr, struct kvm_walk_access *access)
{
- u64 va_top, va_bottom, baddr, desc, new_desc, ipa;
+ u64 va_top, va_bottom, baddr, desc, new_desc, ipa, va;
struct kvm_s2_trans s2_trans = {};
int level, stride, ret;
+ va = access->ia;
level = wi->sl;
stride = wi->pgshift - 3;
baddr = wi->baddr;
@@ -1340,6 +1341,7 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu,
static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par)
{
+ struct kvm_walk_access access = {};
struct s1_walk_result wr = {};
struct s1_walk_info wi = {};
bool perm_fail = false;
@@ -1357,9 +1359,21 @@ static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par)
if (wr.level == S1_MMU_DISABLED)
goto compute_par;
+ access.type = WALK_ACCESS_AT;
+ access.ia = vaddr;
+ switch (op) {
+ case OP_AT_S1E1WP:
+ case OP_AT_S1E1W:
+ case OP_AT_S1E2W:
+ case OP_AT_S1E0W:
+ access.write = true;
+ break;
+ default:
+ }
+
idx = srcu_read_lock(&vcpu->kvm->srcu);
- ret = walk_s1(vcpu, &wi, &wr, vaddr);
+ ret = walk_s1(vcpu, &wi, &wr, &access);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -1683,11 +1697,11 @@ int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
* set. The rest of the wi and wr should be 0-initialised.
*/
int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
- struct s1_walk_result *wr, u64 va)
+ struct s1_walk_result *wr, struct kvm_walk_access *access)
{
int ret;
- ret = setup_s1_walk(vcpu, wi, wr, va);
+ ret = setup_s1_walk(vcpu, wi, wr, access->ia);
if (ret)
return ret;
@@ -1697,7 +1711,7 @@ int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
return 0;
}
- return walk_s1(vcpu, wi, wr, va);
+ return walk_s1(vcpu, wi, wr, access);
}
struct desc_match {
@@ -1735,6 +1749,10 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level)
.as_el0 = false,
.pan = false,
};
+ struct kvm_walk_access access = {
+ .type = WALK_ACCESS_NONARCH,
+ .ia = va,
+ };
struct s1_walk_result wr = {};
int ret;
@@ -1754,7 +1772,7 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level)
}
/* Walk the guest's PT, looking for a match along the way */
- ret = walk_s1(vcpu, &wi, &wr, va);
+ ret = walk_s1(vcpu, &wi, &wr, &access);
switch (ret) {
case -EINTR:
/* We interrupted the walk on a match, return the level */
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 4f13f37e560b..54228db30371 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -1425,6 +1425,7 @@ static u64 read_vncr_el2(struct kvm_vcpu *vcpu)
static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem)
{
+ struct kvm_walk_access access = {};
struct kvm_memory_slot *memslot;
bool write_fault, writable;
unsigned long mmu_seq;
@@ -1434,6 +1435,7 @@ static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem)
int ret;
vt = vcpu->arch.vncr_tlb;
+ write_fault = kvm_is_write_fault(vcpu);
/*
* If we're about to walk the EL2 S1 PTs, we must invalidate the
@@ -1459,12 +1461,14 @@ static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem)
va = read_vncr_el2(vcpu);
- ret = __kvm_translate_va(vcpu, &vt->wi, &vt->wr, va);
+ access.type = WALK_ACCESS_NV2;
+ access.ia = va;
+ access.write = write_fault;
+
+ ret = __kvm_translate_va(vcpu, &vt->wi, &vt->wr, &access);
if (ret)
return ret;
- write_fault = kvm_is_write_fault(vcpu);
-
mmu_seq = vcpu->kvm->mmu_invalidate_seq;
smp_rmb();
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* [PATCH 11/22] KVM: arm64: Use a struct for stage-1 walk context
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (9 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 10/22] KVM: arm64: Plumb through access descriptor for stage-1 Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 18:41 ` [PATCH 12/22] KVM: arm64: Create helper for stage-1 descriptor updates Oliver Upton
` (10 subsequent siblings)
21 siblings, 0 replies; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
Consolidate the current walk step in a struct such that helpers can be
spun off from the core implementation.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/kvm/at.c | 94 ++++++++++++++++++++++++---------------------
1 file changed, 51 insertions(+), 43 deletions(-)
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 597e9cddfc7e..816d23e7752d 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -429,6 +429,14 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
return -EFAULT;
}
+struct s1_walk_step {
+ u64 desc;
+ u64 desc_ipa;
+ u64 desc_pa;
+ struct kvm_s2_trans s2_trans;
+ int level;
+};
+
static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc,
struct s1_walk_info *wi)
{
@@ -468,12 +476,12 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu,
static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
struct s1_walk_result *wr, struct kvm_walk_access *access)
{
- u64 va_top, va_bottom, baddr, desc, new_desc, ipa, va;
- struct kvm_s2_trans s2_trans = {};
- int level, stride, ret;
+ u64 va_top, va_bottom, baddr, new_desc, va;
+ struct s1_walk_step ws = {};
+ int stride, ret;
va = access->ia;
- level = wi->sl;
+ ws.level = wi->sl;
stride = wi->pgshift - 3;
baddr = wi->baddr;
@@ -482,15 +490,15 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
while (1) {
u64 index;
- va_bottom = (3 - level) * stride + wi->pgshift;
+ va_bottom = (3 - ws.level) * stride + wi->pgshift;
index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
- ipa = baddr | index;
+ ws.desc_ipa = ws.desc_pa = baddr | index;
if (wi->s2) {
struct kvm_walk_access s2_access = {
.type = WALK_ACCESS_S1PTW,
- .ia = ipa,
+ .ia = ws.desc_ipa,
/*
* R_JCXVS, stage-2 dirty state can be updated
@@ -500,25 +508,25 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
.write = wi->ha,
};
- ret = kvm_walk_nested_s2(vcpu, &s2_access, &s2_trans);
+ ret = kvm_walk_nested_s2(vcpu, &s2_access, &ws.s2_trans);
if (ret == -EAGAIN)
return ret;
if (ret) {
fail_s1_walk(wr,
- (s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | level,
+ (ws.s2_trans.esr & ~ESR_ELx_FSC_LEVEL) | ws.level,
true);
return ret;
}
- if (!s2_trans.readable) {
- fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level),
+ if (!ws.s2_trans.readable) {
+ fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(ws.level),
true);
return -EPERM;
}
- ipa = s2_trans.output;
+ ws.desc_pa = ws.s2_trans.output;
}
if (wi->filter) {
@@ -526,40 +534,40 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
{
.wi = wi,
.table_ipa = baddr,
- .level = level,
+ .level = ws.level,
}, wi->filter->priv);
if (ret)
return ret;
}
- ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi);
+ ret = kvm_read_s1_desc(vcpu, ws.desc_pa, &ws.desc, wi);
if (ret) {
- fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false);
+ fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(ws.level), false);
return ret;
}
- new_desc = desc;
+ new_desc = ws.desc;
/* Invalid descriptor */
- if (!(desc & BIT(0)))
+ if (!(ws.desc & BIT(0)))
goto transfault;
/* Block mapping, check validity down the line */
- if (!(desc & BIT(1)))
+ if (!(ws.desc & BIT(1)))
break;
/* Page mapping */
- if (level == 3)
+ if (ws.level == 3)
break;
/* Table handling */
if (!wi->hpd) {
- wr->APTable |= FIELD_GET(S1_TABLE_AP, desc);
- wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, desc);
- wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, desc);
+ wr->APTable |= FIELD_GET(S1_TABLE_AP, ws.desc);
+ wr->UXNTable |= FIELD_GET(PMD_TABLE_UXN, ws.desc);
+ wr->PXNTable |= FIELD_GET(PMD_TABLE_PXN, ws.desc);
}
- baddr = desc_to_oa(wi, desc);
+ baddr = desc_to_oa(wi, ws.desc);
/* Check for out-of-range OA */
if (check_output_size(baddr, wi))
@@ -567,23 +575,23 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
/* Prepare for next round */
va_top = va_bottom - 1;
- level++;
+ ws.level++;
}
/* Block mapping, check the validity of the level */
- if (!(desc & BIT(1))) {
+ if (!(ws.desc & BIT(1))) {
bool valid_block = false;
bool lpa = kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, PARANGE, 52);
switch (BIT(wi->pgshift)) {
case SZ_4K:
- valid_block = level == 1 || level == 2 || (wi->pa52bit && level == 0);
+ valid_block = ws.level == 1 || ws.level == 2 || (wi->pa52bit && ws.level == 0);
break;
case SZ_16K:
- valid_block = level == 2 || (wi->pa52bit && level == 1);
+ valid_block = ws.level == 2 || (wi->pa52bit && ws.level == 1);
break;
case SZ_64K:
- valid_block = level == 2 || (lpa && level == 1);
+ valid_block = ws.level == 2 || (lpa && ws.level == 1);
break;
}
@@ -591,19 +599,19 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
goto transfault;
}
- baddr = desc_to_oa(wi, desc);
+ baddr = desc_to_oa(wi, ws.desc);
if (check_output_size(baddr & GENMASK(52, va_bottom), wi))
goto addrsz;
- va_bottom += contiguous_bit_shift(desc, wi, level);
+ va_bottom += contiguous_bit_shift(ws.desc, wi, ws.level);
wr->failed = false;
- wr->level = level;
- wr->desc = desc;
+ wr->level = ws.level;
+ wr->desc = ws.desc;
wr->pa = baddr & GENMASK(52, va_bottom);
wr->pa |= va & GENMASK_ULL(va_bottom - 1, 0);
- wr->nG = (wi->regime != TR_EL2) && (desc & PTE_NG);
+ wr->nG = (wi->regime != TR_EL2) && (ws.desc & PTE_NG);
if (wr->nG)
wr->asid = get_asid_by_regime(vcpu, wi->regime);
@@ -612,35 +620,35 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
if (wi->ha)
new_desc |= PTE_AF;
- if (new_desc != desc) {
- if (wi->s2 && !s2_trans.writable) {
- fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(level), true);
+ if (new_desc != ws.desc) {
+ if (wi->s2 && !ws.s2_trans.writable) {
+ fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(ws.level), true);
return -EPERM;
}
- ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi);
+ ret = kvm_swap_s1_desc(vcpu, ws.desc_pa, ws.desc, new_desc, wi);
if (ret == -EAGAIN)
return ret;
if (ret) {
- fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false);
+ fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(ws.level), false);
return ret;
}
- desc = new_desc;
+ ws.desc = new_desc;
}
- if (!(desc & PTE_AF)) {
- fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false);
+ if (!(ws.desc & PTE_AF)) {
+ fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(ws.level), false);
return -EACCES;
}
return 0;
addrsz:
- fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(level), false);
+ fail_s1_walk(wr, ESR_ELx_FSC_ADDRSZ_L(ws.level), false);
return -EINVAL;
transfault:
- fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(level), false);
+ fail_s1_walk(wr, ESR_ELx_FSC_FAULT_L(ws.level), false);
return -ENOENT;
}
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* [PATCH 12/22] KVM: arm64: Create helper for stage-1 descriptor updates
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (10 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 11/22] KVM: arm64: Use a struct for stage-1 walk context Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 18:55 ` sashiko-bot
2026-06-23 18:41 ` [PATCH 13/22] KVM: arm64: Set dirty state at stage-1 Oliver Upton
` (9 subsequent siblings)
21 siblings, 1 reply; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
Prepare for FEAT_HAFT support by creating a helper for descriptor
updates to be used for both leaf and table descriptors.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/kvm/at.c | 52 ++++++++++++++++++++++++++-------------------
1 file changed, 30 insertions(+), 22 deletions(-)
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 816d23e7752d..0218176107b5 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -455,9 +455,28 @@ static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc,
return 0;
}
-static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new,
- struct s1_walk_info *wi)
+static int handle_desc_update(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
+ struct s1_walk_step *ws, struct s1_walk_result *wr,
+ struct kvm_walk_access *access)
{
+ u64 old, new;
+ int ret;
+
+ old = new = ws->desc;
+
+ if (wi->ha)
+ new |= PTE_AF;
+
+ if (new == old)
+ return 0;
+
+ if (wi->s2 && !ws->s2_trans.writable) {
+ fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(ws->level), true);
+ return -EPERM;
+ }
+
+ ws->desc = new;
+
if (wi->be) {
old = (__force u64)cpu_to_be64(old);
new = (__force u64)cpu_to_be64(new);
@@ -466,7 +485,12 @@ static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new,
new = (__force u64)cpu_to_le64(new);
}
- return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
+ ret = __kvm_at_swap_desc(vcpu->kvm, ws->desc_pa, old, new);
+ if (!ret || ret == -EAGAIN)
+ return ret;
+
+ fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(ws->level), false);
+ return ret;
}
static void compute_s1_permissions(struct kvm_vcpu *vcpu,
@@ -617,25 +641,9 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
compute_s1_permissions(vcpu, wi, wr);
- if (wi->ha)
- new_desc |= PTE_AF;
-
- if (new_desc != ws.desc) {
- if (wi->s2 && !ws.s2_trans.writable) {
- fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(ws.level), true);
- return -EPERM;
- }
-
- ret = kvm_swap_s1_desc(vcpu, ws.desc_pa, ws.desc, new_desc, wi);
- if (ret == -EAGAIN)
- return ret;
- if (ret) {
- fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(ws.level), false);
- return ret;
- }
-
- ws.desc = new_desc;
- }
+ ret = handle_desc_update(vcpu, wi, &ws, wr, access);
+ if (ret)
+ return ret;
if (!(ws.desc & PTE_AF)) {
fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(ws.level), false);
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* Re: [PATCH 12/22] KVM: arm64: Create helper for stage-1 descriptor updates
2026-06-23 18:41 ` [PATCH 12/22] KVM: arm64: Create helper for stage-1 descriptor updates Oliver Upton
@ 2026-06-23 18:55 ` sashiko-bot
0 siblings, 0 replies; 40+ messages in thread
From: sashiko-bot @ 2026-06-23 18:55 UTC (permalink / raw)
To: Oliver Upton; +Cc: kvmarm, Marc Zyngier
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Low] Unused variable `new_desc` left behind after refactoring.
--
commit a77df785da34afbda05fb0cd124ac7baeff877cd
Author: Oliver Upton <oupton@kernel.org>
KVM: arm64: Create helper for stage-1 descriptor updates
This commit prepares for FEAT_HAFT support by creating a descriptor update
helper to be used for both leaf and table descriptors.
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> index 816d23e7752df..0218176107b52 100644
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
[ ... ]
> @@ -617,25 +641,9 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
>
> compute_s1_permissions(vcpu, wi, wr);
>
> - if (wi->ha)
> - new_desc |= PTE_AF;
> -
> - if (new_desc != ws.desc) {
> - if (wi->s2 && !ws.s2_trans.writable) {
> - fail_s1_walk(wr, ESR_ELx_FSC_PERM_L(ws.level), true);
> - return -EPERM;
> - }
> -
> - ret = kvm_swap_s1_desc(vcpu, ws.desc_pa, ws.desc, new_desc, wi);
> - if (ret == -EAGAIN)
> - return ret;
> - if (ret) {
> - fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(ws.level), false);
> - return ret;
> - }
> -
> - ws.desc = new_desc;
> - }
> + ret = handle_desc_update(vcpu, wi, &ws, wr, access);
> + if (ret)
> + return ret;
[Severity: Low]
Does this leave new_desc unused in walk_s1?
Looking at walk_s1, it appears new_desc is still declared at the top of the
function and assigned later on, but is no longer read after this change:
walk_s1() {
u64 va_top, va_bottom, baddr, new_desc, va;
...
new_desc = ws.desc;
...
}
Will leaving the declaration and assignment trigger an unused-but-set-variable
compiler warning?
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260623184201.1518871-1-oupton@kernel.org?part=12
^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH 13/22] KVM: arm64: Set dirty state at stage-1
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (11 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 12/22] KVM: arm64: Create helper for stage-1 descriptor updates Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 18:54 ` sashiko-bot
2026-06-23 18:41 ` [PATCH 14/22] KVM: arm64: Grant write permission when DBM is set at S1 Oliver Upton
` (8 subsequent siblings)
21 siblings, 1 reply; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
Mark the descriptor as dirty if the permissions are sufficient to grant
write access. Note that compute_s1_direct_permissions() already
considers the DBM bit as writable.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
Spotted right before posting:
TCR_ELx.HD is conditioned on TCR_ELx.HA being set, which is missing from
this patch. I'll address in v2.
arch/arm64/include/asm/kvm_arm.h | 1 +
arch/arm64/include/asm/kvm_nested.h | 1 +
arch/arm64/kvm/at.c | 23 +++++++++++++++++++++++
3 files changed, 25 insertions(+)
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 3f9233b5a130..3f7fa9c3e9c5 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -111,6 +111,7 @@
#define TCR_EL2_DS (1UL << 32)
#define TCR_EL2_RES1 ((1U << 31) | (1 << 23))
#define TCR_EL2_HPD (1 << 24)
+#define TCR_EL2_HD (1 << 22)
#define TCR_EL2_HA (1 << 21)
#define TCR_EL2_TBI (1 << 20)
#define TCR_EL2_PS_SHIFT 16
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index 347d79fd350c..1bb070328b1c 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -316,6 +316,7 @@ struct s1_walk_info {
bool s2;
bool pa52bit;
bool ha;
+ bool hd;
};
struct s1_walk_result {
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 0218176107b5..bfba31b270a9 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -412,6 +412,10 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
wi->ha &= (wi->regime == TR_EL2 ?
FIELD_GET(TCR_EL2_HA, tcr) :
FIELD_GET(TCR_HA, tcr));
+ wi->hd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, DBM);
+ wi->hd &= (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_HD, tcr) :
+ FIELD_GET(TCR_HD, tcr));
return 0;
@@ -455,6 +459,22 @@ static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc,
return 0;
}
+static bool should_set_dirty_state(struct s1_walk_info *wi, struct s1_walk_step *ws,
+ struct s1_walk_result *wr, struct kvm_walk_access *access)
+{
+ bool perm = wi->as_el0 ? wr->uw : wr->pw;
+
+ switch (access->type) {
+ /* R_RKMHW */
+ case WALK_ACCESS_CMO:
+ case WALK_ACCESS_AT:
+ return false;
+ default:
+ /* R_NSXRD */
+ return access->write && wi->hd && perm;
+ }
+}
+
static int handle_desc_update(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
struct s1_walk_step *ws, struct s1_walk_result *wr,
struct kvm_walk_access *access)
@@ -467,6 +487,9 @@ static int handle_desc_update(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
if (wi->ha)
new |= PTE_AF;
+ if (should_set_dirty_state(wi, ws, wr, access))
+ new &= ~PTE_RDONLY;
+
if (new == old)
return 0;
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* Re: [PATCH 13/22] KVM: arm64: Set dirty state at stage-1
2026-06-23 18:41 ` [PATCH 13/22] KVM: arm64: Set dirty state at stage-1 Oliver Upton
@ 2026-06-23 18:54 ` sashiko-bot
0 siblings, 0 replies; 40+ messages in thread
From: sashiko-bot @ 2026-06-23 18:54 UTC (permalink / raw)
To: Oliver Upton; +Cc: kvmarm, Marc Zyngier
Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
- [High] Hardware management of the dirty state (wi->hd) is incorrectly enabled for stage-1 even if hardware management of the access flag (wi->ha) is disabled.
- [Medium] The walk result's descriptor and dirty state are not updated after a successful hardware update, leading to spurious secondary faults.
--
commit bfdb5faf043bd9330372f7cd1a89affe2bb61b35
Author: Oliver Upton <oupton@kernel.org>
KVM: arm64: Set dirty state at stage-1
This commit updates the KVM ARM64 software page table walker to clear
the PTE_RDONLY bit in guest page tables when write access is permitted
and DBM (Dirty Bit Management) is enabled. This simulates hardware
management of the dirty state during stage-1 translation walks.
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> index 0218176107b52..bfba31b270a95 100644
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
> @@ -412,6 +412,10 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> wi->ha &= (wi->regime == TR_EL2 ?
> FIELD_GET(TCR_EL2_HA, tcr) :
> FIELD_GET(TCR_HA, tcr));
> + wi->hd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, DBM);
> + wi->hd &= (wi->regime == TR_EL2 ?
> + FIELD_GET(TCR_EL2_HD, tcr) :
> + FIELD_GET(TCR_HD, tcr));
[Severity: High]
Does this code incorrectly enable hardware management of the dirty state (wi->hd)
when hardware management of the access flag (wi->ha) is disabled?
The ARM Architecture Reference Manual (D8.2.7) states that if HA is 0, HD is
effectively 0 and no hardware updates are permitted.
If a guest enables TCR_ELx.HD but leaves TCR_ELx.HA disabled, wi->hd evaluates
to true while wi->ha is false in setup_s1_walk(). Upon a write fault,
handle_desc_update() would clear PTE_RDONLY but skip setting PTE_AF, causing KVM
to inject an Access Flag fault despite having already modified the guest page
table to clear PTE_RDONLY.
>
> return 0;
[ ... ]
> @@ -467,6 +487,9 @@ static int handle_desc_update(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> if (wi->ha)
> new |= PTE_AF;
>
> + if (should_set_dirty_state(wi, ws, wr, access))
> + new &= ~PTE_RDONLY;
> +
> if (new == old)
> return 0;
[Severity: Medium]
Are we failing to update the walk result (wr) after a successful hardware update?
When handle_desc_update() successfully clears PTE_RDONLY, it updates the
descriptor in guest memory and ws->desc, but wr->desc and wr->dirty are not
updated to reflect this change for walk_s1().
If kvm_translate_vncr() subsequently uses the stale wr.dirty == false, it maps
the VNCR page as PAGE_KERNEL_RO. When the guest retries the write, it triggers
a permission fault, forcing a second translation walk which then sees the updated
descriptor and maps it RW. Does this omission cause spurious secondary page
faults?
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260623184201.1518871-1-oupton@kernel.org?part=13
^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH 14/22] KVM: arm64: Grant write permission when DBM is set at S1
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (12 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 13/22] KVM: arm64: Set dirty state at stage-1 Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 18:57 ` sashiko-bot
2026-06-23 18:41 ` [PATCH 15/22] KVM: arm64: Don't update descriptors for "non-arch" access Oliver Upton
` (7 subsequent siblings)
21 siblings, 1 reply; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
The DBM bit effectively behaves as a permission bit when using direct
permissions, especially so in the context of accesses that do not update
the dirty state (like AT).
Treat AP[1] == 0 for the purposes of computing permissions, relying on
the walker to abort if it fails to set the dirty state where required.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/kvm/at.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index bfba31b270a9..faffb69c2eef 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -1077,11 +1077,23 @@ static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu,
struct s1_walk_info *wi,
struct s1_walk_result *wr)
{
+ u8 ap = FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc);
bool wxn;
+ /*
+ * Descriptors with the DBM bit set while hardware dirty state are
+ * considered writable, even though certain accesses (like AT instructions)
+ * don't actually update the dirty state.
+ *
+ * Assume that walk_s1() made the necessary descriptor updates for the
+ * access and just treat DBM as writable here.
+ */
+ if (wi->hd && (wr->desc & PTE_DBM))
+ ap &= ~BIT(1);
+
/* Non-hierarchical part of AArch64.S1DirectBasePermissions() */
if (wi->regime != TR_EL2) {
- switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) {
+ switch (ap) {
case 0b00:
wr->pr = wr->pw = true;
wr->ur = wr->uw = false;
@@ -1105,7 +1117,7 @@ static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu,
} else {
wr->ur = wr->uw = wr->ux = false;
- if (!(wr->desc & PTE_RDONLY)) {
+ if (!(ap & BIT(1))) {
wr->pr = wr->pw = true;
} else {
wr->pr = true;
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* Re: [PATCH 14/22] KVM: arm64: Grant write permission when DBM is set at S1
2026-06-23 18:41 ` [PATCH 14/22] KVM: arm64: Grant write permission when DBM is set at S1 Oliver Upton
@ 2026-06-23 18:57 ` sashiko-bot
0 siblings, 0 replies; 40+ messages in thread
From: sashiko-bot @ 2026-06-23 18:57 UTC (permalink / raw)
To: Oliver Upton; +Cc: Marc Zyngier, kvmarm
Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
- [High] The patch incorrectly grants write permissions based on DBM when hardware dirty state management is architecturally disabled due to HA=0.
- [High] Hardware dirty state management (FEAT_HAFDBS) is incorrectly emulated when the Permission Indirection Extension (FEAT_S1PIE) is enabled, leading to spurious permission faults and incorrect descriptor updates.
--
commit 7f44240a7dbcb830d02ca9b76f878070fdff7b00
Author: Oliver Upton <oupton@kernel.org>
KVM: arm64: Grant write permission when DBM is set at S1
The DBM bit effectively behaves as a permission bit when using direct
permissions, especially so in the context of accesses that do not update
the dirty state (like AT).
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> index bfba31b270a95..faffb69c2eef5 100644
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
> @@ -1077,11 +1077,23 @@ static void compute_s1_direct_permissions(struct kvm_vcpu *vcpu,
> struct s1_walk_info *wi,
> struct s1_walk_result *wr)
> {
> + u8 ap = FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc);
> bool wxn;
>
> + /*
> + * Descriptors with the DBM bit set while hardware dirty state are
> + * considered writable, even though certain accesses (like AT instructions)
> + * don't actually update the dirty state.
> + *
> + * Assume that walk_s1() made the necessary descriptor updates for the
> + * access and just treat DBM as writable here.
> + */
> + if (wi->hd && (wr->desc & PTE_DBM))
> + ap &= ~BIT(1);
[Severity: High]
Does this incorrectly grant write permissions when hardware management of the
Access Flag is disabled?
According to the ARM Architecture Reference Manual, hardware management of the
dirty state (TCR_ELx.HD = 1) is ignored if hardware management of the Access
Flag is disabled (TCR_ELx.HA = 0).
In setup_s1_walk(), we compute wi->hd directly from TCR.HD without enforcing
this requirement:
arch/arm64/kvm/at.c:setup_s1_walk() {
...
wi->hd = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, DBM);
wi->hd &= (wi->regime == TR_EL2 ?
FIELD_GET(TCR_EL2_HD, tcr) :
FIELD_GET(TCR_HD, tcr));
...
}
If a guest OS configures TCR_ELx.HA = 0 and TCR_ELx.HD = 1, and creates a
stage-1 descriptor with RDONLY=1 and DBM=1, it should remain strictly
read-only and generate a Permission Fault on a write. By relying on the
ungated wi->hd here, does KVM bypass the Permission Fault and allow a
guest-visible privilege escalation?
[Severity: High]
Is hardware dirty state management incorrectly emulated when the Permission
Indirection Extension (FEAT_S1PIE) is enabled?
When FEAT_S1PIE is enabled, the DBM bit (bit 51) acts as Index[1] in the PI
index. If a write occurs to a "clean" page (DBM=1) and HD=1, the hardware
checks if clearing bit 51 would grant write access.
It looks like compute_s1_indirect_permissions() evaluates permissions using
the current PI index without simulating this dirty bit transition, which could
cause a spurious permission fault instead of updating the dirty state:
arch/arm64/kvm/at.c:compute_s1_indirect_permissions() {
...
idx = pte_pi_index(wr->desc);
...
set_perms(priv, wr, pp);
...
}
Additionally, if the dirty state needs to be updated, it appears we
unconditionally clear bit 7 (PTE_RDONLY) instead of bit 51 (PTE_DBM):
arch/arm64/kvm/at.c:handle_desc_update() {
...
if (should_set_dirty_state(wi, ws, wr, access))
new &= ~PTE_RDONLY;
...
}
For PIE, bit 7 is unrelated to the dirty state. Could this corrupt the
guest's PIE descriptor?
>
> /* Non-hierarchical part of AArch64.S1DirectBasePermissions() */
> if (wi->regime != TR_EL2) {
> - switch (FIELD_GET(PTE_USER | PTE_RDONLY, wr->desc)) {
> + switch (ap) {
> case 0b00:
> wr->pr = wr->pw = true;
> wr->ur = wr->uw = false;
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260623184201.1518871-1-oupton@kernel.org?part=14
^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH 15/22] KVM: arm64: Don't update descriptors for "non-arch" access
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (13 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 14/22] KVM: arm64: Grant write permission when DBM is set at S1 Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 18:41 ` [PATCH 16/22] KVM: arm64: nv: Expose FEAT_HAFDBS Oliver Upton
` (6 subsequent siblings)
21 siblings, 0 replies; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
WALK_ACCESS_NONARCH is a special indicator that the page table walk
request is not due to an architectural behavior and instead used
internally for KVM. Currently, the only user of this is SEA injection to
identify the correct translation level that generated the abort.
Ensure "non-arch" table walks are not made visible to the guest and
explicitly avoid descriptor updates. These accesses are not expected to
be used for pseudo-TLB fills, nor are they used to dirty any memory.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/kvm/at.c | 15 ++++++++++++++-
arch/arm64/kvm/nested.c | 12 +++++++++++-
2 files changed, 25 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index faffb69c2eef..8d76eaf463c9 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -459,6 +459,15 @@ static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc,
return 0;
}
+static bool should_set_access_flag(struct s1_walk_info *wi, struct s1_walk_step *ws,
+ struct kvm_walk_access *access)
+{
+ if (access->type == WALK_ACCESS_NONARCH)
+ return false;
+
+ return wi->ha;
+}
+
static bool should_set_dirty_state(struct s1_walk_info *wi, struct s1_walk_step *ws,
struct s1_walk_result *wr, struct kvm_walk_access *access)
{
@@ -468,6 +477,7 @@ static bool should_set_dirty_state(struct s1_walk_info *wi, struct s1_walk_step
/* R_RKMHW */
case WALK_ACCESS_CMO:
case WALK_ACCESS_AT:
+ case WALK_ACCESS_NONARCH:
return false;
default:
/* R_NSXRD */
@@ -484,7 +494,7 @@ static int handle_desc_update(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
old = new = ws->desc;
- if (wi->ha)
+ if (should_set_access_flag(wi, ws, access))
new |= PTE_AF;
if (should_set_dirty_state(wi, ws, wr, access))
@@ -555,6 +565,9 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
.write = wi->ha,
};
+ if (access->type == WALK_ACCESS_NONARCH)
+ s2_access.type = WALK_ACCESS_NONARCH;
+
ret = kvm_walk_nested_s2(vcpu, &s2_access, &ws.s2_trans);
if (ret == -EAGAIN)
return ret;
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 54228db30371..4d128003ae7a 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -228,6 +228,15 @@ static int read_guest_s2_desc(struct kvm_vcpu *vcpu, struct s2_walk_step *ws,
return 0;
}
+static bool should_set_access_flag(struct s2_walk_info *wi, struct s2_walk_step *ws,
+ struct kvm_walk_access *access)
+{
+ if (access->type == WALK_ACCESS_NONARCH)
+ return false;
+
+ return wi->ha;
+}
+
static bool should_set_dirty_state(struct s2_walk_info *wi, struct s2_walk_step *ws,
struct kvm_s2_trans *out, struct kvm_walk_access *access)
{
@@ -235,6 +244,7 @@ static bool should_set_dirty_state(struct s2_walk_info *wi, struct s2_walk_step
/* R_RKMHW */
case WALK_ACCESS_CMO:
case WALK_ACCESS_AT:
+ case WALK_ACCESS_NONARCH:
return false;
default:
/* R_NSXRD */
@@ -251,7 +261,7 @@ static int handle_desc_update(struct kvm_vcpu *vcpu, struct s2_walk_info *wi,
old = new = ws->desc;
- if (wi->ha)
+ if (should_set_access_flag(wi, ws, access))
new |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
if (should_set_dirty_state(wi, ws, out, access))
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* [PATCH 16/22] KVM: arm64: nv: Expose FEAT_HAFDBS
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (14 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 15/22] KVM: arm64: Don't update descriptors for "non-arch" access Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 19:01 ` sashiko-bot
2026-06-23 18:41 ` [PATCH 17/22] KVM: arm64: Set Access flag on table descriptors at stage-1 Oliver Upton
` (5 subsequent siblings)
21 siblings, 1 reply; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
Everything is in place now to support hardware dirty state management.
Flip the corresponding bit and advertise the feature to the guest.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/kvm/nested.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 4d128003ae7a..2408d604a1e1 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -1790,7 +1790,7 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features))
val &= ~ID_AA64MMFR1_EL1_VH;
- val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, AF);
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, DBM);
break;
case SYS_ID_AA64MMFR2_EL1:
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* Re: [PATCH 16/22] KVM: arm64: nv: Expose FEAT_HAFDBS
2026-06-23 18:41 ` [PATCH 16/22] KVM: arm64: nv: Expose FEAT_HAFDBS Oliver Upton
@ 2026-06-23 19:01 ` sashiko-bot
0 siblings, 0 replies; 40+ messages in thread
From: sashiko-bot @ 2026-06-23 19:01 UTC (permalink / raw)
To: Oliver Upton; +Cc: kvmarm, Marc Zyngier
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
Pre-existing issues:
- [High] KVM will double-fault on the first write to any DBM-enabled page in a nested guest due to stale dirty state evaluation.
--
commit 559268b60f71702530e79a26ecefcf5b5cf06620
Author: Oliver Upton <oupton@kernel.org>
KVM: arm64: nv: Expose FEAT_HAFDBS
This commit enables FEAT_HAFDBS (Hardware Access Flag and Dirty Bit Management)
for nested virtual machines by advertising DBM support to the guest.
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> index 4d128003ae7a5..2408d604a1e18 100644
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
> @@ -1790,7 +1790,7 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
> if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features))
> val &= ~ID_AA64MMFR1_EL1_VH;
>
> - val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, AF);
> + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, DBM);
[Severity: High]
This is a pre-existing issue, but exposing DBM here appears to cause a
performance regression where a nested guest will double-fault on the first
write to any DBM-enabled page.
In walk_nested_s2_pgd(), the code evaluates the stale descriptor before
atomic-swapping the dirty bit into memory:
arch/arm64/kvm/nested.c:walk_nested_s2_pgd() {
...
compute_s2_permissions(vcpu, wi, &ws, out);
out->level = ws.level;
out->desc = ws.desc;
ret = handle_desc_update(vcpu, wi, &ws, out, access);
if (ret)
return ret;
...
}
If handle_desc_update() successfully updates the descriptor, it returns 0
without updating out->dirty or out->desc:
arch/arm64/kvm/nested.c:handle_desc_update() {
...
ret = __kvm_at_swap_desc(vcpu->kvm, ws->desc_pa, old, new);
if (!ret || ret == -EAGAIN)
return ret;
out->esr = ESR_ELx_FSC_SEA_TTW(ws->level);
out->desc = ws->desc;
return 1;
}
Because out->dirty remains false after the successful update, does
adjust_nested_fault_perms() strip the writable permission, forcing a second
VM exit when the guest resumes?
Should walk_nested_s2_pgd() or handle_desc_update() be modified to re-evaluate
the permissions upon success before enabling DBM?
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260623184201.1518871-1-oupton@kernel.org?part=16
^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH 17/22] KVM: arm64: Set Access flag on table descriptors at stage-1
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (15 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 16/22] KVM: arm64: nv: Expose FEAT_HAFDBS Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 20:56 ` sashiko-bot
2026-06-23 18:41 ` [PATCH 18/22] KVM: arm64: nv: Set access flag on table descriptors at stage-2 Oliver Upton
` (4 subsequent siblings)
21 siblings, 1 reply; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
Implement access flag updates on table descriptors as required by
FEAT_HAFT. Note that unlike leaf descriptors, access flags on table
descriptors never generate a fault.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
Same comment as TCR_ELx.HD, will fix in v2.
arch/arm64/include/asm/kvm_nested.h | 1 +
arch/arm64/include/asm/kvm_pgtable.h | 11 +++++++++++
arch/arm64/kvm/at.c | 11 ++++++++++-
arch/arm64/kvm/hyp/pgtable.c | 11 -----------
4 files changed, 22 insertions(+), 12 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
index 1bb070328b1c..b52cf869333c 100644
--- a/arch/arm64/include/asm/kvm_nested.h
+++ b/arch/arm64/include/asm/kvm_nested.h
@@ -317,6 +317,7 @@ struct s1_walk_info {
bool pa52bit;
bool ha;
bool hd;
+ bool haft;
};
struct s1_walk_result {
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 6ae36973686c..67601ed6370d 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -131,6 +131,17 @@ static inline bool kvm_pte_valid(kvm_pte_t pte)
return pte & KVM_PTE_VALID;
}
+static inline bool kvm_pte_table(kvm_pte_t pte, s8 level)
+{
+ if (level == KVM_PGTABLE_LAST_LEVEL)
+ return false;
+
+ if (!kvm_pte_valid(pte))
+ return false;
+
+ return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
+}
+
static inline u64 kvm_pte_to_phys(kvm_pte_t pte)
{
u64 pa;
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 8d76eaf463c9..31a55b9d3385 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -416,6 +416,8 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
wi->hd &= (wi->regime == TR_EL2 ?
FIELD_GET(TCR_EL2_HD, tcr) :
FIELD_GET(TCR_HD, tcr));
+ wi->haft = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, HAFT) &&
+ FIELD_GET(TCR2_EL1_HAFT, effective_tcr2(vcpu, wi->regime));
return 0;
@@ -465,7 +467,7 @@ static bool should_set_access_flag(struct s1_walk_info *wi, struct s1_walk_step
if (access->type == WALK_ACCESS_NONARCH)
return false;
- return wi->ha;
+ return kvm_pte_table(ws->desc, ws->level) ? wi->haft : wi->ha;
}
static bool should_set_dirty_state(struct s1_walk_info *wi, struct s1_walk_step *ws,
@@ -473,6 +475,9 @@ static bool should_set_dirty_state(struct s1_walk_info *wi, struct s1_walk_step
{
bool perm = wi->as_el0 ? wr->uw : wr->pw;
+ if (kvm_pte_table(ws->desc, ws->level))
+ return false;
+
switch (access->type) {
/* R_RKMHW */
case WALK_ACCESS_CMO:
@@ -620,6 +625,10 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
if (ws.level == 3)
break;
+ ret = handle_desc_update(vcpu, wi, &ws, wr, access);
+ if (ret)
+ return ret;
+
/* Table handling */
if (!wi->hpd) {
wr->APTable |= FIELD_GET(S1_TABLE_AP, ws.desc);
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 0c1defa5fb0f..173135ea1d0c 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -71,17 +71,6 @@ static u32 kvm_pgd_pages(u32 ia_bits, s8 start_level)
return kvm_pgd_page_idx(&pgt, -1ULL) + 1;
}
-static bool kvm_pte_table(kvm_pte_t pte, s8 level)
-{
- if (level == KVM_PGTABLE_LAST_LEVEL)
- return false;
-
- if (!kvm_pte_valid(pte))
- return false;
-
- return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
-}
-
static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
{
return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* Re: [PATCH 17/22] KVM: arm64: Set Access flag on table descriptors at stage-1
2026-06-23 18:41 ` [PATCH 17/22] KVM: arm64: Set Access flag on table descriptors at stage-1 Oliver Upton
@ 2026-06-23 20:56 ` sashiko-bot
0 siblings, 0 replies; 40+ messages in thread
From: sashiko-bot @ 2026-06-23 20:56 UTC (permalink / raw)
To: Oliver Upton; +Cc: Marc Zyngier, kvmarm
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [High] KVM incorrectly applies stage-1 Access Flag updates to table descriptors when hardware AF updates are globally disabled, causing spurious permission faults and bypassing nested stage-2 dirty tracking.
--
commit 3e7cbff07bcf9d864e2cd9a33c92f040a6496081
Author: Oliver Upton <oupton@kernel.org>
KVM: arm64: Set Access flag on table descriptors at stage-1
This commit implements access flag updates on table descriptors as required
by FEAT_HAFT. Note that unlike leaf descriptors, access flags on table
descriptors never generate a fault.
> diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
> --- a/arch/arm64/kvm/at.c
> +++ b/arch/arm64/kvm/at.c
> @@ -416,6 +416,8 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
> wi->hd &= (wi->regime == TR_EL2 ?
> FIELD_GET(TCR_EL2_HD, tcr) :
> FIELD_GET(TCR_HD, tcr));
> + wi->haft = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, HAFT) &&
> + FIELD_GET(TCR2_EL1_HAFT, effective_tcr2(vcpu, wi->regime));
[Severity: High]
Should wi->haft initialization be gated by wi->ha?
Per the ARM ARM, TCR2_ELx.HAFT is IGNORED if TCR_ELx.HA is 0. Because
wi->haft is evaluated independently of wi->ha here, the guest could configure
TCR_ELx.HA = 0 but TCR2_ELx.HAFT = 1.
[ ... ]
> @@ -465,7 +467,7 @@ static bool should_set_access_flag(struct s1_walk_info *wi, struct s1_walk_step
> if (access->type == WALK_ACCESS_NONARCH)
> return false;
>
> - return wi->ha;
> + return kvm_pte_table(ws->desc, ws->level) ? wi->haft : wi->ha;
> }
If the guest configures TCR_ELx.HA = 0 and TCR2_ELx.HAFT = 1, this will
return true for table descriptors even when hardware AF updates are disabled,
leading to spurious descriptor writes.
This can cause KVM to trigger stage-2 permission faults if the stage-2
mapping is read-only.
Furthermore, if the stage-2 mapping relies on hardware dirty state management
(DBM), KVM's nested stage-2 walk in walk_s1() treats the access as a read
because wi->ha is false:
arch/arm64/kvm/at.c:walk_s1() {
...
struct kvm_walk_access s2_access = {
.type = WALK_ACCESS_S1PTW,
.ia = ws.desc_ipa,
/*
* R_JCXVS, stage-2 dirty state can be updated
* for an S1PTW even if the stage-1 descriptor
* isn't updated.
*/
.write = wi->ha,
};
...
}
This will silently bypass the nested hypervisor's stage-2 dirty tracking
while still modifying the guest memory in host memory.
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260623184201.1518871-1-oupton@kernel.org?part=17
^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH 18/22] KVM: arm64: nv: Set access flag on table descriptors at stage-2
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (16 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 17/22] KVM: arm64: Set Access flag on table descriptors at stage-1 Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 19:05 ` sashiko-bot
2026-06-23 18:41 ` [PATCH 19/22] KVM: arm64: nv: Expose FEAT_HAFT Oliver Upton
` (3 subsequent siblings)
21 siblings, 1 reply; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
Implement access flag updates on table descriptors as required by
FEAT_HAFT.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/kvm/nested.c | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 2408d604a1e1..6f540d3481d5 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -133,6 +133,7 @@ struct s2_walk_info {
bool be;
bool ha;
bool hd;
+ bool haft;
};
struct s2_walk_step {
@@ -234,12 +235,15 @@ static bool should_set_access_flag(struct s2_walk_info *wi, struct s2_walk_step
if (access->type == WALK_ACCESS_NONARCH)
return false;
- return wi->ha;
+ return kvm_pte_table(ws->desc, ws->level) ? wi->haft : wi->ha;
}
static bool should_set_dirty_state(struct s2_walk_info *wi, struct s2_walk_step *ws,
struct kvm_s2_trans *out, struct kvm_walk_access *access)
{
+ if (kvm_pte_table(ws->desc, ws->level))
+ return false;
+
switch (access->type) {
/* R_RKMHW */
case WALK_ACCESS_CMO:
@@ -424,6 +428,10 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, struct kvm_walk_access *acc
return 1;
}
+ ret = handle_desc_update(vcpu, wi, &ws, out, access);
+ if (ret)
+ return ret;
+
base_addr = ws.desc & GENMASK_ULL(47, wi->pgshift);
ws.level += 1;
@@ -562,6 +570,7 @@ static void setup_s2_walk(struct kvm_vcpu *vcpu, struct s2_walk_info *wi)
wi->ha = vtcr & VTCR_EL2_HA;
wi->be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE;
wi->hd = wi->ha && (vtcr & VTCR_EL2_HD);
+ wi->haft = wi->ha && (vtcr & VTCR_EL2_HAFT);
}
int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, struct kvm_walk_access *access,
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* Re: [PATCH 18/22] KVM: arm64: nv: Set access flag on table descriptors at stage-2
2026-06-23 18:41 ` [PATCH 18/22] KVM: arm64: nv: Set access flag on table descriptors at stage-2 Oliver Upton
@ 2026-06-23 19:05 ` sashiko-bot
2026-06-23 20:14 ` Oliver Upton
0 siblings, 1 reply; 40+ messages in thread
From: sashiko-bot @ 2026-06-23 19:05 UTC (permalink / raw)
To: Oliver Upton; +Cc: kvmarm, Marc Zyngier
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [High] Stage-2 table descriptors fail to generate an Access Flag fault when VTCR_EL2.HAFT == 1, VTCR_EL2.HA == 0, and the descriptor's AF bit is 0.
--
commit 7d14ee3ad8d74a4b9734d83c712bb19ae89c1090
Author: Oliver Upton <oupton@kernel.org>
KVM: arm64: nv: Set access flag on table descriptors at stage-2
This commit implements access flag updates on stage-2 table descriptors for
nested virtualization, as required by FEAT_HAFT.
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> index 2408d604a1e18..6f540d3481d5a 100644
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
[ ... ]
> @@ -424,6 +428,10 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, struct kvm_walk_access *acc
> return 1;
> }
>
> + ret = handle_desc_update(vcpu, wi, &ws, out, access);
> + if (ret)
> + return ret;
> +
[Severity: High]
When VTCR_EL2.HAFT == 1 but VTCR_EL2.HA == 0, and the table descriptor's AF
bit is 0, will this silently bypass the required Access Flag fault?
If wi->haft evaluates to false (such as when HA == 0), handle_desc_update()
returns 0 without updating the descriptor. The walk then proceeds to the next
level without checking if the AF bit is 0, which misses the injection of the
architectural fault.
Should there be an explicit check for the AF bit here similar to the one at
the end of the walk for leaf entries, conditioned on VTCR_EL2.HAFT?
> base_addr = ws.desc & GENMASK_ULL(47, wi->pgshift);
>
> ws.level += 1;
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260623184201.1518871-1-oupton@kernel.org?part=18
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 18/22] KVM: arm64: nv: Set access flag on table descriptors at stage-2
2026-06-23 19:05 ` sashiko-bot
@ 2026-06-23 20:14 ` Oliver Upton
0 siblings, 0 replies; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 20:14 UTC (permalink / raw)
To: sashiko-reviews; +Cc: kvmarm, Marc Zyngier
On Tue, Jun 23, 2026 at 07:05:18PM +0000, sashiko-bot@kernel.org wrote:
> Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
> - [High] Stage-2 table descriptors fail to generate an Access Flag fault when VTCR_EL2.HAFT == 1, VTCR_EL2.HA == 0, and the descriptor's AF bit is 0.
> --
>
> commit 7d14ee3ad8d74a4b9734d83c712bb19ae89c1090
> Author: Oliver Upton <oupton@kernel.org>
>
> KVM: arm64: nv: Set access flag on table descriptors at stage-2
>
> This commit implements access flag updates on stage-2 table descriptors for
> nested virtualization, as required by FEAT_HAFT.
>
> > diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> > index 2408d604a1e18..6f540d3481d5a 100644
> > --- a/arch/arm64/kvm/nested.c
> > +++ b/arch/arm64/kvm/nested.c
> [ ... ]
> > @@ -424,6 +428,10 @@ static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, struct kvm_walk_access *acc
> > return 1;
> > }
> >
> > + ret = handle_desc_update(vcpu, wi, &ws, out, access);
> > + if (ret)
> > + return ret;
> > +
>
> [Severity: High]
> When VTCR_EL2.HAFT == 1 but VTCR_EL2.HA == 0, and the table descriptor's AF
> bit is 0, will this silently bypass the required Access Flag fault?
>
> If wi->haft evaluates to false (such as when HA == 0), handle_desc_update()
> returns 0 without updating the descriptor. The walk then proceeds to the next
> level without checking if the AF bit is 0, which misses the injection of the
> architectural fault.
>
> Should there be an explicit check for the AF bit here similar to the one at
> the end of the walk for leaf entries, conditioned on VTCR_EL2.HAFT?
I_HZHJT, the table access flag never generates a fault.
Thanks,
Oliver
^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH 19/22] KVM: arm64: nv: Expose FEAT_HAFT
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (17 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 18/22] KVM: arm64: nv: Set access flag on table descriptors at stage-2 Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 19:05 ` sashiko-bot
2026-06-23 18:41 ` [PATCH 20/22] KVM: arm64: selftests: Only test AF behavior for emulated AT insns Oliver Upton
` (2 subsequent siblings)
21 siblings, 1 reply; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
Both stages of translation now support the access flag on table
descriptors, advertise the feature.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
arch/arm64/kvm/nested.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 6f540d3481d5..dc2a8b6483c2 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -1799,7 +1799,7 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features))
val &= ~ID_AA64MMFR1_EL1_VH;
- val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, DBM);
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, HAFT);
break;
case SYS_ID_AA64MMFR2_EL1:
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* Re: [PATCH 19/22] KVM: arm64: nv: Expose FEAT_HAFT
2026-06-23 18:41 ` [PATCH 19/22] KVM: arm64: nv: Expose FEAT_HAFT Oliver Upton
@ 2026-06-23 19:05 ` sashiko-bot
0 siblings, 0 replies; 40+ messages in thread
From: sashiko-bot @ 2026-06-23 19:05 UTC (permalink / raw)
To: Oliver Upton; +Cc: kvmarm, Marc Zyngier
Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
- [Medium] KVM's AT instruction emulation incorrectly evaluates HD and HAFT without ensuring the Hardware Access Flag (HA) is enabled, leading to unauthorized hardware updates of the Access flag on table descriptors.
--
commit e646aa23aa5bcaa5e6bc04545e6c0d319fc3a058
Author: Oliver Upton <oupton@kernel.org>
KVM: arm64: nv: Expose FEAT_HAFT
Both stages of translation now support the access flag on table descriptors,
advertise the feature.
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> index 6f540d3481d5a..dc2a8b6483c2a 100644
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
> @@ -1799,7 +1799,7 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
> if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features))
> val &= ~ID_AA64MMFR1_EL1_VH;
>
> - val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, DBM);
> + val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, HAFT);
[Severity: Medium]
Does exposing HAFT here uncover an issue in AT instruction emulation where
hardware updates are authorized even when the hardware access flag is
disabled?
Looking at setup_s1_walk() in arch/arm64/kvm/at.c, wi->hd and wi->haft
are evaluated without checking if the hardware access flag (HA) is enabled:
arch/arm64/kvm/at.c:setup_s1_walk() {
...
wi->haft = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, HAFT) &&
FIELD_GET(TCR2_EL1_HAFT, effective_tcr2(vcpu, wi->regime));
...
}
Then in should_set_access_flag(), this could return wi->haft for table
descriptors, inadvertently authorizing hardware updates even when HA == 0:
arch/arm64/kvm/at.c:should_set_access_flag() {
...
return kvm_pte_table(ws->desc, ws->level) ? wi->haft : wi->ha;
}
If a nested guest sets TCR_ELx.HD = 1 or TCR2_ELx.HAFT = 1 but leaves
TCR_ELx.HA = 0, and then executes an AT instruction, could KVM incorrectly
evaluate the guest's page table permissions as writable or perform an
unauthorized hardware update of the Access flag on table descriptors?
> break;
>
> case SYS_ID_AA64MMFR2_EL1:
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260623184201.1518871-1-oupton@kernel.org?part=19
^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH 20/22] KVM: arm64: selftests: Only test AF behavior for emulated AT insns
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (18 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 19/22] KVM: arm64: nv: Expose FEAT_HAFT Oliver Upton
@ 2026-06-23 18:41 ` Oliver Upton
2026-06-23 18:42 ` [PATCH 21/22] KVM: arm64: selftests: Test AT emulation for FEAT_HAFT Oliver Upton
2026-06-23 18:42 ` [PATCH 22/22] HACK: KVM: arm64: nv: Set the dirty state for CMOs that fetch for write Oliver Upton
21 siblings, 0 replies; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:41 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
The original intent of the test was to test KVM's slowpath AT emulation,
specifically including a test for KVM's IMP DEF choice that AT
instructions set the access flag.
While it is possible to force AT instructions to take the slow path in
KVM, this is only effective for an out-of-context regime wherein the
stage-2 translations are guaranteed to be invalid. As such, the
inclusion of AT S1E2* instructions muddied this a bit, as they operate
on the in-context EL2&0 regime.
Restore the original intent of the test and get rid of the AT S1E2* test
cases entirely.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
tools/testing/selftests/kvm/arm64/at.c | 38 ++++++++++++++++----------
1 file changed, 24 insertions(+), 14 deletions(-)
diff --git a/tools/testing/selftests/kvm/arm64/at.c b/tools/testing/selftests/kvm/arm64/at.c
index ce5d312ef6ba..d7289f3df04f 100644
--- a/tools/testing/selftests/kvm/arm64/at.c
+++ b/tools/testing/selftests/kvm/arm64/at.c
@@ -13,6 +13,7 @@
enum {
CLEAR_ACCESS_FLAG,
+ TEST_ACCESS_FLAG,
};
static u64 *ptep_hva;
@@ -48,21 +49,12 @@ do { \
GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_ATTR, par), MAIR_ATTR_NORMAL); \
GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_SH, par), PTE_SHARED >> 8); \
GUEST_ASSERT_EQ(par & SYS_PAR_EL1_PA, TEST_ADDR); \
+ GUEST_SYNC(TEST_ACCESS_FLAG); \
} \
} while (0)
static void test_at(bool expect_fault)
{
- test_at_insn(S1E2R, expect_fault);
- test_at_insn(S1E2W, expect_fault);
-
- /* Reuse the stage-1 MMU context from EL2 at EL1 */
- copy_el2_to_el1(SCTLR);
- copy_el2_to_el1(MAIR);
- copy_el2_to_el1(TCR);
- copy_el2_to_el1(TTBR0);
- copy_el2_to_el1(TTBR1);
-
/* Disable stage-2 translation and enter a non-host context */
write_sysreg(0, vtcr_el2);
write_sysreg(0, vttbr_el2);
@@ -71,11 +63,21 @@ static void test_at(bool expect_fault)
test_at_insn(S1E1R, expect_fault);
test_at_insn(S1E1W, expect_fault);
+
+ sysreg_clear_set(hcr_el2, 0, HCR_EL2_TGE | HCR_EL2_VM);
+ isb();
}
static void guest_code(void)
{
- sysreg_clear_set(tcr_el1, TCR_HA, 0);
+ /* Reuse the stage-1 MMU context from EL2 at EL1 */
+ copy_el2_to_el1(SCTLR);
+ copy_el2_to_el1(MAIR);
+ copy_el2_to_el1(TCR);
+ copy_el2_to_el1(TTBR0);
+ copy_el2_to_el1(TTBR1);
+
+ sysreg_clear_set_s(SYS_TCR_EL12, TCR_HA, 0);
isb();
test_at(true);
@@ -83,7 +85,11 @@ static void guest_code(void)
if (!SYS_FIELD_GET(ID_AA64MMFR1_EL1, HAFDBS, read_sysreg(id_aa64mmfr1_el1)))
GUEST_DONE();
- sysreg_clear_set(tcr_el1, 0, TCR_HA);
+ /*
+ * KVM's software PTW makes the implementation choice that the AT
+ * instruction sets the access flag.
+ */
+ sysreg_clear_set_s(SYS_TCR_EL12, 0, TCR_HA);
isb();
test_at(false);
@@ -96,8 +102,8 @@ static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc)
case CLEAR_ACCESS_FLAG:
/*
* Delete + reinstall the memslot to invalidate stage-2
- * mappings of the stage-1 page tables, allowing KVM to
- * potentially use the 'slow' AT emulation path.
+ * mappings of the stage-1 page tables, forcing KVM to
+ * use the 'slow' AT emulation path.
*
* This and clearing the access flag from host userspace
* ensures that the access flag cannot be set speculatively
@@ -106,6 +112,10 @@ static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc)
clear_bit(__ffs(PTE_AF), ptep_hva);
vm_mem_region_reload(vcpu->vm, vcpu->vm->memslots[MEM_REGION_PT]);
break;
+ case TEST_ACCESS_FLAG:
+ TEST_ASSERT(test_bit(__ffs(PTE_AF), ptep_hva),
+ "Expected access flag to be set (desc: %lu)", *ptep_hva);
+ break;
default:
TEST_FAIL("Unexpected SYNC arg: %lu", uc->args[1]);
}
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* [PATCH 21/22] KVM: arm64: selftests: Test AT emulation for FEAT_HAFT
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (19 preceding siblings ...)
2026-06-23 18:41 ` [PATCH 20/22] KVM: arm64: selftests: Only test AF behavior for emulated AT insns Oliver Upton
@ 2026-06-23 18:42 ` Oliver Upton
2026-06-23 19:05 ` sashiko-bot
2026-06-23 18:42 ` [PATCH 22/22] HACK: KVM: arm64: nv: Set the dirty state for CMOs that fetch for write Oliver Upton
21 siblings, 1 reply; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:42 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
Test that KVM's AT emulation sets the access flag on table descriptors
when FEAT_HAFT is enabled at stage-1. Additionally, add test coverage
that asserts the access flag is clear on table/page descriptors when the
AT instruction generates an access flag fault.
Signed-off-by: Oliver Upton <oupton@kernel.org>
---
tools/testing/selftests/kvm/arm64/at.c | 74 +++++++++++++++++++++++---
1 file changed, 66 insertions(+), 8 deletions(-)
diff --git a/tools/testing/selftests/kvm/arm64/at.c b/tools/testing/selftests/kvm/arm64/at.c
index d7289f3df04f..fb7399736f44 100644
--- a/tools/testing/selftests/kvm/arm64/at.c
+++ b/tools/testing/selftests/kvm/arm64/at.c
@@ -13,10 +13,11 @@
enum {
CLEAR_ACCESS_FLAG,
- TEST_ACCESS_FLAG,
+ ASSERT_ACCESS_FLAG_SET,
+ ASSERT_ACCESS_FLAG_CLEAR,
};
-static u64 *ptep_hva;
+static u64 *page_ptep, *table_ptep;
#define copy_el2_to_el1(reg) \
write_sysreg_s(read_sysreg_s(SYS_##reg##_EL1), SYS_##reg##_EL12)
@@ -45,11 +46,12 @@ do { \
__GUEST_ASSERT(fsc == ESR_ELx_FSC_ACCESS_L(3), \
"AT "#op": expected access flag fault (par: %lx)", \
par); \
+ GUEST_SYNC(ASSERT_ACCESS_FLAG_CLEAR); \
} else { \
GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_ATTR, par), MAIR_ATTR_NORMAL); \
GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_SH, par), PTE_SHARED >> 8); \
GUEST_ASSERT_EQ(par & SYS_PAR_EL1_PA, TEST_ADDR); \
- GUEST_SYNC(TEST_ACCESS_FLAG); \
+ GUEST_SYNC(ASSERT_ACCESS_FLAG_SET); \
} \
} while (0)
@@ -68,6 +70,14 @@ static void test_at(bool expect_fault)
isb();
}
+static bool guest_has_haft(void)
+{
+ u64 mmfr1 = read_sysreg(id_aa64mmfr1_el1);
+
+ return SYS_FIELD_GET(ID_AA64MMFR1_EL1, HAFDBS, mmfr1) >=
+ ID_AA64MMFR1_EL1_HAFDBS_HAFT;
+}
+
static void guest_code(void)
{
/* Reuse the stage-1 MMU context from EL2 at EL1 */
@@ -93,9 +103,43 @@ static void guest_code(void)
isb();
test_at(false);
+ if (!guest_has_haft())
+ GUEST_DONE();
+
+ sysrec_clear_set_s(SYS_HCRX_EL2, 0, HCRX_EL2_TCR2En);
+ sysreg_clear_set_s(SYS_TCR2_EL12, 0, TCR2_EL1_HAFT);
+ isb();
+ test_at(false);
+
+ /* The effective value of HAFT is 0 if HA is 0 */
+ sysreg_clear_set_s(SYS_TCR_EL12, TCR_HA, 0);
+ isb();
+ test_at(true);
+
+ /* The effective value of HAFT is 0 if HCRX_EL2.TCR2En is 0 */
+ sysreg_clear_set_s(SYS_HCRX_EL2, HCRX_EL2_TCR2En, 0);
+ sysreg_clear_set_s(SYS_TCR_EL12, 0, TCR_HA);
+ isb();
+ test_at(false);
+
GUEST_DONE();
}
+static bool vcpu_haft_enabled(struct kvm_vcpu *vcpu)
+{
+ u64 mmfr1 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64MMFR1_EL1));
+ u8 hafdbs = SYS_FIELD_GET(ID_AA64MMFR1_EL1, HAFDBS, mmfr1);
+ u64 tcr2, hcrx;
+
+ /* FEAT_HAFT implies FEAT_TCRX, FEAT_HCX */
+ if (hafdbs < ID_AA64MMFR1_EL1_HAFDBS_HAFT)
+ return false;
+
+ hcrx = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_HCRX_EL2));
+ tcr2 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_TCR2_EL1));
+ return hcrx & HCRX_EL2_TCR2En && tcr2 & TCR2_EL1_HAFT;
+}
+
static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc)
{
switch (uc->args[1]) {
@@ -109,12 +153,25 @@ static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc)
* ensures that the access flag cannot be set speculatively
* and is reliably cleared at the time of the AT instruction.
*/
- clear_bit(__ffs(PTE_AF), ptep_hva);
+ clear_bit(__ffs(PTE_AF), page_ptep);
+ clear_bit(__ffs(PTE_AF), table_ptep);
vm_mem_region_reload(vcpu->vm, vcpu->vm->memslots[MEM_REGION_PT]);
break;
- case TEST_ACCESS_FLAG:
- TEST_ASSERT(test_bit(__ffs(PTE_AF), ptep_hva),
- "Expected access flag to be set (desc: %lu)", *ptep_hva);
+ case ASSERT_ACCESS_FLAG_SET:
+ TEST_ASSERT(test_bit(__ffs(PTE_AF), page_ptep),
+ "Expected access flag to be set (desc: %lu)", *page_ptep);
+ if (!vcpu_haft_enabled(vcpu))
+ TEST_ASSERT(!test_bit(__ffs(PTE_AF), table_ptep),
+ "Expected access flag to be clear (desc: %lu)", *table_ptep);
+ else
+ TEST_ASSERT(test_bit(__ffs(PTE_AF), table_ptep),
+ "Expected access flag to be set (desc: %lu)", *table_ptep);
+ break;
+ case ASSERT_ACCESS_FLAG_CLEAR:
+ TEST_ASSERT(!test_bit(__ffs(PTE_AF), page_ptep),
+ "Expected access flag to be clear (desc: %lu)", *page_ptep);
+ TEST_ASSERT(!test_bit(__ffs(PTE_AF), table_ptep),
+ "Expected access flag to be clear (desc: %lu)", *table_ptep);
break;
default:
TEST_FAIL("Unexpected SYNC arg: %lu", uc->args[1]);
@@ -158,7 +215,8 @@ int main(void)
kvm_arch_vm_finalize_vcpus(vm);
virt_map(vm, TEST_ADDR, TEST_ADDR, 1);
- ptep_hva = virt_get_pte_hva_at_level(vm, TEST_ADDR, 3);
+ page_ptep = virt_get_pte_hva_at_level(vm, TEST_ADDR, 3);
+ table_ptep = virt_get_pte_hva_at_level(vm, TEST_ADDR, 2);
run_test(vcpu);
kvm_vm_free(vm);
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread* Re: [PATCH 21/22] KVM: arm64: selftests: Test AT emulation for FEAT_HAFT
2026-06-23 18:42 ` [PATCH 21/22] KVM: arm64: selftests: Test AT emulation for FEAT_HAFT Oliver Upton
@ 2026-06-23 19:05 ` sashiko-bot
2026-06-23 20:17 ` Oliver Upton
0 siblings, 1 reply; 40+ messages in thread
From: sashiko-bot @ 2026-06-23 19:05 UTC (permalink / raw)
To: Oliver Upton; +Cc: Marc Zyngier, kvmarm
Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
- [Medium] Missing `isb()` instructions between consecutive writes to control-plane system registers.
- [Low] Typo `sysrec_clear_set_s` instead of `sysreg_clear_set_s` causes a build failure.
--
commit 72540780d6ff6cb4b6ed1c45760fb4372cfa7e9b
Author: Oliver Upton <oupton@kernel.org>
KVM: arm64: selftests: Test AT emulation for FEAT_HAFT
Test that KVM's AT emulation sets the access flag on table descriptors
when FEAT_HAFT is enabled at stage-1. Additionally, add test coverage
that asserts the access flag is clear on table/page descriptors when the
AT instruction generates an access flag fault.
> diff --git a/tools/testing/selftests/kvm/arm64/at.c b/tools/testing/selftests/kvm/arm64/at.c
> index d7289f3df04fa..fb7399736f442 100644
> --- a/tools/testing/selftests/kvm/arm64/at.c
> +++ b/tools/testing/selftests/kvm/arm64/at.c
[ ... ]
> @@ -93,9 +103,43 @@ static void guest_code(void)
> isb();
> test_at(false);
>
> + if (!guest_has_haft())
> + GUEST_DONE();
> +
> + sysrec_clear_set_s(SYS_HCRX_EL2, 0, HCRX_EL2_TCR2En);
[Severity: Low]
There is a typo here: sysrec_clear_set_s instead of sysreg_clear_set_s.
> + sysreg_clear_set_s(SYS_TCR2_EL12, 0, TCR2_EL1_HAFT);
[Severity: Medium]
Does this sequence need an isb() between the register updates?
Updates to control-plane system registers must be synchronized with an
explicit Context Synchronization Event before any subsequent sysreg accesses.
Because sysreg_clear_set_s() performs reads, issuing back-to-back calls
without an intervening isb() causes the second access to execute against an
architecturally undefined pipeline state.
> + isb();
> + test_at(false);
> +
> + /* The effective value of HAFT is 0 if HA is 0 */
> + sysreg_clear_set_s(SYS_TCR_EL12, TCR_HA, 0);
> + isb();
> + test_at(true);
> +
> + /* The effective value of HAFT is 0 if HCRX_EL2.TCR2En is 0 */
> + sysreg_clear_set_s(SYS_HCRX_EL2, HCRX_EL2_TCR2En, 0);
> + sysreg_clear_set_s(SYS_TCR_EL12, 0, TCR_HA);
[Severity: Medium]
Similar to the above, does this code need an isb() between modifying
SYS_HCRX_EL2 and SYS_TCR_EL12?
> + isb();
> + test_at(false);
> +
> GUEST_DONE();
> }
--
Sashiko AI review · https://sashiko.dev/#/patchset/20260623184201.1518871-1-oupton@kernel.org?part=21
^ permalink raw reply [flat|nested] 40+ messages in thread
* Re: [PATCH 21/22] KVM: arm64: selftests: Test AT emulation for FEAT_HAFT
2026-06-23 19:05 ` sashiko-bot
@ 2026-06-23 20:17 ` Oliver Upton
0 siblings, 0 replies; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 20:17 UTC (permalink / raw)
To: sashiko-reviews; +Cc: Marc Zyngier, kvmarm
On Tue, Jun 23, 2026 at 07:05:40PM +0000, sashiko-bot@kernel.org wrote:
> Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
> - [Medium] Missing `isb()` instructions between consecutive writes to control-plane system registers.
> - [Low] Typo `sysrec_clear_set_s` instead of `sysreg_clear_set_s` causes a build failure.
> --
>
> commit 72540780d6ff6cb4b6ed1c45760fb4372cfa7e9b
> Author: Oliver Upton <oupton@kernel.org>
>
> KVM: arm64: selftests: Test AT emulation for FEAT_HAFT
>
> Test that KVM's AT emulation sets the access flag on table descriptors
> when FEAT_HAFT is enabled at stage-1. Additionally, add test coverage
> that asserts the access flag is clear on table/page descriptors when the
> AT instruction generates an access flag fault.
>
> > diff --git a/tools/testing/selftests/kvm/arm64/at.c b/tools/testing/selftests/kvm/arm64/at.c
> > index d7289f3df04fa..fb7399736f442 100644
> > --- a/tools/testing/selftests/kvm/arm64/at.c
> > +++ b/tools/testing/selftests/kvm/arm64/at.c
>
> [ ... ]
>
> > @@ -93,9 +103,43 @@ static void guest_code(void)
> > isb();
> > test_at(false);
> >
> > + if (!guest_has_haft())
> > + GUEST_DONE();
> > +
> > + sysrec_clear_set_s(SYS_HCRX_EL2, 0, HCRX_EL2_TCR2En);
>
> [Severity: Low]
> There is a typo here: sysrec_clear_set_s instead of sysreg_clear_set_s.
Oops!
> > + sysreg_clear_set_s(SYS_TCR2_EL12, 0, TCR2_EL1_HAFT);
>
> [Severity: Medium]
> Does this sequence need an isb() between the register updates?
>
> Updates to control-plane system registers must be synchronized with an
> explicit Context Synchronization Event before any subsequent sysreg accesses.
> Because sysreg_clear_set_s() performs reads, issuing back-to-back calls
> without an intervening isb() causes the second access to execute against an
> architecturally undefined pipeline state.
This is wrong, direct reads / writes of a system register through the
same alias appear in program order.
Thanks,
Oliver
^ permalink raw reply [flat|nested] 40+ messages in thread
* [PATCH 22/22] HACK: KVM: arm64: nv: Set the dirty state for CMOs that fetch for write
2026-06-23 18:41 [PATCH 00/22] KVM: arm64: nv: Implement FEAT_HAFDBS, FEAT_HAFT Oliver Upton
` (20 preceding siblings ...)
2026-06-23 18:42 ` [PATCH 21/22] KVM: arm64: selftests: Test AT emulation for FEAT_HAFT Oliver Upton
@ 2026-06-23 18:42 ` Oliver Upton
21 siblings, 0 replies; 40+ messages in thread
From: Oliver Upton @ 2026-06-23 18:42 UTC (permalink / raw)
To: kvmarm
Cc: Marc Zyngier, Joey Gouly, Suzuki K Poulose, Zenghui Yu,
Wei-Lin Chang, Steffen Eiden, Oliver Upton
From: Oliver Upton <oliver.upton@linux.dev>
Cache maintenance instructions that fetch for write do not set the dirty
state on descriptors, effectively passing the permission check and
leaving the descriptor in a writable-clean state. While this may make
some sense as the literal write has already occurred, there is no way to
correctly represent a "writable, but only for CMOs" permission in the
shadow stage-2 in such a way that the next genuine write generates a
permission fault.
A possible alternative would be to treat CMOs using writable-clean
translations as NOPs, relying on FWB to force everything coherent on the
CPU. This mostly works but will lead to breakage for VMs that have
assigned devices performing non-coherent DMA.
Another option would be to set HCR_EL2.TPCP and do full trap+emulate of
CMOs. And yes, dear reader, that would suck.
Just do the obvious thing instead and mark the descriptor as dirty for
CMOs. Maybe we can get an architectural relaxation if we're lucky...
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
---
arch/arm64/kvm/at.c | 1 -
arch/arm64/kvm/nested.c | 1 -
2 files changed, 2 deletions(-)
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index 31a55b9d3385..2be9100cb84e 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -480,7 +480,6 @@ static bool should_set_dirty_state(struct s1_walk_info *wi, struct s1_walk_step
switch (access->type) {
/* R_RKMHW */
- case WALK_ACCESS_CMO:
case WALK_ACCESS_AT:
case WALK_ACCESS_NONARCH:
return false;
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index dc2a8b6483c2..54c5966e1a6f 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -246,7 +246,6 @@ static bool should_set_dirty_state(struct s2_walk_info *wi, struct s2_walk_step
switch (access->type) {
/* R_RKMHW */
- case WALK_ACCESS_CMO:
case WALK_ACCESS_AT:
case WALK_ACCESS_NONARCH:
return false;
--
2.47.3
^ permalink raw reply related [flat|nested] 40+ messages in thread