* [PATCH v4 01/10] KVM: s390: vsie: Fix dat_split_ste()
2026-03-25 17:13 [PATCH v4 00/10] KVM: s390: More memory management fixes Claudio Imbrenda
@ 2026-03-25 17:13 ` Claudio Imbrenda
2026-03-25 17:13 ` [PATCH v4 02/10] KVM: s390: Remove non-atomic dat_crstep_xchg() Claudio Imbrenda
` (8 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Claudio Imbrenda @ 2026-03-25 17:13 UTC (permalink / raw)
To: kvm
Cc: linux-kernel, linux-s390, borntraeger, frankja, nrb, seiden, gra,
schlameuss, hca, david
If the guest misbehaves and puts the page tables for its nested guest
inside the memory of the nested guest itself, and the guest and nested
guest are being mapped with large pages, the shadow mapping will
lose synchronization with the actual mapping, since this will cause the
large page with the vsie notification bit to be split, but the
vsie notification bit will not be propagated to the resulting small
pages.
Fix this by propagating the vsie_notif bit from large pages to normal
pages when splitting a large page.
Fixes: 2db149a0a6c5 ("KVM: s390: KVM page table management functions: walks")
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Reviewed-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
Reviewed-by: Steffen Eiden <seiden@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
---
arch/s390/kvm/dat.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c
index 670404d4fa44..48b5f2bcf172 100644
--- a/arch/s390/kvm/dat.c
+++ b/arch/s390/kvm/dat.c
@@ -292,6 +292,7 @@ static int dat_split_ste(struct kvm_s390_mmu_cache *mc, union pmd *pmdp, gfn_t g
pt->ptes[i].val = init.val | i * PAGE_SIZE;
/* No need to take locks as the page table is not installed yet. */
pgste_init.prefix_notif = old.s.fc1.prefix_notif;
+ pgste_init.vsie_notif = old.s.fc1.vsie_notif;
pgste_init.pcl = uses_skeys && init.h.i;
dat_init_pgstes(pt, pgste_init.val);
} else {
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v4 02/10] KVM: s390: Remove non-atomic dat_crstep_xchg()
2026-03-25 17:13 [PATCH v4 00/10] KVM: s390: More memory management fixes Claudio Imbrenda
2026-03-25 17:13 ` [PATCH v4 01/10] KVM: s390: vsie: Fix dat_split_ste() Claudio Imbrenda
@ 2026-03-25 17:13 ` Claudio Imbrenda
2026-03-25 17:13 ` [PATCH v4 03/10] KVM: s390: vsie: Fix check for pre-existing shadow mapping Claudio Imbrenda
` (7 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Claudio Imbrenda @ 2026-03-25 17:13 UTC (permalink / raw)
To: kvm
Cc: linux-kernel, linux-s390, borntraeger, frankja, nrb, seiden, gra,
schlameuss, hca, david
In practice dat_crstep_xchg() is racy and hard to use correctly. Simply
remove it and replace its uses with dat_crstep_xchg_atomic().
This solves some actual races that lead to system hangs / crashes.
Opportunistically fix an alignment issue in _gmap_crstep_xchg_atomic().
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Fixes: 589071eaaa8f ("KVM: s390: KVM page table management functions: clear and replace")
Fixes: 94fd9b16cc67 ("KVM: s390: KVM page table management functions: lifecycle management")
Reviewed-by: Steffen Eiden <seiden@linux.ibm.com>
---
arch/s390/kvm/dat.c | 51 +++++++------------------
arch/s390/kvm/dat.h | 9 +++--
arch/s390/kvm/gaccess.c | 26 +++++++------
arch/s390/kvm/gmap.c | 82 ++++++++++++++++++++++++-----------------
arch/s390/kvm/gmap.h | 29 +++++++++------
5 files changed, 100 insertions(+), 97 deletions(-)
diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c
index 48b5f2bcf172..4d44c0f9ad45 100644
--- a/arch/s390/kvm/dat.c
+++ b/arch/s390/kvm/dat.c
@@ -134,32 +134,6 @@ int dat_set_asce_limit(struct kvm_s390_mmu_cache *mc, union asce *asce, int newt
return 0;
}
-/**
- * dat_crstep_xchg() - Exchange a gmap CRSTE with another.
- * @crstep: Pointer to the CRST entry
- * @new: Replacement entry.
- * @gfn: The affected guest address.
- * @asce: The ASCE of the address space.
- *
- * Context: This function is assumed to be called with kvm->mmu_lock held.
- */
-void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce asce)
-{
- if (crstep->h.i) {
- WRITE_ONCE(*crstep, new);
- return;
- } else if (cpu_has_edat2()) {
- crdte_crste(crstep, *crstep, new, gfn, asce);
- return;
- }
-
- if (machine_has_tlb_guest())
- idte_crste(crstep, gfn, IDTE_GUEST_ASCE, asce, IDTE_GLOBAL);
- else
- idte_crste(crstep, gfn, 0, NULL_ASCE, IDTE_GLOBAL);
- WRITE_ONCE(*crstep, new);
-}
-
/**
* dat_crstep_xchg_atomic() - Atomically exchange a gmap CRSTE with another.
* @crstep: Pointer to the CRST entry.
@@ -175,8 +149,8 @@ void dat_crstep_xchg(union crste *crstep, union crste new, gfn_t gfn, union asce
*
* Return: %true if the exchange was successful.
*/
-bool dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new, gfn_t gfn,
- union asce asce)
+bool __must_check dat_crstep_xchg_atomic(union crste *crstep, union crste old, union crste new,
+ gfn_t gfn, union asce asce)
{
if (old.h.i)
return arch_try_cmpxchg((long *)crstep, &old.val, new.val);
@@ -894,7 +868,8 @@ static long _dat_slot_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct d
/* This table entry needs to be updated. */
if (walk->start <= gfn && walk->end >= next) {
- dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce);
+ if (!dat_crstep_xchg_atomic(crstep, crste, new_crste, gfn, walk->asce))
+ return -EINVAL;
/* A lower level table was present, needs to be freed. */
if (!crste.h.fc && !crste.h.i) {
if (is_pmd(crste))
@@ -1072,17 +1047,19 @@ int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level,
static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
{
- union crste crste = READ_ONCE(*crstep);
+ union crste newcrste, oldcrste;
int *n = walk->priv;
- if (!crste.h.fc || crste.h.i || crste.h.p)
- return 0;
-
+ do {
+ oldcrste = READ_ONCE(*crstep);
+ if (!oldcrste.h.fc || oldcrste.h.i || oldcrste.h.p)
+ return 0;
+ if (oldcrste.s.fc1.prefix_notif)
+ break;
+ newcrste = oldcrste;
+ newcrste.s.fc1.prefix_notif = 1;
+ } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, walk->asce));
*n = 2;
- if (crste.s.fc1.prefix_notif)
- return 0;
- crste.s.fc1.prefix_notif = 1;
- dat_crstep_xchg(crstep, crste, gfn, walk->asce);
return 0;
}
diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h
index 123e11dcd70d..22dafc775335 100644
--- a/arch/s390/kvm/dat.h
+++ b/arch/s390/kvm/dat.h
@@ -938,11 +938,14 @@ static inline bool dat_pudp_xchg_atomic(union pud *pudp, union pud old, union pu
return dat_crstep_xchg_atomic(_CRSTEP(pudp), _CRSTE(old), _CRSTE(new), gfn, asce);
}
-static inline void dat_crstep_clear(union crste *crstep, gfn_t gfn, union asce asce)
+static inline union crste dat_crstep_clear_atomic(union crste *crstep, gfn_t gfn, union asce asce)
{
- union crste newcrste = _CRSTE_EMPTY(crstep->h.tt);
+ union crste oldcrste, empty = _CRSTE_EMPTY(crstep->h.tt);
- dat_crstep_xchg(crstep, newcrste, gfn, asce);
+ do {
+ oldcrste = READ_ONCE(*crstep);
+ } while (!dat_crstep_xchg_atomic(crstep, oldcrste, empty, gfn, asce));
+ return oldcrste;
}
static inline int get_level(union crste *crstep, union pte *ptep)
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index a9da9390867d..4ee862424ca0 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1456,7 +1456,7 @@ static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union
static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, union crste *table,
struct guest_fault *f, bool p)
{
- union crste newcrste;
+ union crste newcrste, oldcrste;
gfn_t gfn;
int rc;
@@ -1469,16 +1469,20 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni
if (rc)
return rc;
- newcrste = _crste_fc1(f->pfn, host->h.tt, f->writable, !p);
- newcrste.s.fc1.d |= host->s.fc1.d;
- newcrste.s.fc1.sd |= host->s.fc1.sd;
- newcrste.h.p &= host->h.p;
- newcrste.s.fc1.vsie_notif = 1;
- newcrste.s.fc1.prefix_notif = host->s.fc1.prefix_notif;
- _gmap_crstep_xchg(sg->parent, host, newcrste, f->gfn, false);
-
- newcrste = _crste_fc1(f->pfn, host->h.tt, 0, !p);
- dat_crstep_xchg(table, newcrste, gpa_to_gfn(raddr), sg->asce);
+ do {
+ oldcrste = READ_ONCE(*host);
+ newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, f->writable, !p);
+ newcrste.s.fc1.d |= oldcrste.s.fc1.d;
+ newcrste.s.fc1.sd |= oldcrste.s.fc1.sd;
+ newcrste.h.p &= oldcrste.h.p;
+ newcrste.s.fc1.vsie_notif = 1;
+ newcrste.s.fc1.prefix_notif = oldcrste.s.fc1.prefix_notif;
+ } while (!_gmap_crstep_xchg_atomic(sg->parent, host, oldcrste, newcrste, f->gfn, false));
+
+ newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p);
+ gfn = gpa_to_gfn(raddr);
+ while (!dat_crstep_xchg_atomic(table, READ_ONCE(*table), newcrste, gfn, sg->asce))
+ ;
return 0;
}
diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c
index ef0c6ebfdde2..956be4c01797 100644
--- a/arch/s390/kvm/gmap.c
+++ b/arch/s390/kvm/gmap.c
@@ -313,13 +313,16 @@ static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, st
struct clear_young_pte_priv *priv = walk->priv;
union crste crste, new;
- crste = READ_ONCE(*crstep);
+ do {
+ crste = READ_ONCE(*crstep);
+
+ if (!crste.h.fc)
+ return 0;
+ if (!crste.s.fc1.y && crste.h.i)
+ return 0;
+ if (crste_prefix(crste) && !gmap_mkold_prefix(priv->gmap, gfn, end))
+ break;
- if (!crste.h.fc)
- return 0;
- if (!crste.s.fc1.y && crste.h.i)
- return 0;
- if (!crste_prefix(crste) || gmap_mkold_prefix(priv->gmap, gfn, end)) {
new = crste;
new.h.i = 1;
new.s.fc1.y = 0;
@@ -328,8 +331,8 @@ static long gmap_clear_young_crste(union crste *crstep, gfn_t gfn, gfn_t end, st
folio_set_dirty(phys_to_folio(crste_origin_large(crste)));
new.s.fc1.d = 0;
new.h.p = 1;
- dat_crstep_xchg(crstep, new, gfn, walk->asce);
- }
+ } while (!dat_crstep_xchg_atomic(crstep, crste, new, gfn, walk->asce));
+
priv->young = 1;
return 0;
}
@@ -391,14 +394,18 @@ static long _gmap_unmap_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct
{
struct gmap_unmap_priv *priv = walk->priv;
struct folio *folio = NULL;
+ union crste old = *crstep;
- if (crstep->h.fc) {
- if (crstep->s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
- folio = phys_to_folio(crste_origin_large(*crstep));
- gmap_crstep_xchg(priv->gmap, crstep, _CRSTE_EMPTY(crstep->h.tt), gfn);
- if (folio)
- uv_convert_from_secure_folio(folio);
- }
+ if (!old.h.fc)
+ return 0;
+
+ if (old.s.fc1.pr && test_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &priv->gmap->flags))
+ folio = phys_to_folio(crste_origin_large(old));
+ /* No races should happen because kvm->mmu_lock is held in write mode */
+ KVM_BUG_ON(!gmap_crstep_xchg_atomic(priv->gmap, crstep, old, _CRSTE_EMPTY(old.h.tt), gfn),
+ priv->gmap->kvm);
+ if (folio)
+ uv_convert_from_secure_folio(folio);
return 0;
}
@@ -474,23 +481,24 @@ static long _crste_test_and_clear_softdirty(union crste *table, gfn_t gfn, gfn_t
if (fatal_signal_pending(current))
return 1;
- crste = READ_ONCE(*table);
- if (!crste.h.fc)
- return 0;
- if (crste.h.p && !crste.s.fc1.sd)
- return 0;
+ do {
+ crste = READ_ONCE(*table);
+ if (!crste.h.fc)
+ return 0;
+ if (crste.h.p && !crste.s.fc1.sd)
+ return 0;
- /*
- * If this large page contains one or more prefixes of vCPUs that are
- * currently running, do not reset the protection, leave it marked as
- * dirty.
- */
- if (!crste.s.fc1.prefix_notif || gmap_mkold_prefix(gmap, gfn, end)) {
+ /*
+ * If this large page contains one or more prefixes of vCPUs that are
+ * currently running, do not reset the protection, leave it marked as
+ * dirty.
+ */
+ if (crste.s.fc1.prefix_notif && !gmap_mkold_prefix(gmap, gfn, end))
+ break;
new = crste;
new.h.p = 1;
new.s.fc1.sd = 0;
- gmap_crstep_xchg(gmap, table, new, gfn);
- }
+ } while (!gmap_crstep_xchg_atomic(gmap, table, crste, new, gfn));
for ( ; gfn < end; gfn++)
mark_page_dirty(gmap->kvm, gfn);
@@ -646,8 +654,8 @@ int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fau
static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
gfn_t p_gfn, gfn_t c_gfn, bool force_alloc)
{
+ union crste newcrste, oldcrste;
struct page_table *pt;
- union crste newcrste;
union crste *crstep;
union pte *ptep;
int rc;
@@ -673,7 +681,11 @@ static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
&crstep, &ptep);
if (rc)
return rc;
- dat_crstep_xchg(crstep, newcrste, c_gfn, gmap->asce);
+ do {
+ oldcrste = READ_ONCE(*crstep);
+ if (oldcrste.val == newcrste.val)
+ break;
+ } while (!dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, c_gfn, gmap->asce));
return 0;
}
@@ -777,8 +789,10 @@ static void gmap_ucas_unmap_one(struct gmap *gmap, gfn_t c_gfn)
int rc;
rc = dat_entry_walk(NULL, c_gfn, gmap->asce, 0, TABLE_TYPE_SEGMENT, &crstep, &ptep);
- if (!rc)
- dat_crstep_xchg(crstep, _PMD_EMPTY, c_gfn, gmap->asce);
+ if (rc)
+ return;
+ while (!dat_crstep_xchg_atomic(crstep, READ_ONCE(*crstep), _PMD_EMPTY, c_gfn, gmap->asce))
+ ;
}
void gmap_ucas_unmap(struct gmap *gmap, gfn_t c_gfn, unsigned long count)
@@ -1017,8 +1031,8 @@ static void gmap_unshadow_level(struct gmap *sg, gfn_t r_gfn, int level)
dat_ptep_xchg(ptep, _PTE_EMPTY, r_gfn, sg->asce, uses_skeys(sg));
return;
}
- crste = READ_ONCE(*crstep);
- dat_crstep_clear(crstep, r_gfn, sg->asce);
+
+ crste = dat_crstep_clear_atomic(crstep, r_gfn, sg->asce);
if (crste_leaf(crste) || crste.h.i)
return;
if (is_pmd(crste))
diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h
index ccb5cd751e31..150e91e15ee0 100644
--- a/arch/s390/kvm/gmap.h
+++ b/arch/s390/kvm/gmap.h
@@ -194,35 +194,40 @@ static inline union pgste gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, uni
return _gmap_ptep_xchg(gmap, ptep, newpte, pgste, gfn, true);
}
-static inline void _gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne,
- gfn_t gfn, bool needs_lock)
+static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep,
+ union crste oldcrste, union crste newcrste,
+ gfn_t gfn, bool needs_lock)
{
- unsigned long align = 8 + (is_pmd(*crstep) ? 0 : 11);
+ unsigned long align = is_pmd(newcrste) ? _PAGE_ENTRIES : _PAGE_ENTRIES * _CRST_ENTRIES;
+
+ if (KVM_BUG_ON(crstep->h.tt != oldcrste.h.tt || newcrste.h.tt != oldcrste.h.tt, gmap->kvm))
+ return true;
lockdep_assert_held(&gmap->kvm->mmu_lock);
if (!needs_lock)
lockdep_assert_held(&gmap->children_lock);
gfn = ALIGN_DOWN(gfn, align);
- if (crste_prefix(*crstep) && (ne.h.p || ne.h.i || !crste_prefix(ne))) {
- ne.s.fc1.prefix_notif = 0;
+ if (crste_prefix(oldcrste) && (newcrste.h.p || newcrste.h.i || !crste_prefix(newcrste))) {
+ newcrste.s.fc1.prefix_notif = 0;
gmap_unmap_prefix(gmap, gfn, gfn + align);
}
- if (crste_leaf(*crstep) && crstep->s.fc1.vsie_notif &&
- (ne.h.p || ne.h.i || !ne.s.fc1.vsie_notif)) {
- ne.s.fc1.vsie_notif = 0;
+ if (crste_leaf(oldcrste) && oldcrste.s.fc1.vsie_notif &&
+ (newcrste.h.p || newcrste.h.i || !newcrste.s.fc1.vsie_notif)) {
+ newcrste.s.fc1.vsie_notif = 0;
if (needs_lock)
gmap_handle_vsie_unshadow_event(gmap, gfn);
else
_gmap_handle_vsie_unshadow_event(gmap, gfn);
}
- dat_crstep_xchg(crstep, ne, gfn, gmap->asce);
+ return dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce);
}
-static inline void gmap_crstep_xchg(struct gmap *gmap, union crste *crstep, union crste ne,
- gfn_t gfn)
+static inline bool __must_check gmap_crstep_xchg_atomic(struct gmap *gmap, union crste *crstep,
+ union crste oldcrste, union crste newcrste,
+ gfn_t gfn)
{
- return _gmap_crstep_xchg(gmap, crstep, ne, gfn, true);
+ return _gmap_crstep_xchg_atomic(gmap, crstep, oldcrste, newcrste, gfn, true);
}
/**
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v4 03/10] KVM: s390: vsie: Fix check for pre-existing shadow mapping
2026-03-25 17:13 [PATCH v4 00/10] KVM: s390: More memory management fixes Claudio Imbrenda
2026-03-25 17:13 ` [PATCH v4 01/10] KVM: s390: vsie: Fix dat_split_ste() Claudio Imbrenda
2026-03-25 17:13 ` [PATCH v4 02/10] KVM: s390: Remove non-atomic dat_crstep_xchg() Claudio Imbrenda
@ 2026-03-25 17:13 ` Claudio Imbrenda
2026-03-25 17:13 ` [PATCH v4 04/10] KVM: s390: Fix gmap_link() Claudio Imbrenda
` (6 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Claudio Imbrenda @ 2026-03-25 17:13 UTC (permalink / raw)
To: kvm
Cc: linux-kernel, linux-s390, borntraeger, frankja, nrb, seiden, gra,
schlameuss, hca, david
When shadowing a nested guest, a check is performed and no shadowing is
attempted if the nested guest is already shadowed.
The existing check was incomplete; fix it by also checking whether the
leaf DAT table entry in the existing shadow gmap has the same protection
as the one specified in the guest DAT entry.
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Fixes: e38c884df921 ("KVM: s390: Switch to new gmap")
Reviewed-by: Steffen Eiden <seiden@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
---
arch/s390/kvm/gaccess.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 4ee862424ca0..8fd690255e1b 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1506,8 +1506,9 @@ static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
if (rc)
return rc;
- /* A race occourred. The shadow mapping is already valid, nothing to do */
- if ((ptep && !ptep->h.i) || (!ptep && crste_leaf(*table)))
+ /* A race occurred. The shadow mapping is already valid, nothing to do */
+ if ((ptep && !ptep->h.i && ptep->h.p == w->p) ||
+ (!ptep && crste_leaf(*table) && !table->h.i && table->h.p == w->p))
return 0;
gl = get_level(table, ptep);
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v4 04/10] KVM: s390: Fix gmap_link()
2026-03-25 17:13 [PATCH v4 00/10] KVM: s390: More memory management fixes Claudio Imbrenda
` (2 preceding siblings ...)
2026-03-25 17:13 ` [PATCH v4 03/10] KVM: s390: vsie: Fix check for pre-existing shadow mapping Claudio Imbrenda
@ 2026-03-25 17:13 ` Claudio Imbrenda
2026-03-25 17:13 ` [PATCH v4 05/10] KVM: s390: Correctly handle guest mappings without struct page Claudio Imbrenda
` (5 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Claudio Imbrenda @ 2026-03-25 17:13 UTC (permalink / raw)
To: kvm
Cc: linux-kernel, linux-s390, borntraeger, frankja, nrb, seiden, gra,
schlameuss, hca, david
The slow path of the fault handler ultimately called gmap_link(), which
assumed the fault was a major fault, and blindly called dat_link().
In case of minor faults, things were not always handled properly; in
particular the prefix and vsie marker bits were ignored.
Move dat_link() into gmap.c, renaming it accordingly. Once moved, the
new _gmap_link() function will be able to correctly honour the prefix
and vsie markers.
This will cause spurious unshadows in some uncommon cases.
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Fixes: 94fd9b16cc67 ("KVM: s390: KVM page table management functions: lifecycle management")
Fixes: a2c17f9270cc ("KVM: s390: New gmap code")
Reviewed-by: Steffen Eiden <seiden@linux.ibm.com>
---
arch/s390/kvm/dat.c | 48 -------------------------------------
arch/s390/kvm/dat.h | 2 --
arch/s390/kvm/gmap.c | 56 ++++++++++++++++++++++++++++++++++++++++----
3 files changed, 52 insertions(+), 54 deletions(-)
diff --git a/arch/s390/kvm/dat.c b/arch/s390/kvm/dat.c
index 4d44c0f9ad45..7b8d70fe406d 100644
--- a/arch/s390/kvm/dat.c
+++ b/arch/s390/kvm/dat.c
@@ -997,54 +997,6 @@ bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end)
return _dat_walk_gfn_range(start, end, asce, &test_age_ops, 0, NULL) > 0;
}
-int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level,
- bool uses_skeys, struct guest_fault *f)
-{
- union crste oldval, newval;
- union pte newpte, oldpte;
- union pgste pgste;
- int rc = 0;
-
- rc = dat_entry_walk(mc, f->gfn, asce, DAT_WALK_ALLOC_CONTINUE, level, &f->crstep, &f->ptep);
- if (rc == -EINVAL || rc == -ENOMEM)
- return rc;
- if (rc)
- return -EAGAIN;
-
- if (WARN_ON_ONCE(unlikely(get_level(f->crstep, f->ptep) > level)))
- return -EINVAL;
-
- if (f->ptep) {
- pgste = pgste_get_lock(f->ptep);
- oldpte = *f->ptep;
- newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page);
- newpte.s.sd = oldpte.s.sd;
- oldpte.s.sd = 0;
- if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) {
- pgste = __dat_ptep_xchg(f->ptep, pgste, newpte, f->gfn, asce, uses_skeys);
- if (f->callback)
- f->callback(f);
- } else {
- rc = -EAGAIN;
- }
- pgste_set_unlock(f->ptep, pgste);
- } else {
- oldval = READ_ONCE(*f->crstep);
- newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
- f->write_attempt | oldval.s.fc1.d);
- newval.s.fc1.sd = oldval.s.fc1.sd;
- if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
- crste_origin_large(oldval) != crste_origin_large(newval))
- return -EAGAIN;
- if (!dat_crstep_xchg_atomic(f->crstep, oldval, newval, f->gfn, asce))
- return -EAGAIN;
- if (f->callback)
- f->callback(f);
- }
-
- return rc;
-}
-
static long dat_set_pn_crste(union crste *crstep, gfn_t gfn, gfn_t next, struct dat_walk *walk)
{
union crste newcrste, oldcrste;
diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h
index 22dafc775335..efedcf96110c 100644
--- a/arch/s390/kvm/dat.h
+++ b/arch/s390/kvm/dat.h
@@ -540,8 +540,6 @@ int dat_set_slot(struct kvm_s390_mmu_cache *mc, union asce asce, gfn_t start, gf
u16 type, u16 param);
int dat_set_prefix_notif_bit(union asce asce, gfn_t gfn);
bool dat_test_age_gfn(union asce asce, gfn_t start, gfn_t end);
-int dat_link(struct kvm_s390_mmu_cache *mc, union asce asce, int level,
- bool uses_skeys, struct guest_fault *f);
int dat_perform_essa(union asce asce, gfn_t gfn, int orc, union essa_state *state, bool *dirty);
long dat_reset_cmma(union asce asce, gfn_t start_gfn);
diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c
index 956be4c01797..03e15b5e0b9a 100644
--- a/arch/s390/kvm/gmap.c
+++ b/arch/s390/kvm/gmap.c
@@ -631,10 +631,60 @@ static inline bool gmap_1m_allowed(struct gmap *gmap, gfn_t gfn)
return test_bit(GMAP_FLAG_ALLOW_HPAGE_1M, &gmap->flags);
}
+static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int level,
+ struct guest_fault *f)
+{
+ union crste oldval, newval;
+ union pte newpte, oldpte;
+ union pgste pgste;
+ int rc = 0;
+
+ rc = dat_entry_walk(mc, f->gfn, gmap->asce, DAT_WALK_ALLOC_CONTINUE, level,
+ &f->crstep, &f->ptep);
+ if (rc == -ENOMEM)
+ return rc;
+ if (KVM_BUG_ON(rc == -EINVAL, gmap->kvm))
+ return rc;
+ if (rc)
+ return -EAGAIN;
+ if (KVM_BUG_ON(get_level(f->crstep, f->ptep) > level, gmap->kvm))
+ return -EINVAL;
+
+ if (f->ptep) {
+ pgste = pgste_get_lock(f->ptep);
+ oldpte = *f->ptep;
+ newpte = _pte(f->pfn, f->writable, f->write_attempt | oldpte.s.d, !f->page);
+ newpte.s.sd = oldpte.s.sd;
+ oldpte.s.sd = 0;
+ if (oldpte.val == _PTE_EMPTY.val || oldpte.h.pfra == f->pfn) {
+ pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, pgste, f->gfn);
+ if (f->callback)
+ f->callback(f);
+ } else {
+ rc = -EAGAIN;
+ }
+ pgste_set_unlock(f->ptep, pgste);
+ } else {
+ do {
+ oldval = READ_ONCE(*f->crstep);
+ newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
+ f->write_attempt | oldval.s.fc1.d);
+ newval.s.fc1.sd = oldval.s.fc1.sd;
+ if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
+ crste_origin_large(oldval) != crste_origin_large(newval))
+ return -EAGAIN;
+ } while (!gmap_crstep_xchg_atomic(gmap, f->crstep, oldval, newval, f->gfn));
+ if (f->callback)
+ f->callback(f);
+ }
+
+ return rc;
+}
+
int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fault *f)
{
unsigned int order;
- int rc, level;
+ int level;
lockdep_assert_held(&gmap->kvm->mmu_lock);
@@ -646,9 +696,7 @@ int gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, struct guest_fau
else if (order >= get_order(_SEGMENT_SIZE) && gmap_1m_allowed(gmap, f->gfn))
level = TABLE_TYPE_SEGMENT;
}
- rc = dat_link(mc, gmap->asce, level, uses_skeys(gmap), f);
- KVM_BUG_ON(rc == -EINVAL, gmap->kvm);
- return rc;
+ return _gmap_link(mc, gmap, level, f);
}
static int gmap_ucas_map_one(struct kvm_s390_mmu_cache *mc, struct gmap *gmap,
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v4 05/10] KVM: s390: Correctly handle guest mappings without struct page
2026-03-25 17:13 [PATCH v4 00/10] KVM: s390: More memory management fixes Claudio Imbrenda
` (3 preceding siblings ...)
2026-03-25 17:13 ` [PATCH v4 04/10] KVM: s390: Fix gmap_link() Claudio Imbrenda
@ 2026-03-25 17:13 ` Claudio Imbrenda
2026-03-26 9:45 ` Christian Borntraeger
2026-03-25 17:13 ` [PATCH v4 06/10] KVM: s390: vsie: Fix nested guest memory shadowing Claudio Imbrenda
` (4 subsequent siblings)
9 siblings, 1 reply; 12+ messages in thread
From: Claudio Imbrenda @ 2026-03-25 17:13 UTC (permalink / raw)
To: kvm
Cc: linux-kernel, linux-s390, borntraeger, frankja, nrb, seiden, gra,
schlameuss, hca, david
Introduce a new special softbit for large pages, like already presend
for normal pages, and use it to mark guest mappings that do not have
struct pages.
Whenever a leaf DAT entry becomes dirty, check the special softbit and
only call SetPageDirty() if there is an actual struct page.
Move the logic to mark pages dirty inside _gmap_ptep_xchg() and
_gmap_crstep_xchg_atomic(), to avoid needlessly duplicating the code.
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Fixes: 5a74e3d93417 ("KVM: s390: KVM-specific bitfields and helper functions")
Fixes: a2c17f9270cc ("KVM: s390: New gmap code")
---
arch/s390/kvm/dat.h | 12 ++++++------
arch/s390/kvm/gmap.c | 5 +----
arch/s390/kvm/gmap.h | 4 ++++
3 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/arch/s390/kvm/dat.h b/arch/s390/kvm/dat.h
index efedcf96110c..874cc962e196 100644
--- a/arch/s390/kvm/dat.h
+++ b/arch/s390/kvm/dat.h
@@ -160,14 +160,14 @@ union pmd {
unsigned long :44; /* HW */
unsigned long : 3; /* Unused */
unsigned long : 1; /* HW */
+ unsigned long s : 1; /* Special */
unsigned long w : 1; /* Writable soft-bit */
unsigned long r : 1; /* Readable soft-bit */
unsigned long d : 1; /* Dirty */
unsigned long y : 1; /* Young */
- unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
unsigned long : 3; /* HW */
+ unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
unsigned long vsie_notif : 1; /* Referenced in a shadow table */
- unsigned long : 1; /* Unused */
unsigned long : 4; /* HW */
unsigned long sd : 1; /* Soft-Dirty */
unsigned long pr : 1; /* Present */
@@ -183,14 +183,14 @@ union pud {
unsigned long :33; /* HW */
unsigned long :14; /* Unused */
unsigned long : 1; /* HW */
+ unsigned long s : 1; /* Special */
unsigned long w : 1; /* Writable soft-bit */
unsigned long r : 1; /* Readable soft-bit */
unsigned long d : 1; /* Dirty */
unsigned long y : 1; /* Young */
- unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
unsigned long : 3; /* HW */
+ unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
unsigned long vsie_notif : 1; /* Referenced in a shadow table */
- unsigned long : 1; /* Unused */
unsigned long : 4; /* HW */
unsigned long sd : 1; /* Soft-Dirty */
unsigned long pr : 1; /* Present */
@@ -254,14 +254,14 @@ union crste {
struct {
unsigned long :47;
unsigned long : 1; /* HW (should be 0) */
+ unsigned long s : 1; /* Special */
unsigned long w : 1; /* Writable */
unsigned long r : 1; /* Readable */
unsigned long d : 1; /* Dirty */
unsigned long y : 1; /* Young */
- unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
unsigned long : 3; /* HW */
+ unsigned long prefix_notif : 1; /* Guest prefix invalidation notification */
unsigned long vsie_notif : 1; /* Referenced in a shadow table */
- unsigned long : 1;
unsigned long : 4; /* HW */
unsigned long sd : 1; /* Soft-Dirty */
unsigned long pr : 1; /* Present */
diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c
index 03e15b5e0b9a..e9fc471a5141 100644
--- a/arch/s390/kvm/gmap.c
+++ b/arch/s390/kvm/gmap.c
@@ -544,8 +544,6 @@ static int gmap_handle_minor_crste_fault(union asce asce, struct guest_fault *f)
newcrste.s.fc1.d = 1;
newcrste.s.fc1.sd = 1;
}
- if (!oldcrste.s.fc1.d && newcrste.s.fc1.d)
- SetPageDirty(phys_to_page(crste_origin_large(newcrste)));
/* In case of races, let the slow path deal with it. */
return !dat_crstep_xchg_atomic(f->crstep, oldcrste, newcrste, f->gfn, asce);
}
@@ -576,8 +574,6 @@ static int _gmap_handle_minor_pte_fault(struct gmap *gmap, union pgste *pgste,
newpte.s.d = 1;
newpte.s.sd = 1;
}
- if (!oldpte.s.d && newpte.s.d)
- SetPageDirty(pfn_to_page(newpte.h.pfra));
*pgste = gmap_ptep_xchg(gmap, f->ptep, newpte, *pgste, f->gfn);
return 0;
@@ -669,6 +665,7 @@ static int _gmap_link(struct kvm_s390_mmu_cache *mc, struct gmap *gmap, int leve
oldval = READ_ONCE(*f->crstep);
newval = _crste_fc1(f->pfn, oldval.h.tt, f->writable,
f->write_attempt | oldval.s.fc1.d);
+ newval.s.fc1.s = !f->page;
newval.s.fc1.sd = oldval.s.fc1.sd;
if (oldval.val != _CRSTE_EMPTY(oldval.h.tt).val &&
crste_origin_large(oldval) != crste_origin_large(newval))
diff --git a/arch/s390/kvm/gmap.h b/arch/s390/kvm/gmap.h
index 150e91e15ee0..1b900819bab2 100644
--- a/arch/s390/kvm/gmap.h
+++ b/arch/s390/kvm/gmap.h
@@ -185,6 +185,8 @@ static inline union pgste _gmap_ptep_xchg(struct gmap *gmap, union pte *ptep, un
else
_gmap_handle_vsie_unshadow_event(gmap, gfn);
}
+ if (!ptep->s.d && newpte.s.d && !ptep->s.s)
+ SetPageDirty(pfn_to_page(newpte.h.pfra));
return __dat_ptep_xchg(ptep, pgste, newpte, gfn, gmap->asce, uses_skeys(gmap));
}
@@ -220,6 +222,8 @@ static inline bool __must_check _gmap_crstep_xchg_atomic(struct gmap *gmap, unio
else
_gmap_handle_vsie_unshadow_event(gmap, gfn);
}
+ if (!oldcrste.s.fc1.d && newcrste.s.fc1.d && !oldcrste.s.fc1.s)
+ SetPageDirty(phys_to_page(crste_origin_large(newcrste)));
return dat_crstep_xchg_atomic(crstep, oldcrste, newcrste, gfn, gmap->asce);
}
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH v4 05/10] KVM: s390: Correctly handle guest mappings without struct page
2026-03-25 17:13 ` [PATCH v4 05/10] KVM: s390: Correctly handle guest mappings without struct page Claudio Imbrenda
@ 2026-03-26 9:45 ` Christian Borntraeger
0 siblings, 0 replies; 12+ messages in thread
From: Christian Borntraeger @ 2026-03-26 9:45 UTC (permalink / raw)
To: Claudio Imbrenda, kvm
Cc: linux-kernel, linux-s390, frankja, nrb, seiden, gra, schlameuss,
hca, david
Am 25.03.26 um 18:13 schrieb Claudio Imbrenda:
> Introduce a new special softbit for large pages, like already presend
> for normal pages, and use it to mark guest mappings that do not have
> struct pages.
>
> Whenever a leaf DAT entry becomes dirty, check the special softbit and
> only call SetPageDirty() if there is an actual struct page.
>
> Move the logic to mark pages dirty inside _gmap_ptep_xchg() and
> _gmap_crstep_xchg_atomic(), to avoid needlessly duplicating the code.
>
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> Fixes: 5a74e3d93417 ("KVM: s390: KVM-specific bitfields and helper functions")
> Fixes: a2c17f9270cc ("KVM: s390: New gmap code")
Reviewed-by: Christian Borntraeger <borntraeger@linux.ibm.com>
> ---
> arch/s390/kvm/dat.h | 12 ++++++------
> arch/s390/kvm/gmap.c | 5 +----
> arch/s390/kvm/gmap.h | 4 ++++
> 3 files changed, 11 insertions(+), 10 deletions(-)
[..]
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v4 06/10] KVM: s390: vsie: Fix nested guest memory shadowing
2026-03-25 17:13 [PATCH v4 00/10] KVM: s390: More memory management fixes Claudio Imbrenda
` (4 preceding siblings ...)
2026-03-25 17:13 ` [PATCH v4 05/10] KVM: s390: Correctly handle guest mappings without struct page Claudio Imbrenda
@ 2026-03-25 17:13 ` Claudio Imbrenda
2026-03-25 17:13 ` [PATCH v4 07/10] KVM: s390: vsie: Fix refcount overflow for shadow gmaps Claudio Imbrenda
` (3 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Claudio Imbrenda @ 2026-03-25 17:13 UTC (permalink / raw)
To: kvm
Cc: linux-kernel, linux-s390, borntraeger, frankja, nrb, seiden, gra,
schlameuss, hca, david
Fix _do_shadow_pte() to use the correct pointer (guest pte instead of
nested guest) to set up the new pte.
Add a check to return -EOPNOTSUPP if the mapping for the nested guest
is writeable but the same page in the guest is only read-only.
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Fixes: e38c884df921 ("KVM: s390: Switch to new gmap")
---
arch/s390/kvm/gaccess.c | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 8fd690255e1b..6655c214f2d7 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1436,10 +1436,12 @@ static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union
if (!pgste_get_trylock(ptep_h, &pgste))
return -EAGAIN;
- newpte = _pte(f->pfn, f->writable, !p, 0);
- newpte.s.d |= ptep->s.d;
- newpte.s.sd |= ptep->s.sd;
- newpte.h.p &= ptep->h.p;
+ newpte = _pte(f->pfn, f->writable, !p, ptep_h->s.s);
+ newpte.s.d |= ptep_h->s.d;
+ newpte.s.sd |= ptep_h->s.sd;
+ newpte.h.p &= ptep_h->h.p;
+ if (!newpte.h.p && !f->writable)
+ return -EOPNOTSUPP;
pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false);
pgste.vsie_notif = 1;
pgste_set_unlock(ptep_h, pgste);
@@ -1477,6 +1479,9 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni
newcrste.h.p &= oldcrste.h.p;
newcrste.s.fc1.vsie_notif = 1;
newcrste.s.fc1.prefix_notif = oldcrste.s.fc1.prefix_notif;
+ newcrste.s.fc1.s = oldcrste.s.fc1.s;
+ if (!newcrste.h.p && !f->writable)
+ return -EOPNOTSUPP;
} while (!_gmap_crstep_xchg_atomic(sg->parent, host, oldcrste, newcrste, f->gfn, false));
newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, 0, !p);
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v4 07/10] KVM: s390: vsie: Fix refcount overflow for shadow gmaps
2026-03-25 17:13 [PATCH v4 00/10] KVM: s390: More memory management fixes Claudio Imbrenda
` (5 preceding siblings ...)
2026-03-25 17:13 ` [PATCH v4 06/10] KVM: s390: vsie: Fix nested guest memory shadowing Claudio Imbrenda
@ 2026-03-25 17:13 ` Claudio Imbrenda
2026-03-25 17:13 ` [PATCH v4 08/10] KVM: s390: vsie: Fix unshadowing while shadowing Claudio Imbrenda
` (2 subsequent siblings)
9 siblings, 0 replies; 12+ messages in thread
From: Claudio Imbrenda @ 2026-03-25 17:13 UTC (permalink / raw)
To: kvm
Cc: linux-kernel, linux-s390, borntraeger, frankja, nrb, seiden, gra,
schlameuss, hca, david
In most cases gmap_put() was not called when it should have.
Add the missing gmap_put() in vsie_run().
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Fixes: e38c884df921 ("KVM: s390: Switch to new gmap")
Reviewed-by: Steffen Eiden <seiden@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
---
arch/s390/kvm/vsie.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index 0330829b4046..72895dddc39a 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -1328,7 +1328,7 @@ static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
{
struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
- struct gmap *sg;
+ struct gmap *sg = NULL;
int rc = 0;
while (1) {
@@ -1368,6 +1368,8 @@ static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
sg = gmap_put(sg);
cond_resched();
}
+ if (sg)
+ sg = gmap_put(sg);
if (rc == -EFAULT) {
/*
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v4 08/10] KVM: s390: vsie: Fix unshadowing while shadowing
2026-03-25 17:13 [PATCH v4 00/10] KVM: s390: More memory management fixes Claudio Imbrenda
` (6 preceding siblings ...)
2026-03-25 17:13 ` [PATCH v4 07/10] KVM: s390: vsie: Fix refcount overflow for shadow gmaps Claudio Imbrenda
@ 2026-03-25 17:13 ` Claudio Imbrenda
2026-03-25 17:13 ` [PATCH v4 09/10] KVM: s390: vsie: Fix guest page tables protection Claudio Imbrenda
2026-03-25 17:13 ` [PATCH v4 10/10] KVM: s390: Fix KVM_S390_VCPU_FAULT ioctl Claudio Imbrenda
9 siblings, 0 replies; 12+ messages in thread
From: Claudio Imbrenda @ 2026-03-25 17:13 UTC (permalink / raw)
To: kvm
Cc: linux-kernel, linux-s390, borntraeger, frankja, nrb, seiden, gra,
schlameuss, hca, david
If shadowing causes the shadow gmap to get unshadowed, exit early to
prevent an attempt to dereference the parent pointer, which at this
point is NULL.
Opportunistically add some more checks to prevent NULL parents.
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Fixes: a2c17f9270cc ("KVM: s390: New gmap code")
Fixes: e5f98a6899bd ("KVM: s390: Add some helper functions needed for vSIE")
Fixes: e38c884df921 ("KVM: s390: Switch to new gmap")
---
arch/s390/kvm/gaccess.c | 5 +++++
arch/s390/kvm/gmap.c | 11 ++++++++++-
2 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 6655c214f2d7..d14c2b40a5f6 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1472,6 +1472,9 @@ static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, uni
return rc;
do {
+ /* _gmap_crstep_xchg_atomic() could have unshadowed this shadow gmap */
+ if (!sg->parent)
+ return -EAGAIN;
oldcrste = READ_ONCE(*host);
newcrste = _crste_fc1(f->pfn, oldcrste.h.tt, f->writable, !p);
newcrste.s.fc1.d |= oldcrste.s.fc1.d;
@@ -1527,6 +1530,8 @@ static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
entries[i - 1].pfn, i, entries[i - 1].writable);
if (rc)
return rc;
+ if (!sg->parent)
+ return -EAGAIN;
}
rc = dat_entry_walk(NULL, entries[LEVEL_MEM].gfn, sg->parent->asce, DAT_WALK_LEAF,
diff --git a/arch/s390/kvm/gmap.c b/arch/s390/kvm/gmap.c
index e9fc471a5141..b19020470b13 100644
--- a/arch/s390/kvm/gmap.c
+++ b/arch/s390/kvm/gmap.c
@@ -1160,6 +1160,7 @@ struct gmap_protect_asce_top_level {
static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
struct gmap_protect_asce_top_level *context)
{
+ struct gmap *parent;
int rc, i;
guard(write_lock)(&sg->kvm->mmu_lock);
@@ -1167,7 +1168,12 @@ static inline int __gmap_protect_asce_top_level(struct kvm_s390_mmu_cache *mc, s
if (kvm_s390_array_needs_retry_safe(sg->kvm, context->seq, context->f))
return -EAGAIN;
- scoped_guard(spinlock, &sg->parent->children_lock) {
+ parent = READ_ONCE(sg->parent);
+ if (!parent)
+ return -EAGAIN;
+ scoped_guard(spinlock, &parent->children_lock) {
+ if (READ_ONCE(sg->parent) != parent)
+ return -EAGAIN;
for (i = 0; i < CRST_TABLE_PAGES; i++) {
if (!context->f[i].valid)
continue;
@@ -1250,6 +1256,9 @@ struct gmap *gmap_create_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *pare
struct gmap *sg, *new;
int rc;
+ if (WARN_ON(!parent))
+ return ERR_PTR(-EINVAL);
+
scoped_guard(spinlock, &parent->children_lock) {
sg = gmap_find_shadow(parent, asce, edat_level);
if (sg) {
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v4 09/10] KVM: s390: vsie: Fix guest page tables protection
2026-03-25 17:13 [PATCH v4 00/10] KVM: s390: More memory management fixes Claudio Imbrenda
` (7 preceding siblings ...)
2026-03-25 17:13 ` [PATCH v4 08/10] KVM: s390: vsie: Fix unshadowing while shadowing Claudio Imbrenda
@ 2026-03-25 17:13 ` Claudio Imbrenda
2026-03-25 17:13 ` [PATCH v4 10/10] KVM: s390: Fix KVM_S390_VCPU_FAULT ioctl Claudio Imbrenda
9 siblings, 0 replies; 12+ messages in thread
From: Claudio Imbrenda @ 2026-03-25 17:13 UTC (permalink / raw)
To: kvm
Cc: linux-kernel, linux-s390, borntraeger, frankja, nrb, seiden, gra,
schlameuss, hca, david
When shadowing, the guest page tables are write-protected, in order to
trap changes and properly unshadow the shadow mapping for the nested
guest. Already shadowed levels are skipped, so that only the needed
levels are write protected.
Currently the levels that get write protected are exactly one level too
deep: the last level (nested guest memory) gets protected in the wrong
way, and will be protected again correctly a few lines afterwards; most
importantly, the highest non-shadowed level does *not* get write
protected.
Moreover, if the nested guest is running in a real address space, there
are no DAT tables to shadow.
Write protect the correct levels, so that all the levels that need to
be protected are protected, and avoid double protecting the last level;
skip attempting to shadow the DAT tables when the nested guest is
running in a real address space.
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Fixes: e38c884df921 ("KVM: s390: Switch to new gmap")
Tested-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Reviewed-by: Janosch Frank <frankja@linux.ibm.com>
---
arch/s390/kvm/gaccess.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index d14c2b40a5f6..93d03e0fd704 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -1521,13 +1521,20 @@ static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
gl = get_level(table, ptep);
+ /* In case of a real address space */
+ if (w->level <= LEVEL_MEM) {
+ l = TABLE_TYPE_PAGE_TABLE;
+ hl = TABLE_TYPE_REGION1;
+ goto real_address_space;
+ }
+
/*
* Skip levels that are already protected. For each level, protect
* only the page containing the entry, not the whole table.
*/
for (i = gl ; i >= w->level; i--) {
- rc = gmap_protect_rmap(mc, sg, entries[i - 1].gfn, gpa_to_gfn(saddr),
- entries[i - 1].pfn, i, entries[i - 1].writable);
+ rc = gmap_protect_rmap(mc, sg, entries[i].gfn, gpa_to_gfn(saddr),
+ entries[i].pfn, i + 1, entries[i].writable);
if (rc)
return rc;
if (!sg->parent)
@@ -1543,6 +1550,7 @@ static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg,
/* Get the smallest granularity */
l = min3(gl, hl, w->level);
+real_address_space:
flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0);
/* If necessary, create the shadow mapping */
if (l < gl) {
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v4 10/10] KVM: s390: Fix KVM_S390_VCPU_FAULT ioctl
2026-03-25 17:13 [PATCH v4 00/10] KVM: s390: More memory management fixes Claudio Imbrenda
` (8 preceding siblings ...)
2026-03-25 17:13 ` [PATCH v4 09/10] KVM: s390: vsie: Fix guest page tables protection Claudio Imbrenda
@ 2026-03-25 17:13 ` Claudio Imbrenda
9 siblings, 0 replies; 12+ messages in thread
From: Claudio Imbrenda @ 2026-03-25 17:13 UTC (permalink / raw)
To: kvm
Cc: linux-kernel, linux-s390, borntraeger, frankja, nrb, seiden, gra,
schlameuss, hca, david
A previous commit changed the behaviour of the KVM_S390_VCPU_FAULT
ioctl. The current (wrong) implementation will trigger a guest
addressing exception if the requested address lies outside of a
memslot, unless the VM is UCONTROL.
Restore the previous behaviour by open coding the fault-in logic.
Fixes: 3762e905ec2e ("KVM: s390: use __kvm_faultin_pfn()")
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Reviewed-by: Steffen Eiden <seiden@linux.ibm.com>
---
arch/s390/kvm/kvm-s390.c | 18 +++++++++++++++---
1 file changed, 15 insertions(+), 3 deletions(-)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ebcb0ef8835e..62f04931b54d 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -5520,9 +5520,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
#endif
case KVM_S390_VCPU_FAULT: {
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = vcpu_dat_fault_handler(vcpu, arg, 0);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ gpa_t gaddr = arg;
+
+ scoped_guard(srcu, &vcpu->kvm->srcu) {
+ r = vcpu_ucontrol_translate(vcpu, &gaddr);
+ if (r)
+ break;
+
+ r = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(gaddr), false);
+ if (r == PGM_ADDRESSING)
+ r = -EFAULT;
+ if (r <= 0)
+ break;
+ r = -EIO;
+ KVM_BUG_ON(r, vcpu->kvm);
+ }
break;
}
case KVM_ENABLE_CAP:
--
2.53.0
^ permalink raw reply related [flat|nested] 12+ messages in thread