From: Christoph Schlameuss <schlameuss@linux.ibm.com>
To: kvm@vger.kernel.org
Cc: linux-s390@vger.kernel.org, Heiko Carstens <hca@linux.ibm.com>,
Vasily Gorbik <gor@linux.ibm.com>,
Alexander Gordeev <agordeev@linux.ibm.com>,
Christian Borntraeger <borntraeger@linux.ibm.com>,
Janosch Frank <frankja@linux.ibm.com>,
Claudio Imbrenda <imbrenda@linux.ibm.com>,
Nico Boehr <nrb@linux.ibm.com>,
David Hildenbrand <david@redhat.com>,
Sven Schnelle <svens@linux.ibm.com>,
Paolo Bonzini <pbonzini@redhat.com>,
Shuah Khan <shuah@kernel.org>,
Christoph Schlameuss <schlameuss@linux.ibm.com>
Subject: [PATCH RFC v2 07/11] KVM: s390: Shadow VSIE SCA in guest-1
Date: Mon, 10 Nov 2025 18:16:47 +0100 [thread overview]
Message-ID: <20251110-vsieie-v2-7-9e53a3618c8c@linux.ibm.com> (raw)
In-Reply-To: <20251110-vsieie-v2-0-9e53a3618c8c@linux.ibm.com>
Restructure kvm_s390_handle_vsie() to create a guest-1 shadow of the SCA
if guest-2 attempts to enter SIE with an SCA. If the SCA is used the
vsie_pages are stored in a new vsie_sca struct instead of the arch vsie
struct.
When the VSIE-Interpretation-Extension Facility is active (minimum z17)
the shadow SCA (ssca_block) will be created and shadows of all CPUs
defined in the configuration are created.
SCAOL/H in the VSIE control block are overwritten with references to the
shadow SCA.
The shadow SCA contains the addresses of the original guest-3 SCA as
well as the original VSIE control blocks. With these addresses the
machine can directly monitor the intervention bits within the original
SCA entries, enabling it to handle SENSE_RUNNING and EXTERNAL_CALL sigp
instructions without exiting VSIE.
The original SCA will be pinned in guest-2 memory and only be unpinned
before reuse. This means some pages might still be pinned even after the
guest 3 VM does no longer exist.
The ssca_blocks are also kept within a radix tree to reuse already
existing ssca_blocks efficiently. While the radix tree and array with
references to the ssca_blocks are held in the vsie_sca struct.
The use of vsie_scas is tracked using an ref_count.
Signed-off-by: Christoph Schlameuss <schlameuss@linux.ibm.com>
---
arch/s390/include/asm/kvm_host.h | 11 +-
arch/s390/include/asm/kvm_host_types.h | 5 +-
arch/s390/kvm/kvm-s390.c | 6 +-
arch/s390/kvm/kvm-s390.h | 2 +-
arch/s390/kvm/vsie.c | 672 ++++++++++++++++++++++++++++-----
5 files changed, 596 insertions(+), 100 deletions(-)
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 647014edd3de8abc15067e7203c4855c066c53ad..191b23edf0ac7e9a3e1fd9cdc6fc4c9a9e6769f8 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -597,13 +597,22 @@ struct sie_page2 {
};
struct vsie_page;
+struct vsie_sca;
+/*
+ * vsie_pages, scas and accompanied management vars
+ */
struct kvm_s390_vsie {
struct mutex mutex;
struct radix_tree_root addr_to_page;
int page_count;
int next;
- struct vsie_page *pages[KVM_MAX_VCPUS];
+ struct vsie_page *pages[KVM_S390_MAX_VSIE_VCPUS];
+ struct rw_semaphore ssca_lock;
+ struct radix_tree_root osca_to_sca;
+ int sca_count;
+ int sca_next;
+ struct vsie_sca *scas[KVM_S390_MAX_VSIE_VCPUS];
};
struct kvm_s390_gisa_iam {
diff --git a/arch/s390/include/asm/kvm_host_types.h b/arch/s390/include/asm/kvm_host_types.h
index ce52608449735d6ca629008c554e7df09f97e67b..3141a7163518c8fc1584c36efe216ff237722b7e 100644
--- a/arch/s390/include/asm/kvm_host_types.h
+++ b/arch/s390/include/asm/kvm_host_types.h
@@ -6,6 +6,9 @@
#include <linux/atomic.h>
#include <linux/types.h>
+#define KVM_S390_MAX_VSIE_VCPUS 256
+#define KVM_S390_MAX_SCA_PAGES 5
+
#define KVM_S390_BSCA_CPU_SLOTS 64
#define KVM_S390_ESCA_CPU_SLOTS 248
@@ -102,7 +105,7 @@ struct esca_block {
struct ssca_block {
__u64 osca;
__u64 reserved08[7];
- struct ssca_entry cpu[KVM_MAX_VCPUS];
+ struct ssca_entry cpu[KVM_S390_MAX_VSIE_VCPUS];
};
/*
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ab672aa93f758711af4defb13875fd49a6609758..e3fc53e33e90be7dab75f73ebd0b949c13d22939 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -470,6 +470,9 @@ static void __init kvm_s390_cpu_feat_init(void)
allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
if (sclp.has_kss)
allow_cpu_feat(KVM_S390_VM_CPU_FEAT_KSS);
+ if (sclp.has_vsie_sigpif)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIGPIF);
+
/*
* KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
* all skey handling functions read/set the skey from the PGSTE
@@ -484,9 +487,6 @@ static void __init kvm_s390_cpu_feat_init(void)
* For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and
* KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be
* correctly shadowed. We can do that for the PGSTE but not for PTE.I.
- *
- * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We
- * cannot easily shadow the SCA because of the ipte lock.
*/
}
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 65c950760993467398b68f3763d6f81f52c52385..0e33f00cd63e8b9f261a0c52add86560f2918d05 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -577,7 +577,7 @@ static inline int kvm_s390_use_sca_entries(void)
* might use the entries. By not setting the entries and keeping them
* invalid, hardware will not access them but intercept.
*/
- return sclp.has_sigpif && sclp.has_esca;
+ return sclp.has_sigpif;
}
void kvm_s390_reinject_machine_check(struct kvm_vcpu *vcpu,
struct mcck_volatile_info *mcck_info);
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index e86fef0fa3919668902c766813991572c2311b09..72c794945be916cc107aba74e1609d3b4780d4b9 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -26,6 +26,12 @@
enum vsie_page_flags {
VSIE_PAGE_IN_USE = 0,
+ VSIE_PAGE_PINNED = 1,
+};
+
+enum vsie_sca_flags {
+ VSIE_SCA_ESCA = 0,
+ VSIE_SCA_PINNED = 1,
};
struct vsie_page {
@@ -62,7 +68,9 @@ struct vsie_page {
* looked up by other CPUs.
*/
unsigned long flags; /* 0x0260 */
- __u8 reserved[0x0700 - 0x0268]; /* 0x0268 */
+ /* vsie system control area */
+ struct vsie_sca *sca; /* 0x0268 */
+ __u8 reserved[0x0700 - 0x0270]; /* 0x0270 */
struct kvm_s390_crypto_cb crycb; /* 0x0700 */
__u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */
};
@@ -72,6 +80,41 @@ struct kvm_address_pair {
hpa_t hpa;
};
+/*
+ * Store the vsie system configuration data.
+ */
+struct vsie_sca {
+ /* calculated guest addresses of the sca */
+ gpa_t sca_gpa;
+ atomic_t ref_count;
+ /* defined in enum vsie_sca_flags */
+ unsigned long flags;
+ unsigned long sca_o_nr_pages;
+ struct kvm_address_pair sca_o_pages[KVM_S390_MAX_SCA_PAGES];
+ u64 mcn[4];
+ struct ssca_block *ssca;
+ int page_count;
+ int page_next;
+ struct vsie_page *pages[KVM_S390_MAX_VSIE_VCPUS];
+};
+
+static inline bool use_vsie_sigpif(struct kvm *kvm)
+{
+ return kvm->arch.use_vsie_sigpif;
+}
+
+static inline bool use_vsie_sigpif_for(struct kvm *kvm, struct vsie_page *vsie_page)
+{
+ return use_vsie_sigpif(kvm) &&
+ (vsie_page->scb_o->eca & ECA_SIGPI) &&
+ (vsie_page->scb_o->ecb & ECB_SRSI);
+}
+
+static inline bool sie_uses_esca(struct kvm_s390_sie_block *scb)
+{
+ return (scb->ecb2 & ECB2_ESCA);
+}
+
/**
* gmap_shadow_valid() - check if a shadow guest address space matches the
* given properties and is still valid
@@ -630,6 +673,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
scb_s->icpua = scb_o->icpua;
+ write_scao(scb_s, virt_to_phys(vsie_page->sca->ssca));
+ scb_s->osda = virt_to_phys(scb_o);
if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
new_mso = READ_ONCE(scb_o->mso) & 0xfffffffffff00000UL;
@@ -681,6 +726,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
/* Instruction Execution Prevention */
if (test_kvm_facility(vcpu->kvm, 130))
scb_s->ecb2 |= scb_o->ecb2 & ECB2_IEP;
+ /* extended SCA */
+ scb_s->ecb2 |= scb_o->ecb2 & ECB2_ESCA;
/* Guarded Storage */
if (test_kvm_facility(vcpu->kvm, 133)) {
scb_s->ecb |= scb_o->ecb & ECB_GS;
@@ -713,12 +760,250 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
return rc;
}
+/* Called with ssca_lock held. */
+static void unpin_sca(struct kvm *kvm, struct vsie_sca *sca)
+{
+ if (!test_bit(VSIE_SCA_PINNED, &sca->flags))
+ return;
+
+ unpin_guest_pages(kvm, sca->sca_o_pages, sca->sca_o_nr_pages);
+ sca->sca_o_nr_pages = 0;
+
+ __clear_bit(VSIE_SCA_PINNED, &sca->flags);
+}
+
+/* pin g2s original sca in g1 memory */
+static int pin_sca(struct kvm *kvm, struct vsie_page *vsie_page, struct vsie_sca *sca)
+{
+ bool is_esca = sie_uses_esca(vsie_page->scb_o);
+ int nr_pages = KVM_S390_MAX_SCA_PAGES;
+
+ if (test_bit(VSIE_SCA_PINNED, &sca->flags))
+ return 0;
+
+ if (!is_esca) {
+ nr_pages = 1;
+ if ((sca->sca_gpa & ~PAGE_MASK) + sizeof(struct bsca_block) > PAGE_SIZE)
+ nr_pages = 2;
+ }
+
+ sca->sca_o_nr_pages = pin_guest_pages(kvm, sca->sca_gpa, nr_pages, sca->sca_o_pages);
+ if (WARN_ON_ONCE(sca->sca_o_nr_pages != nr_pages)) {
+ set_validity_icpt(&vsie_page->scb_s, 0x0034U);
+ return -EIO;
+ }
+ __set_bit(VSIE_SCA_PINNED, &sca->flags);
+
+ return 0;
+}
+
+static void get_sca_entry_addr(struct kvm *kvm, struct vsie_page *vsie_page, struct vsie_sca *sca,
+ u16 cpu_nr, gpa_t *gpa, hpa_t *hpa)
+{
+ hpa_t offset;
+ int pn;
+
+ /*
+ * We cannot simply access the hva since the esca_block has typically
+ * 4 pages (arch max 5 pages) that might not be continuous in g1 memory.
+ * The bsca_block may also be stretched over two pages. Only the header
+ * is guaranteed to be on the same page.
+ */
+ if (test_bit(VSIE_SCA_ESCA, &sca->flags))
+ offset = offsetof(struct esca_block, cpu[cpu_nr]);
+ else
+ offset = offsetof(struct bsca_block, cpu[cpu_nr]);
+ pn = ((vsie_page->sca->sca_gpa & ~PAGE_MASK) + offset) >> PAGE_SHIFT;
+ if (WARN_ON_ONCE(pn > sca->sca_o_nr_pages))
+ return;
+
+ if (gpa)
+ *gpa = sca->sca_o_pages[pn].gpa + offset;
+ if (hpa)
+ *hpa = sca->sca_o_pages[pn].hpa + offset;
+}
+
+/*
+ * Try to find the address of an existing shadow system control area.
+ * @sca_o_gpa: original system control area address; guest-2 physical
+ *
+ * Called with ssca_lock held.
+ */
+static struct vsie_sca *get_existing_vsie_sca(struct kvm *kvm, hpa_t sca_o_gpa)
+{
+ struct vsie_sca *sca = radix_tree_lookup(&kvm->arch.vsie.osca_to_sca, sca_o_gpa);
+
+ if (sca)
+ WARN_ON_ONCE(atomic_inc_return(&sca->ref_count) < 1);
+ return sca;
+}
+
+/*
+ * Try to find an currently unused ssca_vsie from the vsie struct.
+ *
+ * Called with ssca_lock held.
+ */
+static struct vsie_sca *get_free_existing_vsie_sca(struct kvm *kvm)
+{
+ struct vsie_sca *sca;
+ int i, ref_count;
+
+ for (i = 0; i >= kvm->arch.vsie.sca_count; i++) {
+ sca = kvm->arch.vsie.scas[kvm->arch.vsie.sca_next];
+ kvm->arch.vsie.sca_next++;
+ kvm->arch.vsie.sca_next %= kvm->arch.vsie.sca_count;
+ ref_count = atomic_inc_return(&sca->ref_count);
+ WARN_ON_ONCE(ref_count < 1);
+ if (ref_count == 1)
+ return sca;
+ atomic_dec(&sca->ref_count);
+ }
+ return ERR_PTR(-EFAULT);
+}
+
+static void destroy_vsie_sca(struct kvm *kvm, struct vsie_sca *sca)
+{
+ radix_tree_delete(&kvm->arch.vsie.osca_to_sca, sca->sca_gpa);
+ if (sca->ssca)
+ free_pages_exact(sca->ssca, sca->page_count);
+ sca->ssca = NULL;
+ free_page((unsigned long)sca);
+}
+
+static void put_vsie_sca(struct vsie_sca *sca)
+{
+ if (!sca)
+ return;
+
+ WARN_ON_ONCE(atomic_dec_return(&sca->ref_count) < 0);
+}
+
+/*
+ * Pin and get an existing or new guest system control area.
+ *
+ * May sleep.
+ */
+static struct vsie_sca *get_vsie_sca(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+ gpa_t sca_addr)
+{
+ struct vsie_sca *sca, *sca_new = NULL;
+ struct kvm *kvm = vcpu->kvm;
+ unsigned int max_sca;
+ int rc;
+
+ rc = validate_scao(vcpu, vsie_page->scb_o, vsie_page->sca_gpa);
+ if (rc)
+ return ERR_PTR(rc);
+
+ /* get existing sca */
+ down_read(&kvm->arch.vsie.ssca_lock);
+ sca = get_existing_vsie_sca(kvm, sca_addr);
+ up_read(&kvm->arch.vsie.ssca_lock);
+ if (sca)
+ return sca;
+
+ /*
+ * Allocate new ssca, it will likely be needed below.
+ * We want at least #online_vcpus shadows, so every VCPU can execute the
+ * VSIE in parallel. (Worst case all single core VMs.)
+ */
+ max_sca = MIN(atomic_read(&kvm->online_vcpus), KVM_S390_MAX_VSIE_VCPUS);
+ if (kvm->arch.vsie.sca_count < max_sca) {
+ BUILD_BUG_ON(sizeof(struct vsie_sca) > PAGE_SIZE);
+ sca_new = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!sca_new)
+ return ERR_PTR(-ENOMEM);
+
+ if (use_vsie_sigpif(vcpu->kvm)) {
+ BUILD_BUG_ON(offsetof(struct ssca_block, cpu) != 64);
+ sca_new->ssca = alloc_pages_exact(sizeof(*sca_new->ssca),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!sca_new->ssca) {
+ free_page((unsigned long)sca);
+ sca_new = NULL;
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+ }
+
+ /* enter write lock and recheck to make sure ssca has not been created by other cpu */
+ down_write(&kvm->arch.vsie.ssca_lock);
+ sca = get_existing_vsie_sca(kvm, sca_addr);
+ if (sca)
+ goto out;
+
+ /* check again under write lock if we are still under our sca_count limit */
+ if (sca_new && kvm->arch.vsie.sca_count < max_sca) {
+ /* make use of vsie_sca just created */
+ sca = sca_new;
+ sca_new = NULL;
+
+ kvm->arch.vsie.scas[kvm->arch.vsie.sca_count] = sca;
+ } else {
+ /* reuse previously created vsie_sca allocation for different osca */
+ sca = get_free_existing_vsie_sca(kvm);
+ /* with nr_vcpus scas one must be free */
+ if (IS_ERR(sca))
+ goto out;
+
+ unpin_sca(kvm, sca);
+ radix_tree_delete(&kvm->arch.vsie.osca_to_sca, sca->sca_gpa);
+ memset(sca, 0, sizeof(struct vsie_sca));
+ }
+
+ /* use ECB of shadow scb to determine SCA type */
+ if (sie_uses_esca(vsie_page->scb_o))
+ __set_bit(VSIE_SCA_ESCA, &sca->flags);
+ sca->sca_gpa = sca_addr;
+ sca->pages[vsie_page->scb_o->icpua] = vsie_page;
+
+ if (sca->sca_gpa != 0) {
+ /*
+ * The pinned original sca will only be unpinned lazily to limit the
+ * required amount of pins/unpins on each vsie entry/exit.
+ * The unpin is done in the reuse vsie_sca allocation path above and
+ * kvm_s390_vsie_destroy().
+ */
+ rc = pin_sca(kvm, vsie_page, sca);
+ if (rc) {
+ sca = ERR_PTR(rc);
+ goto out;
+ }
+ }
+
+ atomic_set(&sca->ref_count, 1);
+ radix_tree_insert(&kvm->arch.vsie.osca_to_sca, sca->sca_gpa, sca);
+
+out:
+ up_write(&kvm->arch.vsie.ssca_lock);
+ if (sca_new)
+ destroy_vsie_sca(kvm, sca_new);
+ return sca;
+}
+
+static void kvm_s390_vsie_gmap_donotify(struct gmap *gmap, unsigned long start,
+ unsigned long end, struct vsie_page *cur_page)
+{
+ unsigned long prefix;
+
+ if (!cur_page)
+ return;
+ if (READ_ONCE(cur_page->gmap) != gmap)
+ return;
+ prefix = cur_page->scb_s.prefix << GUEST_PREFIX_SHIFT;
+ /* with mso/msl, the prefix lies at an offset */
+ prefix += cur_page->scb_s.mso;
+ if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1)
+ prefix_unmapped_sync(cur_page);
+}
+
void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
unsigned long end)
{
struct kvm *kvm = gmap->private;
- struct vsie_page *cur;
- unsigned long prefix;
+ struct vsie_page *cur_page;
+ struct vsie_sca *sca;
+ unsigned int cpu_nr;
int i;
if (!gmap_is_shadow(gmap))
@@ -728,16 +1013,17 @@ void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
* therefore we can safely reference them all the time.
*/
for (i = 0; i < kvm->arch.vsie.page_count; i++) {
- cur = READ_ONCE(kvm->arch.vsie.pages[i]);
- if (!cur)
- continue;
- if (READ_ONCE(cur->gmap) != gmap)
+ cur_page = READ_ONCE(kvm->arch.vsie.pages[i]);
+ kvm_s390_vsie_gmap_donotify(gmap, start, end, cur_page);
+ }
+ for (i = 0; i < kvm->arch.vsie.sca_count; i++) {
+ sca = READ_ONCE(kvm->arch.vsie.scas[i]);
+ if (!sca && atomic_read(&sca->ref_count))
continue;
- prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
- /* with mso/msl, the prefix lies at an offset */
- prefix += cur->scb_s.mso;
- if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1)
- prefix_unmapped_sync(cur);
+ for_each_set_bit_inv(cpu_nr, (unsigned long *)sca->mcn, KVM_S390_MAX_VSIE_VCPUS) {
+ cur_page = sca->pages[cpu_nr];
+ kvm_s390_vsie_gmap_donotify(gmap, start, end, cur_page);
+ }
}
}
@@ -789,13 +1075,6 @@ static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
hpa_t hpa;
- hpa = read_scao(vcpu->kvm, scb_s);
- if (hpa) {
- unpin_guest_page(vcpu->kvm, vsie_page->sca_gpa, hpa);
- vsie_page->sca_gpa = 0;
- write_scao(scb_s, 0);
- }
-
hpa = scb_s->itdba;
if (hpa) {
unpin_guest_page(vcpu->kvm, vsie_page->itdba_gpa, hpa);
@@ -847,20 +1126,6 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
gpa_t gpa;
int rc = 0;
- gpa = read_scao(vcpu->kvm, scb_o);
- if (gpa) {
- rc = validate_scao(vcpu, scb_o, gpa);
- if (!rc) {
- rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
- if (rc)
- rc = set_validity_icpt(scb_s, 0x0034U);
- }
- if (rc)
- goto unpin;
- vsie_page->sca_gpa = gpa;
- write_scao(scb_s, hpa);
- }
-
gpa = READ_ONCE(scb_o->itdba) & ~0xffUL;
if (gpa && (scb_s->ecb & ECB_TE)) {
if (gpa < 2 * PAGE_SIZE) {
@@ -948,14 +1213,18 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
}
/* unpin the scb provided by guest 2, marking it as dirty */
-static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
- gpa_t gpa)
+static void unpin_scb(struct kvm *kvm, struct vsie_page *vsie_page)
{
- hpa_t hpa = virt_to_phys(vsie_page->scb_o);
+ hpa_t hpa;
+
+ if (!test_bit(VSIE_PAGE_PINNED, &vsie_page->flags))
+ return;
+ hpa = virt_to_phys(vsie_page->scb_o);
if (hpa)
- unpin_guest_page(vcpu->kvm, gpa, hpa);
+ unpin_guest_page(kvm, vsie_page->scb_gpa, hpa);
vsie_page->scb_o = NULL;
+ clear_bit(VSIE_PAGE_PINNED, &vsie_page->flags);
}
/*
@@ -964,19 +1233,22 @@ static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
* Returns: - 0 if the scb was pinned.
* - > 0 if control has to be given to guest 2
*/
-static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
- gpa_t gpa)
+static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
{
hpa_t hpa;
int rc;
- rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (test_bit(VSIE_PAGE_PINNED, &vsie_page->flags))
+ return 0;
+
+ rc = pin_guest_page(vcpu->kvm, vsie_page->scb_gpa, &hpa);
if (rc) {
rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
WARN_ON_ONCE(rc);
return 1;
}
vsie_page->scb_o = phys_to_virt(hpa);
+ __set_bit(VSIE_PAGE_PINNED, &vsie_page->flags);
return 0;
}
@@ -1453,75 +1725,129 @@ static void put_vsie_page(struct vsie_page *vsie_page)
clear_bit(VSIE_PAGE_IN_USE, &vsie_page->flags);
}
+static void free_vsie_page(struct vsie_page *vsie_page)
+{
+ free_page((unsigned long)vsie_page);
+}
+
+static struct vsie_page *malloc_vsie_page(struct kvm *kvm)
+{
+ struct vsie_page *vsie_page;
+
+ vsie_page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA);
+ if (!vsie_page)
+ return ERR_PTR(-ENOMEM);
+
+ /* Mark it as invalid until it resides in the tree. */
+ vsie_page->scb_gpa = ULONG_MAX;
+ return vsie_page;
+}
+
/*
* Get or create a vsie page for a scb address.
*
+ * Original control blocks are pinned when the vsie_page pointing to them is
+ * returned.
+ * Newly created vsie_pages only have vsie_page->scb_gpa and vsie_page->sca_gpa
+ * set.
+ *
* Returns: - address of a vsie page (cached or new one)
* - NULL if the same scb address is already used by another VCPU
* - ERR_PTR(-ENOMEM) if out of memory
*/
-static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
+static struct vsie_page *get_vsie_page(struct kvm_vcpu *vcpu, unsigned long addr)
{
- struct vsie_page *vsie_page;
- int nr_vcpus;
+ struct vsie_page *vsie_page, *vsie_page_new;
+ struct kvm *kvm = vcpu->kvm;
+ unsigned int max_vsie_page;
+ int rc, pages_idx;
+ gpa_t sca_addr;
- rcu_read_lock();
vsie_page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
- rcu_read_unlock();
- if (vsie_page) {
- if (try_get_vsie_page(vsie_page)) {
- if (vsie_page->scb_gpa == addr)
- return vsie_page;
- /*
- * We raced with someone reusing + putting this vsie
- * page before we grabbed it.
- */
- put_vsie_page(vsie_page);
- }
+ if (vsie_page && try_get_vsie_page(vsie_page)) {
+ if (vsie_page->scb_gpa == addr)
+ return vsie_page;
+ /*
+ * We raced with someone reusing + putting this vsie
+ * page before we grabbed it.
+ */
+ put_vsie_page(vsie_page);
}
- /*
- * We want at least #online_vcpus shadows, so every VCPU can execute
- * the VSIE in parallel.
- */
- nr_vcpus = atomic_read(&kvm->online_vcpus);
+ max_vsie_page = MIN(atomic_read(&kvm->online_vcpus), KVM_S390_MAX_VSIE_VCPUS);
+
+ /* allocate new vsie_page - we will likely need it */
+ if (addr || kvm->arch.vsie.page_count < max_vsie_page) {
+ vsie_page_new = malloc_vsie_page(kvm);
+ if (IS_ERR(vsie_page_new))
+ return vsie_page_new;
+ }
mutex_lock(&kvm->arch.vsie.mutex);
- if (kvm->arch.vsie.page_count < nr_vcpus) {
- vsie_page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO | GFP_DMA);
- if (!vsie_page) {
- mutex_unlock(&kvm->arch.vsie.mutex);
- return ERR_PTR(-ENOMEM);
- }
- __set_bit(VSIE_PAGE_IN_USE, &vsie_page->flags);
- kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = vsie_page;
+ if (addr || kvm->arch.vsie.page_count < max_vsie_page) {
+ pages_idx = kvm->arch.vsie.page_count;
+ vsie_page = vsie_page_new;
+ vsie_page_new = NULL;
+ kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = vsie_page_new;
kvm->arch.vsie.page_count++;
} else {
/* reuse an existing entry that belongs to nobody */
+ if (vsie_page_new)
+ free_vsie_page(vsie_page_new);
while (true) {
vsie_page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
- if (try_get_vsie_page(vsie_page))
+ if (try_get_vsie_page(vsie_page)) {
+ pages_idx = kvm->arch.vsie.next;
break;
+ }
kvm->arch.vsie.next++;
- kvm->arch.vsie.next %= nr_vcpus;
+ kvm->arch.vsie.next %= max_vsie_page;
}
+
+ unpin_scb(kvm, vsie_page);
if (vsie_page->scb_gpa != ULONG_MAX)
radix_tree_delete(&kvm->arch.vsie.addr_to_page,
vsie_page->scb_gpa >> 9);
}
- /* Mark it as invalid until it resides in the tree. */
- vsie_page->scb_gpa = ULONG_MAX;
+
+ vsie_page->scb_gpa = addr;
+ rc = pin_scb(vcpu, vsie_page);
+ if (rc) {
+ vsie_page->scb_gpa = ULONG_MAX;
+ free_vsie_page(vsie_page);
+ mutex_unlock(&kvm->arch.vsie.mutex);
+ return ERR_PTR(-ENOMEM);
+ }
+ sca_addr = read_scao(kvm, vsie_page->scb_o);
+ vsie_page->sca_gpa = sca_addr;
+ __set_bit(VSIE_PAGE_IN_USE, &vsie_page->flags);
/* Double use of the same address or allocation failure. */
if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9,
vsie_page)) {
+ unpin_scb(kvm, vsie_page);
put_vsie_page(vsie_page);
mutex_unlock(&kvm->arch.vsie.mutex);
return NULL;
}
- vsie_page->scb_gpa = addr;
mutex_unlock(&kvm->arch.vsie.mutex);
+ /*
+ * If the vsie cb does use a sca we store the vsie_page within the
+ * vsie_sca later. But we need to allocate an empty page to leave no
+ * hole in the arch.vsie.pages.
+ */
+ if (sca_addr) {
+ vsie_page_new = malloc_vsie_page(kvm);
+ if (IS_ERR(vsie_page_new)) {
+ unpin_scb(kvm, vsie_page);
+ put_vsie_page(vsie_page);
+ return vsie_page_new;
+ }
+ kvm->arch.vsie.pages[pages_idx] = vsie_page_new;
+ vsie_page_new = NULL;
+ }
+
memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
release_gmap_shadow(vsie_page);
vsie_page->fault_addr = 0;
@@ -1529,11 +1855,124 @@ static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
return vsie_page;
}
+static struct vsie_page *get_vsie_page_cpu_nr(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+ gpa_t scb_o_gpa, u16 cpu_nr)
+{
+ struct vsie_page *vsie_page_n;
+
+ vsie_page_n = get_vsie_page(vcpu, scb_o_gpa);
+ if (IS_ERR(vsie_page_n))
+ return vsie_page_n;
+ shadow_scb(vcpu, vsie_page_n);
+ vsie_page_n->scb_s.eca |= vsie_page->scb_o->eca & ECA_SIGPI;
+ vsie_page_n->scb_s.ecb |= vsie_page->scb_o->ecb & ECB_SRSI;
+ put_vsie_page(vsie_page_n);
+ WARN_ON_ONCE(!((u64)vsie_page_n->scb_gpa & PAGE_MASK));
+ WARN_ON_ONCE(!((u64)vsie_page_n & PAGE_MASK));
+
+ return vsie_page_n;
+}
+
+/*
+ * Fill the shadow system control area used for vsie sigpif.
+ */
+static int init_ssca(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct vsie_sca *sca)
+{
+ hpa_t sca_o_entry_hpa, osca = sca->sca_o_pages[0].hpa;
+ bool is_esca = sie_uses_esca(vsie_page->scb_o);
+ unsigned int cpu_nr, cpu_slots;
+ struct vsie_page *vsie_page_n;
+ gpa_t scb_o_gpa;
+ int i;
+
+ /* copy mcn to detect updates */
+ if (is_esca)
+ for (i = 0; i < 4; i++)
+ sca->mcn[i] = ((struct esca_block *)phys_to_virt(osca))->mcn[i];
+ else
+ sca->mcn[0] = ((struct bsca_block *)phys_to_virt(osca))->mcn;
+
+ /* pin and make minimal shadow for ALL scb in the sca */
+ cpu_slots = is_esca ? KVM_S390_MAX_VSIE_VCPUS : KVM_S390_BSCA_CPU_SLOTS;
+ for_each_set_bit_inv(cpu_nr, (unsigned long *)&vsie_page->sca->mcn, cpu_slots) {
+ get_sca_entry_addr(vcpu->kvm, vsie_page, sca, cpu_nr, NULL, &sca_o_entry_hpa);
+ if (is_esca)
+ scb_o_gpa = ((struct esca_entry *)sca_o_entry_hpa)->sda;
+ else
+ scb_o_gpa = ((struct bsca_entry *)sca_o_entry_hpa)->sda;
+
+ if (vsie_page->scb_s.icpua == cpu_nr)
+ vsie_page_n = vsie_page;
+ else
+ vsie_page_n = get_vsie_page_cpu_nr(vcpu, vsie_page, scb_o_gpa, cpu_nr);
+ if (IS_ERR(vsie_page_n))
+ goto err;
+
+ if (!sca->pages[vsie_page_n->scb_o->icpua])
+ sca->pages[vsie_page_n->scb_o->icpua] = vsie_page_n;
+ WARN_ON_ONCE(sca->pages[vsie_page_n->scb_o->icpua] != vsie_page_n);
+ sca->ssca->cpu[cpu_nr].ssda = virt_to_phys(&vsie_page_n->scb_s);
+ sca->ssca->cpu[cpu_nr].ossea = sca_o_entry_hpa;
+ }
+
+ sca->ssca->osca = osca;
+ return 0;
+
+err:
+ for_each_set_bit_inv(cpu_nr, (unsigned long *)&vsie_page->sca->mcn, cpu_slots) {
+ sca->ssca->cpu[cpu_nr].ssda = 0;
+ sca->ssca->cpu[cpu_nr].ossea = 0;
+ }
+ return PTR_ERR(vsie_page_n);
+}
+
+/*
+ * Shadow the sca on vsie enter.
+ */
+static int shadow_sca(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct vsie_sca *sca)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ int rc;
+
+ vsie_page->sca = sca;
+ if (!sca)
+ return false;
+
+ if (!sca->pages[vsie_page->scb_o->icpua])
+ sca->pages[vsie_page->scb_o->icpua] = vsie_page;
+ WARN_ON_ONCE(sca->pages[vsie_page->scb_o->icpua] != vsie_page);
+
+ if (!sca->ssca)
+ return false;
+ if (!use_vsie_sigpif_for(vcpu->kvm, vsie_page))
+ return false;
+
+ /* skip if the guest does not have an usable sca */
+ if (!sca->ssca->osca) {
+ rc = init_ssca(vcpu, vsie_page, sca);
+ if (rc)
+ return rc;
+ }
+
+ /*
+ * only shadow sigpif if we actually have a sca that we can properly
+ * shadow with vsie_sigpif
+ */
+ scb_s->eca |= vsie_page->scb_o->eca & ECA_SIGPI;
+ scb_s->ecb |= vsie_page->scb_o->ecb & ECB_SRSI;
+
+ WRITE_ONCE(scb_s->osda, virt_to_phys(vsie_page->scb_o));
+ write_scao(scb_s, virt_to_phys(sca->ssca));
+
+ return false;
+}
+
int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
{
struct vsie_page *vsie_page;
- unsigned long scb_addr;
- int rc;
+ struct vsie_sca *sca = NULL;
+ gpa_t scb_addr;
+ int rc = 0;
vcpu->stat.instruction_sie++;
if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2))
@@ -1554,31 +1993,45 @@ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
return 0;
}
- vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
+ /* get the vsie_page including the vsie control block */
+ vsie_page = get_vsie_page(vcpu, scb_addr);
if (IS_ERR(vsie_page))
return PTR_ERR(vsie_page);
- else if (!vsie_page)
+ if (!vsie_page)
/* double use of sie control block - simply do nothing */
return 0;
- rc = pin_scb(vcpu, vsie_page, scb_addr);
- if (rc)
- goto out_put;
+ /* get the vsie_sca including references to the original sca and all cbs */
+ if (vsie_page->sca_gpa) {
+ sca = get_vsie_sca(vcpu, vsie_page, vsie_page->sca_gpa);
+ if (IS_ERR(sca)) {
+ rc = PTR_ERR(sca);
+ goto out_put_vsie_page;
+ }
+ }
+
+ /* shadow scb and sca for vsie_run */
rc = shadow_scb(vcpu, vsie_page);
if (rc)
- goto out_unpin_scb;
+ goto out_put_vsie_sca;
+ rc = shadow_sca(vcpu, vsie_page, sca);
+ if (rc)
+ goto out_unshadow_scb;
+
rc = pin_blocks(vcpu, vsie_page);
if (rc)
- goto out_unshadow;
+ goto out_unshadow_scb;
register_shadow_scb(vcpu, vsie_page);
+
rc = vsie_run(vcpu, vsie_page);
+
unregister_shadow_scb(vcpu);
unpin_blocks(vcpu, vsie_page);
-out_unshadow:
+out_unshadow_scb:
unshadow_scb(vcpu, vsie_page);
-out_unpin_scb:
- unpin_scb(vcpu, vsie_page, scb_addr);
-out_put:
+out_put_vsie_sca:
+ put_vsie_sca(sca);
+out_put_vsie_page:
put_vsie_page(vsie_page);
return rc < 0 ? rc : 0;
@@ -1589,27 +2042,58 @@ void kvm_s390_vsie_init(struct kvm *kvm)
{
mutex_init(&kvm->arch.vsie.mutex);
INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL_ACCOUNT);
+ init_rwsem(&kvm->arch.vsie.ssca_lock);
+ INIT_RADIX_TREE(&kvm->arch.vsie.osca_to_sca, GFP_KERNEL_ACCOUNT);
+}
+
+static void kvm_s390_vsie_destroy_page(struct kvm *kvm, struct vsie_page *vsie_page)
+{
+ if (!vsie_page)
+ return;
+ unpin_scb(kvm, vsie_page);
+ release_gmap_shadow(vsie_page);
+ /* free the radix tree entry */
+ if (vsie_page->scb_gpa != ULONG_MAX)
+ radix_tree_delete(&kvm->arch.vsie.addr_to_page,
+ vsie_page->scb_gpa >> 9);
+ free_vsie_page(vsie_page);
}
/* Destroy the vsie data structures. To be called when a vm is destroyed. */
void kvm_s390_vsie_destroy(struct kvm *kvm)
{
struct vsie_page *vsie_page;
- int i;
+ struct vsie_sca *sca;
+ int i, j;
mutex_lock(&kvm->arch.vsie.mutex);
for (i = 0; i < kvm->arch.vsie.page_count; i++) {
vsie_page = kvm->arch.vsie.pages[i];
kvm->arch.vsie.pages[i] = NULL;
- release_gmap_shadow(vsie_page);
- /* free the radix tree entry */
- if (vsie_page->scb_gpa != ULONG_MAX)
- radix_tree_delete(&kvm->arch.vsie.addr_to_page,
- vsie_page->scb_gpa >> 9);
- free_page((unsigned long)vsie_page);
+ kvm_s390_vsie_destroy_page(kvm, vsie_page);
}
- kvm->arch.vsie.page_count = 0;
mutex_unlock(&kvm->arch.vsie.mutex);
+ down_write(&kvm->arch.vsie.ssca_lock);
+ for (i = 0; i < kvm->arch.vsie.sca_count; i++) {
+ sca = kvm->arch.vsie.scas[i];
+ kvm->arch.vsie.scas[i] = NULL;
+
+ mutex_lock(&kvm->arch.vsie.mutex);
+ for (j = 0; j < KVM_S390_MAX_VSIE_VCPUS; j++) {
+ vsie_page = sca->pages[j];
+ sca->pages[j] = NULL;
+ kvm_s390_vsie_destroy_page(kvm, vsie_page);
+ }
+ sca->page_count = 0;
+ mutex_unlock(&kvm->arch.vsie.mutex);
+
+ unpin_sca(kvm, sca);
+ atomic_set(&sca->ref_count, 0);
+ radix_tree_delete(&kvm->arch.vsie.osca_to_sca, sca->sca_gpa);
+ free_pages_exact(sca, sizeof(*sca));
+ }
+ kvm->arch.vsie.sca_count = 0;
+ up_write(&kvm->arch.vsie.ssca_lock);
}
void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu)
--
2.51.1
next prev parent reply other threads:[~2025-11-10 17:17 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-11-10 17:16 [PATCH RFC v2 00/11] KVM: s390: Add VSIE SIGP Interpretation (vsie_sigpif) Christoph Schlameuss
2025-11-10 17:16 ` [PATCH RFC v2 01/11] KVM: s390: Add SCAO read and write helpers Christoph Schlameuss
2025-11-11 13:45 ` Claudio Imbrenda
2025-11-11 14:37 ` Christoph Schlameuss
2025-11-11 14:55 ` Claudio Imbrenda
2025-11-10 17:16 ` [PATCH RFC v2 02/11] KVM: s390: Remove double 64bscao feature check Christoph Schlameuss
2025-11-10 21:32 ` Eric Farman
2025-11-11 8:13 ` Hendrik Brueckner
2025-11-11 13:20 ` Janosch Frank
2025-11-10 17:16 ` [PATCH RFC v2 03/11] KVM: s390: Move scao validation into a function Christoph Schlameuss
2025-11-10 21:30 ` Eric Farman
2025-11-11 8:48 ` Christoph Schlameuss
2025-11-10 17:16 ` [PATCH RFC v2 04/11] KVM: s390: Add vsie_sigpif detection Christoph Schlameuss
2025-11-10 17:16 ` [PATCH RFC v2 05/11] KVM: s390: Add ssca_block and ssca_entry structs for vsie_ie Christoph Schlameuss
2025-11-10 17:16 ` [PATCH RFC v2 06/11] KVM: s390: Add helper to pin multiple guest pages Christoph Schlameuss
2025-11-13 15:24 ` Janosch Frank
2025-11-10 17:16 ` Christoph Schlameuss [this message]
2025-11-10 17:16 ` [PATCH RFC v2 08/11] KVM: s390: Allow guest-3 cpu add and remove with vsie sigpif Christoph Schlameuss
2025-11-11 15:47 ` Janosch Frank
2025-11-11 16:34 ` Christoph Schlameuss
2025-11-10 17:16 ` [PATCH RFC v2 09/11] KVM: s390: Allow guest-3 switch to extended sca " Christoph Schlameuss
2025-11-11 14:18 ` Janosch Frank
2025-11-10 17:16 ` [PATCH RFC v2 10/11] KVM: s390: Add VSIE shadow configuration Christoph Schlameuss
2025-11-10 17:16 ` [PATCH RFC v2 11/11] KVM: s390: Add VSIE shadow stat counters Christoph Schlameuss
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251110-vsieie-v2-7-9e53a3618c8c@linux.ibm.com \
--to=schlameuss@linux.ibm.com \
--cc=agordeev@linux.ibm.com \
--cc=borntraeger@linux.ibm.com \
--cc=david@redhat.com \
--cc=frankja@linux.ibm.com \
--cc=gor@linux.ibm.com \
--cc=hca@linux.ibm.com \
--cc=imbrenda@linux.ibm.com \
--cc=kvm@vger.kernel.org \
--cc=linux-s390@vger.kernel.org \
--cc=nrb@linux.ibm.com \
--cc=pbonzini@redhat.com \
--cc=shuah@kernel.org \
--cc=svens@linux.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).