public inbox for linux-s390@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1 1/1] KVM: s390: pv: fix asynchronous teardown for small VMs
@ 2023-04-20 16:01 Claudio Imbrenda
  2023-04-20 16:15 ` Marc Hartmayer
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Claudio Imbrenda @ 2023-04-20 16:01 UTC (permalink / raw)
  To: kvm; +Cc: borntraeger, nrb, nsg, frankja, mhartmay, kvm390-list, linux-s390

On machines without the Destroy Secure Configuration Fast UVC, the
topmost level of page tables is set aside and freed asynchronously
as last step of the asynchronous teardown.

Each gmap has a host_to_guest radix tree mapping host (userspace)
addresses (with 1M granularity) to gmap segment table entries (pmds).

If a guest is smaller than 2GB, the topmost level of page tables is the
segment table (i.e. there are only 2 levels). Replacing it means that
the pointers in the host_to_guest mapping would become stale and cause
all kinds of nasty issues.

This patch fixes the issue by synchronously destroying all guests with
only 2 levels of page tables in kvm_s390_pv_set_aside. This will
speed up the process and avoid the issue altogether.

Update s390_replace_asce so it refuses to replace segment type ASCEs.

Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Fixes: fb491d5500a7 ("KVM: s390: pv: asynchronous destroy for reboot")
---
 arch/s390/kvm/pv.c  | 35 ++++++++++++++++++++---------------
 arch/s390/mm/gmap.c |  7 +++++++
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index e032ebbf51b9..ceb8cb628d62 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -39,6 +39,7 @@ struct pv_vm_to_be_destroyed {
 	u64 handle;
 	void *stor_var;
 	unsigned long stor_base;
+	bool small;
 };
 
 static void kvm_s390_clear_pv_state(struct kvm *kvm)
@@ -318,7 +319,11 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
 	if (!priv)
 		return -ENOMEM;
 
-	if (is_destroy_fast_available()) {
+	if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) {
+		/* No need to do things asynchronously for VMs under 2GB */
+		res = kvm_s390_pv_deinit_vm(kvm, rc, rrc);
+		priv->small = true;
+	} else if (is_destroy_fast_available()) {
 		res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc);
 	} else {
 		priv->stor_var = kvm->arch.pv.stor_var;
@@ -335,7 +340,8 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
 		return res;
 	}
 
-	kvm_s390_destroy_lower_2g(kvm);
+	if (!priv->small)
+		kvm_s390_destroy_lower_2g(kvm);
 	kvm_s390_clear_pv_state(kvm);
 	kvm->arch.pv.set_aside = priv;
 
@@ -418,7 +424,10 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
 
 	/* If a previous protected VM was set aside, put it in the need_cleanup list */
 	if (kvm->arch.pv.set_aside) {
-		list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
+		if (((struct pv_vm_to_be_destroyed *)kvm->arch.pv.set_aside)->small)
+			kfree(kvm->arch.pv.set_aside);
+		else
+			list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
 		kvm->arch.pv.set_aside = NULL;
 	}
 
@@ -485,26 +494,22 @@ int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 	if (!p)
 		return -EINVAL;
 
-	/* When a fatal signal is received, stop immediately */
-	if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX))
+	if (p->small)
 		goto done;
-	if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
-		ret = -EIO;
-	kfree(p);
-	p = NULL;
-done:
-	/*
-	 * p is not NULL if we aborted because of a fatal signal, in which
-	 * case queue the leftover for later cleanup.
-	 */
-	if (p) {
+	/* When a fatal signal is received, stop immediately */
+	if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) {
 		mutex_lock(&kvm->lock);
 		list_add(&p->list, &kvm->arch.pv.need_cleanup);
 		mutex_unlock(&kvm->lock);
 		/* Did not finish, but pretend things went well */
 		*rc = UVC_RC_EXECUTED;
 		*rrc = 42;
+		return 0;
 	}
+	if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
+		ret = -EIO;
+done:
+	kfree(p);
 	return ret;
 }
 
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 5a716bdcba05..2267cf9819b2 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2833,6 +2833,9 @@ EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
  * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
  * @gmap: the gmap whose ASCE needs to be replaced
  *
+ * If the ASCE is a SEGMENT type then this function will return -EINVAL,
+ * otherwise the pointers in the host_to_guest radix tree will keep pointing
+ * to the wrong pages, causing use-after-free and memory corruption.
  * If the allocation of the new top level page table fails, the ASCE is not
  * replaced.
  * In any case, the old ASCE is always removed from the gmap CRST list.
@@ -2847,6 +2850,10 @@ int s390_replace_asce(struct gmap *gmap)
 
 	s390_unlist_old_asce(gmap);
 
+	/* Replacing segment type ASCEs would cause serious issues */
+	if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
+		return -EINVAL;
+
 	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
 	if (!page)
 		return -ENOMEM;
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2023-04-21  8:18 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-04-20 16:01 [PATCH v1 1/1] KVM: s390: pv: fix asynchronous teardown for small VMs Claudio Imbrenda
2023-04-20 16:15 ` Marc Hartmayer
2023-04-21  7:35   ` Claudio Imbrenda
2023-04-21  8:04 ` Janosch Frank
2023-04-21  8:17   ` Claudio Imbrenda
2023-04-21  8:07 ` Christian Borntraeger
2023-04-21  8:17   ` Claudio Imbrenda

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox