public inbox for linux-s390@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH v1 1/1] KVM: s390: pv: fix asynchronous teardown for small VMs
@ 2023-04-20 16:01 Claudio Imbrenda
  2023-04-20 16:15 ` Marc Hartmayer
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Claudio Imbrenda @ 2023-04-20 16:01 UTC (permalink / raw)
  To: kvm; +Cc: borntraeger, nrb, nsg, frankja, mhartmay, kvm390-list, linux-s390

On machines without the Destroy Secure Configuration Fast UVC, the
topmost level of page tables is set aside and freed asynchronously
as last step of the asynchronous teardown.

Each gmap has a host_to_guest radix tree mapping host (userspace)
addresses (with 1M granularity) to gmap segment table entries (pmds).

If a guest is smaller than 2GB, the topmost level of page tables is the
segment table (i.e. there are only 2 levels). Replacing it means that
the pointers in the host_to_guest mapping would become stale and cause
all kinds of nasty issues.

This patch fixes the issue by synchronously destroying all guests with
only 2 levels of page tables in kvm_s390_pv_set_aside. This will
speed up the process and avoid the issue altogether.

Update s390_replace_asce so it refuses to replace segment type ASCEs.

Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Fixes: fb491d5500a7 ("KVM: s390: pv: asynchronous destroy for reboot")
---
 arch/s390/kvm/pv.c  | 35 ++++++++++++++++++++---------------
 arch/s390/mm/gmap.c |  7 +++++++
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
index e032ebbf51b9..ceb8cb628d62 100644
--- a/arch/s390/kvm/pv.c
+++ b/arch/s390/kvm/pv.c
@@ -39,6 +39,7 @@ struct pv_vm_to_be_destroyed {
 	u64 handle;
 	void *stor_var;
 	unsigned long stor_base;
+	bool small;
 };
 
 static void kvm_s390_clear_pv_state(struct kvm *kvm)
@@ -318,7 +319,11 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
 	if (!priv)
 		return -ENOMEM;
 
-	if (is_destroy_fast_available()) {
+	if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) {
+		/* No need to do things asynchronously for VMs under 2GB */
+		res = kvm_s390_pv_deinit_vm(kvm, rc, rrc);
+		priv->small = true;
+	} else if (is_destroy_fast_available()) {
 		res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc);
 	} else {
 		priv->stor_var = kvm->arch.pv.stor_var;
@@ -335,7 +340,8 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
 		return res;
 	}
 
-	kvm_s390_destroy_lower_2g(kvm);
+	if (!priv->small)
+		kvm_s390_destroy_lower_2g(kvm);
 	kvm_s390_clear_pv_state(kvm);
 	kvm->arch.pv.set_aside = priv;
 
@@ -418,7 +424,10 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
 
 	/* If a previous protected VM was set aside, put it in the need_cleanup list */
 	if (kvm->arch.pv.set_aside) {
-		list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
+		if (((struct pv_vm_to_be_destroyed *)kvm->arch.pv.set_aside)->small)
+			kfree(kvm->arch.pv.set_aside);
+		else
+			list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
 		kvm->arch.pv.set_aside = NULL;
 	}
 
@@ -485,26 +494,22 @@ int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
 	if (!p)
 		return -EINVAL;
 
-	/* When a fatal signal is received, stop immediately */
-	if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX))
+	if (p->small)
 		goto done;
-	if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
-		ret = -EIO;
-	kfree(p);
-	p = NULL;
-done:
-	/*
-	 * p is not NULL if we aborted because of a fatal signal, in which
-	 * case queue the leftover for later cleanup.
-	 */
-	if (p) {
+	/* When a fatal signal is received, stop immediately */
+	if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) {
 		mutex_lock(&kvm->lock);
 		list_add(&p->list, &kvm->arch.pv.need_cleanup);
 		mutex_unlock(&kvm->lock);
 		/* Did not finish, but pretend things went well */
 		*rc = UVC_RC_EXECUTED;
 		*rrc = 42;
+		return 0;
 	}
+	if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
+		ret = -EIO;
+done:
+	kfree(p);
 	return ret;
 }
 
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 5a716bdcba05..2267cf9819b2 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2833,6 +2833,9 @@ EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
  * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
  * @gmap: the gmap whose ASCE needs to be replaced
  *
+ * If the ASCE is a SEGMENT type then this function will return -EINVAL,
+ * otherwise the pointers in the host_to_guest radix tree will keep pointing
+ * to the wrong pages, causing use-after-free and memory corruption.
  * If the allocation of the new top level page table fails, the ASCE is not
  * replaced.
  * In any case, the old ASCE is always removed from the gmap CRST list.
@@ -2847,6 +2850,10 @@ int s390_replace_asce(struct gmap *gmap)
 
 	s390_unlist_old_asce(gmap);
 
+	/* Replacing segment type ASCEs would cause serious issues */
+	if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
+		return -EINVAL;
+
 	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
 	if (!page)
 		return -ENOMEM;
-- 
2.39.2


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v1 1/1] KVM: s390: pv: fix asynchronous teardown for small VMs
  2023-04-20 16:01 [PATCH v1 1/1] KVM: s390: pv: fix asynchronous teardown for small VMs Claudio Imbrenda
@ 2023-04-20 16:15 ` Marc Hartmayer
  2023-04-21  7:35   ` Claudio Imbrenda
  2023-04-21  8:04 ` Janosch Frank
  2023-04-21  8:07 ` Christian Borntraeger
  2 siblings, 1 reply; 7+ messages in thread
From: Marc Hartmayer @ 2023-04-20 16:15 UTC (permalink / raw)
  To: Claudio Imbrenda, kvm
  Cc: borntraeger, nrb, nsg, frankja, kvm390-list, linux-s390

Claudio Imbrenda <imbrenda@linux.ibm.com> writes:

> On machines without the Destroy Secure Configuration Fast UVC, the
> topmost level of page tables is set aside and freed asynchronously
> as last step of the asynchronous teardown.
>
> Each gmap has a host_to_guest radix tree mapping host (userspace)
> addresses (with 1M granularity) to gmap segment table entries (pmds).
>
> If a guest is smaller than 2GB, the topmost level of page tables is the
> segment table (i.e. there are only 2 levels). Replacing it means that
> the pointers in the host_to_guest mapping would become stale and cause
> all kinds of nasty issues.
>
> This patch fixes the issue by synchronously destroying all guests with
> only 2 levels of page tables in kvm_s390_pv_set_aside. This will
> speed up the process and avoid the issue altogether.
>
> Update s390_replace_asce so it refuses to replace segment type ASCEs.
>
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> Fixes: fb491d5500a7 ("KVM: s390: pv: asynchronous destroy for reboot")
> ---
>  arch/s390/kvm/pv.c  | 35 ++++++++++++++++++++---------------
>  arch/s390/mm/gmap.c |  7 +++++++
>  2 files changed, 27 insertions(+), 15 deletions(-)
>
> diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
> index e032ebbf51b9..ceb8cb628d62 100644
> --- a/arch/s390/kvm/pv.c
> +++ b/arch/s390/kvm/pv.c
> @@ -39,6 +39,7 @@ struct pv_vm_to_be_destroyed {
>  	u64 handle;
>  	void *stor_var;
>  	unsigned long stor_base;
> +	bool small;

I would rather use a more verbose variable name (maybe ‘lower_2g‘
accordingly to the `kvm_s390_destroy_lower_2g` function name or even
better something indicating the actual implications).

At least, I would add a comment what ‘small‘ means and the implications
of it.

[…snip]


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v1 1/1] KVM: s390: pv: fix asynchronous teardown for small VMs
  2023-04-20 16:15 ` Marc Hartmayer
@ 2023-04-21  7:35   ` Claudio Imbrenda
  0 siblings, 0 replies; 7+ messages in thread
From: Claudio Imbrenda @ 2023-04-21  7:35 UTC (permalink / raw)
  To: Marc Hartmayer
  Cc: kvm, borntraeger, nrb, nsg, frankja, kvm390-list, linux-s390

On Thu, 20 Apr 2023 18:15:36 +0200
"Marc Hartmayer" <mhartmay@linux.ibm.com> wrote:

> Claudio Imbrenda <imbrenda@linux.ibm.com> writes:
> 
> > On machines without the Destroy Secure Configuration Fast UVC, the
> > topmost level of page tables is set aside and freed asynchronously
> > as last step of the asynchronous teardown.
> >
> > Each gmap has a host_to_guest radix tree mapping host (userspace)
> > addresses (with 1M granularity) to gmap segment table entries (pmds).
> >
> > If a guest is smaller than 2GB, the topmost level of page tables is the
> > segment table (i.e. there are only 2 levels). Replacing it means that
> > the pointers in the host_to_guest mapping would become stale and cause
> > all kinds of nasty issues.
> >
> > This patch fixes the issue by synchronously destroying all guests with
> > only 2 levels of page tables in kvm_s390_pv_set_aside. This will
> > speed up the process and avoid the issue altogether.
> >
> > Update s390_replace_asce so it refuses to replace segment type ASCEs.
> >
> > Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> > Fixes: fb491d5500a7 ("KVM: s390: pv: asynchronous destroy for reboot")
> > ---
> >  arch/s390/kvm/pv.c  | 35 ++++++++++++++++++++---------------
> >  arch/s390/mm/gmap.c |  7 +++++++
> >  2 files changed, 27 insertions(+), 15 deletions(-)
> >
> > diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
> > index e032ebbf51b9..ceb8cb628d62 100644
> > --- a/arch/s390/kvm/pv.c
> > +++ b/arch/s390/kvm/pv.c
> > @@ -39,6 +39,7 @@ struct pv_vm_to_be_destroyed {
> >  	u64 handle;
> >  	void *stor_var;
> >  	unsigned long stor_base;
> > +	bool small;  
> 
> I would rather use a more verbose variable name (maybe ‘lower_2g‘

lower_2g would mean that there is more above 2g, but in this case the VM
is smaller than 2g (which is the whole point)

> accordingly to the `kvm_s390_destroy_lower_2g` function name or even
> better something indicating the actual implications).
> 
> At least, I would add a comment what ‘small‘ means and the implications

I'll add a comment

> of it.
> 
> […snip]
> 


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v1 1/1] KVM: s390: pv: fix asynchronous teardown for small VMs
  2023-04-20 16:01 [PATCH v1 1/1] KVM: s390: pv: fix asynchronous teardown for small VMs Claudio Imbrenda
  2023-04-20 16:15 ` Marc Hartmayer
@ 2023-04-21  8:04 ` Janosch Frank
  2023-04-21  8:17   ` Claudio Imbrenda
  2023-04-21  8:07 ` Christian Borntraeger
  2 siblings, 1 reply; 7+ messages in thread
From: Janosch Frank @ 2023-04-21  8:04 UTC (permalink / raw)
  To: Claudio Imbrenda, kvm
  Cc: borntraeger, nrb, nsg, mhartmay, kvm390-list, linux-s390

On 4/20/23 18:01, Claudio Imbrenda wrote:
> On machines without the Destroy Secure Configuration Fast UVC, the
> topmost level of page tables is set aside and freed asynchronously
> as last step of the asynchronous teardown.
> 
> Each gmap has a host_to_guest radix tree mapping host (userspace)
> addresses (with 1M granularity) to gmap segment table entries (pmds).
> 
> If a guest is smaller than 2GB, the topmost level of page tables is the
> segment table (i.e. there are only 2 levels). Replacing it means that
> the pointers in the host_to_guest mapping would become stale and cause
> all kinds of nasty issues.

Ouff

> 
> This patch fixes the issue by synchronously destroying all guests with
> only 2 levels of page tables in kvm_s390_pv_set_aside. This will
> speed up the process and avoid the issue altogether.
> 
> Update s390_replace_asce so it refuses to replace segment type ASCEs.
> 
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> Fixes: fb491d5500a7 ("KVM: s390: pv: asynchronous destroy for reboot")
> ---
>   arch/s390/kvm/pv.c  | 35 ++++++++++++++++++++---------------
>   arch/s390/mm/gmap.c |  7 +++++++
>   2 files changed, 27 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
> index e032ebbf51b9..ceb8cb628d62 100644
> --- a/arch/s390/kvm/pv.c
> +++ b/arch/s390/kvm/pv.c
> @@ -39,6 +39,7 @@ struct pv_vm_to_be_destroyed {
>   	u64 handle;
>   	void *stor_var;
>   	unsigned long stor_base;
> +	bool small;

I second Marc's complaints :)

There's no way that the gmap can be manipulated to cause the 
use-after-free problems by adding/removing memory? I.e. changing to >2GB 
memory before your checks and then to < 2GB after the checks?

>   };
>   
>   static void kvm_s390_clear_pv_state(struct kvm *kvm)
> @@ -318,7 +319,11 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
>   	if (!priv)
>   		return -ENOMEM;
>   
> -	if (is_destroy_fast_available()) {
> +	if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) {

How about adding this to gmap.h?

bool gmap_asce_non_replaceable(struct gmap *gmap)
{
	return (gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT;
}

> +		/* No need to do things asynchronously for VMs under 2GB */
> +		res = kvm_s390_pv_deinit_vm(kvm, rc, rrc);
> +		priv->small = true;
> +	} else if (is_destroy_fast_available()) {
>   		res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc);
>   	} else {
>   		priv->stor_var = kvm->arch.pv.stor_var;
> @@ -335,7 +340,8 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
>   		return res;
>   	}
>   
> -	kvm_s390_destroy_lower_2g(kvm);
> +	if (!priv->small)
> +		kvm_s390_destroy_lower_2g(kvm);
>   	kvm_s390_clear_pv_state(kvm);
>   	kvm->arch.pv.set_aside = priv;
>   
> @@ -418,7 +424,10 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
>   
>   	/* If a previous protected VM was set aside, put it in the need_cleanup list */
>   	if (kvm->arch.pv.set_aside) {
> -		list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
> +		if (((struct pv_vm_to_be_destroyed *)kvm->arch.pv.set_aside)->small)

cur = (struct pv_vm_to_be_destroyed *)kvm->arch.pv.set_aside;

if (cur->small)
[...]


> +			kfree(kvm->arch.pv.set_aside);
> +		else
> +			list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
>   		kvm->arch.pv.set_aside = NULL;
>   	}
>   
> @@ -485,26 +494,22 @@ int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
>   	if (!p)
>   		return -EINVAL;
>   
> -	/* When a fatal signal is received, stop immediately */
> -	if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX))
> +	if (p->small)
>   		goto done;
> -	if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
> -		ret = -EIO;
> -	kfree(p);
> -	p = NULL;
> -done:
> -	/*
> -	 * p is not NULL if we aborted because of a fatal signal, in which
> -	 * case queue the leftover for later cleanup.
> -	 */
> -	if (p) {
> +	/* When a fatal signal is received, stop immediately */
> +	if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) {
>   		mutex_lock(&kvm->lock);
>   		list_add(&p->list, &kvm->arch.pv.need_cleanup);
>   		mutex_unlock(&kvm->lock);
>   		/* Did not finish, but pretend things went well */
>   		*rc = UVC_RC_EXECUTED;
>   		*rrc = 42;
> +		return 0;
>   	}
> +	if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
> +		ret = -EIO;
> +done:
> +	kfree(p);
>   	return ret;
>   }
>   
> diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> index 5a716bdcba05..2267cf9819b2 100644
> --- a/arch/s390/mm/gmap.c
> +++ b/arch/s390/mm/gmap.c
> @@ -2833,6 +2833,9 @@ EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
>    * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
>    * @gmap: the gmap whose ASCE needs to be replaced
>    *
> + * If the ASCE is a SEGMENT type then this function will return -EINVAL,
> + * otherwise the pointers in the host_to_guest radix tree will keep pointing
> + * to the wrong pages, causing use-after-free and memory corruption.
>    * If the allocation of the new top level page table fails, the ASCE is not
>    * replaced.
>    * In any case, the old ASCE is always removed from the gmap CRST list.
> @@ -2847,6 +2850,10 @@ int s390_replace_asce(struct gmap *gmap)
>   
>   	s390_unlist_old_asce(gmap);
>   
> +	/* Replacing segment type ASCEs would cause serious issues */
> +	if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
> +		return -EINVAL;
> +
>   	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
>   	if (!page)
>   		return -ENOMEM;


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v1 1/1] KVM: s390: pv: fix asynchronous teardown for small VMs
  2023-04-20 16:01 [PATCH v1 1/1] KVM: s390: pv: fix asynchronous teardown for small VMs Claudio Imbrenda
  2023-04-20 16:15 ` Marc Hartmayer
  2023-04-21  8:04 ` Janosch Frank
@ 2023-04-21  8:07 ` Christian Borntraeger
  2023-04-21  8:17   ` Claudio Imbrenda
  2 siblings, 1 reply; 7+ messages in thread
From: Christian Borntraeger @ 2023-04-21  8:07 UTC (permalink / raw)
  To: Claudio Imbrenda, kvm
  Cc: nrb, nsg, frankja, mhartmay, kvm390-list, linux-s390



Am 20.04.23 um 18:01 schrieb Claudio Imbrenda:
> On machines without the Destroy Secure Configuration Fast UVC, the
> topmost level of page tables is set aside and freed asynchronously
> as last step of the asynchronous teardown.
> 
> Each gmap has a host_to_guest radix tree mapping host (userspace)
> addresses (with 1M granularity) to gmap segment table entries (pmds).
> 
> If a guest is smaller than 2GB, the topmost level of page tables is the
> segment table (i.e. there are only 2 levels). Replacing it means that
> the pointers in the host_to_guest mapping would become stale and cause
> all kinds of nasty issues.
> 
> This patch fixes the issue by synchronously destroying all guests with
> only 2 levels of page tables in kvm_s390_pv_set_aside. This will
> speed up the process and avoid the issue altogether.
> 
> Update s390_replace_asce so it refuses to replace segment type ASCEs.
> 
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> Fixes: fb491d5500a7 ("KVM: s390: pv: asynchronous destroy for reboot")
> ---
>   arch/s390/kvm/pv.c  | 35 ++++++++++++++++++++---------------
>   arch/s390/mm/gmap.c |  7 +++++++
>   2 files changed, 27 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
> index e032ebbf51b9..ceb8cb628d62 100644
> --- a/arch/s390/kvm/pv.c
> +++ b/arch/s390/kvm/pv.c
> @@ -39,6 +39,7 @@ struct pv_vm_to_be_destroyed {
>   	u64 handle;
>   	void *stor_var;
>   	unsigned long stor_base;
> +	bool small;
>   };
>   
>   static void kvm_s390_clear_pv_state(struct kvm *kvm)
> @@ -318,7 +319,11 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
>   	if (!priv)
>   		return -ENOMEM;
>   
> -	if (is_destroy_fast_available()) {
> +	if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) {
> +		/* No need to do things asynchronously for VMs under 2GB */
> +		res = kvm_s390_pv_deinit_vm(kvm, rc, rrc);
> +		priv->small = true;
> +	} else if (is_destroy_fast_available()) {
>   		res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc);
>   	} else {
>   		priv->stor_var = kvm->arch.pv.stor_var;
> @@ -335,7 +340,8 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
>   		return res;
>   	}
>   
> -	kvm_s390_destroy_lower_2g(kvm);
> +	if (!priv->small)
> +		kvm_s390_destroy_lower_2g(kvm);
>   	kvm_s390_clear_pv_state(kvm);
>   	kvm->arch.pv.set_aside = priv;
>   
> @@ -418,7 +424,10 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
>   
>   	/* If a previous protected VM was set aside, put it in the need_cleanup list */
>   	if (kvm->arch.pv.set_aside) {
> -		list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
> +		if (((struct pv_vm_to_be_destroyed *)kvm->arch.pv.set_aside)->small)
why do we need a cast here?

> +			kfree(kvm->arch.pv.set_aside);
> +		else
> +			list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
>   		kvm->arch.pv.set_aside = NULL;
>   	}
>   

With the comment added that Marc asked for

Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v1 1/1] KVM: s390: pv: fix asynchronous teardown for small VMs
  2023-04-21  8:04 ` Janosch Frank
@ 2023-04-21  8:17   ` Claudio Imbrenda
  0 siblings, 0 replies; 7+ messages in thread
From: Claudio Imbrenda @ 2023-04-21  8:17 UTC (permalink / raw)
  To: Janosch Frank
  Cc: kvm, borntraeger, nrb, nsg, mhartmay, kvm390-list, linux-s390

On Fri, 21 Apr 2023 10:04:50 +0200
Janosch Frank <frankja@linux.ibm.com> wrote:

> On 4/20/23 18:01, Claudio Imbrenda wrote:
> > On machines without the Destroy Secure Configuration Fast UVC, the
> > topmost level of page tables is set aside and freed asynchronously
> > as last step of the asynchronous teardown.
> > 
> > Each gmap has a host_to_guest radix tree mapping host (userspace)
> > addresses (with 1M granularity) to gmap segment table entries (pmds).
> > 
> > If a guest is smaller than 2GB, the topmost level of page tables is the
> > segment table (i.e. there are only 2 levels). Replacing it means that
> > the pointers in the host_to_guest mapping would become stale and cause
> > all kinds of nasty issues.  
> 
> Ouff
> 
> > 
> > This patch fixes the issue by synchronously destroying all guests with
> > only 2 levels of page tables in kvm_s390_pv_set_aside. This will
> > speed up the process and avoid the issue altogether.
> > 
> > Update s390_replace_asce so it refuses to replace segment type ASCEs.
> > 
> > Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> > Fixes: fb491d5500a7 ("KVM: s390: pv: asynchronous destroy for reboot")
> > ---
> >   arch/s390/kvm/pv.c  | 35 ++++++++++++++++++++---------------
> >   arch/s390/mm/gmap.c |  7 +++++++
> >   2 files changed, 27 insertions(+), 15 deletions(-)
> > 
> > diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
> > index e032ebbf51b9..ceb8cb628d62 100644
> > --- a/arch/s390/kvm/pv.c
> > +++ b/arch/s390/kvm/pv.c
> > @@ -39,6 +39,7 @@ struct pv_vm_to_be_destroyed {
> >   	u64 handle;
> >   	void *stor_var;
> >   	unsigned long stor_base;
> > +	bool small;  
> 
> I second Marc's complaints :)
> 
> There's no way that the gmap can be manipulated to cause the 
> use-after-free problems by adding/removing memory? I.e. changing to >2GB 
> memory before your checks and then to < 2GB after the checks?

no, the gmap __limit__ cannot be changed after CPUs are created.

if the gmap is smaller than 2GB but has a limit >2GB then it will have
3 levels.

this is moot, though, I have found a much smaller and simpler solution
after talking with Marc

> 
> >   };
> >   
> >   static void kvm_s390_clear_pv_state(struct kvm *kvm)
> > @@ -318,7 +319,11 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
> >   	if (!priv)
> >   		return -ENOMEM;
> >   
> > -	if (is_destroy_fast_available()) {
> > +	if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) {  
> 
> How about adding this to gmap.h?
> 
> bool gmap_asce_non_replaceable(struct gmap *gmap)
> {
> 	return (gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT;
> }

do you really want a function / macro for that?

> 
> > +		/* No need to do things asynchronously for VMs under 2GB */
> > +		res = kvm_s390_pv_deinit_vm(kvm, rc, rrc);
> > +		priv->small = true;
> > +	} else if (is_destroy_fast_available()) {
> >   		res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc);
> >   	} else {
> >   		priv->stor_var = kvm->arch.pv.stor_var;
> > @@ -335,7 +340,8 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
> >   		return res;
> >   	}
> >   
> > -	kvm_s390_destroy_lower_2g(kvm);
> > +	if (!priv->small)
> > +		kvm_s390_destroy_lower_2g(kvm);
> >   	kvm_s390_clear_pv_state(kvm);
> >   	kvm->arch.pv.set_aside = priv;
> >   
> > @@ -418,7 +424,10 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
> >   
> >   	/* If a previous protected VM was set aside, put it in the need_cleanup list */
> >   	if (kvm->arch.pv.set_aside) {
> > -		list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
> > +		if (((struct pv_vm_to_be_destroyed *)kvm->arch.pv.set_aside)->small)  
> 
> cur = (struct pv_vm_to_be_destroyed *)kvm->arch.pv.set_aside;
> 
> if (cur->small)
> [...]

this will go

> 
> 
> > +			kfree(kvm->arch.pv.set_aside);
> > +		else
> > +			list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
> >   		kvm->arch.pv.set_aside = NULL;
> >   	}
> >   
> > @@ -485,26 +494,22 @@ int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
> >   	if (!p)
> >   		return -EINVAL;
> >   
> > -	/* When a fatal signal is received, stop immediately */
> > -	if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX))
> > +	if (p->small)
> >   		goto done;
> > -	if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
> > -		ret = -EIO;
> > -	kfree(p);
> > -	p = NULL;
> > -done:
> > -	/*
> > -	 * p is not NULL if we aborted because of a fatal signal, in which
> > -	 * case queue the leftover for later cleanup.
> > -	 */
> > -	if (p) {
> > +	/* When a fatal signal is received, stop immediately */
> > +	if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) {
> >   		mutex_lock(&kvm->lock);
> >   		list_add(&p->list, &kvm->arch.pv.need_cleanup);
> >   		mutex_unlock(&kvm->lock);
> >   		/* Did not finish, but pretend things went well */
> >   		*rc = UVC_RC_EXECUTED;
> >   		*rrc = 42;
> > +		return 0;
> >   	}
> > +	if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
> > +		ret = -EIO;
> > +done:
> > +	kfree(p);
> >   	return ret;
> >   }
> >   
> > diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> > index 5a716bdcba05..2267cf9819b2 100644
> > --- a/arch/s390/mm/gmap.c
> > +++ b/arch/s390/mm/gmap.c
> > @@ -2833,6 +2833,9 @@ EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
> >    * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
> >    * @gmap: the gmap whose ASCE needs to be replaced
> >    *
> > + * If the ASCE is a SEGMENT type then this function will return -EINVAL,
> > + * otherwise the pointers in the host_to_guest radix tree will keep pointing
> > + * to the wrong pages, causing use-after-free and memory corruption.
> >    * If the allocation of the new top level page table fails, the ASCE is not
> >    * replaced.
> >    * In any case, the old ASCE is always removed from the gmap CRST list.
> > @@ -2847,6 +2850,10 @@ int s390_replace_asce(struct gmap *gmap)
> >   
> >   	s390_unlist_old_asce(gmap);
> >   
> > +	/* Replacing segment type ASCEs would cause serious issues */
> > +	if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
> > +		return -EINVAL;
> > +
> >   	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
> >   	if (!page)
> >   		return -ENOMEM;  
> 


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v1 1/1] KVM: s390: pv: fix asynchronous teardown for small VMs
  2023-04-21  8:07 ` Christian Borntraeger
@ 2023-04-21  8:17   ` Claudio Imbrenda
  0 siblings, 0 replies; 7+ messages in thread
From: Claudio Imbrenda @ 2023-04-21  8:17 UTC (permalink / raw)
  To: Christian Borntraeger
  Cc: kvm, nrb, nsg, frankja, mhartmay, kvm390-list, linux-s390

On Fri, 21 Apr 2023 10:07:22 +0200
Christian Borntraeger <borntraeger@de.ibm.com> wrote:

> Am 20.04.23 um 18:01 schrieb Claudio Imbrenda:
> > On machines without the Destroy Secure Configuration Fast UVC, the
> > topmost level of page tables is set aside and freed asynchronously
> > as last step of the asynchronous teardown.
> > 
> > Each gmap has a host_to_guest radix tree mapping host (userspace)
> > addresses (with 1M granularity) to gmap segment table entries (pmds).
> > 
> > If a guest is smaller than 2GB, the topmost level of page tables is the
> > segment table (i.e. there are only 2 levels). Replacing it means that
> > the pointers in the host_to_guest mapping would become stale and cause
> > all kinds of nasty issues.
> > 
> > This patch fixes the issue by synchronously destroying all guests with
> > only 2 levels of page tables in kvm_s390_pv_set_aside. This will
> > speed up the process and avoid the issue altogether.
> > 
> > Update s390_replace_asce so it refuses to replace segment type ASCEs.
> > 
> > Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> > Fixes: fb491d5500a7 ("KVM: s390: pv: asynchronous destroy for reboot")
> > ---
> >   arch/s390/kvm/pv.c  | 35 ++++++++++++++++++++---------------
> >   arch/s390/mm/gmap.c |  7 +++++++
> >   2 files changed, 27 insertions(+), 15 deletions(-)
> > 
> > diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
> > index e032ebbf51b9..ceb8cb628d62 100644
> > --- a/arch/s390/kvm/pv.c
> > +++ b/arch/s390/kvm/pv.c
> > @@ -39,6 +39,7 @@ struct pv_vm_to_be_destroyed {
> >   	u64 handle;
> >   	void *stor_var;
> >   	unsigned long stor_base;
> > +	bool small;
> >   };
> >   
> >   static void kvm_s390_clear_pv_state(struct kvm *kvm)
> > @@ -318,7 +319,11 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
> >   	if (!priv)
> >   		return -ENOMEM;
> >   
> > -	if (is_destroy_fast_available()) {
> > +	if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) {
> > +		/* No need to do things asynchronously for VMs under 2GB */
> > +		res = kvm_s390_pv_deinit_vm(kvm, rc, rrc);
> > +		priv->small = true;
> > +	} else if (is_destroy_fast_available()) {
> >   		res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc);
> >   	} else {
> >   		priv->stor_var = kvm->arch.pv.stor_var;
> > @@ -335,7 +340,8 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
> >   		return res;
> >   	}
> >   
> > -	kvm_s390_destroy_lower_2g(kvm);
> > +	if (!priv->small)
> > +		kvm_s390_destroy_lower_2g(kvm);
> >   	kvm_s390_clear_pv_state(kvm);
> >   	kvm->arch.pv.set_aside = priv;
> >   
> > @@ -418,7 +424,10 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
> >   
> >   	/* If a previous protected VM was set aside, put it in the need_cleanup list */
> >   	if (kvm->arch.pv.set_aside) {
> > -		list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
> > +		if (((struct pv_vm_to_be_destroyed *)kvm->arch.pv.set_aside)->small)  
> why do we need a cast here?

it's a void *

but it's not important, after talking with Marc I have found a much
simpler solution (will post soon)

> 
> > +			kfree(kvm->arch.pv.set_aside);
> > +		else
> > +			list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
> >   		kvm->arch.pv.set_aside = NULL;
> >   	}
> >     
> 
> With the comment added that Marc asked for
> 
> Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
> 


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2023-04-21  8:18 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-04-20 16:01 [PATCH v1 1/1] KVM: s390: pv: fix asynchronous teardown for small VMs Claudio Imbrenda
2023-04-20 16:15 ` Marc Hartmayer
2023-04-21  7:35   ` Claudio Imbrenda
2023-04-21  8:04 ` Janosch Frank
2023-04-21  8:17   ` Claudio Imbrenda
2023-04-21  8:07 ` Christian Borntraeger
2023-04-21  8:17   ` Claudio Imbrenda

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox