public inbox for linux-s390@vger.kernel.org
 help / color / mirror / Atom feed
From: Janosch Frank <frankja@linux.ibm.com>
To: Claudio Imbrenda <imbrenda@linux.ibm.com>, kvm@vger.kernel.org
Cc: borntraeger@de.ibm.com, nrb@linux.ibm.com, nsg@linux.ibm.com,
	mhartmay@linux.ibm.com,
	kvm390-list@tuxmaker.boeblingen.de.ibm.com,
	linux-s390@vger.kernel.org
Subject: Re: [PATCH v1 1/1] KVM: s390: pv: fix asynchronous teardown for small VMs
Date: Fri, 21 Apr 2023 10:04:50 +0200	[thread overview]
Message-ID: <a1ab46b3-da2b-f815-be15-1294f95d598f@linux.ibm.com> (raw)
In-Reply-To: <20230420160149.51728-1-imbrenda@linux.ibm.com>

On 4/20/23 18:01, Claudio Imbrenda wrote:
> On machines without the Destroy Secure Configuration Fast UVC, the
> topmost level of page tables is set aside and freed asynchronously
> as last step of the asynchronous teardown.
> 
> Each gmap has a host_to_guest radix tree mapping host (userspace)
> addresses (with 1M granularity) to gmap segment table entries (pmds).
> 
> If a guest is smaller than 2GB, the topmost level of page tables is the
> segment table (i.e. there are only 2 levels). Replacing it means that
> the pointers in the host_to_guest mapping would become stale and cause
> all kinds of nasty issues.

Ouff

> 
> This patch fixes the issue by synchronously destroying all guests with
> only 2 levels of page tables in kvm_s390_pv_set_aside. This will
> speed up the process and avoid the issue altogether.
> 
> Update s390_replace_asce so it refuses to replace segment type ASCEs.
> 
> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
> Fixes: fb491d5500a7 ("KVM: s390: pv: asynchronous destroy for reboot")
> ---
>   arch/s390/kvm/pv.c  | 35 ++++++++++++++++++++---------------
>   arch/s390/mm/gmap.c |  7 +++++++
>   2 files changed, 27 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c
> index e032ebbf51b9..ceb8cb628d62 100644
> --- a/arch/s390/kvm/pv.c
> +++ b/arch/s390/kvm/pv.c
> @@ -39,6 +39,7 @@ struct pv_vm_to_be_destroyed {
>   	u64 handle;
>   	void *stor_var;
>   	unsigned long stor_base;
> +	bool small;

I second Marc's complaints :)

There's no way that the gmap can be manipulated to cause the 
use-after-free problems by adding/removing memory? I.e. changing to >2GB 
memory before your checks and then to < 2GB after the checks?

>   };
>   
>   static void kvm_s390_clear_pv_state(struct kvm *kvm)
> @@ -318,7 +319,11 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
>   	if (!priv)
>   		return -ENOMEM;
>   
> -	if (is_destroy_fast_available()) {
> +	if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) {

How about adding this to gmap.h?

bool gmap_asce_non_replaceable(struct gmap *gmap)
{
	return (gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT;
}

> +		/* No need to do things asynchronously for VMs under 2GB */
> +		res = kvm_s390_pv_deinit_vm(kvm, rc, rrc);
> +		priv->small = true;
> +	} else if (is_destroy_fast_available()) {
>   		res = kvm_s390_pv_deinit_vm_fast(kvm, rc, rrc);
>   	} else {
>   		priv->stor_var = kvm->arch.pv.stor_var;
> @@ -335,7 +340,8 @@ int kvm_s390_pv_set_aside(struct kvm *kvm, u16 *rc, u16 *rrc)
>   		return res;
>   	}
>   
> -	kvm_s390_destroy_lower_2g(kvm);
> +	if (!priv->small)
> +		kvm_s390_destroy_lower_2g(kvm);
>   	kvm_s390_clear_pv_state(kvm);
>   	kvm->arch.pv.set_aside = priv;
>   
> @@ -418,7 +424,10 @@ int kvm_s390_pv_deinit_cleanup_all(struct kvm *kvm, u16 *rc, u16 *rrc)
>   
>   	/* If a previous protected VM was set aside, put it in the need_cleanup list */
>   	if (kvm->arch.pv.set_aside) {
> -		list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
> +		if (((struct pv_vm_to_be_destroyed *)kvm->arch.pv.set_aside)->small)

cur = (struct pv_vm_to_be_destroyed *)kvm->arch.pv.set_aside;

if (cur->small)
[...]


> +			kfree(kvm->arch.pv.set_aside);
> +		else
> +			list_add(kvm->arch.pv.set_aside, &kvm->arch.pv.need_cleanup);
>   		kvm->arch.pv.set_aside = NULL;
>   	}
>   
> @@ -485,26 +494,22 @@ int kvm_s390_pv_deinit_aside_vm(struct kvm *kvm, u16 *rc, u16 *rrc)
>   	if (!p)
>   		return -EINVAL;
>   
> -	/* When a fatal signal is received, stop immediately */
> -	if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX))
> +	if (p->small)
>   		goto done;
> -	if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
> -		ret = -EIO;
> -	kfree(p);
> -	p = NULL;
> -done:
> -	/*
> -	 * p is not NULL if we aborted because of a fatal signal, in which
> -	 * case queue the leftover for later cleanup.
> -	 */
> -	if (p) {
> +	/* When a fatal signal is received, stop immediately */
> +	if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) {
>   		mutex_lock(&kvm->lock);
>   		list_add(&p->list, &kvm->arch.pv.need_cleanup);
>   		mutex_unlock(&kvm->lock);
>   		/* Did not finish, but pretend things went well */
>   		*rc = UVC_RC_EXECUTED;
>   		*rrc = 42;
> +		return 0;
>   	}
> +	if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc))
> +		ret = -EIO;
> +done:
> +	kfree(p);
>   	return ret;
>   }
>   
> diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
> index 5a716bdcba05..2267cf9819b2 100644
> --- a/arch/s390/mm/gmap.c
> +++ b/arch/s390/mm/gmap.c
> @@ -2833,6 +2833,9 @@ EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
>    * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
>    * @gmap: the gmap whose ASCE needs to be replaced
>    *
> + * If the ASCE is a SEGMENT type then this function will return -EINVAL,
> + * otherwise the pointers in the host_to_guest radix tree will keep pointing
> + * to the wrong pages, causing use-after-free and memory corruption.
>    * If the allocation of the new top level page table fails, the ASCE is not
>    * replaced.
>    * In any case, the old ASCE is always removed from the gmap CRST list.
> @@ -2847,6 +2850,10 @@ int s390_replace_asce(struct gmap *gmap)
>   
>   	s390_unlist_old_asce(gmap);
>   
> +	/* Replacing segment type ASCEs would cause serious issues */
> +	if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
> +		return -EINVAL;
> +
>   	page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
>   	if (!page)
>   		return -ENOMEM;


  parent reply	other threads:[~2023-04-21  8:05 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-04-20 16:01 [PATCH v1 1/1] KVM: s390: pv: fix asynchronous teardown for small VMs Claudio Imbrenda
2023-04-20 16:15 ` Marc Hartmayer
2023-04-21  7:35   ` Claudio Imbrenda
2023-04-21  8:04 ` Janosch Frank [this message]
2023-04-21  8:17   ` Claudio Imbrenda
2023-04-21  8:07 ` Christian Borntraeger
2023-04-21  8:17   ` Claudio Imbrenda

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=a1ab46b3-da2b-f815-be15-1294f95d598f@linux.ibm.com \
    --to=frankja@linux.ibm.com \
    --cc=borntraeger@de.ibm.com \
    --cc=imbrenda@linux.ibm.com \
    --cc=kvm390-list@tuxmaker.boeblingen.de.ibm.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-s390@vger.kernel.org \
    --cc=mhartmay@linux.ibm.com \
    --cc=nrb@linux.ibm.com \
    --cc=nsg@linux.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox