Re: [PATCH v2 1/2] KVM: nVMX: Fix VMX preemption timer migration

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Vitaly Kuznetsov <vkuznets@redhat.com>
To: Makarand Sonare <makarandsonare@google.com>, kvm@vger.kernel.org
Cc: pshier@google.com, jmattson@google.com
Subject: Re: [PATCH  v2 1/2] KVM: nVMX: Fix VMX preemption timer migration
Date: Wed, 20 May 2020 19:08:58 +0200	[thread overview]
Message-ID: <87v9kqsfdh.fsf@vitty.brq.redhat.com> (raw)
In-Reply-To: <20200519222238.213574-2-makarandsonare@google.com>

Makarand Sonare <makarandsonare@google.com> writes:

> From: Peter Shier <pshier@google.com>
>
> Add new field to hold preemption timer expiration deadline
> appended to struct kvm_vmx_nested_state_data. This is to prevent
> the first VM-Enter after migration from incorrectly restarting the timer
> with the full timer value instead of partially decayed timer value.
> KVM_SET_NESTED_STATE restarts timer using migrated state regardless
> of whether L1 sets VM_EXIT_SAVE_VMX_PREEMPTION_TIMER.
>
> Fixes: cf8b84f48a593 ("kvm: nVMX: Prepare for checkpointing L2 state")
>
> Signed-off-by: Peter Shier <pshier@google.com>
> Signed-off-by: Makarand Sonare <makarandsonare@google.com>
> Change-Id: I6446aba5a547afa667f0ef4620b1b76115bf3753
> ---
>  Documentation/virt/kvm/api.rst  |  4 +++
>  arch/x86/include/uapi/asm/kvm.h |  3 ++
>  arch/x86/kvm/vmx/nested.c       | 56 +++++++++++++++++++++++++++++----
>  arch/x86/kvm/vmx/vmx.h          |  1 +
>  arch/x86/kvm/x86.c              |  3 +-
>  include/uapi/linux/kvm.h        |  1 +
>  6 files changed, 61 insertions(+), 7 deletions(-)
>
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index d871dacb984e9..b410815772970 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -4326,6 +4326,9 @@ Errors:
>    #define KVM_STATE_NESTED_RUN_PENDING		0x00000002
>    #define KVM_STATE_NESTED_EVMCS		0x00000004
>
> +  /* Available with KVM_CAP_NESTED_STATE_PREEMPTION_TIMER */
> +  #define KVM_STATE_NESTED_PREEMPTION_TIMER	0x00000010
> +
>    #define KVM_STATE_NESTED_FORMAT_VMX		0
>    #define KVM_STATE_NESTED_FORMAT_SVM		1
>
> @@ -4346,6 +4349,7 @@ Errors:
>    struct kvm_vmx_nested_state_data {
>  	__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
>  	__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
> +	__u64 preemption_timer_deadline;
>    };
>
>  This ioctl copies the vcpu's nested virtualization state from the kernel to
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index 3f3f780c8c650..13dca545554dc 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -391,6 +391,8 @@ struct kvm_sync_regs {
>  #define KVM_STATE_NESTED_RUN_PENDING	0x00000002
>  #define KVM_STATE_NESTED_EVMCS		0x00000004
>  #define KVM_STATE_NESTED_MTF_PENDING	0x00000008
> +/* Available with KVM_CAP_NESTED_STATE_PREEMPTION_TIMER */
> +#define KVM_STATE_NESTED_PREEMPTION_TIMER	0x00000010
>
>  #define KVM_STATE_NESTED_SMM_GUEST_MODE	0x00000001
>  #define KVM_STATE_NESTED_SMM_VMXON	0x00000002
> @@ -400,6 +402,7 @@ struct kvm_sync_regs {
>  struct kvm_vmx_nested_state_data {
>  	__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
>  	__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
> +	__u64 preemption_timer_deadline;
>  };
>
>  struct kvm_vmx_nested_state_hdr {
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 51ebb60e1533a..318b753743902 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -2092,9 +2092,9 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
>  	return HRTIMER_NORESTART;
>  }
>
> -static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
> +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
> +					u64 preemption_timeout)
>  {
> -	u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
>  	struct vcpu_vmx *vmx = to_vmx(vcpu);
>
>  	/*
> @@ -3353,8 +3353,20 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
>  	 * the timer.
>  	 */
>  	vmx->nested.preemption_timer_expired = false;
> -	if (nested_cpu_has_preemption_timer(vmcs12))
> -		vmx_start_preemption_timer(vcpu);
> +	if (nested_cpu_has_preemption_timer(vmcs12)) {
> +		u64 timer_value = 0;
> +		u64 l1_scaled_tsc_value = (kvm_read_l1_tsc(vcpu, rdtsc())
> +					   >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE);
> +
> +		if (from_vmentry) {
> +			timer_value = vmcs12->vmx_preemption_timer_value;
> +			vmx->nested.preemption_timer_deadline = timer_value +
> +								l1_scaled_tsc_value;
> +		} else if (l1_scaled_tsc_value <= vmx->nested.preemption_timer_deadline)
> +			timer_value = vmx->nested.preemption_timer_deadline -
> +				      l1_scaled_tsc_value;
> +		vmx_start_preemption_timer(vcpu, timer_value);
> +	}
>
>  	/*
>  	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
> @@ -3962,9 +3974,11 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
>  		vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
>
>  	if (nested_cpu_has_preemption_timer(vmcs12) &&
> -	    vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
> +	    !vmx->nested.nested_run_pending) {
> +		if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
>  			vmcs12->vmx_preemption_timer_value =
>  				vmx_get_preemption_timer_value(vcpu);
> +	}
>
>  	/*
>  	 * In some cases (usually, nested EPT), L2 is allowed to change its
> @@ -5939,6 +5953,12 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
>
>  			if (vmx->nested.mtf_pending)
>  				kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
> +
> +			if (nested_cpu_has_preemption_timer(vmcs12)) {
> +				kvm_state.flags |=
> +					KVM_STATE_NESTED_PREEMPTION_TIMER;
> +				kvm_state.size += sizeof(user_vmx_nested_state->preemption_timer_deadline);
> +			}
>  		}
>  	}
>
> @@ -5970,6 +5990,9 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
>
>  	BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
>  	BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
> +	BUILD_BUG_ON(sizeof(user_vmx_nested_state->preemption_timer_deadline)
> +		    != sizeof(vmx->nested.preemption_timer_deadline));
> +
>
>  	/*
>  	 * Copy over the full allocated size of vmcs12 rather than just the size
> @@ -5985,6 +6008,12 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
>  			return -EFAULT;
>  	}
>
> +	if (kvm_state.flags & KVM_STATE_NESTED_PREEMPTION_TIMER) {
> +		if (put_user(vmx->nested.preemption_timer_deadline,
> +			     &user_vmx_nested_state->preemption_timer_deadline))
> +			return -EFAULT;
> +	}
> +
>  out:
>  	return kvm_state.size;
>  }
> @@ -6056,7 +6085,8 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
>  	 */
>  	if (is_smm(vcpu) ?
>  		(kvm_state->flags &
> -		 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
> +		 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING |
> +		  KVM_STATE_NESTED_PREEMPTION_TIMER))
>  		: kvm_state->hdr.vmx.smm.flags)
>  		return -EINVAL;
>
> @@ -6146,6 +6176,20 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
>  			goto error_guest_mode;
>  	}
>
> +	if (kvm_state->flags & KVM_STATE_NESTED_PREEMPTION_TIMER) {
> +
> +		if (kvm_state->size <
> +		    sizeof(*kvm_state) +
> +		    sizeof(user_vmx_nested_state->vmcs12) + sizeof(vmx->nested.preemption_timer_deadline))
> +			goto error_guest_mode;

I'm not sure this check is entirely correct.

kvm_state->size can now be:

- (1) sizeof(*kvm_state) + sizeof(user_vmx_nested_state->vmcs12) when no
  shadow vmcs/preemtion timer flags
- (1) + sizeof(*shadow_vmcs12)) when shadow vmcs and no preemtion
  timer.
- (1) + sizeof(vmx->nested.preemption_timer_deadline) when preemtion
  timer and no shadow vmcs
- (1) + sizeof(*shadow_vmcs12)) +
  sizeof(vmx->nested.preemption_timer_deadline) when both shadow vmcs
  and preemption timer.

(I don't even want to think what happens when we add something else here
:-))

I.e. your check will pass when e.g. the total size is '(1) +
sizeof(*shadow_vmcs12))' but it is wrong, you need more!

I suggest we add a logic to calculate the required size:

  required_size = sizeof(*kvm_state) +  sizeof(user_vmx_nested_state->vmcs12)
  if (shadow_vmcs_present)
      required_size += sizeof(*shadow_vmcs12);
  if (preemption_timer_present)
      required_size += sizeof(vmx->nested.preemption_timer_deadline);
  ...

But ...

> +
> +		if (get_user(vmx->nested.preemption_timer_deadline,
> +			     &user_vmx_nested_state->preemption_timer_deadline)) {

... tt also seems that we expect user_vmx_nested_state to always have
all fields, e.g. here the offset of 'preemption_timer_deadline' is
static, we always expect it to be after shadow vmcs. I think we need a
way to calculate the offset dynamically and not require everything to be
present.

> +			ret = -EFAULT;
> +			goto error_guest_mode;
> +		}
> +	}
> +
>  	if (nested_vmx_check_controls(vcpu, vmcs12) ||
>  	    nested_vmx_check_host_state(vcpu, vmcs12) ||
>  	    nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
> diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
> index 298ddef79d009..db697400755fb 100644
> --- a/arch/x86/kvm/vmx/vmx.h
> +++ b/arch/x86/kvm/vmx/vmx.h
> @@ -169,6 +169,7 @@ struct nested_vmx {
>  	u16 posted_intr_nv;
>
>  	struct hrtimer preemption_timer;
> +	u64 preemption_timer_deadline;
>  	bool preemption_timer_expired;
>
>  	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 471fccf7f8501..ba9e62ffbb4cd 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3418,6 +3418,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  	case KVM_CAP_MSR_PLATFORM_INFO:
>  	case KVM_CAP_EXCEPTION_PAYLOAD:
>  	case KVM_CAP_SET_GUEST_DEBUG:
> +	case KVM_CAP_NESTED_STATE_PREEMPTION_TIMER:
>  		r = 1;
>  		break;
>  	case KVM_CAP_SYNC_REGS:
> @@ -4626,7 +4627,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>
>  		if (kvm_state.flags &
>  		    ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
> -		      | KVM_STATE_NESTED_EVMCS))
> +		      | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_PREEMPTION_TIMER))
>  			break;
>
>  		/* nested_run_pending implies guest_mode.  */
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index ac9eba0289d1b..0868dce12a715 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1018,6 +1018,7 @@ struct kvm_ppc_resize_hpt {
>  #define KVM_CAP_S390_PROTECTED 180
>  #define KVM_CAP_PPC_SECURE_GUEST 181
>  #define KVM_CAP_HALT_POLL 182
> +#define KVM_CAP_NESTED_STATE_PREEMPTION_TIMER 183
>
>  #ifdef KVM_CAP_IRQ_ROUTING
>
> --
> 2.26.2.761.g0e0b3e54be-goog
>

-- 
Vitaly

next prev parent reply	other threads:[~2020-05-20 17:09 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-05-19 22:22 [PATCH v2 0/2] Fix VMX preemption timer migration Makarand Sonare
2020-05-19 22:22 ` [PATCH v2 1/2] KVM: nVMX: " Makarand Sonare
2020-05-20 17:08   ` Vitaly Kuznetsov [this message]
2020-05-20 18:53     ` Makarand Sonare
2020-05-20 20:53       ` Paolo Bonzini
2020-05-21  8:47         ` Vitaly Kuznetsov
2020-05-21  9:06           ` Paolo Bonzini
2020-05-19 22:22 ` [PATCH v2 2/2] KVM: selftests: VMX preemption timer migration test Makarand Sonare

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87v9kqsfdh.fsf@vitty.brq.redhat.com \
    --to=vkuznets@redhat.com \
    --cc=jmattson@google.com \
    --cc=kvm@vger.kernel.org \
    --cc=makarandsonare@google.com \
    --cc=pshier@google.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.