public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Zhi Wang <zhi.wang.linux@gmail.com>
To: Sagi Shahar <sagis@google.com>
Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	x86@kernel.org, Sean Christopherson <seanjc@google.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	Isaku Yamahata <isaku.yamahata@intel.com>,
	Erdem Aktas <erdemaktas@google.com>,
	David Matlack <dmatlack@google.com>,
	Kai Huang <kai.huang@intel.com>,
	Chao Peng <chao.p.peng@linux.intel.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Ingo Molnar <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
	Dave Hansen <dave.hansen@linux.intel.com>
Subject: Re: [RFC PATCH 5/5] KVM: TDX: Add core logic for TDX intra-host migration
Date: Wed, 19 Apr 2023 10:08:01 +0300	[thread overview]
Message-ID: <20230419100801.00007d20.zhi.wang.linux@gmail.com> (raw)
In-Reply-To: <20230407201921.2703758-6-sagis@google.com>

On Fri,  7 Apr 2023 20:19:21 +0000
Sagi Shahar <sagis@google.com> wrote:

> Adds the core logic for transferring state between source and
> destination TDs during intra-host migration.
> 
> Signed-off-by: Sagi Shahar <sagis@google.com>
> ---
>  arch/x86/kvm/vmx/tdx.c | 191 ++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 190 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index 0999a6d827c99..05b164a91437b 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -2834,9 +2834,198 @@ static __always_inline bool tdx_guest(struct kvm *kvm)
>  	return tdx_kvm->finalized;
>  }
>  
> +#define for_each_memslot_pair(memslots_1, memslots_2, memslot_iter_1, \
> +			      memslot_iter_2)                         \
> +	for (memslot_iter_1 = rb_first(&memslots_1->gfn_tree),        \
> +	    memslot_iter_2 = rb_first(&memslots_2->gfn_tree);         \
> +	     memslot_iter_1 && memslot_iter_2;                        \
> +	     memslot_iter_1 = rb_next(memslot_iter_1),                \
> +	    memslot_iter_2 = rb_next(memslot_iter_2))
> +

If it is a pair, using suffix *_a, *_b would be better.

>  static int tdx_migrate_from(struct kvm *dst, struct kvm *src)
>  {
> -	return -EINVAL;
> +	struct rb_node *src_memslot_iter, *dst_memslot_iter;
> +	struct vcpu_tdx *dst_tdx_vcpu, *src_tdx_vcpu;
> +	struct kvm_memslots *src_slots, *dst_slots;
> +	struct kvm_vcpu *dst_vcpu, *src_vcpu;
> +	struct kvm_tdx *src_tdx, *dst_tdx;
> +	unsigned long i, j;
> +	int ret;
> +
> +	src_tdx = to_kvm_tdx(src);
> +	dst_tdx = to_kvm_tdx(dst);
> +
> +	src_slots = __kvm_memslots(src, 0);
> +	dst_slots = __kvm_memslots(dst, 0);
> +
> +	ret = -EINVAL;
> +
> +	if (!src_tdx->finalized) {
> +		pr_warn("Cannot migrate from a non finalized VM\n");
> +		goto abort;
> +	}
> +

Let's use the existing inline function is_td_finalized().

> +	// Traverse both memslots in gfn order and compare them
> +	for_each_memslot_pair(src_slots, dst_slots, src_memslot_iter, dst_memslot_iter) {
> +		struct kvm_memory_slot *src_slot, *dst_slot;
> +
> +		src_slot =
> +			container_of(src_memslot_iter, struct kvm_memory_slot,
> +				     gfn_node[src_slots->node_idx]);
> +		dst_slot =
> +			container_of(src_memslot_iter, struct kvm_memory_slot,
> +				     gfn_node[dst_slots->node_idx]);
> +
                                       ^dst_memslot_iter? So does the other one below.
> +		if (src_slot->base_gfn != dst_slot->base_gfn ||
> +		    src_slot->npages != dst_slot->npages) {
> +			pr_warn("Cannot migrate between VMs with different memory slots configurations\n");
> +			goto abort;
> +		}
> +
> +		if (src_slot->flags != dst_slot->flags) {
> +			pr_warn("Cannot migrate between VMs with different memory slots configurations\n");
> +			goto abort;
> +		}
> +
> +		if (src_slot->flags & KVM_MEM_PRIVATE) {
> +			if (src_slot->restrictedmem.file->f_inode->i_ino !=
> +			    dst_slot->restrictedmem.file->f_inode->i_ino) {
> +				pr_warn("Private memslots points to different restricted files\n");
> +				goto abort;
> +			}
> +
> +			if (src_slot->restrictedmem.index != dst_slot->restrictedmem.index) {
> +				pr_warn("Private memslots points to the restricted file at different offsets\n");
> +				goto abort;
> +			}
> +		}
> +	}
> +
> +	if (src_memslot_iter || dst_memslot_iter) {
> +		pr_warn("Cannot migrate between VMs with different memory slots configurations\n");
> +		goto abort;
> +	}
> +
> +	dst_tdx->hkid = src_tdx->hkid;
> +	dst_tdx->tdr_pa = src_tdx->tdr_pa;
> +
> +	dst_tdx->tdcs_pa = kcalloc(tdx_info.nr_tdcs_pages, sizeof(*dst_tdx->tdcs_pa),
> +			  GFP_KERNEL_ACCOUNT | __GFP_ZERO);
> +	if (!dst_tdx->tdcs_pa) {
> +		ret = -ENOMEM;
> +		goto late_abort;
> +	}
> +	memcpy(dst_tdx->tdcs_pa, src_tdx->tdcs_pa,
> +	       tdx_info.nr_tdcs_pages * sizeof(*dst_tdx->tdcs_pa));
> +
> +	dst_tdx->tsc_offset = src_tdx->tsc_offset;
> +	dst_tdx->attributes = src_tdx->attributes;
> +	dst_tdx->xfam = src_tdx->xfam;
> +	dst_tdx->kvm.arch.gfn_shared_mask = src_tdx->kvm.arch.gfn_shared_mask;
> +
> +	kvm_for_each_vcpu(i, src_vcpu, src)
> +		tdx_flush_vp_on_cpu(src_vcpu);
> +
> +	/* Copy per-vCPU state */
> +	kvm_for_each_vcpu(i, src_vcpu, src) {
> +		src_tdx_vcpu = to_tdx(src_vcpu);
> +		dst_vcpu = kvm_get_vcpu(dst, i);
> +		dst_tdx_vcpu = to_tdx(dst_vcpu);
> +
> +		vcpu_load(dst_vcpu);
> +
> +		memcpy(dst_vcpu->arch.regs, src_vcpu->arch.regs,
> +		       NR_VCPU_REGS * sizeof(src_vcpu->arch.regs[0]));
> +		dst_vcpu->arch.regs_avail = src_vcpu->arch.regs_avail;
> +		dst_vcpu->arch.regs_dirty = src_vcpu->arch.regs_dirty;
> +
> +		dst_vcpu->arch.tsc_offset = dst_tdx->tsc_offset;
> +
> +		dst_tdx_vcpu->interrupt_disabled_hlt = src_tdx_vcpu->interrupt_disabled_hlt;
> +		dst_tdx_vcpu->buggy_hlt_workaround = src_tdx_vcpu->buggy_hlt_workaround;
> +
> +		dst_tdx_vcpu->tdvpr_pa = src_tdx_vcpu->tdvpr_pa;
> +		dst_tdx_vcpu->tdvpx_pa = kcalloc(tdx_info.nr_tdvpx_pages,
> +						 sizeof(*dst_tdx_vcpu->tdvpx_pa),
> +						 GFP_KERNEL_ACCOUNT);
> +		if (!dst_tdx_vcpu->tdvpx_pa) {
> +			ret = -ENOMEM;
> +			vcpu_put(dst_vcpu);
> +			goto late_abort;
> +		}
> +		memcpy(dst_tdx_vcpu->tdvpx_pa, src_tdx_vcpu->tdvpx_pa,
> +		       tdx_info.nr_tdvpx_pages * sizeof(*dst_tdx_vcpu->tdvpx_pa));
> +
> +		td_vmcs_write64(dst_tdx_vcpu, POSTED_INTR_DESC_ADDR, __pa(&dst_tdx_vcpu->pi_desc));
> +
> +		/* Copy private EPT tables */
> +		if (kvm_mmu_move_private_pages_from(dst_vcpu, src_vcpu)) {
> +			ret = -EINVAL;
> +			vcpu_put(dst_vcpu);
> +			goto late_abort;
> +		}
> +
> +		for (j = 0; j < tdx_info.nr_tdvpx_pages; j++)
> +			src_tdx_vcpu->tdvpx_pa[j] = 0;
> +
> +		src_tdx_vcpu->tdvpr_pa = 0;
> +
> +		vcpu_put(dst_vcpu);
> +	}
> +
> +	for_each_memslot_pair(src_slots, dst_slots, src_memslot_iter,
> +			      dst_memslot_iter) {
> +		struct kvm_memory_slot *src_slot, *dst_slot;
> +
> +		src_slot = container_of(src_memslot_iter,
> +					struct kvm_memory_slot,
> +					gfn_node[src_slots->node_idx]);
> +		dst_slot = container_of(src_memslot_iter,
> +					struct kvm_memory_slot,
> +					gfn_node[dst_slots->node_idx]);
> +
> +		for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
> +			unsigned long ugfn;
> +			int level = i + 1;
> +
> +			/*
> +			 * If the gfn and userspace address are not aligned wrt each other, then
> +			 * large page support should already be disabled at this level.
> +			 */
> +			ugfn = dst_slot->userspace_addr >> PAGE_SHIFT;
> +			if ((dst_slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1))
> +				continue;
> +
> +			dst_slot->arch.lpage_info[i - 1] =
> +				src_slot->arch.lpage_info[i - 1];
> +			src_slot->arch.lpage_info[i - 1] = NULL;
> +		}
> +	}
> +
> +	dst->mem_attr_array.xa_head = src->mem_attr_array.xa_head;
> +	src->mem_attr_array.xa_head = NULL;
> +
> +	dst_tdx->finalized = true;
> +
> +	/* Clear source VM to avoid freeing the hkid and pages on VM put */
> +	src_tdx->hkid = -1;
> +	src_tdx->tdr_pa = 0;
> +	for (i = 0; i < tdx_info.nr_tdcs_pages; i++)
> +		src_tdx->tdcs_pa[i] = 0;
> +
> +	return 0;
> +
> +late_abort:
> +	/* If we aborted after the state transfer already started, the src VM
> +	 * is no longer valid.
> +	 */
> +	kvm_vm_dead(src);
> +
> +abort:
> +	dst_tdx->hkid = -1;
> +	dst_tdx->tdr_pa = 0;
> +
> +	return ret;
>  }
>
This function is quite long. It would be better to split some parts into
separate functions.
>  int tdx_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd)


  reply	other threads:[~2023-04-19  7:08 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-04-07 20:19 [RFC PATCH 0/5] Add TDX intra host migration support Sagi Shahar
2023-04-07 20:19 ` [RFC PATCH 1/5] KVM: Split tdp_mmu_pages to private and shared lists Sagi Shahar
2023-04-17 19:36   ` Zhi Wang
2023-04-18 17:14     ` Sagi Shahar
2023-04-07 20:19 ` [RFC PATCH 2/5] KVM: SEV: Refactor common code out of sev_vm_move_enc_context_from Sagi Shahar
2023-04-17 19:45   ` Zhi Wang
2023-04-18 17:17     ` Sagi Shahar
2023-04-07 20:19 ` [RFC PATCH 3/5] KVM: TDX: Add base implementation for tdx_vm_move_enc_context_from Sagi Shahar
2023-04-18  6:28   ` Zhi Wang
2023-04-18 17:47     ` Sagi Shahar
2023-04-19  6:34       ` Zhi Wang
2023-04-27 21:25         ` Sagi Shahar
2023-04-28 16:08           ` Zhi Wang
2023-04-18 12:11   ` Zhi Wang
2023-04-18 17:51     ` Sagi Shahar
2023-04-07 20:19 ` [RFC PATCH 4/5] KVM: TDX: Implement moving private pages between 2 TDs Sagi Shahar
2023-06-02  7:00   ` Isaku Yamahata
2023-04-07 20:19 ` [RFC PATCH 5/5] KVM: TDX: Add core logic for TDX intra-host migration Sagi Shahar
2023-04-19  7:08   ` Zhi Wang [this message]
2023-04-14  7:03 ` [RFC PATCH 0/5] Add TDX intra host migration support Zhi Wang
2023-04-14 19:09   ` Sagi Shahar

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230419100801.00007d20.zhi.wang.linux@gmail.com \
    --to=zhi.wang.linux@gmail.com \
    --cc=bp@alien8.de \
    --cc=chao.p.peng@linux.intel.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=dmatlack@google.com \
    --cc=erdemaktas@google.com \
    --cc=isaku.yamahata@intel.com \
    --cc=kai.huang@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=sagis@google.com \
    --cc=seanjc@google.com \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox