[PATCH 3/5] live migration support for VM dirty log management

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 3/5] live migration support for VM dirty log management
@ 2014-04-17  1:34 Mario Smarduch
  2014-04-17  8:00 ` Marc Zyngier
  0 siblings, 1 reply; 4+ messages in thread
From: Mario Smarduch @ 2014-04-17  1:34 UTC (permalink / raw)
  To: kvmarm@lists.cs.columbia.edu, Marc Zyngier,
	christoffer.dall@linaro.org
  Cc: kvm@vger.kernel.org, 이정석,
	정성진


Add support for dirty bitmap management. Wanted to make it generic but function
does a couple things different then the x86 version.


Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
---
 arch/arm/include/asm/kvm_host.h |    3 ++
 arch/arm/kvm/arm.c              |   71 ++++++++++++++++++++++++++++++++++++++-
 arch/arm/kvm/mmu.c              |   53 +++++++++++++++++++++++++++++
 3 files changed, 126 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 7ac1fdc..16ed4e4 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -230,5 +230,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
 void kvm_tlb_flush_vm(struct kvm *kvm);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+		struct kvm_memory_slot *slot,
+		gfn_t gfn_offset, unsigned long mask);
 
 #endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 7714cc6..7882343 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -785,9 +785,78 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	}
 }
 
+
+/**
+ * kvm_mmu_slot_remove_access - retrieves the log of dirty pages for a memslot.
+ *	It's itteratively during migration to retrieve pages written since
+ *	last call. In the process write protects ptes that are dirty for next
+ *	time, holds the mmu_lock while write protecting dirty pages.
+ *
+ * @kvm:        The KVM pointer
+ * @log:        Bitmap of dirty pages return.
+ */
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
-	return -EINVAL;
+	int r;
+	struct kvm_memory_slot *memslot;
+	unsigned long n, i;
+	unsigned long *dirty_bitmap;
+	unsigned long *dirty_bitmap_buffer;
+	bool is_dirty = false;
+	gfn_t offset;
+
+	mutex_lock(&kvm->slots_lock);
+	r = -EINVAL;
+
+	/* Return with error code will cause migration to abort, this happens
+	 * when initial write protection of VM to manage dirty pages fails
+	 */
+	if (kvm->arch.migration_in_progress == -1)
+		goto out;
+
+	if (log->slot >= KVM_USER_MEM_SLOTS)
+		goto out;
+
+	memslot = id_to_memslot(kvm->memslots, log->slot);
+	dirty_bitmap = memslot->dirty_bitmap;
+
+	r = -ENOENT;
+	if (!dirty_bitmap)
+		goto out;
+
+	n = kvm_dirty_bitmap_bytes(memslot);
+	dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
+	memset(dirty_bitmap_buffer, 0, n);
+
+	spin_lock(&kvm->mmu_lock);
+	for (i = 0; i < n / sizeof(long); i++) {
+		unsigned long mask;
+
+		if (!dirty_bitmap[i])
+			continue;
+
+		is_dirty = true;
+		offset = i * BITS_PER_LONG;
+		kvm_mmu_write_protect_pt_masked(kvm, memslot, offset,
+							dirty_bitmap[i]);
+		mask = dirty_bitmap[i];
+		dirty_bitmap_buffer[i] = mask;
+		dirty_bitmap[i] = 0;
+	}
+
+	if (is_dirty)
+		kvm_tlb_flush_vm(kvm);
+
+	spin_unlock(&kvm->mmu_lock);
+	r = -EFAULT;
+
+	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
+		goto out;
+
+	r = 0;
+out:
+	mutex_unlock(&kvm->slots_lock);
+	return r;
 }
 
 static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index b85ab56..47bec1c 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -773,6 +773,59 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 	spin_unlock(&kvm->mmu_lock);
 }
 
+
+/**
+ * kvm_mmu_write_protected_pt_masked - after migration thread write protects
+ *  the entire VM address space itterative calls are made to get diry pages
+ *  as the VM pages are being migrated. New dirty pages may be subset
+ *  of initial WPed VM or new writes faulted in. Here write protect new
+ *  dirty pages again in preparation of next dirty log read. This function is
+ *  called as a result KVM_GET_DIRTY_LOG ioctl, to determine what pages
+ *  need to be migrated.
+ *   'kvm->mmu_lock' must be  held to protect against concurrent modification
+ *   of page tables (2nd stage fault, mmu modifiers, device writes)
+ *
+ * @kvm:        The KVM pointer
+ * @slot:	The memory slot the dirty log is retrieved for
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask:       The mask of dirty pages at offset 'gnf_offset in this memory
+ *              slot to be writ protect
+ */
+
+void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
+		struct kvm_memory_slot *slot,
+		gfn_t gfn_offset, unsigned long mask)
+{
+	phys_addr_t ipa;
+	pgd_t *pgdp = kvm->arch.pgd, *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte, new_pte;
+
+	/* walk set bits in the mask and write protect corresponding pages */
+	while (mask) {
+		ipa = (slot->base_gfn + gfn_offset + __ffs(mask)) << PAGE_SHIFT;
+		pgd = pgdp + pgd_index(ipa);
+		if (!pgd_present(*pgd))
+			goto update_mask;
+		pud = pud_offset(pgd, ipa);
+		if (!pud_present(*pud))
+			goto update_mask;
+		pmd = pmd_offset(pud, ipa);
+		if (!pmd_present(*pmd))
+			goto update_mask;
+		pte = pte_offset_kernel(pmd, ipa);
+		if (!pte_present(*pte))
+			goto update_mask;
+		if ((*pte & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY)
+			goto update_mask;
+		new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2);
+		*pte = new_pte;
+update_mask:
+		mask &= mask - 1;
+	}
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot,
 			  unsigned long fault_status)
-- 
1.7.9.5


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 3/5] live migration support for VM dirty log management
  2014-04-17  1:34 [PATCH 3/5] live migration support for VM dirty log management Mario Smarduch
@ 2014-04-17  8:00 ` Marc Zyngier
  2014-04-18  3:10   ` Mario Smarduch
  0 siblings, 1 reply; 4+ messages in thread
From: Marc Zyngier @ 2014-04-17  8:00 UTC (permalink / raw)
  To: Mario Smarduch
  Cc: kvmarm@lists.cs.columbia.edu, christoffer.dall@linaro.org,
	kvm@vger.kernel.org, 이정석,
	정성진

On Thu, Apr 17 2014 at  2:34:19 am BST, Mario Smarduch <m.smarduch@samsung.com> wrote:
> Add support for dirty bitmap management. Wanted to make it generic but function
> does a couple things different then the x86 version.
>
>
> Signed-off-by: Mario Smarduch <m.smarduch@samsung.com>
> ---
>  arch/arm/include/asm/kvm_host.h |    3 ++
>  arch/arm/kvm/arm.c              |   71 ++++++++++++++++++++++++++++++++++++++-
>  arch/arm/kvm/mmu.c              |   53 +++++++++++++++++++++++++++++
>  3 files changed, 126 insertions(+), 1 deletion(-)
>
> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> index 7ac1fdc..16ed4e4 100644
> --- a/arch/arm/include/asm/kvm_host.h
> +++ b/arch/arm/include/asm/kvm_host.h
> @@ -230,5 +230,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
>  
>  void kvm_tlb_flush_vm(struct kvm *kvm);
>  void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
> +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
> +		struct kvm_memory_slot *slot,
> +		gfn_t gfn_offset, unsigned long mask);
>  
>  #endif /* __ARM_KVM_HOST_H__ */
> diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
> index 7714cc6..7882343 100644
> --- a/arch/arm/kvm/arm.c
> +++ b/arch/arm/kvm/arm.c
> @@ -785,9 +785,78 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>  	}
>  }
>  
> +
> +/**
> + * kvm_mmu_slot_remove_access - retrieves the log of dirty pages for a memslot.
> + *	It's itteratively during migration to retrieve pages written since
> + *	last call. In the process write protects ptes that are dirty for next
> + *	time, holds the mmu_lock while write protecting dirty pages.
> + *
> + * @kvm:        The KVM pointer
> + * @log:        Bitmap of dirty pages return.
> + */

So let's play the difference game with x86:

>  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
>  {
> -	return -EINVAL;
> +	int r;
> +	struct kvm_memory_slot *memslot;
> +	unsigned long n, i;
> +	unsigned long *dirty_bitmap;
> +	unsigned long *dirty_bitmap_buffer;
> +	bool is_dirty = false;
> +	gfn_t offset;

You've moved offset out of the loop.

> +	mutex_lock(&kvm->slots_lock);
> +	r = -EINVAL;
> +
> +	/* Return with error code will cause migration to abort, this happens
> +	 * when initial write protection of VM to manage dirty pages fails
> +	 */
> +	if (kvm->arch.migration_in_progress == -1)
> +		goto out;

You've added this test. How does x86 cope with the same case?

> +	if (log->slot >= KVM_USER_MEM_SLOTS)
> +		goto out;
> +
> +	memslot = id_to_memslot(kvm->memslots, log->slot);
> +	dirty_bitmap = memslot->dirty_bitmap;
> +
> +	r = -ENOENT;
> +	if (!dirty_bitmap)
> +		goto out;
> +
> +	n = kvm_dirty_bitmap_bytes(memslot);
> +	dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
> +	memset(dirty_bitmap_buffer, 0, n);
> +
> +	spin_lock(&kvm->mmu_lock);
> +	for (i = 0; i < n / sizeof(long); i++) {
> +		unsigned long mask;
> +
> +		if (!dirty_bitmap[i])
> +			continue;
> +
> +		is_dirty = true;
> +		offset = i * BITS_PER_LONG;
> +		kvm_mmu_write_protect_pt_masked(kvm, memslot, offset,
> +							dirty_bitmap[i]);
> +		mask = dirty_bitmap[i];
> +		dirty_bitmap_buffer[i] = mask;
> +		dirty_bitmap[i] = 0;

You've expanded the xchg macro, and moved it around.

> +	}
> +
> +	if (is_dirty)
> +		kvm_tlb_flush_vm(kvm);

This can be easily abstracted to be a kvm_flush_remote_tlbs on x86, and
a HW broadcast on ARM.

> +
> +	spin_unlock(&kvm->mmu_lock);
> +	r = -EFAULT;
> +
> +	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
> +		goto out;
> +
> +	r = 0;
> +out:
> +	mutex_unlock(&kvm->slots_lock);
> +	return r;
>  }

So only two small differences that can be easily factored out. I stand
by my initial analysis, and ask again you move this function to the
generic code as a weak symbol. There isn't any value in duplicating it.

>  static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index b85ab56..47bec1c 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -773,6 +773,59 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
>  	spin_unlock(&kvm->mmu_lock);
>  }
>  
> +
> +/**
> + * kvm_mmu_write_protected_pt_masked - after migration thread write protects
> + *  the entire VM address space itterative calls are made to get diry pages
> + *  as the VM pages are being migrated. New dirty pages may be subset
> + *  of initial WPed VM or new writes faulted in. Here write protect new
> + *  dirty pages again in preparation of next dirty log read. This function is
> + *  called as a result KVM_GET_DIRTY_LOG ioctl, to determine what pages
> + *  need to be migrated.
> + *   'kvm->mmu_lock' must be  held to protect against concurrent modification
> + *   of page tables (2nd stage fault, mmu modifiers, device writes)
> + *
> + * @kvm:        The KVM pointer
> + * @slot:	The memory slot the dirty log is retrieved for
> + * @gfn_offset: The gfn offset in memory slot
> + * @mask:       The mask of dirty pages at offset 'gnf_offset in this memory
> + *              slot to be writ protect
> + */
> +
> +void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
> +		struct kvm_memory_slot *slot,
> +		gfn_t gfn_offset, unsigned long mask)
> +{
> +	phys_addr_t ipa;
> +	pgd_t *pgdp = kvm->arch.pgd, *pgd;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *pte, new_pte;
> +
> +	/* walk set bits in the mask and write protect corresponding pages */
> +	while (mask) {
> +		ipa = (slot->base_gfn + gfn_offset + __ffs(mask)) << PAGE_SHIFT;
> +		pgd = pgdp + pgd_index(ipa);
> +		if (!pgd_present(*pgd))
> +			goto update_mask;

I think something is wrong in your logic. If there is no PGD, it means a
whole 1GB isn't present. Yet you're just clearing one bit from the mask
and doing it again. As you're only looking at BITS_PER_LONG contiguous pages at a
time, it is likely that the same thing will happen for the other pages,
and you're just wasting precious CPU cycles here.

> +		pud = pud_offset(pgd, ipa);
> +		if (!pud_present(*pud))
> +			goto update_mask;
> +		pmd = pmd_offset(pud, ipa);
> +		if (!pmd_present(*pmd))
> +			goto update_mask;
> +		pte = pte_offset_kernel(pmd, ipa);
> +		if (!pte_present(*pte))
> +			goto update_mask;
> +		if ((*pte & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY)
> +			goto update_mask;
> +		new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2);
> +		*pte = new_pte;

I'd like to see these two lines in a separate function (something like
"stage2_mark_pte_ro")...

> +update_mask:
> +		mask &= mask - 1;
> +	}
> +}
> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  			  struct kvm_memory_slot *memslot,
>  			  unsigned long fault_status)

-- 
Jazz is not dead. It just smells funny.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* RE: [PATCH 3/5] live migration support for VM dirty log management
  2014-04-17  8:00 ` Marc Zyngier
@ 2014-04-18  3:10   ` Mario Smarduch
  2014-04-18  8:23     ` Marc Zyngier
  0 siblings, 1 reply; 4+ messages in thread
From: Mario Smarduch @ 2014-04-18  3:10 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: kvmarm@lists.cs.columbia.edu, christoffer.dall@linaro.org,
	kvm@vger.kernel.org, 이정석,
	정성진

MZ> So let's play the difference game with x86:

>  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log

kvm_vm_ioctl_get_dirty_log() is identical now to x86 version moved it to kvm_main.c,
to make it generic, it's declared weak. Do I go into x86 and remove that function?

Or leave it to x86 folks to do it?

-----Original Message-----

> + *

> +	}
> +
> +	if (is_dirty)
> +		kvm_tlb_flush_vm(kvm);

MZ> This can be easily abstracted to be a kvm_flush_remote_tlbs on x86, and a HW broadcast on ARM.

Kvm_tlb_flush_vm() is replaced  with kvm_flush_remote_tlbs() I made that
function weak and declared a ARM version, in arm mmu.c

The current version sends IPIs to vCPU running the guest, ARMv7+
does not need that.  Is that ok?

> +	/* walk set bits in the mask and write protect corresponding pages */
> +	while (mask) {
> +		ipa = (slot->base_gfn + gfn_offset + __ffs(mask)) << PAGE_SHIFT;
> +		pgd = pgdp + pgd_index(ipa);
> +		if (!pgd_present(*pgd))
> +			goto update_mask;

MZ> I think something is wrong in your logic. If there is no PGD, it means a whole 1GB isn't present. Yet you're just clearing one bit from the mask and doing it again. As you're only looking at BITS_PER_LONG MZ> contiguous pages at a time, it is likely that the same thing will happen for the other pages, and you're just wasting precious CPU cycles here.

Yes this is grossly inefficient,  I updated it to walk ptes only, after first determining if it straddles a pmd.
Should mostly be pte walks with maybe one PMD walk but unlikely.

> +		new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2);
> +		*pte = new_pte;

MZ> I'd like to see these two lines in a separate function (something like "stage2_mark_pte_ro")...

Yes ok.

(emailed from outlook client)
--
Jazz is not dead. It just smells funny.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 3/5] live migration support for VM dirty log management
  2014-04-18  3:10   ` Mario Smarduch
@ 2014-04-18  8:23     ` Marc Zyngier
  0 siblings, 0 replies; 4+ messages in thread
From: Marc Zyngier @ 2014-04-18  8:23 UTC (permalink / raw)
  To: Mario Smarduch
  Cc: kvmarm@lists.cs.columbia.edu, christoffer.dall@linaro.org,
	kvm@vger.kernel.org, 이정석,
	정성진

On Fri, Apr 18 2014 at 04:10:41 AM, Mario Smarduch <m.smarduch@samsung.com> wrote:
> MZ> So let's play the difference game with x86:
>
>>  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log
>
>
> kvm_vm_ioctl_get_dirty_log() is identical now to x86 version moved it
> to kvm_main.c, to make it generic, it's declared weak. Do I go into
> x86 and remove that function?
>
> Or leave it to x86 folks to do it?

No, this must be part of your patch series.

>> +	}
>> +
>> +	if (is_dirty)
>> +		kvm_tlb_flush_vm(kvm);
>
> MZ> This can be easily abstracted to be a kvm_flush_remote_tlbs on
> MZ> x86, and a HW broadcast on ARM.
>
> Kvm_tlb_flush_vm() is replaced  with kvm_flush_remote_tlbs() I made that
> function weak and declared a ARM version, in arm mmu.c
>
> The current version sends IPIs to vCPU running the guest, ARMv7+
> does not need that.  Is that ok?

We don't need any of that. You should have akvm_flush_remote_tlb that is
essentially a call to kvm_tlb_flush_vmid.
>
>
>> +	/* walk set bits in the mask and write protect corresponding pages */
>> +	while (mask) {
>> +		ipa = (slot->base_gfn + gfn_offset + __ffs(mask)) << PAGE_SHIFT;
>> +		pgd = pgdp + pgd_index(ipa);
>> +		if (!pgd_present(*pgd))
>> +			goto update_mask;
>
> MZ> I think something is wrong in your logic. If there is no PGD, it
> MZ> means a whole 1GB isn't present. Yet you're just clearing one bit
> MZ> from the mask and doing it again. As you're only looking at
> MZ> BITS_PER_LONG MZ> contiguous pages at a time, it is likely that
> MZ> the same thing will happen for the other pages, and you're just
> MZ> wasting precious CPU cycles here.
>
> Yes this is grossly inefficient, I updated it to walk ptes only, after
> first determining if it straddles a pmd.
> Should mostly be pte walks with maybe one PMD walk but unlikely.

Yes, that's what I thought.

>
>> +		new_pte = pfn_pte(pte_pfn(*pte), PAGE_S2);
>> +		*pte = new_pte;
>
> MZ> I'd like to see these two lines in a separate function (something
> MZ> like "stage2_mark_pte_ro")...
>
> Yes ok.

Thanks.

	M.
-- 
Jazz is not dead. It just smells funny.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2014-04-18  8:23 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-04-17  1:34 [PATCH 3/5] live migration support for VM dirty log management Mario Smarduch
2014-04-17  8:00 ` Marc Zyngier
2014-04-18  3:10   ` Mario Smarduch
2014-04-18  8:23     ` Marc Zyngier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox