From: Andrea Arcangeli <andrea-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
Subject: swapping with MMU Notifiers V2
Date: Tue, 29 Jan 2008 15:50:21 +0100 [thread overview]
Message-ID: <20080129145021.GJ7233@v2.random> (raw)
Hello,
I'm testing KVM swapping on top of Christoph's latest patch
series. However the host is hanging hard for me. Could others test it?
I changed test-hardware, kernel version and kvm kernel version at the
same time, so it might not be an issue with MMU Notifiers V2 but
something else with my new test-setup (previously I was developing and
testing on my workstation which was by far not ideal).
Signed-off-by: Andrea Arcangeli <andrea-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 4086080..c527d7d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -18,6 +18,7 @@ config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on ARCH_SUPPORTS_KVM && EXPERIMENTAL
select PREEMPT_NOTIFIERS
+ select MMU_NOTIFIER
select ANON_INODES
---help---
Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 635e70c..80ebc19 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -524,6 +524,110 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
kvm_flush_remote_tlbs(kvm);
}
+static void kvm_unmap_spte(struct kvm *kvm, u64 *spte)
+{
+ struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+ get_page(page);
+ rmap_remove(kvm, spte);
+ set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(kvm);
+ __free_page(page);
+}
+
+static void kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+ u64 *spte, *curr_spte;
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ BUG_ON(!(*spte & PT_PRESENT_MASK));
+ rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
+ curr_spte = spte;
+ spte = rmap_next(kvm, rmapp, spte);
+ kvm_unmap_spte(kvm, curr_spte);
+ }
+}
+
+void kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+ int i;
+
+ /*
+ * If mmap_sem isn't taken, we can look the memslots with only
+ * the mmu_lock by skipping over the slots with userspace_addr == 0.
+ */
+ spin_lock(&kvm->mmu_lock);
+ for (i = 0; i < kvm->nmemslots; i++) {
+ struct kvm_memory_slot *memslot = &kvm->memslots[i];
+ unsigned long start = memslot->userspace_addr;
+ unsigned long end;
+
+ /* mmu_lock protects userspace_addr */
+ if (!start)
+ continue;
+
+ end = start + (memslot->npages << PAGE_SHIFT);
+ if (hva >= start && hva < end) {
+ gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+ kvm_unmap_rmapp(kvm, &memslot->rmap[gfn_offset]);
+ }
+ }
+ spin_unlock(&kvm->mmu_lock);
+}
+
+static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
+{
+ u64 *spte;
+ int young = 0;
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ int _young;
+ u64 _spte = *spte;
+ BUG_ON(!(_spte & PT_PRESENT_MASK));
+ _young = _spte & PT_ACCESSED_MASK;
+ if (_young) {
+ young = !!_young;
+ set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK);
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+ return young;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ int i;
+ int young = 0;
+
+ /*
+ * If mmap_sem isn't taken, we can look the memslots with only
+ * the mmu_lock by skipping over the slots with userspace_addr == 0.
+ */
+ spin_lock(&kvm->mmu_lock);
+ for (i = 0; i < kvm->nmemslots; i++) {
+ struct kvm_memory_slot *memslot = &kvm->memslots[i];
+ unsigned long start = memslot->userspace_addr;
+ unsigned long end;
+
+ /* mmu_lock protects userspace_addr */
+ if (!start)
+ continue;
+
+ end = start + (memslot->npages << PAGE_SHIFT);
+ if (hva >= start && hva < end) {
+ gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+ young |= kvm_age_rmapp(kvm, &memslot->rmap[gfn_offset]);
+ }
+ }
+ spin_unlock(&kvm->mmu_lock);
+
+ if (young)
+ kvm_flush_remote_tlbs(kvm);
+
+ return young;
+}
+
#ifdef MMU_DEBUG
static int is_empty_shadow_page(u64 *spt)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8f94a0b..8954836 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3167,6 +3167,44 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
free_page((unsigned long)vcpu->arch.pio_data);
}
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
+{
+ return container_of(mn, struct kvm, mmu_notifier);
+}
+
+void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ BUG_ON(mm != kvm->mm);
+ kvm_unmap_hva(kvm, address);
+}
+
+void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ int lock)
+{
+ for (; start < end; start += PAGE_SIZE)
+ kvm_mmu_notifier_invalidate_page(mn, mm, start);
+}
+
+int kvm_mmu_notifier_age_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ BUG_ON(mm != kvm->mm);
+ return kvm_age_hva(kvm, address);
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+ .invalidate_range = kvm_mmu_notifier_invalidate_range,
+ .invalidate_page = kvm_mmu_notifier_invalidate_page,
+ .age_page = kvm_mmu_notifier_age_page,
+};
+
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
@@ -3175,6 +3213,7 @@ struct kvm *kvm_arch_create_vm(void)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
return kvm;
}
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index 67ae307..14733f2 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -404,6 +404,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
int kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
+void kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_age_hva(struct kvm *kvm, unsigned long hva);
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
void kvm_mmu_zap_all(struct kvm *kvm);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ea4764b..9349160 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -15,6 +15,7 @@
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/preempt.h>
+#include <linux/mmu_notifier.h>
#include <asm/signal.h>
#include <linux/kvm.h>
@@ -118,6 +119,7 @@ struct kvm {
struct kvm_io_bus pio_bus;
struct kvm_vm_stat stat;
struct kvm_arch arch;
+ struct mmu_notifier mmu_notifier;
};
/* The guest did something we don't support. */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8fc12dc..bb4747c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -165,6 +165,7 @@ static struct kvm *kvm_create_vm(void)
kvm->mm = current->mm;
atomic_inc(&kvm->mm->mm_count);
+ mmu_notifier_register(&kvm->mmu_notifier, kvm->mm);
spin_lock_init(&kvm->mmu_lock);
kvm_io_bus_init(&kvm->pio_bus);
mutex_init(&kvm->lock);
@@ -1265,7 +1266,11 @@ static int kvm_resume(struct sys_device *dev)
}
static struct sysdev_class kvm_sysdev_class = {
+#ifdef set_kset_name
set_kset_name("kvm"),
+#else
+ .name = "kvm",
+#endif
.suspend = kvm_suspend,
.resume = kvm_resume,
};
You also need the locking patch:
Signed-off-by: Andrea Arcangeli <andrea-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8a90403..35a2ee0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3219,14 +3249,20 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
*/
if (!user_alloc) {
if (npages && !old.rmap) {
- memslot->userspace_addr = do_mmap(NULL, 0,
- npages * PAGE_SIZE,
- PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_ANONYMOUS,
- 0);
-
- if (IS_ERR((void *)memslot->userspace_addr))
- return PTR_ERR((void *)memslot->userspace_addr);
+ unsigned long userspace_addr;
+
+ userspace_addr = do_mmap(NULL, 0,
+ npages * PAGE_SIZE,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS,
+ 0);
+ if (IS_ERR((void *)userspace_addr))
+ return PTR_ERR((void *)userspace_addr);
+
+ /* set userspace_addr atomically for kvm_hva_to_rmapp */
+ spin_lock(&kvm->mmu_lock);
+ memslot->userspace_addr = userspace_addr;
+ spin_unlock(&kvm->mmu_lock);
} else {
if (!old.user_alloc && old.rmap) {
int ret;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4295623..a67e38f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -298,7 +299,15 @@ int __kvm_set_memory_region(struct kvm *kvm,
memset(new.rmap, 0, npages * sizeof(*new.rmap));
new.user_alloc = user_alloc;
- new.userspace_addr = mem->userspace_addr;
+ /*
+ * hva_to_rmmap() serialzies with the mmu_lock and to be
+ * safe it has to ignore memslots with !user_alloc &&
+ * !userspace_addr.
+ */
+ if (user_alloc)
+ new.userspace_addr = mem->userspace_addr;
+ else
+ new.userspace_addr = 0;
}
/* Allocate page dirty bitmap if needed */
@@ -311,14 +320,18 @@ int __kvm_set_memory_region(struct kvm *kvm,
memset(new.dirty_bitmap, 0, dirty_bytes);
}
+ spin_lock(&kvm->mmu_lock);
if (mem->slot >= kvm->nmemslots)
kvm->nmemslots = mem->slot + 1;
*memslot = new;
+ spin_unlock(&kvm->mmu_lock);
r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
if (r) {
+ spin_lock(&kvm->mmu_lock);
*memslot = old;
+ spin_unlock(&kvm->mmu_lock);
goto out_free;
}
This is the updated compatibility code:
Signed-off-by: Andrea Arcangeli <andrea-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
diff --git a/kernel/external-module-compat.h b/kernel/external-module-compat.h
index 67b9cc4..6d2be6a 100644
--- a/kernel/external-module-compat.h
+++ b/kernel/external-module-compat.h
@@ -17,6 +17,45 @@
#include <linux/hrtimer.h>
#include <asm/bitops.h>
+#ifndef CONFIG_MMU_NOTIFIER
+struct mmu_notifier;
+
+struct mmu_notifier_ops {
+ /*
+ * Note: The mmu_notifier structure must be released with
+ * call_rcu() since other processors are only guaranteed to
+ * see the changes after a quiescent period.
+ */
+ void (*release)(struct mmu_notifier *mn,
+ struct mm_struct *mm);
+
+ int (*age_page)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address);
+
+ void (*invalidate_page)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address);
+
+ /* Dummy needed because the mmu_notifier() macro requires it */
+ void (*invalidate_all)(struct mmu_notifier *mn, struct mm_struct *mm,
+ int dummy);
+
+ /*
+ * lock indicates that the function is called under spinlock.
+ */
+ void (*invalidate_range)(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end,
+ int lock);
+};
+
+struct mmu_notifier {
+ const struct mmu_notifier_ops *ops;
+};
+#define mmu_notifier_register(mn, mm) do {} while(0)
+#endif
+
/*
* 2.6.16 does not have GFP_NOWAIT
*/
This is the patch to apply on top of V2 fix the aging:
Signed-off-by: Andrea Arcangeli <andrea-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
diff --git a/mm/rmap.c b/mm/rmap.c
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -285,10 +285,8 @@ static int page_referenced_one(struct pa
if (!pte)
goto out;
- if (ptep_clear_flush_young(vma, address, pte))
- referenced++;
-
- if (mmu_notifier_age_page(mm, address))
+ if (ptep_clear_flush_young(vma, address, pte) |
+ mmu_notifier_age_page(mm, address))
referenced++;
/* Pretend the page is referenced if the task has the
@@ -684,7 +682,7 @@ static int try_to_unmap_one(struct page
* skipped over this mm) then we should reactivate it.
*/
if (!migration && ((vma->vm_flags & VM_LOCKED) ||
- (ptep_clear_flush_young(vma, address, pte) ||
+ (ptep_clear_flush_young(vma, address, pte) |
mmu_notifier_age_page(mm, address)))) {
ret = SWAP_FAIL;
goto out_unmap;
@@ -818,10 +816,8 @@ static void try_to_unmap_cluster(unsigne
page = vm_normal_page(vma, address, *pte);
BUG_ON(!page || PageAnon(page));
- if (ptep_clear_flush_young(vma, address, pte))
- continue;
-
- if (mmu_notifier_age_page(mm, address))
+ if (ptep_clear_flush_young(vma, address, pte) |
+ mmu_notifier_age_page(mm, address))
continue;
/* Nuke the page table entry. */
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
next reply other threads:[~2008-01-29 14:50 UTC|newest]
Thread overview: 36+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-01-29 14:50 Andrea Arcangeli [this message]
[not found] ` <20080129145021.GJ7233-lysg2Xt5kKMAvxtiuMwx3w@public.gmane.org>
2008-01-29 15:13 ` swapping with MMU Notifiers V2 Izik Eidus
2008-01-29 16:14 ` Carsten Otte
[not found] ` <479F50D6.4020005-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>
2008-01-29 16:24 ` Avi Kivity
[not found] ` <479F532C.1020503-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2008-01-29 16:31 ` Carsten Otte
2008-01-29 16:35 ` Carsten Otte
[not found] ` <479F55D6.1090807-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>
2008-01-29 17:02 ` Avi Kivity
[not found] ` <479F5C3C.7070501-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2008-01-29 18:13 ` Joerg Roedel
2008-01-29 17:54 ` Andrea Arcangeli
[not found] ` <20080129175420.GR7233-lysg2Xt5kKMAvxtiuMwx3w@public.gmane.org>
2008-01-29 18:05 ` Avi Kivity
[not found] ` <479F6AE0.3080702-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2008-01-29 18:34 ` Andrea Arcangeli
2008-01-30 11:26 ` Carsten Otte
[not found] ` <47A05EEF.3010701-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>
2008-01-30 11:42 ` Andrea Arcangeli
[not found] ` <20080130114206.GG7233-lysg2Xt5kKMAvxtiuMwx3w@public.gmane.org>
2008-01-30 15:01 ` Carsten Otte
[not found] ` <47A09142.4090307-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>
2008-01-30 15:09 ` Avi Kivity
[not found] ` <47A09342.1040708-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2008-01-30 15:14 ` Carsten Otte
2008-01-29 18:19 ` Joerg Roedel
[not found] ` <20080129181918.GA6344-zLv9SwRftAIdnm+yROfE0A@public.gmane.org>
2008-01-29 18:42 ` Andrea Arcangeli
2008-01-30 9:49 ` Carsten Otte
[not found] ` <47A04816.4090408-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>
2008-01-30 14:38 ` Joerg Roedel
2008-01-29 16:52 ` Andrea Arcangeli
[not found] ` <20080129165219.GN7233-lysg2Xt5kKMAvxtiuMwx3w@public.gmane.org>
2008-01-29 17:17 ` Carsten Otte
[not found] ` <479F5FBF.40203-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>
2008-01-29 17:39 ` Andrea Arcangeli
2008-01-29 16:49 ` Andrea Arcangeli
[not found] ` <20080129164954.GM7233-lysg2Xt5kKMAvxtiuMwx3w@public.gmane.org>
2008-01-29 17:04 ` Avi Kivity
[not found] ` <479F5CBB.5060702-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2008-01-29 17:49 ` Andrea Arcangeli
[not found] ` <20080129174955.GQ7233-lysg2Xt5kKMAvxtiuMwx3w@public.gmane.org>
2008-01-29 17:53 ` Avi Kivity
2008-01-29 17:17 ` Carsten Otte
2008-01-30 18:57 ` Andrea Arcangeli
[not found] ` <20080130185735.GS7233-lysg2Xt5kKMAvxtiuMwx3w@public.gmane.org>
2008-01-31 6:50 ` Avi Kivity
[not found] ` <47A16F99.8060502-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2008-01-31 10:15 ` Andrea Arcangeli
[not found] ` <20080131101519.GG7185-lysg2Xt5kKMAvxtiuMwx3w@public.gmane.org>
2008-01-31 10:34 ` Avi Kivity
[not found] ` <47A1A43D.6020203-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2008-01-31 12:58 ` Andrea Arcangeli
[not found] ` <20080131125842.GL7185-lysg2Xt5kKMAvxtiuMwx3w@public.gmane.org>
2008-01-31 18:56 ` Andrea Arcangeli
2008-02-11 8:20 ` Avi Kivity
2008-02-11 11:37 ` Andrea Arcangeli
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20080129145021.GJ7233@v2.random \
--to=andrea-atkuwr5tajbwk0htik3j/w@public.gmane.org \
--cc=kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.