* [PATCH] kvm swapping with mmu notifiers + age_page
@ 2008-01-21 12:41 Andrea Arcangeli
[not found] ` <20080121124124.GG6970-lysg2Xt5kKMAvxtiuMwx3w@public.gmane.org>
0 siblings, 1 reply; 5+ messages in thread
From: Andrea Arcangeli @ 2008-01-21 12:41 UTC (permalink / raw)
To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
This is the same as before but it uses the age_page callback to
prevent the guest OS working set to be swapped out. It works well here
so far. This depends on the memslot locking with mmu lock patch and on
the mmu notifiers #v3 patch that I'll post in CC with linux-mm shortly
that implements the age_page callback and that changes follow_page to
set the young bit in the pte instead of setting the referenced bit (so
the age_page will be called again later when the VM clears the young
bit).
Signed-off-by: Andrea Arcangeli <andrea-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 4086080..c527d7d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -18,6 +18,7 @@ config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on ARCH_SUPPORTS_KVM && EXPERIMENTAL
select PREEMPT_NOTIFIERS
+ select MMU_NOTIFIER
select ANON_INODES
---help---
Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 324ff9a..189f3e1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -532,6 +532,38 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
kvm_flush_remote_tlbs(kvm);
}
+static void unmap_spte(struct kvm *kvm, u64 *spte)
+{
+ struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+ get_page(page);
+ rmap_remove(kvm, spte);
+ set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(kvm);
+ __free_page(page);
+}
+
+void kvm_rmap_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+ unsigned long *rmapp;
+ u64 *spte, *curr_spte;
+
+ spin_lock(&kvm->mmu_lock);
+ rmapp = kvm_hva_to_rmapp(kvm, hva);
+ if (!rmapp)
+ goto out_unlock;
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ BUG_ON(!(*spte & PT_PRESENT_MASK));
+ rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
+ curr_spte = spte;
+ spte = rmap_next(kvm, rmapp, spte);
+ unmap_spte(kvm, curr_spte);
+ }
+out_unlock:
+ spin_unlock(&kvm->mmu_lock);
+}
+
#ifdef MMU_DEBUG
static int is_empty_shadow_page(u64 *spt)
{
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8a90403..35a2ee0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3159,6 +3159,35 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
free_page((unsigned long)vcpu->arch.pio_data);
}
+static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
+{
+ return container_of(mn, struct kvm, mmu_notifier);
+}
+
+void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ BUG_ON(mm != kvm->mm);
+ kvm_rmap_unmap_hva(kvm, address);
+}
+
+void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ for (; start < end; start += PAGE_SIZE)
+ kvm_mmu_notifier_invalidate_page(mn, mm, start);
+}
+
+static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+ .invalidate_range = kvm_mmu_notifier_invalidate_range,
+ .invalidate_page = kvm_mmu_notifier_invalidate_page,
+ /* age page will drop the spte so follow_page will set the young bit */
+ .age_page = kvm_mmu_notifier_invalidate_page,
+};
+
struct kvm *kvm_arch_create_vm(void)
{
struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
@@ -3167,6 +3196,7 @@ struct kvm *kvm_arch_create_vm(void)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
return kvm;
}
diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h
index d6db0de..522028b 100644
--- a/include/asm-x86/kvm_host.h
+++ b/include/asm-x86/kvm_host.h
@@ -404,6 +404,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu);
int kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
+void kvm_rmap_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
void kvm_mmu_zap_all(struct kvm *kvm);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2714068..eae8734 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -117,6 +117,7 @@ struct kvm {
struct kvm_io_bus pio_bus;
struct kvm_vm_stat stat;
struct kvm_arch arch;
+ struct mmu_notifier mmu_notifier;
};
/* The guest did something we don't support. */
@@ -163,6 +164,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
struct kvm_memory_slot old,
int user_alloc);
gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
+unsigned long *kvm_hva_to_rmapp(struct kvm *kvm, unsigned long addr);
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
void kvm_release_page_clean(struct page *page);
void kvm_release_page_dirty(struct page *page);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4295623..a67e38f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -165,6 +165,7 @@ static struct kvm *kvm_create_vm(void)
kvm->mm = current->mm;
atomic_inc(&kvm->mm->mm_count);
+ mmu_notifier_register(&kvm->mmu_notifier, kvm->mm);
spin_lock_init(&kvm->mmu_lock);
kvm_io_bus_init(&kvm->pio_bus);
mutex_init(&kvm->lock);
@@ -454,6 +467,28 @@ static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
}
+/* if mmap_sem isn't taken, it can be safely called with only the mmu_lock */
+unsigned long *kvm_hva_to_rmapp(struct kvm *kvm, unsigned long addr)
+{
+ int i;
+
+ for (i = 0; i < kvm->nmemslots; i++) {
+ struct kvm_memory_slot *memslot = &kvm->memslots[i];
+ unsigned long start = memslot->userspace_addr;
+ unsigned long end = start + (memslot->npages << PAGE_SHIFT);
+
+ /* mmu_lock protects userspace_addr */
+ if (!start)
+ continue;
+
+ if (addr >= start && addr < end) {
+ gfn_t gfn_offset = (addr - start) >> PAGE_SHIFT;
+ return &memslot->rmap[gfn_offset];
+ }
+ }
+ return NULL;
+}
+
/*
* Requires current->mm->mmap_sem to be held
*/
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH] kvm swapping with mmu notifiers + age_page
[not found] ` <20080121124124.GG6970-lysg2Xt5kKMAvxtiuMwx3w@public.gmane.org>
@ 2008-01-22 14:08 ` Avi Kivity
[not found] ` <4795F8D0.30102-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 5+ messages in thread
From: Avi Kivity @ 2008-01-22 14:08 UTC (permalink / raw)
To: Andrea Arcangeli; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
Andrea Arcangeli wrote:
> This is the same as before but it uses the age_page callback to
> prevent the guest OS working set to be swapped out. It works well here
> so far. This depends on the memslot locking with mmu lock patch and on
> the mmu notifiers #v3 patch that I'll post in CC with linux-mm shortly
> that implements the age_page callback and that changes follow_page to
> set the young bit in the pte instead of setting the referenced bit (so
> the age_page will be called again later when the VM clears the young
> bit).
>
>
> +static void unmap_spte(struct kvm *kvm, u64 *spte)
> +{
> + struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
> + get_page(page);
> + rmap_remove(kvm, spte);
> + set_shadow_pte(spte, shadow_trap_nonpresent_pte);
> + kvm_flush_remote_tlbs(kvm);
> + __free_page(page);
> +}
>
Why is get_page()/__free_page() needed here? Isn't kvm_release_page_*()
sufficient?
--
error compiling committee.c: too many arguments to function
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] kvm swapping with mmu notifiers + age_page
[not found] ` <4795F8D0.30102-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2008-01-22 14:41 ` Andrea Arcangeli
[not found] ` <20080122144149.GD7331-lysg2Xt5kKMAvxtiuMwx3w@public.gmane.org>
0 siblings, 1 reply; 5+ messages in thread
From: Andrea Arcangeli @ 2008-01-22 14:41 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
On Tue, Jan 22, 2008 at 04:08:16PM +0200, Avi Kivity wrote:
> Andrea Arcangeli wrote:
>> This is the same as before but it uses the age_page callback to
>> prevent the guest OS working set to be swapped out. It works well here
>> so far. This depends on the memslot locking with mmu lock patch and on
>> the mmu notifiers #v3 patch that I'll post in CC with linux-mm shortly
>> that implements the age_page callback and that changes follow_page to
>> set the young bit in the pte instead of setting the referenced bit (so
>> the age_page will be called again later when the VM clears the young
>> bit).
>>
>> +static void unmap_spte(struct kvm *kvm, u64 *spte)
>> +{
>> + struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >>
>> PAGE_SHIFT);
>> + get_page(page);
>> + rmap_remove(kvm, spte);
>> + set_shadow_pte(spte, shadow_trap_nonpresent_pte);
>> + kvm_flush_remote_tlbs(kvm);
>> + __free_page(page);
>> +}
>>
>
> Why is get_page()/__free_page() needed here? Isn't kvm_release_page_*()
> sufficient?
The other-cpus-tlb have to be flushed _before_ the page is visible in
the host kernel freelist, otherwise other host-cpus with tlbs still
mapping the page with write-access would be able to modify the page
even after it's queued in the freelist. The mmu_notifier are called in
places like munmap where the __free_page will not be a put_page but a
real __free_page. Furthermore kvm_release_page_ aren't calling
__free_page but put_page that would leak ram in those paths (mostly
invalidate_range). I'd rather not depend on the mmu_notifiers always
being invoked with an additional reference count on the page (in
addition to the spte reference count). The ->invalidate_* methods
might be the ones that put the page in the freelist.
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] kvm swapping with mmu notifiers + age_page
[not found] ` <20080122144149.GD7331-lysg2Xt5kKMAvxtiuMwx3w@public.gmane.org>
@ 2008-01-22 14:53 ` Avi Kivity
[not found] ` <47960371.8020709-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
0 siblings, 1 reply; 5+ messages in thread
From: Avi Kivity @ 2008-01-22 14:53 UTC (permalink / raw)
To: Andrea Arcangeli; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
Andrea Arcangeli wrote:
> On Tue, Jan 22, 2008 at 04:08:16PM +0200, Avi Kivity wrote:
>
>> Andrea Arcangeli wrote:
>>
>>> This is the same as before but it uses the age_page callback to
>>> prevent the guest OS working set to be swapped out. It works well here
>>> so far. This depends on the memslot locking with mmu lock patch and on
>>> the mmu notifiers #v3 patch that I'll post in CC with linux-mm shortly
>>> that implements the age_page callback and that changes follow_page to
>>> set the young bit in the pte instead of setting the referenced bit (so
>>> the age_page will be called again later when the VM clears the young
>>> bit).
>>>
>>> +static void unmap_spte(struct kvm *kvm, u64 *spte)
>>> +{
>>> + struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >>
>>> PAGE_SHIFT);
>>> + get_page(page);
>>> + rmap_remove(kvm, spte);
>>> + set_shadow_pte(spte, shadow_trap_nonpresent_pte);
>>> + kvm_flush_remote_tlbs(kvm);
>>> + __free_page(page);
>>> +}
>>>
>>>
>> Why is get_page()/__free_page() needed here? Isn't kvm_release_page_*()
>> sufficient?
>>
>
> The other-cpus-tlb have to be flushed _before_ the page is visible in
> the host kernel freelist, otherwise other host-cpus with tlbs still
> mapping the page with write-access would be able to modify the page
> even after it's queued in the freelist.
Right. But doesn't this apply to other callers of rmap_remove()?
Perhaps we need to put the flush in set_spte() or rmap_remove() and
rmap_write_protect().
Oh, rmap_write_protect() already has the flush.
> The mmu_notifier are called in
> places like munmap where the __free_page will not be a put_page but a
> real __free_page. Furthermore kvm_release_page_ aren't calling
> __free_page but put_page that would leak ram in those paths (mostly
> invalidate_range). I'd rather not depend on the mmu_notifiers always
> being invoked with an additional reference count on the page (in
> addition to the spte reference count). The ->invalidate_* methods
> might be the ones that put the page in the freelist.
>
I'm afraid I don't really understand the difference in semantics between
put_page() and __free_page(). Maybe we need to switch
kvm_release_page_*() to __free_page()?
--
error compiling committee.c: too many arguments to function
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] kvm swapping with mmu notifiers + age_page
[not found] ` <47960371.8020709-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2008-01-22 17:41 ` Andrea Arcangeli
0 siblings, 0 replies; 5+ messages in thread
From: Andrea Arcangeli @ 2008-01-22 17:41 UTC (permalink / raw)
To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
On Tue, Jan 22, 2008 at 04:53:37PM +0200, Avi Kivity wrote:
> Andrea Arcangeli wrote:
>> On Tue, Jan 22, 2008 at 04:08:16PM +0200, Avi Kivity wrote:
>>
>>> Andrea Arcangeli wrote:
>>>
>>>> This is the same as before but it uses the age_page callback to
>>>> prevent the guest OS working set to be swapped out. It works well here
>>>> so far. This depends on the memslot locking with mmu lock patch and on
>>>> the mmu notifiers #v3 patch that I'll post in CC with linux-mm shortly
>>>> that implements the age_page callback and that changes follow_page to
>>>> set the young bit in the pte instead of setting the referenced bit (so
>>>> the age_page will be called again later when the VM clears the young
>>>> bit).
>>>>
>>>> +static void unmap_spte(struct kvm *kvm, u64 *spte)
>>>> +{
>>>> + struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >>
>>>> PAGE_SHIFT);
>>>> + get_page(page);
>>>> + rmap_remove(kvm, spte);
>>>> + set_shadow_pte(spte, shadow_trap_nonpresent_pte);
>>>> + kvm_flush_remote_tlbs(kvm);
>>>> + __free_page(page);
>>>> +}
>>>>
>>> Why is get_page()/__free_page() needed here? Isn't kvm_release_page_*()
>>> sufficient?
>>>
>>
>> The other-cpus-tlb have to be flushed _before_ the page is visible in
>> the host kernel freelist, otherwise other host-cpus with tlbs still
>> mapping the page with write-access would be able to modify the page
>> even after it's queued in the freelist.
>
> Right. But doesn't this apply to other callers of rmap_remove()? Perhaps
> we need to put the flush in set_spte() or rmap_remove() and
> rmap_write_protect().
>
> Oh, rmap_write_protect() already has the flush.
rmap_write_protect is the only obviously safe one because it doesn't
decrease the reference count, it flushes the tlb only to flush any
write-enabled tlb entry.
The problem is only with all rmap_remove callers.
invalidate_page ironically I think is ok with flushing the tlb after
put_page because ptep_clear_flush is invoked with a pin on the page by
the caller of ptep_clear_flush.
invalidate_range is not ok with flushing the tlb _after_ put_page.
All other rmap_remove callers must take into account that when
rmap_remove returns, in between put_page and tlb-flush, another cpu
may be in the VM and free the page the moment after the pin on the
page is gone. This is especially true with readonly swapcache that
doesn't require swapout to be put in the freelist.
So yes, it may be a generic race for the rmap_remove callers.
I'm not exactly sure why I was getting crashes w/o doing
get_page/tlbflush/__free_page, the only logical explanation at this
point is invalidate_range.
> I'm afraid I don't really understand the difference in semantics between
> put_page() and __free_page(). Maybe we need to switch kvm_release_page_*()
> to __free_page()?
put_page/__free_page will work fine in practice for kvm, __free_page
is faster so yes, I think kvm_release_page_ should be changed to use
__free_page but this is a microoptimization only. The only real issue
is with the tlb flush in smp. If it can happen after
put_page/__free_page or not.
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2008-01-22 17:41 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-01-21 12:41 [PATCH] kvm swapping with mmu notifiers + age_page Andrea Arcangeli
[not found] ` <20080121124124.GG6970-lysg2Xt5kKMAvxtiuMwx3w@public.gmane.org>
2008-01-22 14:08 ` Avi Kivity
[not found] ` <4795F8D0.30102-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2008-01-22 14:41 ` Andrea Arcangeli
[not found] ` <20080122144149.GD7331-lysg2Xt5kKMAvxtiuMwx3w@public.gmane.org>
2008-01-22 14:53 ` Avi Kivity
[not found] ` <47960371.8020709-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2008-01-22 17:41 ` Andrea Arcangeli
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox