Re: large page support for kvm

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

From: Avi Kivity <avi@qumranet.com>
To: Marcelo Tosatti <marcelo@kvack.org>
Cc: kvm-devel <kvm-devel@lists.sourceforge.net>
Subject: Re: large page support for kvm
Date: Tue, 12 Feb 2008 13:55:54 +0200	[thread overview]
Message-ID: <47B1894A.1030208@qumranet.com> (raw)
In-Reply-To: <20080211154901.GA11936@dmt>

Marcelo Tosatti wrote:
> Ok, how does the following look. Still need to plug in large page
> creation in the nonpaging case, but this should be enough for comments.
>   

Most of the comments are cosmetic, but a couple have some meat.

>  
> +#define HPAGE_ALIGN_OFFSET(x) ((((x)+HPAGE_SIZE-1)&HPAGE_MASK) - (x))
>   

A function please.

> +/*
> + * Return the offset inside the memslot largepage integer map for a given
> + * gfn, handling slots that are not large page aligned.
> + */
> +int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
> +{
> +	unsigned long long idx;
> +	unsigned long memslot_align;
> +
> +	memslot_align = HPAGE_ALIGN_OFFSET(slot->base_gfn << PAGE_SHIFT);
> +	idx = ((gfn - slot->base_gfn) << PAGE_SHIFT) + memslot_align;
> +	idx /= HPAGE_SIZE;
>   

Can probably be reshuffled to avoid the long long.  The compiler may 
want to call helpers on i386...

We also need our own HPAGE_SIZE since kvm uses pae on non-pae hosts.
> +
> +static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
> +{
> +	struct kvm_memory_slot *slot;
> +
> +	slot = gfn_to_memslot(kvm, gfn);
> +	if (slot) {
> +		int *largepage_idx;
> +		int end_gfn;
> +
> +		largepage_idx = slot_largepage_idx(gfn, slot);
> +		/* check if the largepage crosses a memslot */
> +		end_gfn = slot->base_gfn + slot->npages;
> +		if (gfn + (HPAGE_SIZE/PAGE_SIZE) >= end_gfn)
> +			return 1;
> +		else
> +			return *largepage_idx;
>   

We might initialize the boundary slots to 1 instead of zero and so avoid 
this check.

> +static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
> +{
> +	if (has_wrprotected_page(vcpu->kvm, large_gfn))
> +		return 0;
> +
> +	if (!host_largepage_backed(vcpu->kvm, large_gfn))
> +		return 0;
> +
> +	if ((large_gfn << PAGE_SHIFT) & (HPAGE_SIZE-1))
> +		return 0;
>   

I'd drop this check and simply ignore the least significant 9 bits of 
large_gfn.

> +
> +	/* guest has 4M pages */
> +	if (!is_pae(vcpu))
> +		return 0;
>   

We can drop this check too.  If we treat a 4MB page as two contiguous 
2MB pages, and don't zero out the large_gfn least significant bits, most 
of the code needn't know about it at all.

> @@ -362,6 +469,20 @@ static unsigned long *gfn_to_rmap(struct
>  	return &slot->rmap[gfn - slot->base_gfn];
>  }
>  /*
>   * Reverse mapping data structures:
>   *
> @@ -371,7 +492,7 @@ static unsigned long *gfn_to_rmap(struct
>   * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
>   * containing more mappings.
>   */
> -static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
> +static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int hpage)
>  {
>  	struct kvm_mmu_page *sp;
>  	struct kvm_rmap_desc *desc;
> @@ -383,7 +504,10 @@ static void rmap_add(struct kvm_vcpu *vc
>  	gfn = unalias_gfn(vcpu->kvm, gfn);
>  	sp = page_header(__pa(spte));
>  	sp->gfns[spte - sp->spt] = gfn;
> -	rmapp = gfn_to_rmap(vcpu->kvm, gfn);
> +	if (!hpage)
> +		rmapp = gfn_to_rmap(vcpu->kvm, gfn);
> +	else
> +		rmapp = gfn_to_rmap_pde(vcpu->kvm, gfn);
>   

This bit can go into a function

>  	if (!*rmapp) {
>  		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
>  		*rmapp = (unsigned long)spte;
> @@ -449,7 +573,10 @@ static void rmap_remove(struct kvm *kvm,
>  		kvm_release_page_dirty(page);
>  	else
>  		kvm_release_page_clean(page);
> -	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
> +	if (is_large_pte(*spte))
> +		rmapp = gfn_to_rmap_pde(kvm, sp->gfns[spte - sp->spt]);
> +	else
> +		rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
>   

As it is reused here.

>  
>  #ifdef MMU_DEBUG
> @@ -749,11 +895,19 @@ static void kvm_mmu_page_unlink_children
>  	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
>  		ent = pt[i];
>  
> +		if (is_shadow_present_pte(pt[i]) && is_large_pte(pt[i])) {
> +			if (is_writeble_pte(pt[i]))
> +				--kvm->stat.lpages;
> +			rmap_remove(kvm, &pt[i]);
> +		}
> +
>   

pt[i] == ent here.  you can also move this code to be the 'else' part of 
the if () you introduce below.

>  		pt[i] = shadow_trap_nonpresent_pte;
>  		if (!is_shadow_present_pte(ent))
>  			continue;
> -		ent &= PT64_BASE_ADDR_MASK;
> -		mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
> +		if (!is_large_pte(ent)) {
> +			ent &= PT64_BASE_ADDR_MASK;
> +			mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
> +		}
>  	}
>  	kvm_flush_remote_tlbs(kvm);
>  }
>   

> @@ -886,12 +1042,21 @@ struct page *gva_to_page(struct kvm_vcpu
>  static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
>  			 unsigned pt_access, unsigned pte_access,
>  			 int user_fault, int write_fault, int dirty,
> -			 int *ptwrite, gfn_t gfn, struct page *page)
> +			 int *ptwrite, int largepage, gfn_t gfn,
> +			 struct page *page)
>  {
>  	u64 spte;
>  	int was_rmapped = is_rmap_pte(*shadow_pte);
>  	int was_writeble = is_writeble_pte(*shadow_pte);
>  
> +	/*
> + 	 * If its a largepage mapping, there could be a previous
> + 	 * pointer to a PTE page hanging there, which will falsely
> + 	 * set was_rmapped.
> + 	 */
> +	if (largepage)
> +		was_rmapped = is_large_pte(*shadow_pte);
> +
>   

But that pte page will have its parent_pte chain pointing to shadow_pte, 
no?  Either this can't happen, or we need to unlink that pte page first.

> @@ -951,10 +1119,17 @@ unshadowed:
>  		mark_page_dirty(vcpu->kvm, gfn);
>  
>  	pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
> +	pgprintk("instantiating %s PTE (%s) at %d (%llx)\n",
> +		 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
> +		 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte);
>  	set_shadow_pte(shadow_pte, spte);
> +	if (!was_rmapped && (spte & (PT_PAGE_SIZE_MASK|PT_WRITABLE_MASK))
> +		== (PT_PAGE_SIZE_MASK|PT_WRITABLE_MASK))
> +		++vcpu->kvm->stat.lpages;
> +
>   

Why do you only account for writable large pages?

> Index: linux-2.6-x86-kvm/arch/x86/kvm/paging_tmpl.h
> ===================================================================
> --- linux-2.6-x86-kvm.orig/arch/x86/kvm/paging_tmpl.h
> +++ linux-2.6-x86-kvm/arch/x86/kvm/paging_tmpl.h
> @@ -71,6 +71,7 @@ struct guest_walker {
>  	unsigned pte_access;
>  	gfn_t gfn;
>  	u32 error_code;
> +	int largepage_backed;
>  };
>  
>  static gfn_t gpte_to_gfn(pt_element_t gpte)
> @@ -120,7 +121,8 @@ static unsigned FNAME(gpte_access)(struc
>   */
>  static int FNAME(walk_addr)(struct guest_walker *walker,
>  			    struct kvm_vcpu *vcpu, gva_t addr,
> -			    int write_fault, int user_fault, int fetch_fault)
> +			    int write_fault, int user_fault, int fetch_fault,
> +			    int faulting)
>  {
>  	pt_element_t pte;
>  	gfn_t table_gfn;
> @@ -130,6 +132,7 @@ static int FNAME(walk_addr)(struct guest
>  	pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
>  walk:
>  	walker->level = vcpu->arch.mmu.root_level;
> +	walker->largepage_backed = 0;
>  	pte = vcpu->arch.cr3;
>  #if PTTYPE == 64
>  	if (!is_long_mode(vcpu)) {
> @@ -192,10 +195,19 @@ walk:
>  		if (walker->level == PT_DIRECTORY_LEVEL
>  		    && (pte & PT_PAGE_SIZE_MASK)
>  		    && (PTTYPE == 64 || is_pse(vcpu))) {
> -			walker->gfn = gpte_to_gfn_pde(pte);
> +			gfn_t gfn = gpte_to_gfn_pde(pte);
> +			walker->gfn = gfn;
> +
>  			walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
>  			if (PTTYPE == 32 && is_cpuid_PSE36())
>  				walker->gfn += pse36_gfn_delta(pte);
> +
> +			if (faulting
> +			    && is_largepage_backed(vcpu, gfn)
> +			    && is_physical_memory(vcpu->kvm, walker->gfn)) {
> +				walker->largepage_backed = 1;
> +				walker->gfn = gfn;
> +			}
>   

I don't like this bit.  So far the walker has been independent of the 
host state, and only depended on guest data.  We can set 
largepage_backed unconditionally on a large guest pte, leave gfn 
containing the low-order bits, and leave the host checks for the actual 
mapping logic.

>  
>  /*
> @@ -299,6 +313,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
>  		shadow_ent = ((u64 *)__va(shadow_addr)) + index;
>  		if (level == PT_PAGE_TABLE_LEVEL)
>  			break;
> +		if (level == PT_DIRECTORY_LEVEL && walker->largepage_backed)
> +			break;
> +
>   

(here)

> @@ -395,6 +412,13 @@ static int FNAME(page_fault)(struct kvm_
>  
>  	page = gfn_to_page(vcpu->kvm, walker.gfn);
>  
> +	/* shortcut non-RAM accesses to avoid walking over a 2MB PMD entry */
> +	if (is_error_page(page)) {
> +		kvm_release_page_clean(page);
> +		up_read(&current->mm->mmap_sem);
> +		return 1;
> +	}
> +
>   

This bit is already in kvm.git?

>  
>  struct kvm_vcpu_stat {
> Index: linux-2.6-x86-kvm/include/linux/kvm_host.h
> ===================================================================
> --- linux-2.6-x86-kvm.orig/include/linux/kvm_host.h
> +++ linux-2.6-x86-kvm/include/linux/kvm_host.h
> @@ -99,7 +99,9 @@ struct kvm_memory_slot {
>  	unsigned long npages;
>  	unsigned long flags;
>  	unsigned long *rmap;
> +	unsigned long *rmap_pde;
>  	unsigned long *dirty_bitmap;
> +	int *largepage;
>   

We can combine rmap_pde and largepage into an array of struct { int 
write_count, unsigned long rmap_pde; } and save some cacheline accesses.

>  	
>  
>  void kvm_free_physmem(struct kvm *kvm)
> @@ -300,6 +308,28 @@ int __kvm_set_memory_region(struct kvm *
>  		new.user_alloc = user_alloc;
>  		new.userspace_addr = mem->userspace_addr;
>  	}
> +	if (npages && !new.rmap_pde) {
> +		int largepages = npages / (HPAGE_SIZE/PAGE_SIZE);
> +		if (npages % (HPAGE_SIZE/PAGE_SIZE))
> +			largepages++;
> +		new.rmap_pde = vmalloc(largepages * sizeof(struct page *));
> +
> +		if (!new.rmap_pde)
> +			goto out_free;
> +
> +		memset(new.rmap_pde, 0, largepages * sizeof(struct page *));
> +	}
> +	if (npages && !new.largepage) {
> +		int largepages = npages / (HPAGE_SIZE/PAGE_SIZE);
> +		if (npages % (HPAGE_SIZE/PAGE_SIZE))
> +			largepages++;
> +		new.largepage = kmalloc(largepages * sizeof(int), GFP_KERNEL);
> +
> +	

vmalloc() here too, for very large guests.

Please test the changes with user/test/x86/access.flat, with both normal 
and large pages backing.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

next prev parent reply	other threads:[~2008-02-12 11:55 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-01-29 17:20 large page support for kvm Avi Kivity
     [not found] ` <479F604C.20107-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2008-01-30 18:40   ` Joerg Roedel
     [not found]     ` <20080130184035.GS6960-5C7GfCeVMHo@public.gmane.org>
2008-01-31  5:44       ` Avi Kivity
2008-02-11 15:49         ` Marcelo Tosatti
2008-02-12 11:55           ` Avi Kivity [this message]
2008-02-13  0:15             ` Marcelo Tosatti
2008-02-13  6:45               ` Avi Kivity
2008-02-14 23:17                 ` Marcelo Tosatti
2008-02-15  7:40                   ` Roedel, Joerg
2008-02-17  9:38                   ` Avi Kivity
2008-02-19 20:37                     ` Marcelo Tosatti
2008-02-20 14:25                       ` Avi Kivity
2008-02-22  2:01                         ` Marcelo Tosatti
2008-02-22  7:16                           ` Avi Kivity

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=47B1894A.1030208@qumranet.com \
    --to=avi@qumranet.com \
    --cc=kvm-devel@lists.sourceforge.net \
    --cc=marcelo@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox