Re: large page support for kvm

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Avi Kivity <avi@qumranet.com>
To: Marcelo Tosatti <marcelo@kvack.org>
Cc: kvm-devel <kvm-devel@lists.sourceforge.net>
Subject: Re: large page support for kvm
Date: Tue, 12 Feb 2008 13:55:54 +0200	[thread overview]
Message-ID: <47B1894A.1030208@qumranet.com> (raw)
In-Reply-To: <20080211154901.GA11936@dmt>

Marcelo Tosatti wrote:
> Ok, how does the following look. Still need to plug in large page
> creation in the nonpaging case, but this should be enough for comments.
>   

Most of the comments are cosmetic, but a couple have some meat.

>  
> +#define HPAGE_ALIGN_OFFSET(x) ((((x)+HPAGE_SIZE-1)&HPAGE_MASK) - (x))
>   

A function please.

> +/*
> + * Return the offset inside the memslot largepage integer map for a given
> + * gfn, handling slots that are not large page aligned.
> + */
> +int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
> +{
> +	unsigned long long idx;
> +	unsigned long memslot_align;
> +
> +	memslot_align = HPAGE_ALIGN_OFFSET(slot->base_gfn << PAGE_SHIFT);
> +	idx = ((gfn - slot->base_gfn) << PAGE_SHIFT) + memslot_align;
> +	idx /= HPAGE_SIZE;
>   

Can probably be reshuffled to avoid the long long.  The compiler may 
want to call helpers on i386...

We also need our own HPAGE_SIZE since kvm uses pae on non-pae hosts.
> +
> +static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
> +{
> +	struct kvm_memory_slot *slot;
> +
> +	slot = gfn_to_memslot(kvm, gfn);
> +	if (slot) {
> +		int *largepage_idx;
> +		int end_gfn;
> +
> +		largepage_idx = slot_largepage_idx(gfn, slot);
> +		/* check if the largepage crosses a memslot */
> +		end_gfn = slot->base_gfn + slot->npages;
> +		if (gfn + (HPAGE_SIZE/PAGE_SIZE) >= end_gfn)
> +			return 1;
> +		else
> +			return *largepage_idx;
>   

We might initialize the boundary slots to 1 instead of zero and so avoid 
this check.

> +static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
> +{
> +	if (has_wrprotected_page(vcpu->kvm, large_gfn))
> +		return 0;
> +
> +	if (!host_largepage_backed(vcpu->kvm, large_gfn))
> +		return 0;
> +
> +	if ((large_gfn << PAGE_SHIFT) & (HPAGE_SIZE-1))
> +		return 0;
>   

I'd drop this check and simply ignore the least significant 9 bits of 
large_gfn.

> +
> +	/* guest has 4M pages */
> +	if (!is_pae(vcpu))
> +		return 0;
>   

We can drop this check too.  If we treat a 4MB page as two contiguous 
2MB pages, and don't zero out the large_gfn least significant bits, most 
of the code needn't know about it at all.

> @@ -362,6 +469,20 @@ static unsigned long *gfn_to_rmap(struct
>  	return &slot->rmap[gfn - slot->base_gfn];
>  }
>  /*
>   * Reverse mapping data structures:
>   *
> @@ -371,7 +492,7 @@ static unsigned long *gfn_to_rmap(struct
>   * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
>   * containing more mappings.
>   */
> -static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
> +static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int hpage)
>  {
>  	struct kvm_mmu_page *sp;
>  	struct kvm_rmap_desc *desc;
> @@ -383,7 +504,10 @@ static void rmap_add(struct kvm_vcpu *vc
>  	gfn = unalias_gfn(vcpu->kvm, gfn);
>  	sp = page_header(__pa(spte));
>  	sp->gfns[spte - sp->spt] = gfn;
> -	rmapp = gfn_to_rmap(vcpu->kvm, gfn);
> +	if (!hpage)
> +		rmapp = gfn_to_rmap(vcpu->kvm, gfn);
> +	else
> +		rmapp = gfn_to_rmap_pde(vcpu->kvm, gfn);
>   

This bit can go into a function

>  	if (!*rmapp) {
>  		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
>  		*rmapp = (unsigned long)spte;
> @@ -449,7 +573,10 @@ static void rmap_remove(struct kvm *kvm,
>  		kvm_release_page_dirty(page);
>  	else
>  		kvm_release_page_clean(page);
> -	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
> +	if (is_large_pte(*spte))
> +		rmapp = gfn_to_rmap_pde(kvm, sp->gfns[spte - sp->spt]);
> +	else
> +		rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
>   

As it is reused here.

>  
>  #ifdef MMU_DEBUG
> @@ -749,11 +895,19 @@ static void kvm_mmu_page_unlink_children
>  	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
>  		ent = pt[i];
>  
> +		if (is_shadow_present_pte(pt[i]) && is_large_pte(pt[i])) {
> +			if (is_writeble_pte(pt[i]))
> +				--kvm->stat.lpages;
> +			rmap_remove(kvm, &pt[i]);
> +		}
> +
>   

pt[i] == ent here.  you can also move this code to be the 'else' part of 
the if () you introduce below.

>  		pt[i] = shadow_trap_nonpresent_pte;
>  		if (!is_shadow_present_pte(ent))
>  			continue;
> -		ent &= PT64_BASE_ADDR_MASK;
> -		mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
> +		if (!is_large_pte(ent)) {
> +			ent &= PT64_BASE_ADDR_MASK;
> +			mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
> +		}
>  	}
>  	kvm_flush_remote_tlbs(kvm);
>  }
>   

> @@ -886,12 +1042,21 @@ struct page *gva_to_page(struct kvm_vcpu
>  static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
>  			 unsigned pt_access, unsigned pte_access,
>  			 int user_fault, int write_fault, int dirty,
> -			 int *ptwrite, gfn_t gfn, struct page *page)
> +			 int *ptwrite, int largepage, gfn_t gfn,
> +			 struct page *page)
>  {
>  	u64 spte;
>  	int was_rmapped = is_rmap_pte(*shadow_pte);
>  	int was_writeble = is_writeble_pte(*shadow_pte);
>  
> +	/*
> + 	 * If its a largepage mapping, there could be a previous
> + 	 * pointer to a PTE page hanging there, which will falsely
> + 	 * set was_rmapped.
> + 	 */
> +	if (largepage)
> +		was_rmapped = is_large_pte(*shadow_pte);
> +
>   

But that pte page will have its parent_pte chain pointing to shadow_pte, 
no?  Either this can't happen, or we need to unlink that pte page first.

> @@ -951,10 +1119,17 @@ unshadowed:
>  		mark_page_dirty(vcpu->kvm, gfn);
>  
>  	pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
> +	pgprintk("instantiating %s PTE (%s) at %d (%llx)\n",
> +		 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
> +		 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte);
>  	set_shadow_pte(shadow_pte, spte);
> +	if (!was_rmapped && (spte & (PT_PAGE_SIZE_MASK|PT_WRITABLE_MASK))
> +		== (PT_PAGE_SIZE_MASK|PT_WRITABLE_MASK))
> +		++vcpu->kvm->stat.lpages;
> +
>   

Why do you only account for writable large pages?

> Index: linux-2.6-x86-kvm/arch/x86/kvm/paging_tmpl.h
> ===================================================================
> --- linux-2.6-x86-kvm.orig/arch/x86/kvm/paging_tmpl.h
> +++ linux-2.6-x86-kvm/arch/x86/kvm/paging_tmpl.h
> @@ -71,6 +71,7 @@ struct guest_walker {
>  	unsigned pte_access;
>  	gfn_t gfn;
>  	u32 error_code;
> +	int largepage_backed;
>  };
>  
>  static gfn_t gpte_to_gfn(pt_element_t gpte)
> @@ -120,7 +121,8 @@ static unsigned FNAME(gpte_access)(struc
>   */
>  static int FNAME(walk_addr)(struct guest_walker *walker,
>  			    struct kvm_vcpu *vcpu, gva_t addr,
> -			    int write_fault, int user_fault, int fetch_fault)
> +			    int write_fault, int user_fault, int fetch_fault,
> +			    int faulting)
>  {
>  	pt_element_t pte;
>  	gfn_t table_gfn;
> @@ -130,6 +132,7 @@ static int FNAME(walk_addr)(struct guest
>  	pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
>  walk:
>  	walker->level = vcpu->arch.mmu.root_level;
> +	walker->largepage_backed = 0;
>  	pte = vcpu->arch.cr3;
>  #if PTTYPE == 64
>  	if (!is_long_mode(vcpu)) {
> @@ -192,10 +195,19 @@ walk:
>  		if (walker->level == PT_DIRECTORY_LEVEL
>  		    && (pte & PT_PAGE_SIZE_MASK)
>  		    && (PTTYPE == 64 || is_pse(vcpu))) {
> -			walker->gfn = gpte_to_gfn_pde(pte);
> +			gfn_t gfn = gpte_to_gfn_pde(pte);
> +			walker->gfn = gfn;
> +
>  			walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
>  			if (PTTYPE == 32 && is_cpuid_PSE36())
>  				walker->gfn += pse36_gfn_delta(pte);
> +
> +			if (faulting
> +			    && is_largepage_backed(vcpu, gfn)
> +			    && is_physical_memory(vcpu->kvm, walker->gfn)) {
> +				walker->largepage_backed = 1;
> +				walker->gfn = gfn;
> +			}
>   

I don't like this bit.  So far the walker has been independent of the 
host state, and only depended on guest data.  We can set 
largepage_backed unconditionally on a large guest pte, leave gfn 
containing the low-order bits, and leave the host checks for the actual 
mapping logic.

>  
>  /*
> @@ -299,6 +313,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
>  		shadow_ent = ((u64 *)__va(shadow_addr)) + index;
>  		if (level == PT_PAGE_TABLE_LEVEL)
>  			break;
> +		if (level == PT_DIRECTORY_LEVEL && walker->largepage_backed)
> +			break;
> +
>   

(here)

> @@ -395,6 +412,13 @@ static int FNAME(page_fault)(struct kvm_
>  
>  	page = gfn_to_page(vcpu->kvm, walker.gfn);
>  
> +	/* shortcut non-RAM accesses to avoid walking over a 2MB PMD entry */
> +	if (is_error_page(page)) {
> +		kvm_release_page_clean(page);
> +		up_read(&current->mm->mmap_sem);
> +		return 1;
> +	}
> +
>   

This bit is already in kvm.git?

>  
>  struct kvm_vcpu_stat {
> Index: linux-2.6-x86-kvm/include/linux/kvm_host.h
> ===================================================================
> --- linux-2.6-x86-kvm.orig/include/linux/kvm_host.h
> +++ linux-2.6-x86-kvm/include/linux/kvm_host.h
> @@ -99,7 +99,9 @@ struct kvm_memory_slot {
>  	unsigned long npages;
>  	unsigned long flags;
>  	unsigned long *rmap;
> +	unsigned long *rmap_pde;
>  	unsigned long *dirty_bitmap;
> +	int *largepage;
>   

We can combine rmap_pde and largepage into an array of struct { int 
write_count, unsigned long rmap_pde; } and save some cacheline accesses.

>  	
>  
>  void kvm_free_physmem(struct kvm *kvm)
> @@ -300,6 +308,28 @@ int __kvm_set_memory_region(struct kvm *
>  		new.user_alloc = user_alloc;
>  		new.userspace_addr = mem->userspace_addr;
>  	}
> +	if (npages && !new.rmap_pde) {
> +		int largepages = npages / (HPAGE_SIZE/PAGE_SIZE);
> +		if (npages % (HPAGE_SIZE/PAGE_SIZE))
> +			largepages++;
> +		new.rmap_pde = vmalloc(largepages * sizeof(struct page *));
> +
> +		if (!new.rmap_pde)
> +			goto out_free;
> +
> +		memset(new.rmap_pde, 0, largepages * sizeof(struct page *));
> +	}
> +	if (npages && !new.largepage) {
> +		int largepages = npages / (HPAGE_SIZE/PAGE_SIZE);
> +		if (npages % (HPAGE_SIZE/PAGE_SIZE))
> +			largepages++;
> +		new.largepage = kmalloc(largepages * sizeof(int), GFP_KERNEL);
> +
> +	

vmalloc() here too, for very large guests.

Please test the changes with user/test/x86/access.flat, with both normal 
and large pages backing.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

next prev parent reply	other threads:[~2008-02-12 11:55 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-01-29 17:20 large page support for kvm Avi Kivity
     [not found] ` <479F604C.20107-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2008-01-30 18:40   ` Joerg Roedel
     [not found]     ` <20080130184035.GS6960-5C7GfCeVMHo@public.gmane.org>
2008-01-31  5:44       ` Avi Kivity
2008-02-11 15:49         ` Marcelo Tosatti
2008-02-12 11:55           ` Avi Kivity [this message]
2008-02-13  0:15             ` Marcelo Tosatti
2008-02-13  6:45               ` Avi Kivity
2008-02-14 23:17                 ` Marcelo Tosatti
2008-02-15  7:40                   ` Roedel, Joerg
2008-02-17  9:38                   ` Avi Kivity
2008-02-19 20:37                     ` Marcelo Tosatti
2008-02-20 14:25                       ` Avi Kivity
2008-02-22  2:01                         ` Marcelo Tosatti
2008-02-22  7:16                           ` Avi Kivity

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=47B1894A.1030208@qumranet.com \
    --to=avi@qumranet.com \
    --cc=kvm-devel@lists.sourceforge.net \
    --cc=marcelo@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.