linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Gleb Natapov <gleb@redhat.com>
To: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: avi.kivity@gmail.com, mtosatti@redhat.com, pbonzini@redhat.com,
	linux-kernel@vger.kernel.org, kvm@vger.kernel.org
Subject: Re: [PATCH 07/12] KVM: MMU: redesign the algorithm of pte_list
Date: Wed, 28 Aug 2013 11:12:05 +0300	[thread overview]
Message-ID: <20130828081205.GN22899@redhat.com> (raw)
In-Reply-To: <1375189330-24066-8-git-send-email-xiaoguangrong@linux.vnet.ibm.com>

On Tue, Jul 30, 2013 at 09:02:05PM +0800, Xiao Guangrong wrote:
> Change the algorithm to:
> 1) always add new desc to the first desc (pointed by parent_ptes/rmap)
>    that is good to implement rcu-nulls-list-like lockless rmap
>    walking
> 
> 2) always move the entry in first desc to the the position we want
>    to remove when remove a spte in the parent_ptes/rmap (backward-move).
>    It is good for us to implement lockless rmap walk since in the current
>    code, when a spte is deleted from the "desc", another spte in the last
>    "desc" will be moved to this position to replace the deleted one. If the
>    deleted one has been accessed and we do not access the replaced one, the
>    replaced one is missed when we do lockless walk.
>    To fix this case, we do not backward move the spte, instead, we forward
>    move the entry: when a spte is deleted, we move the entry in the first
>    desc to that position
> 
> Both of these also can reduce cache miss
> 
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  arch/x86/kvm/mmu.c | 182 ++++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 125 insertions(+), 57 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 5a40564..3013bb1 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -918,6 +918,50 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
>  	return level - 1;
>  }
>  
> +static int __find_first_free(struct pte_list_desc *desc)
> +{
> +	int i;
> +
> +	for (i = 0; i < PTE_LIST_EXT; i++)
> +		if (!desc->sptes[i])
> +			break;
> +	return i;
> +}
> +
> +static int find_first_free(struct pte_list_desc *desc)
> +{
> +	int free = __find_first_free(desc);
> +
> +	WARN_ON(free >= PTE_LIST_EXT);
> +	return free;
> +}
> +
> +static int find_last_used(struct pte_list_desc *desc)
> +{
> +	int used = __find_first_free(desc) - 1;
> +
> +	WARN_ON(used < 0 || used >= PTE_LIST_EXT);
> +	return used;
> +}
> +
> +/*
> + * TODO: we can encode the desc number into the rmap/parent_ptes
> + * since at least 10 physical/virtual address bits are reserved
> + * on x86. It is worthwhile if it shows that the desc walking is
> + * a performance issue.
> + */
> +static int count_spte_number(struct pte_list_desc *desc)
> +{
> +	int first_free, desc_num;
> +
> +	first_free = __find_first_free(desc);
> +
> +	for (desc_num = 0; desc->more; desc = desc->more)
> +		desc_num++;
> +
> +	return first_free + desc_num * PTE_LIST_EXT;
> +}
> +
>  /*
>   * Pte mapping structures:
>   *
> @@ -934,92 +978,116 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
>  			unsigned long *pte_list)
>  {
>  	struct pte_list_desc *desc;
> -	int i, count = 0;
> +	int free_pos;
>  
>  	if (!*pte_list) {
>  		rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
>  		*pte_list = (unsigned long)spte;
> -	} else if (!(*pte_list & 1)) {
> +		return 0;
> +	}
> +
> +	if (!(*pte_list & 1)) {
>  		rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
>  		desc = mmu_alloc_pte_list_desc(vcpu);
>  		desc->sptes[0] = (u64 *)*pte_list;
>  		desc->sptes[1] = spte;
>  		*pte_list = (unsigned long)desc | 1;
> -		++count;
> -	} else {
> -		rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
> -		desc = (struct pte_list_desc *)(*pte_list & ~1ul);
> -		while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
> -			desc = desc->more;
> -			count += PTE_LIST_EXT;
> -		}
> -		if (desc->sptes[PTE_LIST_EXT-1]) {
> -			desc->more = mmu_alloc_pte_list_desc(vcpu);
> -			desc = desc->more;
> -		}
> -		for (i = 0; desc->sptes[i]; ++i)
> -			++count;
> -		desc->sptes[i] = spte;
> +		return 1;
> +	}
> +
> +	rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
> +	desc = (struct pte_list_desc *)(*pte_list & ~1ul);
> +
> +	/* No empty position in the desc. */
> +	if (desc->sptes[PTE_LIST_EXT - 1]) {
> +		struct pte_list_desc *new_desc;
> +		new_desc = mmu_alloc_pte_list_desc(vcpu);
> +		new_desc->more = desc;
> +		desc = new_desc;
> +		*pte_list = (unsigned long)desc | 1;
>  	}
> -	return count;
> +
> +	free_pos = find_first_free(desc);
> +	desc->sptes[free_pos] = spte;
> +	return count_spte_number(desc);
Should it be count_spte_number(desc) - 1? The function should returns
the number of pte entries before the spte was added.

>  }
>  
>  static void
> -pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
> -			   int i, struct pte_list_desc *prev_desc)
> +pte_list_desc_remove_entry(unsigned long *pte_list,
> +			   struct pte_list_desc *desc, int i)
>  {
> -	int j;
> +	struct pte_list_desc *first_desc;
> +	int last_used;
> +
> +	first_desc = (struct pte_list_desc *)(*pte_list & ~1ul);
> +	last_used = find_last_used(first_desc);
>  
> -	for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
> -		;
> -	desc->sptes[i] = desc->sptes[j];
> -	desc->sptes[j] = NULL;
> -	if (j != 0)
> +	/*
> +	 * Move the entry from the first desc to this position we want
> +	 * to remove.
> +	 */
> +	desc->sptes[i] = first_desc->sptes[last_used];
> +	first_desc->sptes[last_used] = NULL;
> +
What if desc == first_desc and i < last_used. You still move spte
backwards so lockless walk may have already examined entry at i and
will miss spte that was moved there from last_used position, no?

> +	/* No valid entry in this desc, we can free this desc now. */
> +	if (!first_desc->sptes[0]) {
> +		struct pte_list_desc *next_desc = first_desc->more;
> +
> +		/*
> +		 * Only one entry existing but still use a desc to store it?
> +		 */
> +		WARN_ON(!next_desc);
> +
> +		mmu_free_pte_list_desc(first_desc);
> +		first_desc = next_desc;
> +		*pte_list = (unsigned long)first_desc | 1ul;
>  		return;
> -	if (!prev_desc && !desc->more)
> -		*pte_list = (unsigned long)desc->sptes[0];
> -	else
> -		if (prev_desc)
> -			prev_desc->more = desc->more;
> -		else
> -			*pte_list = (unsigned long)desc->more | 1;
> -	mmu_free_pte_list_desc(desc);
> +	}
> +
> +	WARN_ON(!first_desc->sptes[0]);
> +
> +	/*
> +	 * Only one entry in this desc, move the entry to the head
> +	 * then the desc can be freed.
> +	 */
> +	if (!first_desc->sptes[1] && !first_desc->more) {
> +		*pte_list = (unsigned long)first_desc->sptes[0];
> +		mmu_free_pte_list_desc(first_desc);
> +	}
>  }
>  
>  static void pte_list_remove(u64 *spte, unsigned long *pte_list)
>  {
>  	struct pte_list_desc *desc;
> -	struct pte_list_desc *prev_desc;
>  	int i;
>  
>  	if (!*pte_list) {
> -		printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
> -		BUG();
> -	} else if (!(*pte_list & 1)) {
> +		WARN(1, KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
Why change BUG() to WARN() here and below?

> +		return;
> +	}
> +
> +	if (!(*pte_list & 1)) {
>  		rmap_printk("pte_list_remove:  %p 1->0\n", spte);
>  		if ((u64 *)*pte_list != spte) {
> -			printk(KERN_ERR "pte_list_remove:  %p 1->BUG\n", spte);
> -			BUG();
> +			WARN(1, KERN_ERR "pte_list_remove:  %p 1->BUG\n", spte);
>  		}
Remove {} since only one statement left in the if(). Or better yet why
not:
  WARN ((u64 *)*pte_list != spte, ....)?
But again why not BUG()?

>  		*pte_list = 0;
> -	} else {
> -		rmap_printk("pte_list_remove:  %p many->many\n", spte);
> -		desc = (struct pte_list_desc *)(*pte_list & ~1ul);
> -		prev_desc = NULL;
> -		while (desc) {
> -			for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
> -				if (desc->sptes[i] == spte) {
> -					pte_list_desc_remove_entry(pte_list,
> -							       desc, i,
> -							       prev_desc);
> -					return;
> -				}
> -			prev_desc = desc;
> -			desc = desc->more;
> -		}
> -		pr_err("pte_list_remove: %p many->many\n", spte);
> -		BUG();
> +		return;
>  	}
> +
> +	rmap_printk("pte_list_remove:  %p many->many\n", spte);
> +	desc = (struct pte_list_desc *)(*pte_list & ~1ul);
> +	while (desc) {
> +		for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
> +			if (desc->sptes[i] == spte) {
> +				pte_list_desc_remove_entry(pte_list,
> +							       desc, i);
> +				return;
> +			}
> +		desc = desc->more;
> +	}
> +
> +	WARN(1, "pte_list_remove: %p many->many\n", spte);
>  }
>  
>  typedef void (*pte_list_walk_fn) (u64 *spte);
> -- 
> 1.8.1.4

--
			Gleb.

  reply	other threads:[~2013-08-28  8:13 UTC|newest]

Thread overview: 69+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-07-30 13:01 [RFC PATCH 00/12] KVM: MMU: locklessly wirte-protect Xiao Guangrong
2013-07-30 13:01 ` [PATCH 01/12] KVM: MMU: remove unused parameter Xiao Guangrong
2013-08-29  7:22   ` Gleb Natapov
2013-07-30 13:02 ` [PATCH 02/12] KVM: MMU: properly check last spte in fast_page_fault() Xiao Guangrong
2013-07-30 13:02 ` [PATCH 03/12] KVM: MMU: lazily drop large spte Xiao Guangrong
2013-08-02 14:55   ` Marcelo Tosatti
2013-08-02 15:42     ` Xiao Guangrong
2013-08-02 20:27       ` Marcelo Tosatti
2013-08-02 22:56         ` Xiao Guangrong
2013-07-30 13:02 ` [PATCH 04/12] KVM: MMU: log dirty page after marking spte writable Xiao Guangrong
2013-07-30 13:26   ` Paolo Bonzini
2013-07-31  7:25     ` Xiao Guangrong
2013-08-07  1:48   ` Marcelo Tosatti
2013-08-07  4:06     ` Xiao Guangrong
2013-08-08 15:06       ` Marcelo Tosatti
2013-08-08 16:26         ` Xiao Guangrong
2013-11-20  0:29       ` Marcelo Tosatti
2013-11-20  0:35         ` Marcelo Tosatti
2013-11-20 14:20         ` Xiao Guangrong
2013-11-20 19:47           ` Marcelo Tosatti
2013-11-21  4:26             ` Xiao Guangrong
2013-07-30 13:02 ` [PATCH 05/12] KVM: MMU: add spte into rmap before logging dirty page Xiao Guangrong
2013-07-30 13:27   ` Paolo Bonzini
2013-07-31  7:33     ` Xiao Guangrong
2013-07-30 13:02 ` [PATCH 06/12] KVM: MMU: flush tlb if the spte can be locklessly modified Xiao Guangrong
2013-08-28  7:23   ` Gleb Natapov
2013-08-28  7:50     ` Xiao Guangrong
2013-07-30 13:02 ` [PATCH 07/12] KVM: MMU: redesign the algorithm of pte_list Xiao Guangrong
2013-08-28  8:12   ` Gleb Natapov [this message]
2013-08-28  8:37     ` Xiao Guangrong
2013-08-28  8:58       ` Gleb Natapov
2013-08-28  9:19         ` Xiao Guangrong
2013-07-30 13:02 ` [PATCH 08/12] KVM: MMU: introduce nulls desc Xiao Guangrong
2013-08-28  8:40   ` Gleb Natapov
2013-08-28  8:54     ` Xiao Guangrong
2013-07-30 13:02 ` [PATCH 09/12] KVM: MMU: introduce pte-list lockless walker Xiao Guangrong
2013-08-28  9:20   ` Gleb Natapov
2013-08-28  9:33     ` Xiao Guangrong
2013-08-28  9:46       ` Gleb Natapov
2013-08-28 10:13         ` Xiao Guangrong
2013-08-28 10:49           ` Gleb Natapov
2013-08-28 12:15             ` Xiao Guangrong
2013-08-28 13:36               ` Gleb Natapov
2013-08-29  6:50                 ` Xiao Guangrong
2013-08-29  9:08                   ` Gleb Natapov
2013-08-29  9:31                     ` Xiao Guangrong
2013-08-29  9:51                       ` Gleb Natapov
2013-08-29 11:26                         ` Xiao Guangrong
2013-08-30 11:38                           ` Gleb Natapov
2013-09-02  7:02                             ` Xiao Guangrong
2013-08-29  9:31                   ` Gleb Natapov
2013-08-29 11:33                     ` Xiao Guangrong
2013-08-29 12:02                       ` Xiao Guangrong
2013-08-30 11:44                         ` Gleb Natapov
2013-09-02  8:50                           ` Xiao Guangrong
2013-07-30 13:02 ` [PATCH 10/12] KVM: MMU: allow locklessly access shadow page table out of vcpu thread Xiao Guangrong
2013-08-07 13:09   ` Takuya Yoshikawa
2013-08-07 13:19     ` Xiao Guangrong
2013-08-29  9:10   ` Gleb Natapov
2013-08-29  9:25     ` Xiao Guangrong
2013-07-30 13:02 ` [PATCH 11/12] KVM: MMU: locklessly write-protect the page Xiao Guangrong
2013-07-30 13:02 ` [PATCH 12/12] KVM: MMU: clean up spte_write_protect Xiao Guangrong
2013-07-30 13:11 ` [RFC PATCH 00/12] KVM: MMU: locklessly wirte-protect Xiao Guangrong
2013-08-03  5:09 ` Takuya Yoshikawa
2013-08-04 14:15   ` Xiao Guangrong
2013-08-29  7:16   ` Gleb Natapov
2013-08-06 13:16 ` Xiao Guangrong
2013-08-08 17:38   ` Paolo Bonzini
2013-08-09  4:51     ` Xiao Guangrong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20130828081205.GN22899@redhat.com \
    --to=gleb@redhat.com \
    --cc=avi.kivity@gmail.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mtosatti@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=xiaoguangrong@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).