From: Gleb Natapov <gleb@redhat.com>
To: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: avi.kivity@gmail.com, mtosatti@redhat.com, pbonzini@redhat.com,
linux-kernel@vger.kernel.org, kvm@vger.kernel.org
Subject: Re: [PATCH 07/12] KVM: MMU: redesign the algorithm of pte_list
Date: Wed, 28 Aug 2013 11:12:05 +0300 [thread overview]
Message-ID: <20130828081205.GN22899@redhat.com> (raw)
In-Reply-To: <1375189330-24066-8-git-send-email-xiaoguangrong@linux.vnet.ibm.com>
On Tue, Jul 30, 2013 at 09:02:05PM +0800, Xiao Guangrong wrote:
> Change the algorithm to:
> 1) always add new desc to the first desc (pointed by parent_ptes/rmap)
> that is good to implement rcu-nulls-list-like lockless rmap
> walking
>
> 2) always move the entry in first desc to the the position we want
> to remove when remove a spte in the parent_ptes/rmap (backward-move).
> It is good for us to implement lockless rmap walk since in the current
> code, when a spte is deleted from the "desc", another spte in the last
> "desc" will be moved to this position to replace the deleted one. If the
> deleted one has been accessed and we do not access the replaced one, the
> replaced one is missed when we do lockless walk.
> To fix this case, we do not backward move the spte, instead, we forward
> move the entry: when a spte is deleted, we move the entry in the first
> desc to that position
>
> Both of these also can reduce cache miss
>
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
> arch/x86/kvm/mmu.c | 182 ++++++++++++++++++++++++++++++++++++-----------------
> 1 file changed, 125 insertions(+), 57 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 5a40564..3013bb1 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -918,6 +918,50 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
> return level - 1;
> }
>
> +static int __find_first_free(struct pte_list_desc *desc)
> +{
> + int i;
> +
> + for (i = 0; i < PTE_LIST_EXT; i++)
> + if (!desc->sptes[i])
> + break;
> + return i;
> +}
> +
> +static int find_first_free(struct pte_list_desc *desc)
> +{
> + int free = __find_first_free(desc);
> +
> + WARN_ON(free >= PTE_LIST_EXT);
> + return free;
> +}
> +
> +static int find_last_used(struct pte_list_desc *desc)
> +{
> + int used = __find_first_free(desc) - 1;
> +
> + WARN_ON(used < 0 || used >= PTE_LIST_EXT);
> + return used;
> +}
> +
> +/*
> + * TODO: we can encode the desc number into the rmap/parent_ptes
> + * since at least 10 physical/virtual address bits are reserved
> + * on x86. It is worthwhile if it shows that the desc walking is
> + * a performance issue.
> + */
> +static int count_spte_number(struct pte_list_desc *desc)
> +{
> + int first_free, desc_num;
> +
> + first_free = __find_first_free(desc);
> +
> + for (desc_num = 0; desc->more; desc = desc->more)
> + desc_num++;
> +
> + return first_free + desc_num * PTE_LIST_EXT;
> +}
> +
> /*
> * Pte mapping structures:
> *
> @@ -934,92 +978,116 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
> unsigned long *pte_list)
> {
> struct pte_list_desc *desc;
> - int i, count = 0;
> + int free_pos;
>
> if (!*pte_list) {
> rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
> *pte_list = (unsigned long)spte;
> - } else if (!(*pte_list & 1)) {
> + return 0;
> + }
> +
> + if (!(*pte_list & 1)) {
> rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
> desc = mmu_alloc_pte_list_desc(vcpu);
> desc->sptes[0] = (u64 *)*pte_list;
> desc->sptes[1] = spte;
> *pte_list = (unsigned long)desc | 1;
> - ++count;
> - } else {
> - rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
> - desc = (struct pte_list_desc *)(*pte_list & ~1ul);
> - while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
> - desc = desc->more;
> - count += PTE_LIST_EXT;
> - }
> - if (desc->sptes[PTE_LIST_EXT-1]) {
> - desc->more = mmu_alloc_pte_list_desc(vcpu);
> - desc = desc->more;
> - }
> - for (i = 0; desc->sptes[i]; ++i)
> - ++count;
> - desc->sptes[i] = spte;
> + return 1;
> + }
> +
> + rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
> + desc = (struct pte_list_desc *)(*pte_list & ~1ul);
> +
> + /* No empty position in the desc. */
> + if (desc->sptes[PTE_LIST_EXT - 1]) {
> + struct pte_list_desc *new_desc;
> + new_desc = mmu_alloc_pte_list_desc(vcpu);
> + new_desc->more = desc;
> + desc = new_desc;
> + *pte_list = (unsigned long)desc | 1;
> }
> - return count;
> +
> + free_pos = find_first_free(desc);
> + desc->sptes[free_pos] = spte;
> + return count_spte_number(desc);
Should it be count_spte_number(desc) - 1? The function should returns
the number of pte entries before the spte was added.
> }
>
> static void
> -pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
> - int i, struct pte_list_desc *prev_desc)
> +pte_list_desc_remove_entry(unsigned long *pte_list,
> + struct pte_list_desc *desc, int i)
> {
> - int j;
> + struct pte_list_desc *first_desc;
> + int last_used;
> +
> + first_desc = (struct pte_list_desc *)(*pte_list & ~1ul);
> + last_used = find_last_used(first_desc);
>
> - for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
> - ;
> - desc->sptes[i] = desc->sptes[j];
> - desc->sptes[j] = NULL;
> - if (j != 0)
> + /*
> + * Move the entry from the first desc to this position we want
> + * to remove.
> + */
> + desc->sptes[i] = first_desc->sptes[last_used];
> + first_desc->sptes[last_used] = NULL;
> +
What if desc == first_desc and i < last_used. You still move spte
backwards so lockless walk may have already examined entry at i and
will miss spte that was moved there from last_used position, no?
> + /* No valid entry in this desc, we can free this desc now. */
> + if (!first_desc->sptes[0]) {
> + struct pte_list_desc *next_desc = first_desc->more;
> +
> + /*
> + * Only one entry existing but still use a desc to store it?
> + */
> + WARN_ON(!next_desc);
> +
> + mmu_free_pte_list_desc(first_desc);
> + first_desc = next_desc;
> + *pte_list = (unsigned long)first_desc | 1ul;
> return;
> - if (!prev_desc && !desc->more)
> - *pte_list = (unsigned long)desc->sptes[0];
> - else
> - if (prev_desc)
> - prev_desc->more = desc->more;
> - else
> - *pte_list = (unsigned long)desc->more | 1;
> - mmu_free_pte_list_desc(desc);
> + }
> +
> + WARN_ON(!first_desc->sptes[0]);
> +
> + /*
> + * Only one entry in this desc, move the entry to the head
> + * then the desc can be freed.
> + */
> + if (!first_desc->sptes[1] && !first_desc->more) {
> + *pte_list = (unsigned long)first_desc->sptes[0];
> + mmu_free_pte_list_desc(first_desc);
> + }
> }
>
> static void pte_list_remove(u64 *spte, unsigned long *pte_list)
> {
> struct pte_list_desc *desc;
> - struct pte_list_desc *prev_desc;
> int i;
>
> if (!*pte_list) {
> - printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
> - BUG();
> - } else if (!(*pte_list & 1)) {
> + WARN(1, KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
Why change BUG() to WARN() here and below?
> + return;
> + }
> +
> + if (!(*pte_list & 1)) {
> rmap_printk("pte_list_remove: %p 1->0\n", spte);
> if ((u64 *)*pte_list != spte) {
> - printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
> - BUG();
> + WARN(1, KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
> }
Remove {} since only one statement left in the if(). Or better yet why
not:
WARN ((u64 *)*pte_list != spte, ....)?
But again why not BUG()?
> *pte_list = 0;
> - } else {
> - rmap_printk("pte_list_remove: %p many->many\n", spte);
> - desc = (struct pte_list_desc *)(*pte_list & ~1ul);
> - prev_desc = NULL;
> - while (desc) {
> - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
> - if (desc->sptes[i] == spte) {
> - pte_list_desc_remove_entry(pte_list,
> - desc, i,
> - prev_desc);
> - return;
> - }
> - prev_desc = desc;
> - desc = desc->more;
> - }
> - pr_err("pte_list_remove: %p many->many\n", spte);
> - BUG();
> + return;
> }
> +
> + rmap_printk("pte_list_remove: %p many->many\n", spte);
> + desc = (struct pte_list_desc *)(*pte_list & ~1ul);
> + while (desc) {
> + for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
> + if (desc->sptes[i] == spte) {
> + pte_list_desc_remove_entry(pte_list,
> + desc, i);
> + return;
> + }
> + desc = desc->more;
> + }
> +
> + WARN(1, "pte_list_remove: %p many->many\n", spte);
> }
>
> typedef void (*pte_list_walk_fn) (u64 *spte);
> --
> 1.8.1.4
--
Gleb.
next prev parent reply other threads:[~2013-08-28 8:12 UTC|newest]
Thread overview: 69+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-07-30 13:01 [RFC PATCH 00/12] KVM: MMU: locklessly wirte-protect Xiao Guangrong
2013-07-30 13:01 ` [PATCH 01/12] KVM: MMU: remove unused parameter Xiao Guangrong
2013-08-29 7:22 ` Gleb Natapov
2013-07-30 13:02 ` [PATCH 02/12] KVM: MMU: properly check last spte in fast_page_fault() Xiao Guangrong
2013-07-30 13:02 ` [PATCH 03/12] KVM: MMU: lazily drop large spte Xiao Guangrong
2013-08-02 14:55 ` Marcelo Tosatti
2013-08-02 15:42 ` Xiao Guangrong
2013-08-02 20:27 ` Marcelo Tosatti
2013-08-02 22:56 ` Xiao Guangrong
2013-07-30 13:02 ` [PATCH 04/12] KVM: MMU: log dirty page after marking spte writable Xiao Guangrong
2013-07-30 13:26 ` Paolo Bonzini
2013-07-31 7:25 ` Xiao Guangrong
2013-08-07 1:48 ` Marcelo Tosatti
2013-08-07 4:06 ` Xiao Guangrong
2013-08-08 15:06 ` Marcelo Tosatti
2013-08-08 16:26 ` Xiao Guangrong
2013-11-20 0:29 ` Marcelo Tosatti
2013-11-20 0:35 ` Marcelo Tosatti
2013-11-20 14:20 ` Xiao Guangrong
2013-11-20 19:47 ` Marcelo Tosatti
2013-11-21 4:26 ` Xiao Guangrong
2013-07-30 13:02 ` [PATCH 05/12] KVM: MMU: add spte into rmap before logging dirty page Xiao Guangrong
2013-07-30 13:27 ` Paolo Bonzini
2013-07-31 7:33 ` Xiao Guangrong
2013-07-30 13:02 ` [PATCH 06/12] KVM: MMU: flush tlb if the spte can be locklessly modified Xiao Guangrong
2013-08-28 7:23 ` Gleb Natapov
2013-08-28 7:50 ` Xiao Guangrong
2013-07-30 13:02 ` [PATCH 07/12] KVM: MMU: redesign the algorithm of pte_list Xiao Guangrong
2013-08-28 8:12 ` Gleb Natapov [this message]
2013-08-28 8:37 ` Xiao Guangrong
2013-08-28 8:58 ` Gleb Natapov
2013-08-28 9:19 ` Xiao Guangrong
2013-07-30 13:02 ` [PATCH 08/12] KVM: MMU: introduce nulls desc Xiao Guangrong
2013-08-28 8:40 ` Gleb Natapov
2013-08-28 8:54 ` Xiao Guangrong
2013-07-30 13:02 ` [PATCH 09/12] KVM: MMU: introduce pte-list lockless walker Xiao Guangrong
2013-08-28 9:20 ` Gleb Natapov
2013-08-28 9:33 ` Xiao Guangrong
2013-08-28 9:46 ` Gleb Natapov
2013-08-28 10:13 ` Xiao Guangrong
2013-08-28 10:49 ` Gleb Natapov
2013-08-28 12:15 ` Xiao Guangrong
2013-08-28 13:36 ` Gleb Natapov
2013-08-29 6:50 ` Xiao Guangrong
2013-08-29 9:08 ` Gleb Natapov
2013-08-29 9:31 ` Xiao Guangrong
2013-08-29 9:51 ` Gleb Natapov
2013-08-29 11:26 ` Xiao Guangrong
2013-08-30 11:38 ` Gleb Natapov
2013-09-02 7:02 ` Xiao Guangrong
2013-08-29 9:31 ` Gleb Natapov
2013-08-29 11:33 ` Xiao Guangrong
2013-08-29 12:02 ` Xiao Guangrong
2013-08-30 11:44 ` Gleb Natapov
2013-09-02 8:50 ` Xiao Guangrong
2013-07-30 13:02 ` [PATCH 10/12] KVM: MMU: allow locklessly access shadow page table out of vcpu thread Xiao Guangrong
2013-08-07 13:09 ` Takuya Yoshikawa
2013-08-07 13:19 ` Xiao Guangrong
2013-08-29 9:10 ` Gleb Natapov
2013-08-29 9:25 ` Xiao Guangrong
2013-07-30 13:02 ` [PATCH 11/12] KVM: MMU: locklessly write-protect the page Xiao Guangrong
2013-07-30 13:02 ` [PATCH 12/12] KVM: MMU: clean up spte_write_protect Xiao Guangrong
2013-07-30 13:11 ` [RFC PATCH 00/12] KVM: MMU: locklessly wirte-protect Xiao Guangrong
2013-08-03 5:09 ` Takuya Yoshikawa
2013-08-04 14:15 ` Xiao Guangrong
2013-08-29 7:16 ` Gleb Natapov
2013-08-06 13:16 ` Xiao Guangrong
2013-08-08 17:38 ` Paolo Bonzini
2013-08-09 4:51 ` Xiao Guangrong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20130828081205.GN22899@redhat.com \
--to=gleb@redhat.com \
--cc=avi.kivity@gmail.com \
--cc=kvm@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mtosatti@redhat.com \
--cc=pbonzini@redhat.com \
--cc=xiaoguangrong@linux.vnet.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.