[PATCH RFC 26/44] x86/pt-shadow: Maintain a small cache of shadowed frames

From: Andrew Cooper <andrew.cooper3@citrix.com>
To: Xen-devel <xen-devel@lists.xen.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: [PATCH RFC 26/44] x86/pt-shadow: Maintain a small cache of shadowed frames
Date: Thu, 4 Jan 2018 20:21:51 +0000	[thread overview]
Message-ID: <1515097329-31902-27-git-send-email-andrew.cooper3@citrix.com> (raw)
In-Reply-To: <1515097329-31902-1-git-send-email-andrew.cooper3@citrix.com>

This improves the shadowing performance substantially.  In particular, system
calls for 64bit PV guests (which switch between the user and kernel
pagetables) no longer suffer a 4K copy hit in both directions.

See the code comments for reasoning and the algorithm description.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
 xen/arch/x86/mm.c                  |   2 +
 xen/arch/x86/mm/shadow/multi.c     |   2 +
 xen/arch/x86/pv/pt-shadow.c        | 196 ++++++++++++++++++++++++++++++++-----
 xen/include/asm-x86/pv/pt-shadow.h |   9 ++
 4 files changed, 186 insertions(+), 23 deletions(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index d5c69c0..f8f15e9 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -2413,6 +2413,8 @@ int free_page_type(struct page_info *page, unsigned long type,
     case PGT_l4_page_table:
         ASSERT(preemptible);
         rc = free_l4_table(page);
+        if ( !rc )
+            pt_shadow_l4_invlpg(owner, page);
         break;
     default:
         gdprintk(XENLOG_WARNING, "type %" PRtype_info " mfn %" PRI_mfn "\n",
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 9c929ed..f9ec5aa 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -1895,6 +1895,8 @@ void sh_destroy_l4_shadow(struct domain *d, mfn_t smfn)
         }
     });
 
+    pt_shadow_l4_invlpg(d, sp);
+
     /* Put the memory back in the pool */
     shadow_free(d, smfn);
 }
diff --git a/xen/arch/x86/pv/pt-shadow.c b/xen/arch/x86/pv/pt-shadow.c
index 33cb303..b4f2b86 100644
--- a/xen/arch/x86/pv/pt-shadow.c
+++ b/xen/arch/x86/pv/pt-shadow.c
@@ -24,6 +24,10 @@
 
 #include <asm/pv/pt-shadow.h>
 
+/* Override macros from asm/mm.h to make them work with mfn_t */
+#undef page_to_mfn
+#define page_to_mfn(pg) _mfn(__page_to_mfn(pg))
+
 /*
  * To use percpu linear ranges, we require that no two pcpus have %cr3
  * pointing at the same L4 pagetable at the same time.
@@ -38,19 +42,44 @@
  *
  * The algorithm is fairly simple.
  *
+ *   - A small cache of shadowed L4s from the same guest is maintained.
  *   - When a pcpu is switching to a new vcpu cr3 and shadowing is necessary,
- *     perform a full 4K copy of the guests frame into a percpu frame, and run
- *     on that.
+ *     the cache is searched.
+ *     - If the new cr3 is already cached, use our existing shadow.
+ *     - If not, drop an entry and shadow the new frame with a full 4K copy.
  *   - When a write to a guests L4 pagetable occurs, the update must be
  *     propagated to all existing shadows.  An IPI is sent to the domains
  *     dirty mask indicating which frame/slot was updated, and each pcpu
  *     checks to see whether it needs to sync the update into its shadow.
+ *   - When a guest L4 pagetable is freed, it must be dropped from any caches,
+ *     as Xen will allow it to become writeable to the guest again, and its
+ *     contents will go stale.  It uses the same IPI mechanism as for writes.
+ */
+
+#define L4_SHADOW_ORDER 2
+#define NR_L4_SHADOWS   (1ul << L4_SHADOW_ORDER)
+
+/*
+ * An individual cache entry.  Contains a %cr3 which has been cached, and the
+ * index of this entry into the shadow frames.
+ *
+ * The layout relies on %cr3 being page aligned, with the index stored in the
+ * lower bits.  idx could be a smaller bitfield, but there is no other
+ * information to store, and having it as an 8bit field results in better
+ * compiled code.
  */
+typedef union pt_cache_entry {
+    unsigned long raw;
+    struct {
+        uint8_t idx;
+        unsigned long :4, cr3_mfn:52;
+    };
+} pt_cache_entry_t;
 
 struct pt_shadow {
     /*
-     * A frame used to shadow a vcpus intended pagetable.  When shadowing,
-     * this frame is the one actually referenced by %cr3.
+     * A cache of frames used to shadow a vcpus intended pagetables.  When
+     * shadowing, one of these frames is the one actually referenced by %cr3.
      */
     paddr_t shadow_l4;
     l4_pgentry_t *shadow_l4_va;
@@ -63,29 +92,60 @@ struct pt_shadow {
      */
     const struct domain *domain;
 
-    /* If nonzero, a guests pagetable which we are shadowing. */
-    paddr_t shadowing;
+    /*
+     * A collection of %cr3's, belonging to @p domain, which are shadowed
+     * locally.
+     *
+     * A cache entry is used if cr3_mfn != 0, free otherwise.  The cache is
+     * maintained in most-recently-used order.  As a result, cache[0].cr3_mfn
+     * should always match v->arch.cr3.
+     *
+     * The cache[].idx fields will always be unique, and between 0 and
+     * NR_L4_SHADOWS.  Their order however will vary as most-recently-used
+     * order is maintained.
+     */
+    pt_cache_entry_t cache[NR_L4_SHADOWS];
 };
 
 static DEFINE_PER_CPU(struct pt_shadow, ptsh);
 
+static l4_pgentry_t *shadow_l4_va(struct pt_shadow *ptsh, unsigned int idx)
+{
+    return _p(ptsh->shadow_l4_va) + idx * PAGE_SIZE;
+}
+
+static paddr_t shadow_l4(struct pt_shadow *ptsh, unsigned int idx)
+{
+    return ptsh->shadow_l4 + idx * PAGE_SIZE;
+}
+
 int pt_shadow_alloc(unsigned int cpu)
 {
     struct pt_shadow *ptsh = &per_cpu(ptsh, cpu);
-    unsigned int memflags = 0;
+    unsigned int memflags = 0, i;
     nodeid_t node = cpu_to_node(cpu);
     struct page_info *pg;
+    mfn_t mfns[NR_L4_SHADOWS];
 
     if ( node != NUMA_NO_NODE )
         memflags = MEMF_node(node);
 
-    pg = alloc_domheap_page(NULL, memflags);
+    pg = alloc_domheap_pages(NULL, L4_SHADOW_ORDER, memflags);
     if ( !pg )
         return -ENOMEM;
 
     ptsh->shadow_l4 = page_to_maddr(pg);
 
-    ptsh->shadow_l4_va = __map_domain_page_global(pg);
+    for ( i = 0; i < ARRAY_SIZE(mfns); ++i )
+    {
+        /* Initialise the cache (ascending idx fields). */
+        ptsh->cache[i] = (pt_cache_entry_t){ i };
+
+        /* Collect MFNs to vmap(). */
+        mfns[i] = mfn_add(maddr_to_mfn(ptsh->shadow_l4), i);
+    }
+
+    ptsh->shadow_l4_va = vmap(mfns, ARRAY_SIZE(mfns));
     if ( !ptsh->shadow_l4_va )
         return -ENOMEM;
 
@@ -98,17 +158,35 @@ void pt_shadow_free(unsigned int cpu)
 
     if ( ptsh->shadow_l4_va )
     {
-        unmap_domain_page_global(ptsh->shadow_l4_va);
+        vunmap(ptsh->shadow_l4_va);
         ptsh->shadow_l4_va = NULL;
     }
 
     if ( ptsh->shadow_l4 )
     {
-        free_domheap_page(maddr_to_page(ptsh->shadow_l4));
+        free_domheap_pages(maddr_to_page(ptsh->shadow_l4), L4_SHADOW_ORDER);
         ptsh->shadow_l4 = 0;
     }
 }
 
+static pt_cache_entry_t *pt_cache_lookup(
+    struct pt_shadow *ptsh, unsigned long maddr)
+{
+    unsigned int i;
+
+    ASSERT(!local_irq_is_enabled());
+
+    for ( i = 0; i < ARRAY_SIZE(ptsh->cache); ++i )
+    {
+        pt_cache_entry_t *ent = &ptsh->cache[i];
+
+        if ( ent->cr3_mfn == (maddr >> PAGE_SHIFT) )
+            return ent;
+    }
+
+    return NULL;
+}
+
 /*
  * We only need to shadow 4-level PV guests.  All other guests have per-vcpu
  * monitor tables which are never scheduled on concurrent pcpus.  Care needs
@@ -126,6 +204,7 @@ unsigned long pt_maybe_shadow(struct vcpu *v)
     unsigned int cpu = smp_processor_id();
     struct pt_shadow *ptsh = &per_cpu(ptsh, cpu);
     unsigned long flags, new_cr3 = v->arch.cr3;
+    pt_cache_entry_t *ent;
 
     /*
      * IPIs for updates are based on the domain dirty mask.  If we ever switch
@@ -135,8 +214,12 @@ unsigned long pt_maybe_shadow(struct vcpu *v)
     if ( ptsh->domain &&
          ptsh->domain != v->domain )
     {
+        unsigned int i;
+
         ptsh->domain = NULL;
-        ptsh->shadowing = 0;
+
+        for ( i = 0; i < ARRAY_SIZE(ptsh->cache); ++i )
+            ptsh->cache[i].cr3_mfn = 0;
     }
 
     /* No shadowing necessary? Run on the intended pagetable. */
@@ -145,10 +228,6 @@ unsigned long pt_maybe_shadow(struct vcpu *v)
 
     ptsh->domain = v->domain;
 
-    /* Fastpath, if we are already shadowing the intended pagetable. */
-    if ( ptsh->shadowing == new_cr3 )
-        return ptsh->shadow_l4;
-
     /*
      * We may be called with interrupts disabled (e.g. context switch), or
      * interrupts enabled (e.g. new_guest_cr3()).
@@ -158,14 +237,46 @@ unsigned long pt_maybe_shadow(struct vcpu *v)
      */
     local_irq_save(flags);
 
+    ent = pt_cache_lookup(ptsh, new_cr3);
+    if ( ent )
+    {
+        /*
+         * Cache hit.  Promote this entry to being most recently used (if it
+         * isn't already).
+         */
+        unsigned int cache_idx = ent - ptsh->cache;
+
+        if ( cache_idx )
+        {
+            pt_cache_entry_t tmp = *ent;
+
+            switch ( cache_idx )
+            {
+            case 3: ptsh->cache[3] = ptsh->cache[2];
+            case 2: ptsh->cache[2] = ptsh->cache[1];
+            case 1: ptsh->cache[1] = ptsh->cache[0];
+                    ptsh->cache[0] = tmp;
+            }
+        }
+        local_irq_restore(flags);
+    }
+    else
     {
+        /*
+         * Cache miss.  Recycle whatever was in the last slot, promote it to
+         * being most recently used, and copy the entire pagetable.
+         */
         unsigned int slot = l4_table_offset(PERCPU_LINEAR_START);
+        unsigned int idx = ptsh->cache[3].idx;
         l4_pgentry_t *l4t, *vcpu_l4t;
 
-        ptsh->shadowing = new_cr3;
+        ptsh->cache[3] = ptsh->cache[2];
+        ptsh->cache[2] = ptsh->cache[1];
+        ptsh->cache[1] = ptsh->cache[0];
+        ptsh->cache[0] = (pt_cache_entry_t){ new_cr3 | idx };
         local_irq_restore(flags);
 
-        l4t = ptsh->shadow_l4_va;
+        l4t = shadow_l4_va(ptsh, idx);
         vcpu_l4t = map_domain_page(maddr_to_mfn(new_cr3));
 
         /*
@@ -184,7 +295,9 @@ unsigned long pt_maybe_shadow(struct vcpu *v)
         unmap_domain_page(vcpu_l4t);
     }
 
-    return ptsh->shadow_l4;
+    ASSERT(ptsh->cache[0].cr3_mfn == (new_cr3 >> PAGE_SHIFT));
+
+    return shadow_l4(ptsh, ptsh->cache[0].idx);
 }
 
 struct ptsh_ipi_info
@@ -193,6 +306,7 @@ struct ptsh_ipi_info
     const struct page_info *pg;
     enum {
         PTSH_IPI_WRITE,
+        PTSH_IPI_INVLPG,
     } op;
     unsigned int slot;
 };
@@ -202,29 +316,49 @@ static void _pt_shadow_ipi(void *arg)
     unsigned int cpu = smp_processor_id();
     struct pt_shadow *ptsh = &per_cpu(ptsh, cpu);
     const struct ptsh_ipi_info *info = arg;
-    unsigned long maddr = page_to_maddr(info->pg);
+    pt_cache_entry_t *ent;
 
     /* No longer shadowing state from this domain?  Nothing to do. */
     if ( info->d != ptsh->domain )
         return;
 
+    ent = pt_cache_lookup(ptsh, page_to_maddr(info->pg));
+
     /* Not shadowing this frame?  Nothing to do. */
-    if ( ptsh->shadowing != maddr )
+    if ( ent == NULL )
         return;
 
     switch ( info->op )
     {
         l4_pgentry_t *l4t, *vcpu_l4t;
+        unsigned int cache_idx, shadow_idx;
 
     case PTSH_IPI_WRITE:
-        l4t = ptsh->shadow_l4_va;
-        vcpu_l4t = map_domain_page(maddr_to_mfn(maddr));
+        l4t = shadow_l4_va(ptsh, ent->idx);
+        vcpu_l4t = map_domain_page(page_to_mfn(info->pg));
 
         l4t[info->slot] = vcpu_l4t[info->slot];
 
         unmap_domain_page(vcpu_l4t);
         break;
 
+    case PTSH_IPI_INVLPG:
+        cache_idx = ent - ptsh->cache;
+        shadow_idx = ent->idx;
+
+        /*
+         * Demote the dropped entry to least-recently-used, so it is the next
+         * entry to be reused.
+         */
+        switch ( cache_idx )
+        {
+        case 0: BUG(); /* ??? Freeing the L4 which current is running on! */
+        case 1: ptsh->cache[1] = ptsh->cache[2];
+        case 2: ptsh->cache[2] = ptsh->cache[3];
+        case 3: ptsh->cache[3] = (pt_cache_entry_t){ shadow_idx };
+        }
+        break;
+
     default:
         ASSERT_UNREACHABLE();
     }
@@ -248,6 +382,22 @@ void pt_shadow_l4_write(const struct domain *d, const struct page_info *pg,
     on_selected_cpus(d->domain_dirty_cpumask, _pt_shadow_ipi, &info, 1);
 }
 
+void pt_shadow_l4_invlpg(const struct domain *d, const struct page_info *pg)
+{
+    struct ptsh_ipi_info info;
+
+    if ( !pt_need_shadow(d) )
+        return;
+
+    info = (struct ptsh_ipi_info){
+        .d = d,
+        .pg = pg,
+        .op = PTSH_IPI_INVLPG,
+    };
+
+    on_selected_cpus(d->domain_dirty_cpumask, _pt_shadow_ipi, &info, 1);
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/include/asm-x86/pv/pt-shadow.h b/xen/include/asm-x86/pv/pt-shadow.h
index 6e71e99..d5576f4 100644
--- a/xen/include/asm-x86/pv/pt-shadow.h
+++ b/xen/include/asm-x86/pv/pt-shadow.h
@@ -47,6 +47,13 @@ unsigned long pt_maybe_shadow(struct vcpu *v);
 void pt_shadow_l4_write(
     const struct domain *d, const struct page_info *pg, unsigned int slot);
 
+/*
+ * Called when an L4 pagetable is freed.  The PT shadow logic ensures that it
+ * is purged from any caches.
+ */
+void pt_shadow_l4_invlpg(
+    const struct domain *d, const struct page_info *pg);
+
 #else /* !CONFIG_PV */
 
 static inline int pt_shadow_alloc(unsigned int cpu) { return 0; }
@@ -58,6 +65,8 @@ static inline unsigned long pt_maybe_shadow(struct vcpu *v)
 }
 static inline void pt_shadow_l4_write(
     const struct domain *d, const struct page_info *pg, unsigned int slot) { }
+static inline void pt_shadow_l4_invlpg(
+    const struct domain *d, const struct page_info *pg) { }
 
 #endif /* CONFIG_PV */
 
-- 
2.1.4


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel