xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed
From: Andrew Cooper <andrew.cooper3@citrix.com>
To: Xen-devel <xen-devel@lists.xen.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: [PATCH RFC 13/44] x86/pt-shadow: Shadow L4 tables from 64bit PV guests
Date: Thu, 4 Jan 2018 20:21:38 +0000	[thread overview]
Message-ID: <1515097329-31902-14-git-send-email-andrew.cooper3@citrix.com> (raw)
In-Reply-To: <1515097329-31902-1-git-send-email-andrew.cooper3@citrix.com>

See the code comments for reasoning and the algorithm description.

This is a very simplistic algorithm, which comes with a substantial
performance overhead.  The algorithm will be improved in a later patch, once
more infrastructure is in place.

Some of the code (particularly in pt_maybe_shadow()) is structured oddly.
This is deliberate to simplify the patch for the later algorithm improvement,
to avoid unnecessary code motion getting in the way of the logical change.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v3:
 * Rebase over change to using ptsh
 * Rework, in terms of being as close to the eventual algorithm as possible,
   before we get map_domain_page() which is usable in context switch context.
---
 xen/arch/x86/mm.c                  |   5 +-
 xen/arch/x86/mm/shadow/multi.c     |   2 +
 xen/arch/x86/pv/mm.h               |  16 +++-
 xen/arch/x86/pv/pt-shadow.c        | 164 +++++++++++++++++++++++++++++++++++++
 xen/include/asm-x86/fixmap.h       |   1 +
 xen/include/asm-x86/pv/pt-shadow.h |  24 ++++++
 6 files changed, 209 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index f85ef6c..375565f 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -126,6 +126,7 @@
 #include <asm/hvm/grant_table.h>
 #include <asm/pv/grant_table.h>
 #include <asm/pv/mm.h>
+#include <asm/pv/pt-shadow.h>
 
 #include "pv/mm.h"
 
@@ -501,13 +502,15 @@ DEFINE_PER_CPU(unsigned long, curr_ptbase);
 
 void do_write_ptbase(struct vcpu *v, bool tlb_maintenance)
 {
-    unsigned long new_cr3 = v->arch.cr3;
+    unsigned long new_cr3;
     unsigned int cpu = smp_processor_id();
     unsigned long *this_curr_ptbase = &per_cpu(curr_ptbase, cpu);
 
     /* Check that %cr3 isn't being shuffled under our feet. */
     ASSERT(*this_curr_ptbase == read_cr3());
 
+    new_cr3 = pt_maybe_shadow(v);
+
     if ( tlb_maintenance )
         write_cr3(new_cr3);
     else
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index c4e954e..9c929ed 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -39,6 +39,7 @@ asm(".file \"" __OBJECT_FILE__ "\"");
 #include <asm/hvm/cacheattr.h>
 #include <asm/mtrr.h>
 #include <asm/guest_pt.h>
+#include <asm/pv/pt-shadow.h>
 #include <public/sched.h>
 #include "private.h"
 #include "types.h"
@@ -952,6 +953,7 @@ static int shadow_set_l4e(struct domain *d,
 
     /* Write the new entry */
     shadow_write_entries(sl4e, &new_sl4e, 1, sl4mfn);
+    pt_shadow_l4_write(d, mfn_to_page(sl4mfn), pgentry_ptr_to_slot(sl4e));
     flags |= SHADOW_SET_CHANGED;
 
     if ( shadow_l4e_get_flags(old_sl4e) & _PAGE_PRESENT )
diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h
index a10b09a..7c66ca7 100644
--- a/xen/arch/x86/pv/mm.h
+++ b/xen/arch/x86/pv/mm.h
@@ -1,6 +1,8 @@
 #ifndef __PV_MM_H__
 #define __PV_MM_H__
 
+#include <asm/pv/pt-shadow.h>
+
 l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn);
 
 int new_guest_cr3(mfn_t mfn);
@@ -38,7 +40,7 @@ static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear)
  */
 static inline bool update_intpte(intpte_t *p, intpte_t old, intpte_t new,
                                  unsigned long mfn, struct vcpu *v,
-                                 bool preserve_ad)
+                                 bool preserve_ad, unsigned int level)
 {
     bool rv = true;
 
@@ -77,6 +79,11 @@ static inline bool update_intpte(intpte_t *p, intpte_t old, intpte_t new,
             old = t;
         }
     }
+
+    if ( level == 4 )
+        pt_shadow_l4_write(v->domain, mfn_to_page(mfn),
+                           pgentry_ptr_to_slot(p));
+
     return rv;
 }
 
@@ -87,7 +94,12 @@ static inline bool update_intpte(intpte_t *p, intpte_t old, intpte_t new,
 #define UPDATE_ENTRY(_t,_p,_o,_n,_m,_v,_ad)                         \
     update_intpte(&_t ## e_get_intpte(*(_p)),                       \
                   _t ## e_get_intpte(_o), _t ## e_get_intpte(_n),   \
-                  (_m), (_v), (_ad))
+                  (_m), (_v), (_ad), _t ## _LEVEL)
+
+#define l1_LEVEL 1
+#define l2_LEVEL 2
+#define l3_LEVEL 3
+#define l4_LEVEL 4
 
 static inline l1_pgentry_t adjust_guest_l1e(l1_pgentry_t l1e,
                                             const struct domain *d)
diff --git a/xen/arch/x86/pv/pt-shadow.c b/xen/arch/x86/pv/pt-shadow.c
index 7db8efb..46a0251 100644
--- a/xen/arch/x86/pv/pt-shadow.c
+++ b/xen/arch/x86/pv/pt-shadow.c
@@ -22,8 +22,32 @@
 #include <xen/mm.h>
 #include <xen/numa.h>
 
+#include <asm/fixmap.h>
 #include <asm/pv/pt-shadow.h>
 
+/*
+ * To use percpu linear ranges, we require that no two pcpus have %cr3
+ * pointing at the same L4 pagetable at the same time.
+ *
+ * Guests however might choose to use the same L4 pagetable on multiple vcpus
+ * at once, e.g. concurrently scheduling two threads from the same process.
+ * In practice, all HVM guests, and 32bit PV guests run on Xen-provided
+ * per-vcpu monitor tables, so it is only 64bit PV guests which are an issue.
+ *
+ * To resolve the issue, we shadow L4 pagetables from 64bit PV guests when
+ * they are in context.
+ *
+ * The algorithm is fairly simple.
+ *
+ *   - When a pcpu is switching to a new vcpu cr3 and shadowing is necessary,
+ *     perform a full 4K copy of the guests frame into a percpu frame, and run
+ *     on that.
+ *   - When a write to a guests L4 pagetable occurs, the update must be
+ *     propagated to all existing shadows.  An IPI is sent to the domains
+ *     dirty mask indicating which frame/slot was updated, and each pcpu
+ *     checks to see whether it needs to sync the update into its shadow.
+ */
+
 struct pt_shadow {
     /*
      * A frame used to shadow a vcpus intended pagetable.  When shadowing,
@@ -31,6 +55,17 @@ struct pt_shadow {
      */
     paddr_t shadow_l4;
     l4_pgentry_t *shadow_l4_va;
+
+    /*
+     * Domain to which the shadowed state belongs, or NULL if no state is
+     * being cached.  IPIs for updates to cached information are based on the
+     * domain dirty mask, which can race with the target of the IPI switching
+     * to a different context.
+     */
+    const struct domain *domain;
+
+    /* If nonzero, a guests pagetable which we are shadowing. */
+    paddr_t shadowing;
 };
 
 static DEFINE_PER_CPU(struct pt_shadow, ptsh);
@@ -76,6 +111,135 @@ void pt_shadow_free(unsigned int cpu)
 }
 
 /*
+ * We only need to shadow 4-level PV guests.  All other guests have per-vcpu
+ * monitor tables which are never scheduled on concurrent pcpus.  Care needs
+ * to be taken not to shadow d0v0 during construction, as it writes its L4
+ * directly.
+ */
+static bool pt_need_shadow(const struct domain *d)
+{
+    return (system_state >= SYS_STATE_active && is_pv_domain(d) &&
+            !is_idle_domain(d) && !is_pv_32bit_domain(d) && d->max_vcpus > 1);
+}
+
+unsigned long pt_maybe_shadow(struct vcpu *v)
+{
+    unsigned int cpu = smp_processor_id();
+    struct pt_shadow *ptsh = &per_cpu(ptsh, cpu);
+    unsigned long flags, new_cr3 = v->arch.cr3;
+
+    /*
+     * IPIs for updates are based on the domain dirty mask.  If we ever switch
+     * out of the currently shadowed context (even to idle), the cache will
+     * become stale.
+     */
+    if ( ptsh->domain &&
+         ptsh->domain != v->domain )
+    {
+        ptsh->domain = NULL;
+        ptsh->shadowing = 0;
+    }
+
+    /* No shadowing necessary? Run on the intended pagetable. */
+    if ( !pt_need_shadow(v->domain) )
+        return new_cr3;
+
+    ptsh->domain = v->domain;
+
+    /* Fastpath, if we are already shadowing the intended pagetable. */
+    if ( ptsh->shadowing == new_cr3 )
+        return ptsh->shadow_l4;
+
+    /*
+     * We may be called with interrupts disabled (e.g. context switch), or
+     * interrupts enabled (e.g. new_guest_cr3()).
+     *
+     * Reads and modifications of ptsh-> are only on the local cpu, but must
+     * be excluded against reads and modifications in _pt_shadow_ipi().
+     */
+    local_irq_save(flags);
+
+    {
+        l4_pgentry_t *l4t, *vcpu_l4t;
+
+        set_percpu_fixmap(cpu, PERCPU_FIXSLOT_SHADOW,
+                          l1e_from_paddr(new_cr3, __PAGE_HYPERVISOR_RO));
+        ptsh->shadowing = new_cr3;
+        local_irq_restore(flags);
+
+        l4t = ptsh->shadow_l4_va;
+        vcpu_l4t = percpu_fix_to_virt(cpu, PERCPU_FIXSLOT_SHADOW);
+
+        copy_page(l4t, vcpu_l4t);
+    }
+
+    return ptsh->shadow_l4;
+}
+
+struct ptsh_ipi_info
+{
+    const struct domain *d;
+    const struct page_info *pg;
+    enum {
+        PTSH_IPI_WRITE,
+    } op;
+    unsigned int slot;
+};
+
+static void _pt_shadow_ipi(void *arg)
+{
+    unsigned int cpu = smp_processor_id();
+    struct pt_shadow *ptsh = &per_cpu(ptsh, cpu);
+    const struct ptsh_ipi_info *info = arg;
+    unsigned long maddr = page_to_maddr(info->pg);
+
+    /* No longer shadowing state from this domain?  Nothing to do. */
+    if ( info->d != ptsh->domain )
+        return;
+
+    /* Not shadowing this frame?  Nothing to do. */
+    if ( ptsh->shadowing != maddr )
+        return;
+
+    switch ( info->op )
+    {
+        l4_pgentry_t *l4t, *vcpu_l4t;
+
+    case PTSH_IPI_WRITE:
+        l4t = ptsh->shadow_l4_va;
+
+        /* Reuse the mapping established in pt_maybe_shadow(). */
+        ASSERT(l1e_get_paddr(*percpu_fixmap_l1e(cpu, PERCPU_FIXSLOT_SHADOW)) ==
+               maddr);
+        vcpu_l4t = percpu_fix_to_virt(cpu, PERCPU_FIXSLOT_SHADOW);
+
+        l4t[info->slot] = vcpu_l4t[info->slot];
+        break;
+
+    default:
+        ASSERT_UNREACHABLE();
+    }
+}
+
+void pt_shadow_l4_write(const struct domain *d, const struct page_info *pg,
+                        unsigned int slot)
+{
+    struct ptsh_ipi_info info;
+
+    if ( !pt_need_shadow(d) )
+        return;
+
+    info = (struct ptsh_ipi_info){
+        .d = d,
+        .pg = pg,
+        .op = PTSH_IPI_WRITE,
+        .slot = slot,
+    };
+
+    on_selected_cpus(d->domain_dirty_cpumask, _pt_shadow_ipi, &info, 1);
+}
+
+/*
  * Local variables:
  * mode: C
  * c-file-style: "BSD"
diff --git a/xen/include/asm-x86/fixmap.h b/xen/include/asm-x86/fixmap.h
index d46939a..748219f 100644
--- a/xen/include/asm-x86/fixmap.h
+++ b/xen/include/asm-x86/fixmap.h
@@ -28,6 +28,7 @@
 #include <acpi/apei.h>
 
 #define NR_PERCPU_SLOTS 1
+#define PERCPU_FIXSLOT_SHADOW 0
 
 /*
  * Here we define all the compile-time 'special' virtual
diff --git a/xen/include/asm-x86/pv/pt-shadow.h b/xen/include/asm-x86/pv/pt-shadow.h
index ff99c85..6e71e99 100644
--- a/xen/include/asm-x86/pv/pt-shadow.h
+++ b/xen/include/asm-x86/pv/pt-shadow.h
@@ -21,6 +21,8 @@
 #ifndef __X86_PV_PT_SHADOW_H__
 #define __X86_PV_PT_SHADOW_H__
 
+#include <xen/sched.h>
+
 #ifdef CONFIG_PV
 
 /*
@@ -30,11 +32,33 @@
 int pt_shadow_alloc(unsigned int cpu);
 void pt_shadow_free(unsigned int cpu);
 
+/*
+ * Called for context switches, and when a vcpu explicitly changes cr3.  The
+ * PT shadow logic returns the cr3 hardware should run on, which is either
+ * v->arch.cr3 (no shadowing necessary), or a local frame (which is a suitable
+ * shadow of v->arch.cr3).
+ */
+unsigned long pt_maybe_shadow(struct vcpu *v);
+
+/*
+ * Called when a write occurs to an L4 pagetable.  The PT shadow logic brings
+ * any shadows of this page up-to-date.
+ */
+void pt_shadow_l4_write(
+    const struct domain *d, const struct page_info *pg, unsigned int slot);
+
 #else /* !CONFIG_PV */
 
 static inline int pt_shadow_alloc(unsigned int cpu) { return 0; }
 static inline void pt_shadow_free(unsigned int cpu) { }
 
+static inline unsigned long pt_maybe_shadow(struct vcpu *v)
+{
+    return v->arch.cr3;
+}
+static inline void pt_shadow_l4_write(
+    const struct domain *d, const struct page_info *pg, unsigned int slot) { }
+
 #endif /* CONFIG_PV */
 
 #endif /* __X86_PV_PT_SHADOW_H__ */
-- 
2.1.4


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

  parent reply	other threads:[~2018-01-04 20:21 UTC|newest]

Thread overview: 61+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-01-04 20:21 [PATCH FAIRLY-RFC 00/44] x86: Prerequisite work for a Xen KAISER solution Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 01/44] passthrough/vtd: Don't DMA to the stack in queue_invalidate_wait() Andrew Cooper
2018-01-05  9:21   ` Jan Beulich
2018-01-05  9:33     ` Andrew Cooper
2018-01-16  6:41   ` Tian, Kevin
2018-01-04 20:21 ` [PATCH RFC 02/44] x86/idt: Factor out enabling and disabling of ISTs Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 03/44] x86/pv: Rename invalidate_shadow_ldt() to pv_destroy_ldt() Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 04/44] x86/boot: Introduce cpu_smpboot_bsp() to dynamically allocate BSP state Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 05/44] x86/boot: Move arch_init_memory() earlier in the boot sequence Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 06/44] x86/boot: Allocate percpu pagetables for the idle vcpus Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 07/44] x86/boot: Use " Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 08/44] x86/pv: Avoid an opencoded mov to %cr3 in toggle_guest_mode() Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 09/44] x86/mm: Track the current %cr3 in a per_cpu variable Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 10/44] x86/pt-shadow: Initial infrastructure for L4 PV pagetable shadowing Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 11/44] x86/pt-shadow: Always set _PAGE_ACCESSED on L4e updates Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 12/44] x86/fixmap: Temporarily add a percpu fixmap range Andrew Cooper
2018-01-04 20:21 ` Andrew Cooper [this message]
2018-01-04 20:21 ` [PATCH RFC 14/44] x86/mm: Added safety checks that pagetables aren't shared Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 15/44] x86: Rearrange the virtual layout to introduce a PERCPU linear slot Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 16/44] xen/ipi: Introduce arch_ipi_param_ok() to check IPI parameters Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 17/44] x86/smp: Infrastructure for allocating and freeing percpu pagetables Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 18/44] x86/mm: Maintain the correct percpu mappings on context switch Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 19/44] x86/boot: Defer TSS/IST setup until later during boot on the BSP Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 20/44] x86/smp: Allocate a percpu linear range for the IDT Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 21/44] x86/smp: Switch to using the percpu IDT mappings Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 22/44] x86/mm: Track whether the current cr3 has a short or extended directmap Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 23/44] x86/smp: Allocate percpu resources for map_domain_page() to use Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 24/44] x86/mapcache: Reimplement map_domain_page() from scratch Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 25/44] x86/fixmap: Drop percpu fixmap range Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 26/44] x86/pt-shadow: Maintain a small cache of shadowed frames Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 27/44] x86/smp: Allocate a percpu linear range for the compat translation area Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 28/44] x86/xlat: Use the percpu " Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 29/44] x86/smp: Allocate percpu resources for the GDT and LDT Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 30/44] x86/pv: Break handle_ldt_mapping_fault() out of handle_gdt_ldt_mapping_fault() Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 31/44] x86/pv: Drop support for paging out the LDT Andrew Cooper
2018-01-24 11:04   ` Jan Beulich
2018-01-04 20:21 ` [PATCH RFC 32/44] x86: Always reload the LDT on vcpu context switch Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 33/44] x86/smp: Use the percpu GDT/LDT mappings Andrew Cooper
2018-01-04 20:21 ` [PATCH RFC 34/44] x86: Drop the PERDOMAIN mappings Andrew Cooper
2018-01-04 20:22 ` [PATCH RFC 35/44] x86/smp: Allocate the stack in the percpu range Andrew Cooper
2018-01-04 20:22 ` [PATCH RFC 36/44] x86/monitor: Capture Xen's intent to use monitor at boot time Andrew Cooper
2018-01-04 20:22 ` [PATCH RFC 37/44] x86/misc: Move some IPI parameters off the stack Andrew Cooper
2018-01-04 20:22 ` [PATCH RFC 38/44] x86/mca: Move __HYPERVISOR_mca " Andrew Cooper
2018-01-04 20:22 ` [PATCH RFC 39/44] x86/smp: Introduce get_smp_ipi_buf() and take more " Andrew Cooper
2018-01-04 20:22 ` [PATCH RFC 40/44] x86/boot: Switch the APs to the percpu pagetables before entering C Andrew Cooper
2018-01-04 20:22 ` [PATCH RFC 41/44] x86/smp: Switch to using the percpu stacks Andrew Cooper
2018-01-04 20:22 ` [PATCH RFC 42/44] x86/smp: Allocate a percpu linear range for the TSS Andrew Cooper
2018-01-04 20:22 ` [PATCH RFC 43/44] x86/smp: Use the percpu TSS mapping Andrew Cooper
2018-01-04 20:22 ` [PATCH RFC 44/44] misc debugging Andrew Cooper
2018-01-05  7:48 ` [PATCH FAIRLY-RFC 00/44] x86: Prerequisite work for a Xen KAISER solution Juergen Gross
2018-01-05  9:26   ` Andrew Cooper
2018-01-05  9:39     ` Juergen Gross
2018-01-05  9:56       ` Andrew Cooper
2018-01-05 14:11       ` George Dunlap
2018-01-05 14:17         ` Juergen Gross
2018-01-05 14:21           ` George Dunlap
2018-01-05 14:28             ` Jan Beulich
2018-01-05 14:27         ` Jan Beulich
2018-01-05 14:35           ` Andrew Cooper
2018-01-08 11:41             ` George Dunlap
2018-01-09 23:14   ` Stefano Stabellini

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1515097329-31902-14-git-send-email-andrew.cooper3@citrix.com \
    --to=andrew.cooper3@citrix.com \
    --cc=xen-devel@lists.xen.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).