All of lore.kernel.org
 help / color / mirror / Atom feed
From: Olaf Hering <olaf@aepfle.de>
To: Andrew Cooper <andrew.cooper3@citrix.com>,
	Jan Beulich <JBeulich@suse.com>
Cc: xen-devel@lists.xen.org
Subject: Re: superpages lost after migration of HVM domU
Date: Wed, 26 Apr 2017 17:43:33 +0200	[thread overview]
Message-ID: <20170426154333.GA21598@aepfle.de> (raw)
In-Reply-To: <58F8FE8D0200007800152928@prv-mh.provo.novell.com>


[-- Attachment #1.1: Type: text/plain, Size: 13003 bytes --]

On Thu, Apr 20, Jan Beulich wrote:

> >>> On 20.04.17 at 18:04, <olaf@aepfle.de> wrote:
> > On Thu, Apr 20, Andrew Cooper wrote:
> > 
> >> As it currently stands, the sending side iterates from 0 to p2m_size,
> >> and sends every frame on the first pass.  This means we get PAGE_DATA
> >> records linearly, in batches of 1024, or two aligned 2M superpages.
> > Is there a way to preserve 1G pages? This 380G domU I'm looking at is
> > built with 4k:461390 2M:2341 1G:365 pages.
> I think we've hashed out a possible way to deal with this, by
> speculatively allocating 1G pages as long as the allocation cap for
> the domain allows, subsequently punching holes into those pages
> if we can't allocate any new pages anymore (due to otherwise
> overrunning the cap).

The result is not pretty. This HVM-only approach appears to work for a
domU with "memory=3024" and localhost migration.
It is required to punch holes as soon as possible to avoid errors in
xenforeignmemory_map due to "Over-allocation". Would be nice if the
receiver gets a memory map upfront to avoid all stunts...

Olaf

diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h
index a83f22af4e..36e7891dde 100644
--- a/tools/libxc/xc_sr_common.h
+++ b/tools/libxc/xc_sr_common.h
@@ -107,6 +107,9 @@ struct xc_sr_save_ops
  */
 struct xc_sr_restore_ops
 {
+    /* Allocate a MFN for the given PFN */
+    int (*allocate_pfn)(struct xc_sr_context *ctx, xen_pfn_t pfn);
+
     /* Convert a PFN to GFN.  May return ~0UL for an invalid mapping. */
     xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
 
@@ -331,6 +334,14 @@ struct xc_sr_context
                     /* HVM context blob. */
                     void *context;
                     size_t contextsz;
+
+                    /* Bitmap of currently allocated PFNs during restore. */
+                    xen_pfn_t *sp_extents;
+                    unsigned long *attempted_1g;
+                    unsigned long *attempted_2m;
+                    unsigned long *allocated_pfns;
+                    xen_pfn_t max_allocated_pfn;
+                    unsigned long alloc_cnt;
                 } restore;
             };
         } x86_hvm;
diff --git a/tools/libxc/xc_sr_restore.c b/tools/libxc/xc_sr_restore.c
index 3549f0a1ae..2e8d15307f 100644
--- a/tools/libxc/xc_sr_restore.c
+++ b/tools/libxc/xc_sr_restore.c
@@ -135,6 +135,7 @@ int populate_pfns(struct xc_sr_context *ctx, unsigned count,
                   const xen_pfn_t *original_pfns, const uint32_t *types)
 {
     xc_interface *xch = ctx->xch;
+    xen_pfn_t min_pfn = original_pfns[0], max_pfn = original_pfns[0];
     xen_pfn_t *mfns = malloc(count * sizeof(*mfns)),
         *pfns = malloc(count * sizeof(*pfns));
     unsigned i, nr_pfns = 0;
@@ -149,11 +150,18 @@ int populate_pfns(struct xc_sr_context *ctx, unsigned count,
 
     for ( i = 0; i < count; ++i )
     {
+        if (original_pfns[i] < min_pfn)
+            min_pfn = original_pfns[i];
+        if (original_pfns[i] > max_pfn)
+            max_pfn = original_pfns[i];
         if ( (!types || (types &&
                          (types[i] != XEN_DOMCTL_PFINFO_XTAB &&
                           types[i] != XEN_DOMCTL_PFINFO_BROKEN))) &&
              !pfn_is_populated(ctx, original_pfns[i]) )
         {
+            rc = ctx->restore.ops.allocate_pfn(ctx, original_pfns[i]);
+            if ( rc )
+                goto err;
             rc = pfn_set_populated(ctx, original_pfns[i]);
             if ( rc )
                 goto err;
@@ -161,6 +169,16 @@ int populate_pfns(struct xc_sr_context *ctx, unsigned count,
             ++nr_pfns;
         }
     }
+    IPRINTF("checking range %lx %lx\n", min_pfn, max_pfn);
+    while (min_pfn < max_pfn) {
+        if (!pfn_is_populated(ctx, min_pfn) && test_and_clear_bit(min_pfn, ctx->x86_hvm.restore.allocated_pfns)) {
+            xen_pfn_t pfn = min_pfn;
+            rc = xc_domain_decrease_reservation_exact(xch, ctx->domid, 1, 0, &pfn);
+            IPRINTF("free %lx %lx %d\n", min_pfn, pfn, rc);
+        }
+        min_pfn++;
+    }
+    nr_pfns = 0;
 
     if ( nr_pfns )
     {
@@ -723,6 +741,10 @@ static void cleanup(struct xc_sr_context *ctx)
                                    NRPAGES(bitmap_size(ctx->restore.p2m_size)));
     free(ctx->restore.buffered_records);
     free(ctx->restore.populated_pfns);
+    free(ctx->x86_hvm.restore.sp_extents);
+    free(ctx->x86_hvm.restore.attempted_1g);
+    free(ctx->x86_hvm.restore.attempted_2m);
+    free(ctx->x86_hvm.restore.allocated_pfns);
     if ( ctx->restore.ops.cleanup(ctx) )
         PERROR("Failed to clean up");
 }
@@ -810,6 +832,17 @@ static int restore(struct xc_sr_context *ctx)
     saved_errno = errno;
     saved_rc = rc;
     PERROR("Restore failed");
+    {
+        unsigned long i;
+        bool a, p;
+        IPRINTF("alloc_cnt %lu\n", ctx->x86_hvm.restore.alloc_cnt);
+        for (i = 0; i < ctx->restore.p2m_size; i++) {
+            p = test_bit(i, ctx->restore.populated_pfns);
+            a = test_bit(i, ctx->x86_hvm.restore.allocated_pfns);
+            if (p != a)
+                IPRINTF("%lx a %x p %x\n", i, a, p);
+        }
+    }
 
  done:
     cleanup(ctx);
@@ -888,6 +921,7 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
     }
 
     ctx.restore.p2m_size = nr_pfns;
+    IPRINTF("p2m_size %lx\n", ctx.restore.p2m_size);
 
     if ( ctx.dominfo.hvm )
     {
diff --git a/tools/libxc/xc_sr_restore_x86_hvm.c b/tools/libxc/xc_sr_restore_x86_hvm.c
index 1dca85354a..fc441d2a6d 100644
--- a/tools/libxc/xc_sr_restore_x86_hvm.c
+++ b/tools/libxc/xc_sr_restore_x86_hvm.c
@@ -3,6 +3,10 @@
 
 #include "xc_sr_common_x86.h"
 
+#define SUPERPAGE_2MB_SHIFT   9
+#define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT)
+#define SUPERPAGE_1GB_SHIFT   18
+#define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT)
 /*
  * Process an HVM_CONTEXT record from the stream.
  */
@@ -149,6 +153,20 @@ static int x86_hvm_setup(struct xc_sr_context *ctx)
         return -1;
     }
 
+    ctx->x86_hvm.restore.sp_extents = calloc(1UL << SUPERPAGE_1GB_SHIFT, sizeof(*ctx->x86_hvm.restore.sp_extents));
+    ctx->x86_hvm.restore.attempted_1g = bitmap_alloc((ctx->restore.p2m_size >> SUPERPAGE_1GB_SHIFT) + 1);
+    ctx->x86_hvm.restore.attempted_2m = bitmap_alloc((ctx->restore.p2m_size >> SUPERPAGE_2MB_SHIFT) + 1);
+    ctx->x86_hvm.restore.max_allocated_pfn = ctx->restore.p2m_size;
+    ctx->x86_hvm.restore.allocated_pfns = bitmap_alloc(ctx->x86_hvm.restore.max_allocated_pfn + 1);
+    if (!ctx->x86_hvm.restore.sp_extents || !ctx->x86_hvm.restore.allocated_pfns || !ctx->x86_hvm.restore.attempted_2m || !ctx->x86_hvm.restore.attempted_1g)
+    {
+        ERROR("Unable to allocate memory for allocated_pfns bitmaps");
+        return -1;
+    }
+    /* No superpage in 1st 2MB due to VGA hole */
+    set_bit(0, ctx->x86_hvm.restore.attempted_1g);
+    set_bit(0, ctx->x86_hvm.restore.attempted_2m);
+
     return 0;
 }
 
@@ -228,8 +246,139 @@ static int x86_hvm_cleanup(struct xc_sr_context *ctx)
     return 0;
 }
 
+static bool pfn_is_allocated(const struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    if ( pfn > ctx->x86_hvm.restore.max_allocated_pfn )
+        return false;
+    return test_bit(pfn, ctx->x86_hvm.restore.allocated_pfns);
+}
+
+/*
+ * Set a pfn as allocated, expanding the tracking structures if needed. To
+ * avoid realloc()ing too excessively, the size increased to the nearest power
+ * of two large enough to contain the required pfn.
+ */
+static int pfn_set_allocated(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    xc_interface *xch = ctx->xch;
+
+    if ( pfn > ctx->x86_hvm.restore.max_allocated_pfn )
+    {
+        xen_pfn_t new_max;
+        size_t old_sz, new_sz;
+        unsigned long *p;
+
+        /* Round up to the nearest power of two larger than pfn, less 1. */
+        new_max = pfn;
+        new_max |= new_max >> 1;
+        new_max |= new_max >> 2;
+        new_max |= new_max >> 4;
+        new_max |= new_max >> 8;
+        new_max |= new_max >> 16;
+#ifdef __x86_64__
+        new_max |= new_max >> 32;
+#endif
+
+        old_sz = bitmap_size(ctx->x86_hvm.restore.max_allocated_pfn + 1);
+        new_sz = bitmap_size(new_max + 1);
+        p = realloc(ctx->x86_hvm.restore.allocated_pfns, new_sz);
+        if ( !p )
+        {
+            ERROR("Failed to realloc allocated bitmap");
+            errno = ENOMEM;
+            return -1;
+        }
+
+        memset((uint8_t *)p + old_sz, 0x00, new_sz - old_sz);
+
+        ctx->x86_hvm.restore.allocated_pfns    = p;
+        ctx->x86_hvm.restore.max_allocated_pfn = new_max;
+    }
+
+    assert(!test_bit(pfn, ctx->x86_hvm.restore.allocated_pfns));
+    set_bit(pfn, ctx->x86_hvm.restore.allocated_pfns);
+
+    return 0;
+}
+
+static int x86_hvm_allocate_pfn(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    xc_interface *xch = ctx->xch;
+    bool success = false;
+    int rc = -1;
+    long done;
+    unsigned long i, nr_extents;
+    unsigned long stat_1g = 0, stat_2m = 0, stat_4k = 0;
+    unsigned long idx_1g, idx_2m;
+    unsigned long count;
+    xen_pfn_t base_pfn = 0, *sp_extents = ctx->x86_hvm.restore.sp_extents;
+
+    IPRINTF("pfn %lx\n", (long)pfn);
+    if (pfn_is_allocated(ctx, pfn))
+        return 0;
+
+    idx_1g = pfn >> SUPERPAGE_1GB_SHIFT;
+    idx_2m = pfn >> SUPERPAGE_2MB_SHIFT;
+    IPRINTF("idx_1g %lu idx_2m %lu\n", idx_1g, idx_2m);
+    if (!test_and_set_bit(idx_1g, ctx->x86_hvm.restore.attempted_1g)) {
+        count = 1UL << SUPERPAGE_1GB_SHIFT;
+        base_pfn = (pfn >> SUPERPAGE_1GB_SHIFT) << SUPERPAGE_1GB_SHIFT;
+        nr_extents = count >> SUPERPAGE_1GB_SHIFT;
+        IPRINTF("base_pfn %lx count %lu nr_extents %lu\n", (long)base_pfn, count, nr_extents);
+        for ( i = 0; i < nr_extents; i++ )
+            sp_extents[i] = base_pfn + (i<<SUPERPAGE_1GB_SHIFT);
+        done = xc_domain_populate_physmap(xch, ctx->domid, nr_extents, SUPERPAGE_1GB_SHIFT, 0, sp_extents);
+        IPRINTF("1G %lu -> %ld\n", nr_extents, done);
+        if (done > 0) {
+            success = true;
+            ctx->x86_hvm.restore.alloc_cnt += count;
+            stat_1g = done;
+            for (i = 0; i < (count >> SUPERPAGE_2MB_SHIFT); i++)
+                set_bit((base_pfn >> SUPERPAGE_2MB_SHIFT) + i, ctx->x86_hvm.restore.attempted_2m);
+        }
+    }
+
+    if (!test_and_set_bit(idx_2m, ctx->x86_hvm.restore.attempted_2m)) {
+        count = 1UL << SUPERPAGE_2MB_SHIFT;
+        base_pfn = (pfn >> SUPERPAGE_2MB_SHIFT) << SUPERPAGE_2MB_SHIFT;
+        nr_extents = count >> SUPERPAGE_2MB_SHIFT;
+        IPRINTF("base_pfn %lx count %lu nr_extents %lu\n", (long)base_pfn, count, nr_extents);
+        for ( i = 0; i < nr_extents; i++ )
+            sp_extents[i] = base_pfn + (i<<SUPERPAGE_2MB_SHIFT);
+        done = xc_domain_populate_physmap(xch, ctx->domid, nr_extents, SUPERPAGE_2MB_SHIFT, 0, sp_extents);
+        IPRINTF("2M %lu -> %ld\n", nr_extents, done);
+        if (done > 0) {
+            success = true;
+            ctx->x86_hvm.restore.alloc_cnt += count;
+            stat_2m = done;
+        }
+    }
+    if (success == false) {
+        count = 1;
+        sp_extents[0] = base_pfn = pfn;
+        done = xc_domain_populate_physmap(xch, ctx->domid, count, 0, 0, sp_extents);
+        if (done > 0) {
+            success = true;
+            ctx->x86_hvm.restore.alloc_cnt += count;
+            stat_4k = count;
+        }
+    }
+    IPRINTF("count %lu\n", count);
+    IPRINTF("1G %lu 2M %lu 4k %lu\n", stat_1g, stat_2m, stat_4k);
+    if (success == true) {
+        do {
+            count--;
+            rc = pfn_set_allocated(ctx, base_pfn + count);
+            if (rc)
+                break;
+        } while (count);
+    }
+    return rc;
+}
+
 struct xc_sr_restore_ops restore_ops_x86_hvm =
 {
+    .allocate_pfn    = x86_hvm_allocate_pfn,
     .pfn_is_valid    = x86_hvm_pfn_is_valid,
     .pfn_to_gfn      = x86_hvm_pfn_to_gfn,
     .set_gfn         = x86_hvm_set_gfn,
diff --git a/tools/libxc/xc_sr_restore_x86_pv.c b/tools/libxc/xc_sr_restore_x86_pv.c
index 50e25c162c..c426f14c73 100644
--- a/tools/libxc/xc_sr_restore_x86_pv.c
+++ b/tools/libxc/xc_sr_restore_x86_pv.c
@@ -1152,8 +1152,15 @@ static int x86_pv_cleanup(struct xc_sr_context *ctx)
     return 0;
 }
 
+static int x86_pv_allocate_pfn(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    errno = ENOMEM;
+    return -1;
+}
+
 struct xc_sr_restore_ops restore_ops_x86_pv =
 {
+    .allocate_pfn    = x86_pv_allocate_pfn,
     .pfn_is_valid    = x86_pv_pfn_is_valid,
     .pfn_to_gfn      = pfn_to_mfn,
     .set_page_type   = x86_pv_set_page_type,

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 195 bytes --]

[-- Attachment #2: Type: text/plain, Size: 127 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

  reply	other threads:[~2017-04-26 15:43 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-04-20 15:35 superpages lost after migration of HVM domU Olaf Hering
2017-04-20 15:53 ` Andrew Cooper
2017-04-20 16:04   ` Olaf Hering
2017-04-20 16:31     ` Jan Beulich
2017-04-26 15:43       ` Olaf Hering [this message]
2017-04-26 15:52         ` Andrew Cooper
2017-04-28 10:15           ` Olaf Hering

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170426154333.GA21598@aepfle.de \
    --to=olaf@aepfle.de \
    --cc=JBeulich@suse.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=xen-devel@lists.xen.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.