From mboxrd@z Thu Jan 1 00:00:00 1970 From: Olaf Hering Subject: Re: superpages lost after migration of HVM domU Date: Wed, 26 Apr 2017 17:43:33 +0200 Message-ID: <20170426154333.GA21598@aepfle.de> References: <20170420153523.GG4645@aepfle.de> <104ce757-2105-774e-f421-5ee07f5acb0b@citrix.com> <20170420160401.GH4645@aepfle.de> <58F8FE8D0200007800152928@prv-mh.provo.novell.com> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="===============4062526452078370138==" Return-path: In-Reply-To: <58F8FE8D0200007800152928@prv-mh.provo.novell.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Errors-To: xen-devel-bounces@lists.xen.org Sender: "Xen-devel" To: Andrew Cooper , Jan Beulich Cc: xen-devel@lists.xen.org List-Id: xen-devel@lists.xenproject.org --===============4062526452078370138== Content-Type: multipart/signed; micalg=pgp-sha1; protocol="application/pgp-signature"; boundary="9amGYk9869ThD9tj" Content-Disposition: inline --9amGYk9869ThD9tj Content-Type: text/plain; charset=utf-8 Content-Disposition: inline Content-Transfer-Encoding: quoted-printable On Thu, Apr 20, Jan Beulich wrote: > >>> On 20.04.17 at 18:04, wrote: > > On Thu, Apr 20, Andrew Cooper wrote: > >=20 > >> As it currently stands, the sending side iterates from 0 to p2m_size, > >> and sends every frame on the first pass. This means we get PAGE_DATA > >> records linearly, in batches of 1024, or two aligned 2M superpages. > > Is there a way to preserve 1G pages? This 380G domU I'm looking at is > > built with 4k:461390 2M:2341 1G:365 pages. > I think we've hashed out a possible way to deal with this, by > speculatively allocating 1G pages as long as the allocation cap for > the domain allows, subsequently punching holes into those pages > if we can't allocate any new pages anymore (due to otherwise > overrunning the cap). The result is not pretty. This HVM-only approach appears to work for a domU with "memory=3D3024" and localhost migration. It is required to punch holes as soon as possible to avoid errors in xenforeignmemory_map due to "Over-allocation". Would be nice if the receiver gets a memory map upfront to avoid all stunts... Olaf diff --git a/tools/libxc/xc_sr_common.h b/tools/libxc/xc_sr_common.h index a83f22af4e..36e7891dde 100644 --- a/tools/libxc/xc_sr_common.h +++ b/tools/libxc/xc_sr_common.h @@ -107,6 +107,9 @@ struct xc_sr_save_ops */ struct xc_sr_restore_ops { + /* Allocate a MFN for the given PFN */ + int (*allocate_pfn)(struct xc_sr_context *ctx, xen_pfn_t pfn); + /* Convert a PFN to GFN. May return ~0UL for an invalid mapping. */ xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn= ); =20 @@ -331,6 +334,14 @@ struct xc_sr_context /* HVM context blob. */ void *context; size_t contextsz; + + /* Bitmap of currently allocated PFNs during restore. = */ + xen_pfn_t *sp_extents; + unsigned long *attempted_1g; + unsigned long *attempted_2m; + unsigned long *allocated_pfns; + xen_pfn_t max_allocated_pfn; + unsigned long alloc_cnt; } restore; }; } x86_hvm; diff --git a/tools/libxc/xc_sr_restore.c b/tools/libxc/xc_sr_restore.c index 3549f0a1ae..2e8d15307f 100644 --- a/tools/libxc/xc_sr_restore.c +++ b/tools/libxc/xc_sr_restore.c @@ -135,6 +135,7 @@ int populate_pfns(struct xc_sr_context *ctx, unsigned c= ount, const xen_pfn_t *original_pfns, const uint32_t *types) { xc_interface *xch =3D ctx->xch; + xen_pfn_t min_pfn =3D original_pfns[0], max_pfn =3D original_pfns[0]; xen_pfn_t *mfns =3D malloc(count * sizeof(*mfns)), *pfns =3D malloc(count * sizeof(*pfns)); unsigned i, nr_pfns =3D 0; @@ -149,11 +150,18 @@ int populate_pfns(struct xc_sr_context *ctx, unsigned= count, =20 for ( i =3D 0; i < count; ++i ) { + if (original_pfns[i] < min_pfn) + min_pfn =3D original_pfns[i]; + if (original_pfns[i] > max_pfn) + max_pfn =3D original_pfns[i]; if ( (!types || (types && (types[i] !=3D XEN_DOMCTL_PFINFO_XTAB && types[i] !=3D XEN_DOMCTL_PFINFO_BROKEN))) && !pfn_is_populated(ctx, original_pfns[i]) ) { + rc =3D ctx->restore.ops.allocate_pfn(ctx, original_pfns[i]); + if ( rc ) + goto err; rc =3D pfn_set_populated(ctx, original_pfns[i]); if ( rc ) goto err; @@ -161,6 +169,16 @@ int populate_pfns(struct xc_sr_context *ctx, unsigned = count, ++nr_pfns; } } + IPRINTF("checking range %lx %lx\n", min_pfn, max_pfn); + while (min_pfn < max_pfn) { + if (!pfn_is_populated(ctx, min_pfn) && test_and_clear_bit(min_pfn,= ctx->x86_hvm.restore.allocated_pfns)) { + xen_pfn_t pfn =3D min_pfn; + rc =3D xc_domain_decrease_reservation_exact(xch, ctx->domid, 1= , 0, &pfn); + IPRINTF("free %lx %lx %d\n", min_pfn, pfn, rc); + } + min_pfn++; + } + nr_pfns =3D 0; =20 if ( nr_pfns ) { @@ -723,6 +741,10 @@ static void cleanup(struct xc_sr_context *ctx) NRPAGES(bitmap_size(ctx->restore.p2m_si= ze))); free(ctx->restore.buffered_records); free(ctx->restore.populated_pfns); + free(ctx->x86_hvm.restore.sp_extents); + free(ctx->x86_hvm.restore.attempted_1g); + free(ctx->x86_hvm.restore.attempted_2m); + free(ctx->x86_hvm.restore.allocated_pfns); if ( ctx->restore.ops.cleanup(ctx) ) PERROR("Failed to clean up"); } @@ -810,6 +832,17 @@ static int restore(struct xc_sr_context *ctx) saved_errno =3D errno; saved_rc =3D rc; PERROR("Restore failed"); + { + unsigned long i; + bool a, p; + IPRINTF("alloc_cnt %lu\n", ctx->x86_hvm.restore.alloc_cnt); + for (i =3D 0; i < ctx->restore.p2m_size; i++) { + p =3D test_bit(i, ctx->restore.populated_pfns); + a =3D test_bit(i, ctx->x86_hvm.restore.allocated_pfns); + if (p !=3D a) + IPRINTF("%lx a %x p %x\n", i, a, p); + } + } =20 done: cleanup(ctx); @@ -888,6 +921,7 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uin= t32_t dom, } =20 ctx.restore.p2m_size =3D nr_pfns; + IPRINTF("p2m_size %lx\n", ctx.restore.p2m_size); =20 if ( ctx.dominfo.hvm ) { diff --git a/tools/libxc/xc_sr_restore_x86_hvm.c b/tools/libxc/xc_sr_restor= e_x86_hvm.c index 1dca85354a..fc441d2a6d 100644 --- a/tools/libxc/xc_sr_restore_x86_hvm.c +++ b/tools/libxc/xc_sr_restore_x86_hvm.c @@ -3,6 +3,10 @@ =20 #include "xc_sr_common_x86.h" =20 +#define SUPERPAGE_2MB_SHIFT 9 +#define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT) +#define SUPERPAGE_1GB_SHIFT 18 +#define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT) /* * Process an HVM_CONTEXT record from the stream. */ @@ -149,6 +153,20 @@ static int x86_hvm_setup(struct xc_sr_context *ctx) return -1; } =20 + ctx->x86_hvm.restore.sp_extents =3D calloc(1UL << SUPERPAGE_1GB_SHIFT,= sizeof(*ctx->x86_hvm.restore.sp_extents)); + ctx->x86_hvm.restore.attempted_1g =3D bitmap_alloc((ctx->restore.p2m_s= ize >> SUPERPAGE_1GB_SHIFT) + 1); + ctx->x86_hvm.restore.attempted_2m =3D bitmap_alloc((ctx->restore.p2m_s= ize >> SUPERPAGE_2MB_SHIFT) + 1); + ctx->x86_hvm.restore.max_allocated_pfn =3D ctx->restore.p2m_size; + ctx->x86_hvm.restore.allocated_pfns =3D bitmap_alloc(ctx->x86_hvm.rest= ore.max_allocated_pfn + 1); + if (!ctx->x86_hvm.restore.sp_extents || !ctx->x86_hvm.restore.allocate= d_pfns || !ctx->x86_hvm.restore.attempted_2m || !ctx->x86_hvm.restore.attem= pted_1g) + { + ERROR("Unable to allocate memory for allocated_pfns bitmaps"); + return -1; + } + /* No superpage in 1st 2MB due to VGA hole */ + set_bit(0, ctx->x86_hvm.restore.attempted_1g); + set_bit(0, ctx->x86_hvm.restore.attempted_2m); + return 0; } =20 @@ -228,8 +246,139 @@ static int x86_hvm_cleanup(struct xc_sr_context *ctx) return 0; } =20 +static bool pfn_is_allocated(const struct xc_sr_context *ctx, xen_pfn_t pf= n) +{ + if ( pfn > ctx->x86_hvm.restore.max_allocated_pfn ) + return false; + return test_bit(pfn, ctx->x86_hvm.restore.allocated_pfns); +} + +/* + * Set a pfn as allocated, expanding the tracking structures if needed. To + * avoid realloc()ing too excessively, the size increased to the nearest p= ower + * of two large enough to contain the required pfn. + */ +static int pfn_set_allocated(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + xc_interface *xch =3D ctx->xch; + + if ( pfn > ctx->x86_hvm.restore.max_allocated_pfn ) + { + xen_pfn_t new_max; + size_t old_sz, new_sz; + unsigned long *p; + + /* Round up to the nearest power of two larger than pfn, less 1. */ + new_max =3D pfn; + new_max |=3D new_max >> 1; + new_max |=3D new_max >> 2; + new_max |=3D new_max >> 4; + new_max |=3D new_max >> 8; + new_max |=3D new_max >> 16; +#ifdef __x86_64__ + new_max |=3D new_max >> 32; +#endif + + old_sz =3D bitmap_size(ctx->x86_hvm.restore.max_allocated_pfn + 1); + new_sz =3D bitmap_size(new_max + 1); + p =3D realloc(ctx->x86_hvm.restore.allocated_pfns, new_sz); + if ( !p ) + { + ERROR("Failed to realloc allocated bitmap"); + errno =3D ENOMEM; + return -1; + } + + memset((uint8_t *)p + old_sz, 0x00, new_sz - old_sz); + + ctx->x86_hvm.restore.allocated_pfns =3D p; + ctx->x86_hvm.restore.max_allocated_pfn =3D new_max; + } + + assert(!test_bit(pfn, ctx->x86_hvm.restore.allocated_pfns)); + set_bit(pfn, ctx->x86_hvm.restore.allocated_pfns); + + return 0; +} + +static int x86_hvm_allocate_pfn(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + xc_interface *xch =3D ctx->xch; + bool success =3D false; + int rc =3D -1; + long done; + unsigned long i, nr_extents; + unsigned long stat_1g =3D 0, stat_2m =3D 0, stat_4k =3D 0; + unsigned long idx_1g, idx_2m; + unsigned long count; + xen_pfn_t base_pfn =3D 0, *sp_extents =3D ctx->x86_hvm.restore.sp_exte= nts; + + IPRINTF("pfn %lx\n", (long)pfn); + if (pfn_is_allocated(ctx, pfn)) + return 0; + + idx_1g =3D pfn >> SUPERPAGE_1GB_SHIFT; + idx_2m =3D pfn >> SUPERPAGE_2MB_SHIFT; + IPRINTF("idx_1g %lu idx_2m %lu\n", idx_1g, idx_2m); + if (!test_and_set_bit(idx_1g, ctx->x86_hvm.restore.attempted_1g)) { + count =3D 1UL << SUPERPAGE_1GB_SHIFT; + base_pfn =3D (pfn >> SUPERPAGE_1GB_SHIFT) << SUPERPAGE_1GB_SHIFT; + nr_extents =3D count >> SUPERPAGE_1GB_SHIFT; + IPRINTF("base_pfn %lx count %lu nr_extents %lu\n", (long)base_pfn,= count, nr_extents); + for ( i =3D 0; i < nr_extents; i++ ) + sp_extents[i] =3D base_pfn + (i<domid, nr_extents, S= UPERPAGE_1GB_SHIFT, 0, sp_extents); + IPRINTF("1G %lu -> %ld\n", nr_extents, done); + if (done > 0) { + success =3D true; + ctx->x86_hvm.restore.alloc_cnt +=3D count; + stat_1g =3D done; + for (i =3D 0; i < (count >> SUPERPAGE_2MB_SHIFT); i++) + set_bit((base_pfn >> SUPERPAGE_2MB_SHIFT) + i, ctx->x86_hv= m.restore.attempted_2m); + } + } + + if (!test_and_set_bit(idx_2m, ctx->x86_hvm.restore.attempted_2m)) { + count =3D 1UL << SUPERPAGE_2MB_SHIFT; + base_pfn =3D (pfn >> SUPERPAGE_2MB_SHIFT) << SUPERPAGE_2MB_SHIFT; + nr_extents =3D count >> SUPERPAGE_2MB_SHIFT; + IPRINTF("base_pfn %lx count %lu nr_extents %lu\n", (long)base_pfn,= count, nr_extents); + for ( i =3D 0; i < nr_extents; i++ ) + sp_extents[i] =3D base_pfn + (i<domid, nr_extents, S= UPERPAGE_2MB_SHIFT, 0, sp_extents); + IPRINTF("2M %lu -> %ld\n", nr_extents, done); + if (done > 0) { + success =3D true; + ctx->x86_hvm.restore.alloc_cnt +=3D count; + stat_2m =3D done; + } + } + if (success =3D=3D false) { + count =3D 1; + sp_extents[0] =3D base_pfn =3D pfn; + done =3D xc_domain_populate_physmap(xch, ctx->domid, count, 0, 0, = sp_extents); + if (done > 0) { + success =3D true; + ctx->x86_hvm.restore.alloc_cnt +=3D count; + stat_4k =3D count; + } + } + IPRINTF("count %lu\n", count); + IPRINTF("1G %lu 2M %lu 4k %lu\n", stat_1g, stat_2m, stat_4k); + if (success =3D=3D true) { + do { + count--; + rc =3D pfn_set_allocated(ctx, base_pfn + count); + if (rc) + break; + } while (count); + } + return rc; +} + struct xc_sr_restore_ops restore_ops_x86_hvm =3D { + .allocate_pfn =3D x86_hvm_allocate_pfn, .pfn_is_valid =3D x86_hvm_pfn_is_valid, .pfn_to_gfn =3D x86_hvm_pfn_to_gfn, .set_gfn =3D x86_hvm_set_gfn, diff --git a/tools/libxc/xc_sr_restore_x86_pv.c b/tools/libxc/xc_sr_restore= _x86_pv.c index 50e25c162c..c426f14c73 100644 --- a/tools/libxc/xc_sr_restore_x86_pv.c +++ b/tools/libxc/xc_sr_restore_x86_pv.c @@ -1152,8 +1152,15 @@ static int x86_pv_cleanup(struct xc_sr_context *ctx) return 0; } =20 +static int x86_pv_allocate_pfn(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + errno =3D ENOMEM; + return -1; +} + struct xc_sr_restore_ops restore_ops_x86_pv =3D { + .allocate_pfn =3D x86_pv_allocate_pfn, .pfn_is_valid =3D x86_pv_pfn_is_valid, .pfn_to_gfn =3D pfn_to_mfn, .set_page_type =3D x86_pv_set_page_type, --9amGYk9869ThD9tj Content-Type: application/pgp-signature; name="signature.asc" -----BEGIN PGP SIGNATURE----- iF0EARECAB0WIQSkRyP6Rn//f03pRUBdQqD6ppg2fgUCWQDAIAAKCRBdQqD6ppg2 fh1RAJ9A18QcRvGwIS9hczMgKvu7bsTDwACeOJ++Cy0fL7nA1og0RzWMXygYIpU= =OmCF -----END PGP SIGNATURE----- --9amGYk9869ThD9tj-- --===============4062526452078370138== Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: base64 Content-Disposition: inline X19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX18KWGVuLWRldmVs IG1haWxpbmcgbGlzdApYZW4tZGV2ZWxAbGlzdHMueGVuLm9yZwpodHRwczovL2xpc3RzLnhlbi5v cmcveGVuLWRldmVsCg== --===============4062526452078370138==--