From mboxrd@z Thu Jan 1 00:00:00 1970 From: Olaf Hering Subject: Re: superpages lost after migration of HVM domU Date: Fri, 28 Apr 2017 12:15:21 +0200 Message-ID: <20170428101521.GA15020@aepfle.de> References: <20170420153523.GG4645@aepfle.de> <104ce757-2105-774e-f421-5ee07f5acb0b@citrix.com> <20170420160401.GH4645@aepfle.de> <58F8FE8D0200007800152928@prv-mh.provo.novell.com> <20170426154333.GA21598@aepfle.de> Mime-Version: 1.0 Content-Type: multipart/mixed; boundary="===============9073118524411200435==" Return-path: In-Reply-To: List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Errors-To: xen-devel-bounces@lists.xen.org Sender: "Xen-devel" To: Andrew Cooper Cc: Jan Beulich , xen-devel@lists.xen.org List-Id: xen-devel@lists.xenproject.org --===============9073118524411200435== Content-Type: multipart/signed; micalg=pgp-sha1; protocol="application/pgp-signature"; boundary="82I3+IH0IqGh5yIs" Content-Disposition: inline --82I3+IH0IqGh5yIs Content-Type: text/plain; charset=utf-8 Content-Disposition: inline Content-Transfer-Encoding: quoted-printable On Wed, Apr 26, Andrew Cooper wrote: > On 26/04/17 16:43, Olaf Hering wrote: > > On Thu, Apr 20, Jan Beulich wrote: > > > >>>>> On 20.04.17 at 18:04, wrote: > >>> On Thu, Apr 20, Andrew Cooper wrote: > >>> > >>>> As it currently stands, the sending side iterates from 0 to p2m_size, > >>>> and sends every frame on the first pass. This means we get PAGE_DATA > >>>> records linearly, in batches of 1024, or two aligned 2M superpages. > >>> Is there a way to preserve 1G pages? This 380G domU I'm looking at is > >>> built with 4k:461390 2M:2341 1G:365 pages. > >> I think we've hashed out a possible way to deal with this, by > >> speculatively allocating 1G pages as long as the allocation cap for > >> the domain allows, subsequently punching holes into those pages > >> if we can't allocate any new pages anymore (due to otherwise > >> overrunning the cap). > > The result is not pretty. This HVM-only approach appears to work for a > > domU with "memory=3D3024" and localhost migration. > > It is required to punch holes as soon as possible to avoid errors in > > xenforeignmemory_map due to "Over-allocation". Would be nice if the > > receiver gets a memory map upfront to avoid all stunts... >=20 > Oh - I was about to start working on this. This is a pleasant surprise. = :) Here is a variant that actually works for migration between two dom0s. --- a/tools/libxc/xc_sr_common.h +++ b/tools/libxc/xc_sr_common.h @@ -107,6 +107,9 @@ struct xc_sr_save_ops */ struct xc_sr_restore_ops { + /* Allocate a MFN for the given PFN */ + int (*allocate_pfn)(struct xc_sr_context *ctx, xen_pfn_t pfn); + /* Convert a PFN to GFN. May return ~0UL for an invalid mapping. */ xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn= ); =20 @@ -172,6 +175,52 @@ struct xc_sr_x86_pv_restore_vcpu size_t basicsz, extdsz, xsavesz, msrsz; }; =20 +struct xc_sr_bitmap +{ + void *p; + unsigned long bits; +}; + +extern bool _xc_sr_bitmap_resize(struct xc_sr_bitmap *bm, unsigned long bi= ts); +static inline bool xc_sr_bitmap_resize(struct xc_sr_bitmap *bm, unsigned l= ong bits) +{ + if (bits > bm->bits) + return _xc_sr_bitmap_resize(bm, bits); + return true; +} + +static inline void xc_sr_bitmap_free(struct xc_sr_bitmap *bm) +{ + free(bm->p); +} + +static inline bool xc_sr_set_bit(unsigned long bit, struct xc_sr_bitmap *b= m) +{ + if (!xc_sr_bitmap_resize(bm, bit)) + return false; + + set_bit(bit, bm->p); + return true; +} + +static inline bool xc_sr_test_bit(unsigned long bit, struct xc_sr_bitmap *= bm) +{ + if (bit > bm->bits) + return false; + return !!test_bit(bit, bm->p); +} + +static inline int xc_sr_test_and_clear_bit(unsigned long bit, struct xc_sr= _bitmap *bm) +{ + return test_and_clear_bit(bit, bm->p); +} + +static inline int xc_sr_test_and_set_bit(unsigned long bit, struct xc_sr_b= itmap *bm) +{ + return test_and_set_bit(bit, bm->p); +} + + struct xc_sr_context { xc_interface *xch; @@ -256,8 +305,7 @@ struct xc_sr_context domid_t xenstore_domid, console_domid; =20 /* Bitmap of currently populated PFNs during restore. */ - unsigned long *populated_pfns; - xen_pfn_t max_populated_pfn; + struct xc_sr_bitmap populated_pfns; =20 /* Sender has invoked verify mode on the stream. */ bool verify; @@ -332,6 +380,12 @@ struct xc_sr_context /* HVM context blob. */ void *context; size_t contextsz; + + /* Bitmap of currently allocated PFNs during restore. = */ + struct xc_sr_bitmap attempted_1g; + struct xc_sr_bitmap attempted_2m; + struct xc_sr_bitmap allocated_pfns; + unsigned long alloc_cnt; } restore; }; } x86_hvm; --- a/tools/libxc/xc_sr_restore.c +++ b/tools/libxc/xc_sr_restore.c @@ -71,11 +71,9 @@ static int read_headers(struct xc_sr_con /* * Is a pfn populated? */ -static bool pfn_is_populated(const struct xc_sr_context *ctx, xen_pfn_t pf= n) +static bool pfn_is_populated(struct xc_sr_context *ctx, xen_pfn_t pfn) { - if ( pfn > ctx->restore.max_populated_pfn ) - return false; - return test_bit(pfn, ctx->restore.populated_pfns); + return xc_sr_test_bit(pfn, &ctx->restore.populated_pfns); } =20 /* @@ -87,42 +85,12 @@ static int pfn_set_populated(struct xc_s { xc_interface *xch =3D ctx->xch; =20 - if ( pfn > ctx->restore.max_populated_pfn ) + if ( !xc_sr_set_bit(pfn, &ctx->restore.populated_pfns) ) { - xen_pfn_t new_max; - size_t old_sz, new_sz; - unsigned long *p; - - /* Round up to the nearest power of two larger than pfn, less 1. */ - new_max =3D pfn; - new_max |=3D new_max >> 1; - new_max |=3D new_max >> 2; - new_max |=3D new_max >> 4; - new_max |=3D new_max >> 8; - new_max |=3D new_max >> 16; -#ifdef __x86_64__ - new_max |=3D new_max >> 32; -#endif - - old_sz =3D bitmap_size(ctx->restore.max_populated_pfn + 1); - new_sz =3D bitmap_size(new_max + 1); - p =3D realloc(ctx->restore.populated_pfns, new_sz); - if ( !p ) - { - ERROR("Failed to realloc populated bitmap"); - errno =3D ENOMEM; - return -1; - } - - memset((uint8_t *)p + old_sz, 0x00, new_sz - old_sz); - - ctx->restore.populated_pfns =3D p; - ctx->restore.max_populated_pfn =3D new_max; + ERROR("Failed to realloc populated bitmap"); + errno =3D ENOMEM; + return -1; } - - assert(!test_bit(pfn, ctx->restore.populated_pfns)); - set_bit(pfn, ctx->restore.populated_pfns); - return 0; } =20 @@ -135,6 +103,7 @@ int populate_pfns(struct xc_sr_context * const xen_pfn_t *original_pfns, const uint32_t *types) { xc_interface *xch =3D ctx->xch; + xen_pfn_t min_pfn =3D original_pfns[0], max_pfn =3D original_pfns[0]; xen_pfn_t *mfns =3D malloc(count * sizeof(*mfns)), *pfns =3D malloc(count * sizeof(*pfns)); unsigned i, nr_pfns =3D 0; @@ -149,11 +118,18 @@ int populate_pfns(struct xc_sr_context * =20 for ( i =3D 0; i < count; ++i ) { + if (original_pfns[i] < min_pfn) + min_pfn =3D original_pfns[i]; + if (original_pfns[i] > max_pfn) + max_pfn =3D original_pfns[i]; if ( (!types || (types && (types[i] !=3D XEN_DOMCTL_PFINFO_XTAB && types[i] !=3D XEN_DOMCTL_PFINFO_BROKEN))) && !pfn_is_populated(ctx, original_pfns[i]) ) { + rc =3D ctx->restore.ops.allocate_pfn(ctx, original_pfns[i]); + if ( rc ) + goto err; rc =3D pfn_set_populated(ctx, original_pfns[i]); if ( rc ) goto err; @@ -161,6 +137,21 @@ int populate_pfns(struct xc_sr_context * ++nr_pfns; } } + IPRINTF("checking range %lx %lx\n", min_pfn, max_pfn); + while (min_pfn < max_pfn) { + if (!xc_sr_bitmap_resize(&ctx->x86_hvm.restore.allocated_pfns, min= _pfn)) + { + PERROR("Failed to realloc allocated_pfns %" PRI_xen_pfn, min_p= fn); + goto err; + } + if (!pfn_is_populated(ctx, min_pfn) && xc_sr_test_and_clear_bit(mi= n_pfn, &ctx->x86_hvm.restore.allocated_pfns)) { + xen_pfn_t pfn =3D min_pfn; + rc =3D xc_domain_decrease_reservation_exact(xch, ctx->domid, 1= , 0, &pfn); + IPRINTF("free %lx %lx %d\n", min_pfn, pfn, rc); + } + min_pfn++; + } + nr_pfns =3D 0; =20 if ( nr_pfns ) { @@ -684,10 +675,8 @@ static int setup(struct xc_sr_context *c if ( rc ) goto err; =20 - ctx->restore.max_populated_pfn =3D (32 * 1024 / 4) - 1; - ctx->restore.populated_pfns =3D bitmap_alloc( - ctx->restore.max_populated_pfn + 1); - if ( !ctx->restore.populated_pfns ) + rc =3D !xc_sr_bitmap_resize(&ctx->restore.populated_pfns, 32 * 1024 / = 4); + if ( rc ) { ERROR("Unable to allocate memory for populated_pfns bitmap"); rc =3D -1; @@ -722,7 +711,10 @@ static void cleanup(struct xc_sr_context xc_hypercall_buffer_free_pages(xch, dirty_bitmap, NRPAGES(bitmap_size(ctx->restore.p2m_si= ze))); free(ctx->restore.buffered_records); - free(ctx->restore.populated_pfns); + xc_sr_bitmap_free(&ctx->restore.populated_pfns); + xc_sr_bitmap_free(&ctx->x86_hvm.restore.attempted_1g); + xc_sr_bitmap_free(&ctx->x86_hvm.restore.attempted_2m); + xc_sr_bitmap_free(&ctx->x86_hvm.restore.allocated_pfns); if ( ctx->restore.ops.cleanup(ctx) ) PERROR("Failed to clean up"); } @@ -810,6 +802,17 @@ static int restore(struct xc_sr_context saved_errno =3D errno; saved_rc =3D rc; PERROR("Restore failed"); + { + unsigned long i; + bool a, p; + IPRINTF("alloc_cnt %lu\n", ctx->x86_hvm.restore.alloc_cnt); + for (i =3D 0; i < ctx->restore.p2m_size; i++) { + p =3D xc_sr_test_bit(i, &ctx->restore.populated_pfns); + a =3D xc_sr_test_bit(i, &ctx->x86_hvm.restore.allocated_pfns); + if (p !=3D a) + IPRINTF("%lx a %x p %x\n", i, a, p); + } + } =20 done: cleanup(ctx); @@ -888,6 +891,7 @@ int xc_domain_restore(xc_interface *xch, } =20 ctx.restore.p2m_size =3D nr_pfns; + IPRINTF("p2m_size %lx\n", ctx.restore.p2m_size); =20 if ( ctx.dominfo.hvm ) { --- a/tools/libxc/xc_sr_restore_x86_hvm.c +++ b/tools/libxc/xc_sr_restore_x86_hvm.c @@ -3,6 +3,10 @@ =20 #include "xc_sr_common_x86.h" =20 +#define SUPERPAGE_2MB_SHIFT 9 +#define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT) +#define SUPERPAGE_1GB_SHIFT 18 +#define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT) /* * Process an HVM_CONTEXT record from the stream. */ @@ -130,6 +134,17 @@ static int x86_hvm_setup(struct xc_sr_co return -1; } =20 + if (!xc_sr_bitmap_resize(&ctx->x86_hvm.restore.attempted_1g, (ctx->res= tore.p2m_size >> SUPERPAGE_1GB_SHIFT) + 1) || + !xc_sr_bitmap_resize(&ctx->x86_hvm.restore.attempted_2m, (ctx->res= tore.p2m_size >> SUPERPAGE_2MB_SHIFT) + 1) || + !xc_sr_bitmap_resize(&ctx->x86_hvm.restore.allocated_pfns, ctx->re= store.p2m_size + 1)) + { + ERROR("Unable to allocate memory for allocated_pfns bitmaps"); + return -1; + } + /* No superpage in 1st 2MB due to VGA hole */ + xc_sr_set_bit(0, &ctx->x86_hvm.restore.attempted_1g); + xc_sr_set_bit(0, &ctx->x86_hvm.restore.attempted_2m); + return 0; } =20 @@ -209,8 +224,110 @@ static int x86_hvm_cleanup(struct xc_sr_ return 0; } =20 +static bool pfn_is_allocated(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + return xc_sr_test_bit(pfn, &ctx->x86_hvm.restore.allocated_pfns); +} + +/* + * Set a pfn as allocated, expanding the tracking structures if needed. To + * avoid realloc()ing too excessively, the size increased to the nearest p= ower + * of two large enough to contain the required pfn. + */ +static int pfn_set_allocated(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + xc_interface *xch =3D ctx->xch; + + if ( !xc_sr_set_bit(pfn, &ctx->x86_hvm.restore.allocated_pfns) ) + { + ERROR("Failed to realloc allocated_pfns bitmap"); + errno =3D ENOMEM; + return -1; + } + return 0; +} + +static int x86_hvm_allocate_pfn(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + xc_interface *xch =3D ctx->xch; + bool success =3D false; + int rc =3D -1; + long done; + unsigned long i; + unsigned long stat_1g =3D 0, stat_2m =3D 0, stat_4k =3D 0; + unsigned long idx_1g, idx_2m; + unsigned long count; + xen_pfn_t base_pfn =3D 0, sp_extent; + + IPRINTF("pfn %lx\n", (long)pfn); + if (pfn_is_allocated(ctx, pfn)) + return 0; + + idx_1g =3D pfn >> SUPERPAGE_1GB_SHIFT; + idx_2m =3D pfn >> SUPERPAGE_2MB_SHIFT; + if (!xc_sr_bitmap_resize(&ctx->x86_hvm.restore.attempted_1g, idx_1g)) + { + PERROR("Failed to realloc attempted_1g"); + return -1; + } + if (!xc_sr_bitmap_resize(&ctx->x86_hvm.restore.attempted_2m, idx_2m)) + { + PERROR("Failed to realloc attempted_2m"); + return -1; + } + IPRINTF("idx_1g %lu idx_2m %lu\n", idx_1g, idx_2m); + if (!xc_sr_test_and_set_bit(idx_1g, &ctx->x86_hvm.restore.attempted_1g= )) { + count =3D 1UL << SUPERPAGE_1GB_SHIFT; + base_pfn =3D (pfn >> SUPERPAGE_1GB_SHIFT) << SUPERPAGE_1GB_SHIFT; + sp_extent =3D base_pfn; + done =3D xc_domain_populate_physmap(xch, ctx->domid, 1, SUPERPAGE_= 1GB_SHIFT, 0, &sp_extent); + IPRINTF("1G base_pfn %lx count %lu done %ld\n", (long)base_pfn, co= unt, done); + if (done > 0) { + success =3D true; + ctx->x86_hvm.restore.alloc_cnt +=3D count; + stat_1g =3D done; + for (i =3D 0; i < (count >> SUPERPAGE_2MB_SHIFT); i++) + xc_sr_set_bit((base_pfn >> SUPERPAGE_2MB_SHIFT) + i, &ctx-= >x86_hvm.restore.attempted_2m); + } + } + + if (!xc_sr_test_and_set_bit(idx_2m, &ctx->x86_hvm.restore.attempted_2m= )) { + count =3D 1UL << SUPERPAGE_2MB_SHIFT; + base_pfn =3D (pfn >> SUPERPAGE_2MB_SHIFT) << SUPERPAGE_2MB_SHIFT; + sp_extent =3D base_pfn; + done =3D xc_domain_populate_physmap(xch, ctx->domid, 1, SUPERPAGE_= 2MB_SHIFT, 0, &sp_extent); + IPRINTF("2M base_pfn %lx count %lu done %ld\n", (long)base_pfn, co= unt, done); + if (done > 0) { + success =3D true; + ctx->x86_hvm.restore.alloc_cnt +=3D count; + stat_2m =3D done; + } + } + if (success =3D=3D false) { + count =3D 1; + sp_extent =3D base_pfn =3D pfn; + done =3D xc_domain_populate_physmap(xch, ctx->domid, count, 0, 0, = &sp_extent); + if (done > 0) { + success =3D true; + ctx->x86_hvm.restore.alloc_cnt +=3D count; + stat_4k =3D count; + } + } + IPRINTF("count %lu 1G %lu 2M %lu 4k %lu\n", count, stat_1g, stat_2m, s= tat_4k); + if (success =3D=3D true) { + do { + count--; + rc =3D pfn_set_allocated(ctx, base_pfn + count); + if (rc) + break; + } while (count); + } + return rc; +} + struct xc_sr_restore_ops restore_ops_x86_hvm =3D { + .allocate_pfn =3D x86_hvm_allocate_pfn, .pfn_is_valid =3D x86_hvm_pfn_is_valid, .pfn_to_gfn =3D x86_hvm_pfn_to_gfn, .set_gfn =3D x86_hvm_set_gfn, --- a/tools/libxc/xc_sr_restore_x86_pv.c +++ b/tools/libxc/xc_sr_restore_x86_pv.c @@ -1141,8 +1141,15 @@ static int x86_pv_cleanup(struct xc_sr_c return 0; } =20 +static int x86_pv_allocate_pfn(struct xc_sr_context *ctx, xen_pfn_t pfn) +{ + errno =3D ENOMEM; + return -1; +} + struct xc_sr_restore_ops restore_ops_x86_pv =3D { + .allocate_pfn =3D x86_pv_allocate_pfn, .pfn_is_valid =3D x86_pv_pfn_is_valid, .pfn_to_gfn =3D pfn_to_mfn, .set_page_type =3D x86_pv_set_page_type, --- a/tools/libxc/xc_sr_common.c +++ b/tools/libxc/xc_sr_common.c @@ -153,6 +153,42 @@ static void __attribute__((unused)) buil XC_BUILD_BUG_ON(sizeof(struct xc_sr_rec_hvm_params) !=3D 8); } =20 +bool _xc_sr_bitmap_resize(struct xc_sr_bitmap *bm, unsigned long bits) +{ + if (bits > bm->bits) + { + size_t new_max; + size_t old_sz, new_sz; + void *p; + + /* Round up to the nearest power of two larger than bit, less 1. */ + new_max =3D bits; + new_max |=3D new_max >> 1; + new_max |=3D new_max >> 2; + new_max |=3D new_max >> 4; + new_max |=3D new_max >> 8; + new_max |=3D new_max >> 16; +#ifdef __x86_64__ + new_max |=3D new_max >> 32; +#endif + + old_sz =3D bitmap_size(bm->bits + 1); + new_sz =3D bitmap_size(new_max + 1); + p =3D realloc(bm->p, new_sz); + if (!p) + return false; + + if (bm->p) + memset(p + old_sz, 0, new_sz - old_sz); + else + memset(p, 0, new_sz); + + bm->p =3D p; + bm->bits =3D new_max; + } + return true; +} + /* * Local variables: * mode: C --82I3+IH0IqGh5yIs Content-Type: application/pgp-signature; name="signature.asc" -----BEGIN PGP SIGNATURE----- iF0EARECAB0WIQSkRyP6Rn//f03pRUBdQqD6ppg2fgUCWQMWOQAKCRBdQqD6ppg2 frIRAKCLXJeTkpzAaIjwXgu3VdRRifcIVQCguI2bApbKaldvMvdhVO3Wz5tfJA0= =qVNq -----END PGP SIGNATURE----- --82I3+IH0IqGh5yIs-- --===============9073118524411200435== Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: base64 Content-Disposition: inline X19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX18KWGVuLWRldmVs IG1haWxpbmcgbGlzdApYZW4tZGV2ZWxAbGlzdHMueGVuLm9yZwpodHRwczovL2xpc3RzLnhlbi5v cmcveGVuLWRldmVsCg== --===============9073118524411200435==--