* [PATCH] Make restore work properly with PV superpage flag
@ 2012-10-12 17:32 Dave McCracken
2012-10-18 10:05 ` Ian Campbell
2012-10-18 14:41 ` George Dunlap
0 siblings, 2 replies; 6+ messages in thread
From: Dave McCracken @ 2012-10-12 17:32 UTC (permalink / raw)
To: George Dunlap, Keir Fraser; +Cc: Xen Developers List
For PV guests, the superpage flag means to unconditionally allocate all
pages as superpages, which is required for Linux hugepages to work. Support
for this on restore was not supported. This patch adds proper support for
the superpage flag on restore.
For HVM guests, the superpage flag has been used to mean attempt to allocate
superpages if possible on restore. This functionality has been preserved.
Signed-off-by: Dave McCracken <dave.mccracken@oracle.com>
--------
xc_domain_restore.c | 130 ++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 101 insertions(+), 29 deletions(-)
--- xen-unstable/tools/libxc/xc_domain_restore.c 2012-08-18 10:04:53.000000000 -0500
+++ xen-unstable-restore//tools/libxc/xc_domain_restore.c 2012-08-20 07:24:22.000000000 -0500
@@ -23,6 +23,19 @@
*
*/
+/*
+ * The superpages flag in restore has two different meanings depending on
+ * the type of domain.
+ *
+ * For an HVM domain, the flag means to look for properly aligned contiguous
+ * pages and try to allocate a superpage to satisfy it. If that fails,
+ * fall back to small pages.
+ *
+ * For a PV domain, the flag means allocate all memory as superpages. If that
+ * fails, the restore fails. This behavior is required for PV guests who
+ * want to use superpages.
+ */
+
#include <stdlib.h>
#include <unistd.h>
@@ -41,6 +54,9 @@ struct restore_ctx {
xen_pfn_t *live_p2m; /* Live mapping of the table mapping each PFN to its current MFN. */
xen_pfn_t *p2m; /* A table mapping each PFN to its new MFN. */
xen_pfn_t *p2m_batch; /* A table of P2M mappings in the current region. */
+ xen_pfn_t *p2m_saved_batch; /* Copy of p2m_batch array for pv superpage alloc */
+ int superpages; /* Superpage allocation has been requested */
+ int hvm; /* This is an hvm domain */
int completed; /* Set when a consistent image is available */
int last_checkpoint; /* Set when we should commit to the current checkpoint when it completes. */
int compressing; /* Set when sender signals that pages would be sent compressed (for Remus) */
@@ -49,11 +65,6 @@ struct restore_ctx {
#define HEARTBEAT_MS 1000
-#define SUPERPAGE_PFN_SHIFT 9
-#define SUPERPAGE_NR_PFNS (1UL << SUPERPAGE_PFN_SHIFT)
-
-#define SUPER_PAGE_START(pfn) (((pfn) & (SUPERPAGE_NR_PFNS-1)) == 0 )
-
#ifndef __MINIOS__
static ssize_t rdexact(xc_interface *xch, struct restore_ctx *ctx,
int fd, void* buf, size_t size)
@@ -103,6 +114,49 @@ static ssize_t rdexact(xc_interface *xch
#else
#define RDEXACT read_exact
#endif
+
+#define SUPERPAGE_PFN_SHIFT 9
+#define SUPERPAGE_NR_PFNS (1UL << SUPERPAGE_PFN_SHIFT)
+#define SUPERPAGE(_pfn) ((_pfn) & (~(SUPERPAGE_NR_PFNS-1)))
+#define SUPER_PAGE_START(pfn) (((pfn) & (SUPERPAGE_NR_PFNS-1)) == 0 )
+
+/*
+** When we're restoring into a pv superpage-allocated guest, we take
+** a copy of the p2m_batch array to preserve the pfn, then allocate the
+** corresponding superpages. We then fill in the p2m array using the saved
+** pfns.
+*/
+static int alloc_superpage_mfns(
+ xc_interface *xch, uint32_t dom, struct restore_ctx *ctx, int nr_mfns)
+{
+ int i, j, max = 0;
+ unsigned long pfn, base_pfn, mfn;
+
+ for (i = 0; i < nr_mfns; i++)
+ {
+ pfn = ctx->p2m_batch[i];
+ base_pfn = SUPERPAGE(pfn);
+ if (ctx->p2m[base_pfn] != (INVALID_P2M_ENTRY-2))
+ {
+ ctx->p2m_saved_batch[max] = base_pfn;
+ ctx->p2m_batch[max] = base_pfn;
+ max++;
+ ctx->p2m[base_pfn] = INVALID_P2M_ENTRY-2;
+ }
+ }
+ if (xc_domain_populate_physmap_exact(xch, dom, max, SUPERPAGE_PFN_SHIFT,
+ 0, ctx->p2m_batch) != 0)
+ return 1;
+
+ for (i = 0; i < max; i++)
+ {
+ mfn = ctx->p2m_batch[i];
+ pfn = ctx->p2m_saved_batch[i];
+ for (j = 0; j < SUPERPAGE_NR_PFNS; j++)
+ ctx->p2m[pfn++] = mfn++;
+ }
+ return 0;
+}
/*
** In the state file (or during transfer), all page-table pages are
** converted into a 'canonical' form where references to actual mfns
@@ -113,7 +167,7 @@ static ssize_t rdexact(xc_interface *xch
static int uncanonicalize_pagetable(
xc_interface *xch, uint32_t dom, struct restore_ctx *ctx, void *page)
{
- int i, pte_last, nr_mfns = 0;
+ int i, rc, pte_last, nr_mfns = 0;
unsigned long pfn;
uint64_t pte;
struct domain_info_context *dinfo = &ctx->dinfo;
@@ -152,13 +206,20 @@ static int uncanonicalize_pagetable(
}
/* Allocate the requisite number of mfns. */
- if ( nr_mfns &&
- (xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
- ctx->p2m_batch) != 0) )
- {
- ERROR("Failed to allocate memory for batch.!\n");
- errno = ENOMEM;
- return 0;
+ if (nr_mfns)
+ {
+ if (!ctx->hvm && ctx->superpages)
+ rc = alloc_superpage_mfns(xch, dom, ctx, nr_mfns);
+ else
+ rc = xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
+ ctx->p2m_batch);
+
+ if (rc)
+ {
+ ERROR("Failed to allocate memory for batch.!\n");
+ errno = ENOMEM;
+ return 0;
+ }
}
/* Second pass: uncanonicalize each present PTE */
@@ -1018,8 +1079,8 @@ static int pagebuf_get(xc_interface *xch
static int apply_batch(xc_interface *xch, uint32_t dom, struct restore_ctx *ctx,
xen_pfn_t* region_mfn, unsigned long* pfn_type, int pae_extended_cr3,
- unsigned int hvm, struct xc_mmu* mmu,
- pagebuf_t* pagebuf, int curbatch, int superpages)
+ struct xc_mmu* mmu,
+ pagebuf_t* pagebuf, int curbatch)
{
int i, j, curpage, nr_mfns;
int k, scount;
@@ -1061,7 +1122,7 @@ static int apply_batch(xc_interface *xch
/* Is this the next expected continuation? */
if ( pfn == superpage_start + scount )
{
- if ( !superpages )
+ if ( !ctx->superpages )
{
ERROR("Unexpexted codepath with no superpages");
return -1;
@@ -1114,16 +1175,17 @@ static int apply_batch(xc_interface *xch
}
/* Are we ready to start a new superpage candidate? */
- if ( superpages && SUPER_PAGE_START(pfn) )
+ if ( ctx->hvm && ctx->superpages && SUPER_PAGE_START(pfn) )
{
superpage_start=pfn;
scount++;
- continue;
}
-
- /* Add the current pfn to pfn_batch */
- ctx->p2m_batch[nr_mfns++] = pfn;
- ctx->p2m[pfn]--;
+ else
+ {
+ /* Add the current pfn to pfn_batch */
+ ctx->p2m_batch[nr_mfns++] = pfn;
+ ctx->p2m[pfn]--;
+ }
}
}
@@ -1144,9 +1206,14 @@ static int apply_batch(xc_interface *xch
{
DPRINTF("Mapping order 0, %d; first pfn %lx\n", nr_mfns, ctx->p2m_batch[0]);
- if(xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0,
- 0, ctx->p2m_batch) != 0)
- {
+ if (!ctx->hvm && ctx->superpages)
+ rc = alloc_superpage_mfns(xch, dom, ctx, nr_mfns);
+ else
+ rc = xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
+ ctx->p2m_batch);
+
+ if (rc)
+ {
ERROR("Failed to allocate memory for batch.!\n");
errno = ENOMEM;
return -1;
@@ -1175,7 +1242,7 @@ static int apply_batch(xc_interface *xch
|| pagetype == XEN_DOMCTL_PFINFO_XALLOC )
region_mfn[i] = ~0UL; /* map will fail but we don't care */
else
- region_mfn[i] = hvm ? pfn : ctx->p2m[pfn];
+ region_mfn[i] = ctx->hvm ? pfn : ctx->p2m[pfn];
}
/* Map relevant mfns */
@@ -1298,7 +1365,7 @@ static int apply_batch(xc_interface *xch
}
}
- if ( !hvm &&
+ if ( !ctx->hvm &&
xc_add_mmu_update(xch, mmu,
(((unsigned long long)mfn) << PAGE_SHIFT)
| MMU_MACHPHYS_UPDATE, pfn) )
@@ -1389,6 +1456,9 @@ int xc_domain_restore(xc_interface *xch,
memset(ctx, 0, sizeof(*ctx));
+ ctx->superpages = superpages;
+ ctx->hvm = hvm;
+
ctxt = xc_hypercall_buffer_alloc(xch, ctxt, sizeof(*ctxt));
if ( ctxt == NULL )
@@ -1452,6 +1522,9 @@ int xc_domain_restore(xc_interface *xch,
region_mfn = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
ctx->p2m_batch = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
+ if (!ctx->hvm && ctx->superpages)
+ ctx->p2m_saved_batch =
+ malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
if ( (ctx->p2m == NULL) || (pfn_type == NULL) ||
(region_mfn == NULL) || (ctx->p2m_batch == NULL) )
@@ -1575,8 +1648,7 @@ int xc_domain_restore(xc_interface *xch,
int brc;
brc = apply_batch(xch, dom, ctx, region_mfn, pfn_type,
- pae_extended_cr3, hvm, mmu, &pagebuf, curbatch,
- superpages);
+ pae_extended_cr3, mmu, &pagebuf, curbatch);
if ( brc < 0 )
goto out;
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] Make restore work properly with PV superpage flag
2012-10-18 10:05 ` Ian Campbell
@ 2012-10-18 10:01 ` George Dunlap
0 siblings, 0 replies; 6+ messages in thread
From: George Dunlap @ 2012-10-18 10:01 UTC (permalink / raw)
To: Ian Campbell; +Cc: Dave McCracken, Keir (Xen.org), Xen Developers List
On 18/10/12 11:05, Ian Campbell wrote:
> George, Keir, and thoughts or opinions on this patch?
It's on my to-review short-list. I should be able to get to it by the
end of the week.
-George
>
> On Fri, 2012-10-12 at 18:32 +0100, Dave McCracken wrote:
>> For PV guests, the superpage flag means to unconditionally allocate all
>> pages as superpages, which is required for Linux hugepages to work. Support
>> for this on restore was not supported. This patch adds proper support for
>> the superpage flag on restore.
>>
>> For HVM guests, the superpage flag has been used to mean attempt to allocate
>> superpages if possible on restore. This functionality has been preserved.
>>
>> Signed-off-by: Dave McCracken <dave.mccracken@oracle.com>
>>
>> --------
>>
>> xc_domain_restore.c | 130 ++++++++++++++++++++++++++++++++++++++++------------
>> 1 file changed, 101 insertions(+), 29 deletions(-)
>>
>>
>> --- xen-unstable/tools/libxc/xc_domain_restore.c 2012-08-18 10:04:53.000000000 -0500
>> +++ xen-unstable-restore//tools/libxc/xc_domain_restore.c 2012-08-20 07:24:22.000000000 -0500
>> @@ -23,6 +23,19 @@
>> *
>> */
>>
>> +/*
>> + * The superpages flag in restore has two different meanings depending on
>> + * the type of domain.
>> + *
>> + * For an HVM domain, the flag means to look for properly aligned contiguous
>> + * pages and try to allocate a superpage to satisfy it. If that fails,
>> + * fall back to small pages.
>> + *
>> + * For a PV domain, the flag means allocate all memory as superpages. If that
>> + * fails, the restore fails. This behavior is required for PV guests who
>> + * want to use superpages.
>> + */
>> +
>> #include <stdlib.h>
>> #include <unistd.h>
>>
>> @@ -41,6 +54,9 @@ struct restore_ctx {
>> xen_pfn_t *live_p2m; /* Live mapping of the table mapping each PFN to its current MFN. */
>> xen_pfn_t *p2m; /* A table mapping each PFN to its new MFN. */
>> xen_pfn_t *p2m_batch; /* A table of P2M mappings in the current region. */
>> + xen_pfn_t *p2m_saved_batch; /* Copy of p2m_batch array for pv superpage alloc */
>> + int superpages; /* Superpage allocation has been requested */
>> + int hvm; /* This is an hvm domain */
>> int completed; /* Set when a consistent image is available */
>> int last_checkpoint; /* Set when we should commit to the current checkpoint when it completes. */
>> int compressing; /* Set when sender signals that pages would be sent compressed (for Remus) */
>> @@ -49,11 +65,6 @@ struct restore_ctx {
>>
>> #define HEARTBEAT_MS 1000
>>
>> -#define SUPERPAGE_PFN_SHIFT 9
>> -#define SUPERPAGE_NR_PFNS (1UL << SUPERPAGE_PFN_SHIFT)
>> -
>> -#define SUPER_PAGE_START(pfn) (((pfn) & (SUPERPAGE_NR_PFNS-1)) == 0 )
>> -
>> #ifndef __MINIOS__
>> static ssize_t rdexact(xc_interface *xch, struct restore_ctx *ctx,
>> int fd, void* buf, size_t size)
>> @@ -103,6 +114,49 @@ static ssize_t rdexact(xc_interface *xch
>> #else
>> #define RDEXACT read_exact
>> #endif
>> +
>> +#define SUPERPAGE_PFN_SHIFT 9
>> +#define SUPERPAGE_NR_PFNS (1UL << SUPERPAGE_PFN_SHIFT)
>> +#define SUPERPAGE(_pfn) ((_pfn) & (~(SUPERPAGE_NR_PFNS-1)))
>> +#define SUPER_PAGE_START(pfn) (((pfn) & (SUPERPAGE_NR_PFNS-1)) == 0 )
> Why this code motion?
>
>> +
>> +/*
>> +** When we're restoring into a pv superpage-allocated guest, we take
>> +** a copy of the p2m_batch array to preserve the pfn, then allocate the
>> +** corresponding superpages. We then fill in the p2m array using the saved
>> +** pfns.
> So the pfns are not contiguous, but we back them with a super page? IOW
> they are super pages in machine address space but not physical address
> space?
>
> If they are contig in p-space you just need to save the first one?
>
>> +*/
>> +static int alloc_superpage_mfns(
>> + xc_interface *xch, uint32_t dom, struct restore_ctx *ctx, int nr_mfns)
>> +{
>> + int i, j, max = 0;
>> + unsigned long pfn, base_pfn, mfn;
>> +
>> + for (i = 0; i < nr_mfns; i++)
>> + {
>> + pfn = ctx->p2m_batch[i];
>> + base_pfn = SUPERPAGE(pfn);
>> + if (ctx->p2m[base_pfn] != (INVALID_P2M_ENTRY-2))
>> + {
>> + ctx->p2m_saved_batch[max] = base_pfn;
> Is p2m_saved_batch used anywhere other than in this function? Could it
> be a local variable?
>
>> + ctx->p2m_batch[max] = base_pfn;
>> + max++;
>> + ctx->p2m[base_pfn] = INVALID_P2M_ENTRY-2;
> What is this magic -2? Can we get a symbolic constant (probably for the
> whole expression)?
>
>> + }
>> + }
>> + if (xc_domain_populate_physmap_exact(xch, dom, max, SUPERPAGE_PFN_SHIFT,
>> + 0, ctx->p2m_batch) != 0)
>> + return 1;
>> +
>> + for (i = 0; i < max; i++)
>> + {
>> + mfn = ctx->p2m_batch[i];
>> + pfn = ctx->p2m_saved_batch[i];
> If the orriginal was != (INVALID_P2M_ENTRY-2) then you didn't initialise
> p2m_saved_batch for this index?
>
>> + for (j = 0; j < SUPERPAGE_NR_PFNS; j++)
>> + ctx->p2m[pfn++] = mfn++;
>> + }
>> + return 0;
>> +}
>> /*
>> ** In the state file (or during transfer), all page-table pages are
>> ** converted into a 'canonical' form where references to actual mfns
>> @@ -113,7 +167,7 @@ static ssize_t rdexact(xc_interface *xch
>> static int uncanonicalize_pagetable(
>> xc_interface *xch, uint32_t dom, struct restore_ctx *ctx, void *page)
>> {
>> - int i, pte_last, nr_mfns = 0;
>> + int i, rc, pte_last, nr_mfns = 0;
>> unsigned long pfn;
>> uint64_t pte;
>> struct domain_info_context *dinfo = &ctx->dinfo;
>> @@ -152,13 +206,20 @@ static int uncanonicalize_pagetable(
>> }
>>
>> /* Allocate the requisite number of mfns. */
>> - if ( nr_mfns &&
>> - (xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
>> - ctx->p2m_batch) != 0) )
>> - {
>> - ERROR("Failed to allocate memory for batch.!\n");
>> - errno = ENOMEM;
>> - return 0;
>> + if (nr_mfns)
>> + {
>> + if (!ctx->hvm && ctx->superpages)
>> + rc = alloc_superpage_mfns(xch, dom, ctx, nr_mfns);
>> + else
>> + rc = xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
>> + ctx->p2m_batch);
> Would it be clearer to rename alloc_superpage_mfns to (e.g.)
> alloc_guest_mfns and have it start:
> if ( ctx->hvm || !ctx->superpages )
> return xc_domain...
>
> Keeps the allocation logic in one place, since it is repeated again
> below.
>
>> +
>> + if (rc)
>> + {
>> + ERROR("Failed to allocate memory for batch.!\n");
>> + errno = ENOMEM;
>> + return 0;
>> + }
>> }
>>
>> /* Second pass: uncanonicalize each present PTE */
>> @@ -1018,8 +1079,8 @@ static int pagebuf_get(xc_interface *xch
>>
>> static int apply_batch(xc_interface *xch, uint32_t dom, struct restore_ctx *ctx,
>> xen_pfn_t* region_mfn, unsigned long* pfn_type, int pae_extended_cr3,
>> - unsigned int hvm, struct xc_mmu* mmu,
>> - pagebuf_t* pagebuf, int curbatch, int superpages)
>> + struct xc_mmu* mmu,
>> + pagebuf_t* pagebuf, int curbatch)
>> {
>> int i, j, curpage, nr_mfns;
>> int k, scount;
>> @@ -1061,7 +1122,7 @@ static int apply_batch(xc_interface *xch
>> /* Is this the next expected continuation? */
>> if ( pfn == superpage_start + scount )
>> {
>> - if ( !superpages )
>> + if ( !ctx->superpages )
>> {
>> ERROR("Unexpexted codepath with no superpages");
>> return -1;
>> @@ -1114,16 +1175,17 @@ static int apply_batch(xc_interface *xch
>> }
>>
>> /* Are we ready to start a new superpage candidate? */
>> - if ( superpages && SUPER_PAGE_START(pfn) )
>> + if ( ctx->hvm && ctx->superpages && SUPER_PAGE_START(pfn) )
> Can we push the movement of these things into the ctx struct into a
> separate preparatory patch so it's easier to see the actual code
> changes.
>
>> {
>> superpage_start=pfn;
>> scount++;
>> - continue;
>> }
>> -
>> - /* Add the current pfn to pfn_batch */
>> - ctx->p2m_batch[nr_mfns++] = pfn;
>> - ctx->p2m[pfn]--;
>> + else
>> + {
>> + /* Add the current pfn to pfn_batch */
>> + ctx->p2m_batch[nr_mfns++] = pfn;
>> + ctx->p2m[pfn]--;
>> + }
>> }
>> }
>>
>> @@ -1144,9 +1206,14 @@ static int apply_batch(xc_interface *xch
>> {
>> DPRINTF("Mapping order 0, %d; first pfn %lx\n", nr_mfns, ctx->p2m_batch[0]);
>>
>> - if(xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0,
>> - 0, ctx->p2m_batch) != 0)
>> - {
>> + if (!ctx->hvm && ctx->superpages)
>> + rc = alloc_superpage_mfns(xch, dom, ctx, nr_mfns);
>> + else
>> + rc = xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
>> + ctx->p2m_batch);
>> +
>> + if (rc)
>> + {
>> ERROR("Failed to allocate memory for batch.!\n");
>> errno = ENOMEM;
>> return -1;
>> @@ -1175,7 +1242,7 @@ static int apply_batch(xc_interface *xch
>> || pagetype == XEN_DOMCTL_PFINFO_XALLOC )
>> region_mfn[i] = ~0UL; /* map will fail but we don't care */
>> else
>> - region_mfn[i] = hvm ? pfn : ctx->p2m[pfn];
>> + region_mfn[i] = ctx->hvm ? pfn : ctx->p2m[pfn];
>> }
>>
>> /* Map relevant mfns */
>> @@ -1298,7 +1365,7 @@ static int apply_batch(xc_interface *xch
>> }
>> }
>>
>> - if ( !hvm &&
>> + if ( !ctx->hvm &&
>> xc_add_mmu_update(xch, mmu,
>> (((unsigned long long)mfn) << PAGE_SHIFT)
>> | MMU_MACHPHYS_UPDATE, pfn) )
>> @@ -1389,6 +1456,9 @@ int xc_domain_restore(xc_interface *xch,
>>
>> memset(ctx, 0, sizeof(*ctx));
>>
>> + ctx->superpages = superpages;
>> + ctx->hvm = hvm;
>> +
>> ctxt = xc_hypercall_buffer_alloc(xch, ctxt, sizeof(*ctxt));
>>
>> if ( ctxt == NULL )
>> @@ -1452,6 +1522,9 @@ int xc_domain_restore(xc_interface *xch,
>>
>> region_mfn = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
>> ctx->p2m_batch = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
>> + if (!ctx->hvm && ctx->superpages)
>> + ctx->p2m_saved_batch =
>> + malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
>>
>> if ( (ctx->p2m == NULL) || (pfn_type == NULL) ||
>> (region_mfn == NULL) || (ctx->p2m_batch == NULL) )
>> @@ -1575,8 +1648,7 @@ int xc_domain_restore(xc_interface *xch,
>> int brc;
>>
>> brc = apply_batch(xch, dom, ctx, region_mfn, pfn_type,
>> - pae_extended_cr3, hvm, mmu, &pagebuf, curbatch,
>> - superpages);
>> + pae_extended_cr3, mmu, &pagebuf, curbatch);
>> if ( brc < 0 )
>> goto out;
>>
>>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xen.org
>> http://lists.xen.org/xen-devel
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] Make restore work properly with PV superpage flag
2012-10-12 17:32 [PATCH] Make restore work properly with PV superpage flag Dave McCracken
@ 2012-10-18 10:05 ` Ian Campbell
2012-10-18 10:01 ` George Dunlap
2012-10-18 14:41 ` George Dunlap
1 sibling, 1 reply; 6+ messages in thread
From: Ian Campbell @ 2012-10-18 10:05 UTC (permalink / raw)
To: Dave McCracken; +Cc: George Dunlap, Xen Developers List, Keir (Xen.org)
George, Keir, and thoughts or opinions on this patch?
On Fri, 2012-10-12 at 18:32 +0100, Dave McCracken wrote:
> For PV guests, the superpage flag means to unconditionally allocate all
> pages as superpages, which is required for Linux hugepages to work. Support
> for this on restore was not supported. This patch adds proper support for
> the superpage flag on restore.
>
> For HVM guests, the superpage flag has been used to mean attempt to allocate
> superpages if possible on restore. This functionality has been preserved.
>
> Signed-off-by: Dave McCracken <dave.mccracken@oracle.com>
>
> --------
>
> xc_domain_restore.c | 130 ++++++++++++++++++++++++++++++++++++++++------------
> 1 file changed, 101 insertions(+), 29 deletions(-)
>
>
> --- xen-unstable/tools/libxc/xc_domain_restore.c 2012-08-18 10:04:53.000000000 -0500
> +++ xen-unstable-restore//tools/libxc/xc_domain_restore.c 2012-08-20 07:24:22.000000000 -0500
> @@ -23,6 +23,19 @@
> *
> */
>
> +/*
> + * The superpages flag in restore has two different meanings depending on
> + * the type of domain.
> + *
> + * For an HVM domain, the flag means to look for properly aligned contiguous
> + * pages and try to allocate a superpage to satisfy it. If that fails,
> + * fall back to small pages.
> + *
> + * For a PV domain, the flag means allocate all memory as superpages. If that
> + * fails, the restore fails. This behavior is required for PV guests who
> + * want to use superpages.
> + */
> +
> #include <stdlib.h>
> #include <unistd.h>
>
> @@ -41,6 +54,9 @@ struct restore_ctx {
> xen_pfn_t *live_p2m; /* Live mapping of the table mapping each PFN to its current MFN. */
> xen_pfn_t *p2m; /* A table mapping each PFN to its new MFN. */
> xen_pfn_t *p2m_batch; /* A table of P2M mappings in the current region. */
> + xen_pfn_t *p2m_saved_batch; /* Copy of p2m_batch array for pv superpage alloc */
> + int superpages; /* Superpage allocation has been requested */
> + int hvm; /* This is an hvm domain */
> int completed; /* Set when a consistent image is available */
> int last_checkpoint; /* Set when we should commit to the current checkpoint when it completes. */
> int compressing; /* Set when sender signals that pages would be sent compressed (for Remus) */
> @@ -49,11 +65,6 @@ struct restore_ctx {
>
> #define HEARTBEAT_MS 1000
>
> -#define SUPERPAGE_PFN_SHIFT 9
> -#define SUPERPAGE_NR_PFNS (1UL << SUPERPAGE_PFN_SHIFT)
> -
> -#define SUPER_PAGE_START(pfn) (((pfn) & (SUPERPAGE_NR_PFNS-1)) == 0 )
> -
> #ifndef __MINIOS__
> static ssize_t rdexact(xc_interface *xch, struct restore_ctx *ctx,
> int fd, void* buf, size_t size)
> @@ -103,6 +114,49 @@ static ssize_t rdexact(xc_interface *xch
> #else
> #define RDEXACT read_exact
> #endif
> +
> +#define SUPERPAGE_PFN_SHIFT 9
> +#define SUPERPAGE_NR_PFNS (1UL << SUPERPAGE_PFN_SHIFT)
> +#define SUPERPAGE(_pfn) ((_pfn) & (~(SUPERPAGE_NR_PFNS-1)))
> +#define SUPER_PAGE_START(pfn) (((pfn) & (SUPERPAGE_NR_PFNS-1)) == 0 )
Why this code motion?
> +
> +/*
> +** When we're restoring into a pv superpage-allocated guest, we take
> +** a copy of the p2m_batch array to preserve the pfn, then allocate the
> +** corresponding superpages. We then fill in the p2m array using the saved
> +** pfns.
So the pfns are not contiguous, but we back them with a super page? IOW
they are super pages in machine address space but not physical address
space?
If they are contig in p-space you just need to save the first one?
> +*/
> +static int alloc_superpage_mfns(
> + xc_interface *xch, uint32_t dom, struct restore_ctx *ctx, int nr_mfns)
> +{
> + int i, j, max = 0;
> + unsigned long pfn, base_pfn, mfn;
> +
> + for (i = 0; i < nr_mfns; i++)
> + {
> + pfn = ctx->p2m_batch[i];
> + base_pfn = SUPERPAGE(pfn);
> + if (ctx->p2m[base_pfn] != (INVALID_P2M_ENTRY-2))
> + {
> + ctx->p2m_saved_batch[max] = base_pfn;
Is p2m_saved_batch used anywhere other than in this function? Could it
be a local variable?
> + ctx->p2m_batch[max] = base_pfn;
> + max++;
> + ctx->p2m[base_pfn] = INVALID_P2M_ENTRY-2;
What is this magic -2? Can we get a symbolic constant (probably for the
whole expression)?
> + }
> + }
> + if (xc_domain_populate_physmap_exact(xch, dom, max, SUPERPAGE_PFN_SHIFT,
> + 0, ctx->p2m_batch) != 0)
> + return 1;
> +
> + for (i = 0; i < max; i++)
> + {
> + mfn = ctx->p2m_batch[i];
> + pfn = ctx->p2m_saved_batch[i];
If the orriginal was != (INVALID_P2M_ENTRY-2) then you didn't initialise
p2m_saved_batch for this index?
> + for (j = 0; j < SUPERPAGE_NR_PFNS; j++)
> + ctx->p2m[pfn++] = mfn++;
> + }
> + return 0;
> +}
> /*
> ** In the state file (or during transfer), all page-table pages are
> ** converted into a 'canonical' form where references to actual mfns
> @@ -113,7 +167,7 @@ static ssize_t rdexact(xc_interface *xch
> static int uncanonicalize_pagetable(
> xc_interface *xch, uint32_t dom, struct restore_ctx *ctx, void *page)
> {
> - int i, pte_last, nr_mfns = 0;
> + int i, rc, pte_last, nr_mfns = 0;
> unsigned long pfn;
> uint64_t pte;
> struct domain_info_context *dinfo = &ctx->dinfo;
> @@ -152,13 +206,20 @@ static int uncanonicalize_pagetable(
> }
>
> /* Allocate the requisite number of mfns. */
> - if ( nr_mfns &&
> - (xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
> - ctx->p2m_batch) != 0) )
> - {
> - ERROR("Failed to allocate memory for batch.!\n");
> - errno = ENOMEM;
> - return 0;
> + if (nr_mfns)
> + {
> + if (!ctx->hvm && ctx->superpages)
> + rc = alloc_superpage_mfns(xch, dom, ctx, nr_mfns);
> + else
> + rc = xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
> + ctx->p2m_batch);
Would it be clearer to rename alloc_superpage_mfns to (e.g.)
alloc_guest_mfns and have it start:
if ( ctx->hvm || !ctx->superpages )
return xc_domain...
Keeps the allocation logic in one place, since it is repeated again
below.
> +
> + if (rc)
> + {
> + ERROR("Failed to allocate memory for batch.!\n");
> + errno = ENOMEM;
> + return 0;
> + }
> }
>
> /* Second pass: uncanonicalize each present PTE */
> @@ -1018,8 +1079,8 @@ static int pagebuf_get(xc_interface *xch
>
> static int apply_batch(xc_interface *xch, uint32_t dom, struct restore_ctx *ctx,
> xen_pfn_t* region_mfn, unsigned long* pfn_type, int pae_extended_cr3,
> - unsigned int hvm, struct xc_mmu* mmu,
> - pagebuf_t* pagebuf, int curbatch, int superpages)
> + struct xc_mmu* mmu,
> + pagebuf_t* pagebuf, int curbatch)
> {
> int i, j, curpage, nr_mfns;
> int k, scount;
> @@ -1061,7 +1122,7 @@ static int apply_batch(xc_interface *xch
> /* Is this the next expected continuation? */
> if ( pfn == superpage_start + scount )
> {
> - if ( !superpages )
> + if ( !ctx->superpages )
> {
> ERROR("Unexpexted codepath with no superpages");
> return -1;
> @@ -1114,16 +1175,17 @@ static int apply_batch(xc_interface *xch
> }
>
> /* Are we ready to start a new superpage candidate? */
> - if ( superpages && SUPER_PAGE_START(pfn) )
> + if ( ctx->hvm && ctx->superpages && SUPER_PAGE_START(pfn) )
Can we push the movement of these things into the ctx struct into a
separate preparatory patch so it's easier to see the actual code
changes.
> {
> superpage_start=pfn;
> scount++;
> - continue;
> }
> -
> - /* Add the current pfn to pfn_batch */
> - ctx->p2m_batch[nr_mfns++] = pfn;
> - ctx->p2m[pfn]--;
> + else
> + {
> + /* Add the current pfn to pfn_batch */
> + ctx->p2m_batch[nr_mfns++] = pfn;
> + ctx->p2m[pfn]--;
> + }
> }
> }
>
> @@ -1144,9 +1206,14 @@ static int apply_batch(xc_interface *xch
> {
> DPRINTF("Mapping order 0, %d; first pfn %lx\n", nr_mfns, ctx->p2m_batch[0]);
>
> - if(xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0,
> - 0, ctx->p2m_batch) != 0)
> - {
> + if (!ctx->hvm && ctx->superpages)
> + rc = alloc_superpage_mfns(xch, dom, ctx, nr_mfns);
> + else
> + rc = xc_domain_populate_physmap_exact(xch, dom, nr_mfns, 0, 0,
> + ctx->p2m_batch);
> +
> + if (rc)
> + {
> ERROR("Failed to allocate memory for batch.!\n");
> errno = ENOMEM;
> return -1;
> @@ -1175,7 +1242,7 @@ static int apply_batch(xc_interface *xch
> || pagetype == XEN_DOMCTL_PFINFO_XALLOC )
> region_mfn[i] = ~0UL; /* map will fail but we don't care */
> else
> - region_mfn[i] = hvm ? pfn : ctx->p2m[pfn];
> + region_mfn[i] = ctx->hvm ? pfn : ctx->p2m[pfn];
> }
>
> /* Map relevant mfns */
> @@ -1298,7 +1365,7 @@ static int apply_batch(xc_interface *xch
> }
> }
>
> - if ( !hvm &&
> + if ( !ctx->hvm &&
> xc_add_mmu_update(xch, mmu,
> (((unsigned long long)mfn) << PAGE_SHIFT)
> | MMU_MACHPHYS_UPDATE, pfn) )
> @@ -1389,6 +1456,9 @@ int xc_domain_restore(xc_interface *xch,
>
> memset(ctx, 0, sizeof(*ctx));
>
> + ctx->superpages = superpages;
> + ctx->hvm = hvm;
> +
> ctxt = xc_hypercall_buffer_alloc(xch, ctxt, sizeof(*ctxt));
>
> if ( ctxt == NULL )
> @@ -1452,6 +1522,9 @@ int xc_domain_restore(xc_interface *xch,
>
> region_mfn = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
> ctx->p2m_batch = malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
> + if (!ctx->hvm && ctx->superpages)
> + ctx->p2m_saved_batch =
> + malloc(ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
>
> if ( (ctx->p2m == NULL) || (pfn_type == NULL) ||
> (region_mfn == NULL) || (ctx->p2m_batch == NULL) )
> @@ -1575,8 +1648,7 @@ int xc_domain_restore(xc_interface *xch,
> int brc;
>
> brc = apply_batch(xch, dom, ctx, region_mfn, pfn_type,
> - pae_extended_cr3, hvm, mmu, &pagebuf, curbatch,
> - superpages);
> + pae_extended_cr3, mmu, &pagebuf, curbatch);
> if ( brc < 0 )
> goto out;
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] Make restore work properly with PV superpage flag
2012-10-12 17:32 [PATCH] Make restore work properly with PV superpage flag Dave McCracken
2012-10-18 10:05 ` Ian Campbell
@ 2012-10-18 14:41 ` George Dunlap
2012-10-18 14:46 ` Dave McCracken
2012-11-12 17:04 ` Ian Campbell
1 sibling, 2 replies; 6+ messages in thread
From: George Dunlap @ 2012-10-18 14:41 UTC (permalink / raw)
To: Dave McCracken; +Cc: Xen Developers List, Keir Fraser
On Fri, Oct 12, 2012 at 6:32 PM, Dave McCracken <dcm@mccr.org> wrote:
> For PV guests, the superpage flag means to unconditionally allocate all
> pages as superpages, which is required for Linux hugepages to work. Support
> for this on restore was not supported. This patch adds proper support for
> the superpage flag on restore.
>
> For HVM guests, the superpage flag has been used to mean attempt to allocate
> superpages if possible on restore. This functionality has been preserved.
>
> Signed-off-by: Dave McCracken <dave.mccracken@oracle.com>
Looks good -- boy, when you know that all of your pages have to be
superpages, it makes the logic a lot easier. :-) Unfortunately that's
not the case for HVM guests, so we need to keep the complicated
detection logic.
I was trying to think of a way to make the "hvm" and "superpage" based
on the desired behavior (e.g, "detect_superpages",
"always_superpages", or something like that), rather than having to
encode "!hvm && superpage" all over the place, but I think in the end
the way you've written it is fine.
Acked-by: George Dunlap <george.dunlap@eu.citrix.com>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] Make restore work properly with PV superpage flag
2012-10-18 14:41 ` George Dunlap
@ 2012-10-18 14:46 ` Dave McCracken
2012-11-12 17:04 ` Ian Campbell
1 sibling, 0 replies; 6+ messages in thread
From: Dave McCracken @ 2012-10-18 14:46 UTC (permalink / raw)
To: George Dunlap; +Cc: Xen Developers List, Keir Fraser
On Thursday, October 18, 2012, George Dunlap wrote:
> Looks good -- boy, when you know that all of your pages have to be
> superpages, it makes the logic a lot easier. :-) Unfortunately that's
> not the case for HVM guests, so we need to keep the complicated
> detection logic.
>
> I was trying to think of a way to make the "hvm" and "superpage" based
> on the desired behavior (e.g, "detect_superpages",
> "always_superpages", or something like that), rather than having to
> encode "!hvm && superpage" all over the place, but I think in the end
> the way you've written it is fine.
Yeah, I spent a lot of time trying to make some integrated way to do both
types of allocation, but finally gave up and kept the two code paths separate.
In the end I think it's cleaner that way.
Dave
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] Make restore work properly with PV superpage flag
2012-10-18 14:41 ` George Dunlap
2012-10-18 14:46 ` Dave McCracken
@ 2012-11-12 17:04 ` Ian Campbell
1 sibling, 0 replies; 6+ messages in thread
From: Ian Campbell @ 2012-11-12 17:04 UTC (permalink / raw)
To: George Dunlap; +Cc: Dave McCracken, Keir (Xen.org), Xen Developers List
On Thu, 2012-10-18 at 15:41 +0100, George Dunlap wrote:
> On Fri, Oct 12, 2012 at 6:32 PM, Dave McCracken <dcm@mccr.org> wrote:
> > For PV guests, the superpage flag means to unconditionally allocate all
> > pages as superpages, which is required for Linux hugepages to work. Support
> > for this on restore was not supported. This patch adds proper support for
> > the superpage flag on restore.
> >
> > For HVM guests, the superpage flag has been used to mean attempt to allocate
> > superpages if possible on restore. This functionality has been preserved.
> >
> > Signed-off-by: Dave McCracken <dave.mccracken@oracle.com>
[...]
>
> Acked-by: George Dunlap <george.dunlap@eu.citrix.com>
Applied, sorry for the delay.
Ian.
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2012-11-12 17:04 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-10-12 17:32 [PATCH] Make restore work properly with PV superpage flag Dave McCracken
2012-10-18 10:05 ` Ian Campbell
2012-10-18 10:01 ` George Dunlap
2012-10-18 14:41 ` George Dunlap
2012-10-18 14:46 ` Dave McCracken
2012-11-12 17:04 ` Ian Campbell
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).