* [PATCH 1 of 9] Add callbacks for suspend, postcopy and preresume in xc_domain_save
2009-05-14 0:19 [PATCH 0 of 9] Xen support for Remus Brendan Cully
@ 2009-05-14 0:19 ` Brendan Cully
2009-05-14 0:19 ` [PATCH 2 of 9] Make xc_domain_restore loop until the fd is closed Brendan Cully
` (7 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Brendan Cully @ 2009-05-14 0:19 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1240355494 25200
# Node ID 904729ffa2692482c77e7da5828c4b218a3a51c2
# Parent 7d552e56d105786838ac027f3625486c9c2ea449
Add callbacks for suspend, postcopy and preresume in xc_domain_save.
This makes it possible to perform repeated checkpoints.
Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
diff --git a/tools/libxc/xc_domain_save.c b/tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c
+++ b/tools/libxc/xc_domain_save.c
@@ -337,11 +337,11 @@
return -1;
}
-
-static int suspend_and_state(int (*suspend)(void), int xc_handle, int io_fd,
- int dom, xc_dominfo_t *info)
+static int suspend_and_state(int (*suspend)(void*), void* data,
+ int xc_handle, int io_fd, int dom,
+ xc_dominfo_t *info)
{
- if ( !(*suspend)() )
+ if ( !(*suspend)(data) )
{
ERROR("Suspend request failed");
return -1;
@@ -745,14 +745,15 @@
}
int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags, int (*suspend)(void),
+ uint32_t max_factor, uint32_t flags,
+ struct save_callbacks* callbacks,
int hvm, void *(*init_qemu_maps)(int, unsigned),
void (*qemu_flip_buffer)(int, int))
{
xc_dominfo_t info;
DECLARE_DOMCTL;
- int rc = 1, frc, i, j, last_iter, iter = 0;
+ int rc = 1, frc, i, j, last_iter = 0, iter = 0;
int live = (flags & XCFLAGS_LIVE);
int debug = (flags & XCFLAGS_DEBUG);
int race = 0, sent_last_iter, skip_this_iter;
@@ -873,7 +874,8 @@
else
{
/* This is a non-live suspend. Suspend the domain .*/
- if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info) )
+ if ( suspend_and_state(callbacks->suspend, callbacks->data, xc_handle,
+ io_fd, dom, &info) )
{
ERROR("Domain appears not to have suspended");
goto out;
@@ -994,6 +996,7 @@
print_stats(xc_handle, dom, 0, &stats, 0);
+ copypages:
/* Now write out each data page, canonicalising page tables as we go... */
for ( ; ; )
{
@@ -1307,7 +1310,8 @@
DPRINTF("Start last iteration\n");
last_iter = 1;
- if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info) )
+ if ( suspend_and_state(callbacks->suspend, callbacks->data,
+ xc_handle, io_fd, dom, &info) )
{
ERROR("Domain appears not to have suspended");
goto out;
@@ -1602,6 +1606,40 @@
rc = 0;
out:
+ if ( !rc && callbacks->postcopy )
+ callbacks->postcopy(callbacks->data);
+
+ /* Flush last write and discard cache for file. */
+ discard_file_cache(io_fd, 1 /* flush */);
+
+ /* checkpoint_cb can spend arbitrarily long in between rounds */
+ if (!rc && callbacks->checkpoint &&
+ callbacks->checkpoint(callbacks->data) > 0)
+ {
+ /* reset stats timer */
+ print_stats(xc_handle, dom, 0, &stats, 0);
+
+ rc = 1;
+ /* last_iter = 1; */
+ if ( suspend_and_state(callbacks->suspend, callbacks->data, xc_handle,
+ io_fd, dom, &info) )
+ {
+ ERROR("Domain appears not to have suspended");
+ goto out;
+ }
+ DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
+ print_stats(xc_handle, dom, 0, &stats, 1);
+
+ if ( xc_shadow_control(xc_handle, dom,
+ XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
+ p2m_size, NULL, 0, &stats) != p2m_size )
+ {
+ ERROR("Error flushing shadow PT");
+ }
+
+ goto copypages;
+ }
+
if ( live )
{
@@ -1611,9 +1649,6 @@
DPRINTF("Warning - couldn't disable shadow mode");
}
- /* Flush last write and discard cache for file. */
- discard_file_cache(io_fd, 1 /* flush */);
-
if ( live_shinfo )
munmap(live_shinfo, PAGE_SIZE);
diff --git a/tools/libxc/xenguest.h b/tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h
+++ b/tools/libxc/xenguest.h
@@ -14,6 +14,19 @@
#define XCFLAGS_HVM 4
#define XCFLAGS_STDVGA 8
+/* callbacks provided by xc_domain_save */
+struct save_callbacks {
+ int (*suspend)(void* data);
+ /* callback to rendezvous with external checkpoint functions */
+ int (*postcopy)(void* data);
+ /* returns:
+ * 0: terminate checkpointing gracefully
+ * 1: take another checkpoint */
+ int (*checkpoint)(void* data);
+
+ /* to be provided as the first argument to each callback function */
+ void* data;
+};
/**
* This function will save a running domain.
@@ -25,7 +38,8 @@
*/
int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
- int (*suspend)(void), int hvm,
+ struct save_callbacks* callbacks,
+ int hvm,
void *(*init_qemu_maps)(int, unsigned), /* HVM only */
void (*qemu_flip_buffer)(int, int)); /* HVM only */
diff --git a/tools/python/xen/xend/server/XMLRPCServer.py b/tools/python/xen/xend/server/XMLRPCServer.py
--- a/tools/python/xen/xend/server/XMLRPCServer.py
+++ b/tools/python/xen/xend/server/XMLRPCServer.py
@@ -94,7 +94,8 @@
'destroyDevice','getDeviceSxprs',
'setMemoryTarget', 'setName', 'setVCpuCount', 'shutdown',
'send_sysrq', 'getVCPUInfo', 'waitForDevices',
- 'getRestartCount', 'getBlockDeviceClass']
+ 'getRestartCount', 'getBlockDeviceClass',
+ 'waitForShutdown', 'resumeDomain']
exclude = ['domain_create', 'domain_restore']
diff --git a/tools/xcutils/xc_save.c b/tools/xcutils/xc_save.c
--- a/tools/xcutils/xc_save.c
+++ b/tools/xcutils/xc_save.c
@@ -71,7 +71,7 @@
return 1;
}
-static int suspend(void)
+static int suspend(void* data)
{
unsigned long sx_state = 0;
@@ -217,6 +217,7 @@
{
unsigned int maxit, max_f;
int io_fd, ret, port;
+ struct save_callbacks callbacks;
if (argc != 6)
errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]);
@@ -253,8 +254,10 @@
"using slow path");
}
}
+ memset(&callbacks, 0, sizeof(callbacks));
+ callbacks.suspend = suspend;
ret = xc_domain_save(si.xc_fd, io_fd, si.domid, maxit, max_f, si.flags,
- &suspend, !!(si.flags & XCFLAGS_HVM),
+ &callbacks, !!(si.flags & XCFLAGS_HVM),
&init_qemu_maps, &qemu_flip_buffer);
if (si.suspend_evtchn > 0)
^ permalink raw reply [flat|nested] 10+ messages in thread* [PATCH 2 of 9] Make xc_domain_restore loop until the fd is closed
2009-05-14 0:19 [PATCH 0 of 9] Xen support for Remus Brendan Cully
2009-05-14 0:19 ` [PATCH 1 of 9] Add callbacks for suspend, postcopy and preresume in xc_domain_save Brendan Cully
@ 2009-05-14 0:19 ` Brendan Cully
2009-05-14 0:19 ` [PATCH 3 of 9] Initiate failover if a packet is not received every 500ms Brendan Cully
` (6 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Brendan Cully @ 2009-05-14 0:19 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1240355507 25200
# Node ID f5c0d3208d8ae9183391398d52c9be5969da24ec
# Parent 904729ffa2692482c77e7da5828c4b218a3a51c2
Make xc_domain_restore loop until the fd is closed.
The tail containing the final PFN table, VCPU contexts and
shared_info_page is buffered, then the read loop is restarted.
After the first pass, incoming pages are buffered until the next tail
is read, completing a new consistent checkpoint. At this point, the
memory changes are applied and the loop begins again. When the fd read
fails, the tail buffer is processed.
Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c
+++ b/tools/libxc/xc_domain_restore.c
@@ -269,6 +269,438 @@
return p2m_frame_list;
}
+typedef struct {
+ unsigned int pfncount;
+ unsigned long* pfntab;
+ unsigned int vcpucount;
+ unsigned char* vcpubuf;
+ unsigned char shared_info_page[PAGE_SIZE];
+} tailbuf_t;
+
+static int buffer_tail(tailbuf_t* buf, int fd, unsigned int max_vcpu_id,
+ uint64_t vcpumap, int ext_vcpucontext)
+{
+ unsigned int i;
+ size_t pfnlen, vcpulen;
+
+ /* TODO: handle changing pfntab and vcpu counts */
+ /* PFN tab */
+ if ( read_exact(fd, &buf->pfncount, sizeof(buf->pfncount)) ||
+ (buf->pfncount > (1U << 28)) ) /* up to 1TB of address space */
+ {
+ ERROR("Error when reading pfn count");
+ return -1;
+ }
+ pfnlen = sizeof(unsigned long) * buf->pfncount;
+ if ( !(buf->pfntab) ) {
+ if ( !(buf->pfntab = malloc(pfnlen)) ) {
+ ERROR("Error allocating PFN tail buffer");
+ return -1;
+ }
+ }
+ // DPRINTF("Reading PFN tab: %d bytes\n", pfnlen);
+ if ( read_exact(fd, buf->pfntab, pfnlen) ) {
+ ERROR("Error when reading pfntab");
+ goto free_pfntab;
+ }
+
+ /* VCPU contexts */
+ buf->vcpucount = 0;
+ for (i = 0; i <= max_vcpu_id; i++) {
+ // DPRINTF("vcpumap: %llx, cpu: %d, bit: %llu\n", vcpumap, i, (vcpumap % (1ULL << i)));
+ if ( (!(vcpumap & (1ULL << i))) )
+ continue;
+ buf->vcpucount++;
+ }
+ // DPRINTF("VCPU count: %d\n", buf->vcpucount);
+ vcpulen = ((guest_width == 8) ? sizeof(vcpu_guest_context_x86_64_t)
+ : sizeof(vcpu_guest_context_x86_32_t)) * buf->vcpucount;
+ if ( ext_vcpucontext )
+ vcpulen += 128 * buf->vcpucount;
+
+ if ( !(buf->vcpubuf) ) {
+ if ( !(buf->vcpubuf = malloc(vcpulen)) ) {
+ ERROR("Error allocating VCPU ctxt tail buffer");
+ goto free_pfntab;
+ }
+ }
+ // DPRINTF("Reading VCPUS: %d bytes\n", vcpulen);
+ if ( read_exact(fd, buf->vcpubuf, vcpulen) ) {
+ ERROR("Error when reading ctxt");
+ goto free_vcpus;
+ }
+
+ /* load shared_info_page */
+ // DPRINTF("Reading shared info: %lu bytes\n", PAGE_SIZE);
+ if ( read_exact(fd, buf->shared_info_page, PAGE_SIZE) ) {
+ ERROR("Error when reading shared info page");
+ goto free_vcpus;
+ }
+
+ return 0;
+
+ free_vcpus:
+ if (buf->vcpubuf) {
+ free (buf->vcpubuf);
+ buf->vcpubuf = NULL;
+ }
+ free_pfntab:
+ if (buf->pfntab) {
+ free (buf->pfntab);
+ buf->pfntab = NULL;
+ }
+
+ return -1;
+}
+
+static void tailbuf_free(tailbuf_t* buf)
+{
+ if (buf->vcpubuf) {
+ free(buf->vcpubuf);
+ buf->vcpubuf = NULL;
+ }
+ if (buf->pfntab) {
+ free(buf->pfntab);
+ buf->pfntab = NULL;
+ }
+}
+
+typedef struct {
+ void* pages;
+ unsigned int nr_physpages, nr_pages; /* pages is of length nr_physpages, pfn_types is of length nr_pages */
+
+ /* Types of the pfns in the current region */
+ unsigned long* pfn_types;
+
+ int verify;
+
+ int new_ctxt_format;
+ int max_vcpu_id;
+ uint64_t vcpumap;
+ uint64_t identpt;
+ uint64_t vm86_tss;
+} pagebuf_t;
+
+static int pagebuf_init(pagebuf_t* buf)
+{
+ memset(buf, 0, sizeof(*buf));
+ return 0;
+}
+
+static void pagebuf_free(pagebuf_t* buf)
+{
+ if (buf->pages) {
+ free(buf->pages);
+ buf->pages = NULL;
+ }
+ if(buf->pfn_types) {
+ free(buf->pfn_types);
+ buf->pfn_types = NULL;
+ }
+}
+
+static int pagebuf_get_one(pagebuf_t* buf, int fd)
+{
+ int count, countpages, oldcount, i;
+ void* ptmp;
+
+ if ( read_exact(fd, &count, sizeof(count)) )
+ {
+ ERROR("Error when reading batch size");
+ return -1;
+ }
+
+ // DPRINTF("reading batch of %d pages\n", count);
+
+ if (!count) {
+ DPRINTF("Last batch read\n");
+ return 0;
+ } else if (count == -1) {
+ DPRINTF("Entering page verify mode\n");
+ buf->verify = 1;
+ return pagebuf_get_one(buf, fd);
+ } else if (count == -2) {
+ buf->new_ctxt_format = 1;
+ if ( read_exact(fd, &buf->max_vcpu_id, sizeof(buf->max_vcpu_id)) ||
+ buf->max_vcpu_id >= 64 || read_exact(fd, &buf->vcpumap,
+ sizeof(uint64_t)) ) {
+ ERROR("Error when reading max_vcpu_id");
+ return -1;
+ }
+ // DPRINTF("Max VCPU ID: %d, vcpumap: %llx\n", buf->max_vcpu_id, buf->vcpumap);
+ return pagebuf_get_one(buf, fd);
+ } else if (count == -3) {
+ /* Skip padding 4 bytes then read the EPT identity PT location. */
+ if ( read_exact(fd, &buf->identpt, sizeof(uint32_t)) ||
+ read_exact(fd, &buf->identpt, sizeof(uint64_t)) )
+ {
+ ERROR("error read the address of the EPT identity map");
+ return -1;
+ }
+ // DPRINTF("EPT identity map address: %llx\n", buf->identpt);
+ return pagebuf_get_one(buf, fd);
+ } else if ( count == -4 ) {
+ /* Skip padding 4 bytes then read the vm86 TSS location. */
+ if ( read_exact(fd, &buf->vm86_tss, sizeof(uint32_t)) ||
+ read_exact(fd, &buf->vm86_tss, sizeof(uint64_t)) )
+ {
+ ERROR("error read the address of the vm86 TSS");
+ return -1;
+ }
+ // DPRINTF("VM86 TSS location: %llx\n", buf->vm86_tss);
+ return pagebuf_get_one(buf, fd);
+ } else if ( (count > MAX_BATCH_SIZE) || (count < 0) ) {
+ ERROR("Max batch size exceeded (%d). Giving up.", count);
+ return -1;
+ }
+
+ oldcount = buf->nr_pages;
+ buf->nr_pages += count;
+ if (!buf->pfn_types) {
+ if (!(buf->pfn_types = malloc(buf->nr_pages * sizeof(*(buf->pfn_types))))) {
+ ERROR("Could not allocate PFN type buffer");
+ return -1;
+ }
+ } else {
+ if (!(ptmp = realloc(buf->pfn_types, buf->nr_pages * sizeof(*(buf->pfn_types))))) {
+ ERROR("Could not reallocate PFN type buffer");
+ return -1;
+ }
+ buf->pfn_types = ptmp;
+ }
+ if ( read_exact(fd, buf->pfn_types + oldcount, count * sizeof(*(buf->pfn_types)))) {
+ ERROR("Error when reading region pfn types");
+ return -1;
+ }
+
+ countpages = count;
+ for (i = oldcount; i < buf->nr_pages; ++i)
+ if ((buf->pfn_types[i] & XEN_DOMCTL_PFINFO_LTAB_MASK) == XEN_DOMCTL_PFINFO_XTAB)
+ --countpages;
+
+ if (!countpages)
+ return count;
+
+ oldcount = buf->nr_physpages;
+ buf->nr_physpages += countpages;
+ if (!buf->pages) {
+ if (!(buf->pages = malloc(buf->nr_physpages * PAGE_SIZE))) {
+ ERROR("Could not allocate page buffer");
+ return -1;
+ }
+ } else {
+ if (!(ptmp = realloc(buf->pages, buf->nr_physpages * PAGE_SIZE))) {
+ ERROR("Could not reallocate page buffer");
+ return -1;
+ }
+ buf->pages = ptmp;
+ }
+ if ( read_exact(fd, buf->pages + oldcount * PAGE_SIZE, countpages * PAGE_SIZE) ) {
+ ERROR("Error when reading pages");
+ return -1;
+ }
+
+ return count;
+}
+
+static int pagebuf_get(pagebuf_t* buf, int fd)
+{
+ int rc;
+
+ buf->nr_physpages = buf->nr_pages = 0;
+
+ do {
+ rc = pagebuf_get_one(buf, fd);
+ } while (rc > 0);
+
+ if (rc < 0)
+ pagebuf_free(buf);
+
+ return rc;
+}
+
+static int apply_batch(int xc_handle, uint32_t dom, xen_pfn_t* region_mfn,
+ unsigned long* pfn_type, int pae_extended_cr3,
+ unsigned int hvm, struct xc_mmu* mmu,
+ pagebuf_t* pagebuf, int curbatch)
+{
+ int nr_mfns = 0;
+ int i, j, curpage;
+ /* used by debug verify code */
+ unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
+ /* Our mapping of the current region (batch) */
+ char *region_base;
+ /* A temporary mapping, and a copy, of one frame of guest memory. */
+ unsigned long *page = NULL;
+ int nraces = 0;
+
+ unsigned long mfn, pfn, pagetype;
+
+ j = pagebuf->nr_pages - curbatch;
+ if (j > MAX_BATCH_SIZE)
+ j = MAX_BATCH_SIZE;
+
+ /* First pass for this batch: work out how much memory to alloc */
+ for ( i = 0; i < j; i++ )
+ {
+ pfn = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ pagetype = pagebuf->pfn_types[i + curbatch] & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) &&
+ (p2m[pfn] == INVALID_P2M_ENTRY) )
+ {
+ /* Have a live PFN which hasn't had an MFN allocated */
+ p2m_batch[nr_mfns++] = pfn;
+ p2m[pfn]--;
+ }
+ }
+
+ /* Now allocate a bunch of mfns for this batch */
+ if ( nr_mfns &&
+ (xc_domain_memory_populate_physmap(xc_handle, dom, nr_mfns, 0,
+ 0, p2m_batch) != 0) )
+ {
+ ERROR("Failed to allocate memory for batch.!\n");
+ errno = ENOMEM;
+ return -1;
+ }
+
+ /* Second pass for this batch: update p2m[] and region_mfn[] */
+ nr_mfns = 0;
+ for ( i = 0; i < j; i++ )
+ {
+ pfn = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ pagetype = pagebuf->pfn_types[i + curbatch] & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+ region_mfn[i] = ~0UL; /* map will fail but we don't care */
+ else
+ {
+ if ( p2m[pfn] == (INVALID_P2M_ENTRY-1) )
+ {
+ /* We just allocated a new mfn above; update p2m */
+ p2m[pfn] = p2m_batch[nr_mfns++];
+ nr_pfns++;
+ }
+
+ /* setup region_mfn[] for batch map.
+ * For HVM guests, this interface takes PFNs, not MFNs */
+ region_mfn[i] = hvm ? pfn : p2m[pfn];
+ }
+ }
+
+ /* Map relevant mfns */
+ region_base = xc_map_foreign_batch(
+ xc_handle, dom, PROT_WRITE, region_mfn, j);
+
+ if ( region_base == NULL )
+ {
+ ERROR("map batch failed");
+ return -1;
+ }
+
+ for ( i = 0, curpage = -1; i < j; i++ )
+ {
+ pfn = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ pagetype = pagebuf->pfn_types[i + curbatch] & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+ /* a bogus/unmapped page: skip it */
+ continue;
+
+ ++curpage;
+
+ if ( pfn > p2m_size )
+ {
+ ERROR("pfn out of range");
+ return -1;
+ }
+
+ pfn_type[pfn] = pagetype;
+
+ mfn = p2m[pfn];
+
+ /* In verify mode, we use a copy; otherwise we work in place */
+ page = pagebuf->verify ? (void *)buf : (region_base + i*PAGE_SIZE);
+
+ memcpy(page, pagebuf->pages + (curpage + curbatch) * PAGE_SIZE, PAGE_SIZE);
+
+ pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+ if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
+ (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+ {
+ /*
+ ** A page table page - need to 'uncanonicalize' it, i.e.
+ ** replace all the references to pfns with the corresponding
+ ** mfns for the new domain.
+ **
+ ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
+ ** so we may need to update the p2m after the main loop.
+ ** Hence we defer canonicalization of L1s until then.
+ */
+ if ((pt_levels != 3) ||
+ pae_extended_cr3 ||
+ (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
+
+ if (!uncanonicalize_pagetable(xc_handle, dom,
+ pagetype, page)) {
+ /*
+ ** Failing to uncanonicalize a page table can be ok
+ ** under live migration since the pages type may have
+ ** changed by now (and we'll get an update later).
+ */
+ DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
+ pagetype >> 28, pfn, mfn);
+ nraces++;
+ continue;
+ }
+ }
+ }
+ else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
+ {
+ ERROR("Bogus page type %lx page table is out of range: "
+ "i=%d p2m_size=%lu", pagetype, i, p2m_size);
+ return -1;
+ }
+
+ if ( pagebuf->verify )
+ {
+ int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
+ if ( res )
+ {
+ int v;
+
+ DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
+ "actualcs=%08lx\n", pfn, pagebuf->pfn_types[pfn],
+ csum_page(region_base + (i + curbatch)*PAGE_SIZE),
+ csum_page(buf));
+
+ for ( v = 0; v < 4; v++ )
+ {
+ unsigned long *p = (unsigned long *)
+ (region_base + i*PAGE_SIZE);
+ if ( buf[v] != p[v] )
+ DPRINTF(" %d: %08lx %08lx\n", v, buf[v], p[v]);
+ }
+ }
+ }
+
+ if ( !hvm &&
+ xc_add_mmu_update(xc_handle, mmu,
+ (((unsigned long long)mfn) << PAGE_SHIFT)
+ | MMU_MACHPHYS_UPDATE, pfn) )
+ {
+ ERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
+ return -1;
+ }
+ } /* end of 'batch' for loop */
+
+ munmap(region_base, j*PAGE_SIZE);
+
+ return nraces;
+}
+
int xc_domain_restore(int xc_handle, int io_fd, uint32_t dom,
unsigned int store_evtchn, unsigned long *store_mfn,
unsigned int console_evtchn, unsigned long *console_mfn,
@@ -278,7 +710,6 @@
int rc = 1, frc, i, j, n, m, pae_extended_cr3 = 0, ext_vcpucontext = 0;
unsigned long mfn, pfn;
unsigned int prev_pc, this_pc;
- int verify = 0;
int nraces = 0;
/* The new domain's shared-info frame number. */
@@ -297,9 +728,6 @@
/* A table of MFNs to map in the current region */
xen_pfn_t *region_mfn = NULL;
- /* Types of the pfns in the current region */
- unsigned long region_pfn_type[MAX_BATCH_SIZE];
-
/* A copy of the pfn-to-mfn table frame list. */
xen_pfn_t *p2m_frame_list = NULL;
@@ -311,9 +739,6 @@
struct xc_mmu *mmu = NULL;
- /* used by debug verify code */
- unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
-
struct mmuext_op pin[MAX_PIN_BATCH];
unsigned int nr_pins;
@@ -327,6 +752,14 @@
/* Buffer for holding HVM context */
uint8_t *hvm_buf = NULL;
+ int completed = 0;
+ pagebuf_t pagebuf;
+ tailbuf_t tailbuf, tmptail;
+ void* vcpup;
+
+ pagebuf_init(&pagebuf);
+ memset(&tailbuf, 0, sizeof(tailbuf));
+
/* For info only */
nr_pfns = 0;
@@ -435,9 +868,10 @@
prev_pc = 0;
n = m = 0;
+ loadpages:
for ( ; ; )
{
- int j, nr_mfns = 0;
+ int j, curbatch;
this_pc = (n * 100) / p2m_size;
if ( (this_pc - prev_pc) >= 5 )
@@ -446,248 +880,49 @@
prev_pc = this_pc;
}
- if ( read_exact(io_fd, &j, sizeof(int)) )
- {
- ERROR("Error when reading batch size");
- goto out;
- }
+ if ( !completed ) {
+ pagebuf.nr_physpages = pagebuf.nr_pages = 0;
+ if ( pagebuf_get_one(&pagebuf, io_fd) < 0 ) {
+ ERROR("Error when reading batch\n");
+ goto out;
+ }
+ }
+ j = pagebuf.nr_pages;
PPRINTF("batch %d\n",j);
- if ( j == -1 )
- {
- verify = 1;
- DPRINTF("Entering page verify mode\n");
- continue;
- }
+ if ( j == 0 ) {
+ /* catch vcpu updates */
+ if (pagebuf.new_ctxt_format) {
+ vcpumap = pagebuf.vcpumap;
+ max_vcpu_id = pagebuf.max_vcpu_id;
+ }
+ /* should this be deferred? does it change? */
+ if (pagebuf.identpt)
+ xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT, pagebuf.identpt);
+ if (pagebuf.vm86_tss)
+ xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, pagebuf.vm86_tss);
+ break; /* our work here is done */
+ }
- if ( j == -2 )
- {
- new_ctxt_format = 1;
- if ( read_exact(io_fd, &max_vcpu_id, sizeof(int)) ||
- (max_vcpu_id >= 64) ||
- read_exact(io_fd, &vcpumap, sizeof(uint64_t)) )
- {
- ERROR("Error when reading max_vcpu_id");
- goto out;
- }
- continue;
- }
+ /* break pagebuf into batches */
+ curbatch = 0;
+ while ( curbatch < j ) {
+ int brc;
- if ( j == -3 )
- {
- uint64_t ident_pt;
+ brc = apply_batch(xc_handle, dom, region_mfn, pfn_type,
+ pae_extended_cr3, hvm, mmu, &pagebuf, curbatch);
+ if ( brc < 0 )
+ goto out;
- /* Skip padding 4 bytes then read the EPT identity PT location. */
- if ( read_exact(io_fd, &ident_pt, sizeof(uint32_t)) ||
- read_exact(io_fd, &ident_pt, sizeof(uint64_t)) )
- {
- ERROR("error read the address of the EPT identity map");
- goto out;
- }
+ nraces += brc;
- xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT, ident_pt);
- continue;
- }
+ curbatch += MAX_BATCH_SIZE;
+ }
- if ( j == -4 )
- {
- uint64_t vm86_tss;
+ pagebuf.nr_physpages = pagebuf.nr_pages = 0;
- /* Skip padding 4 bytes then read the vm86 TSS location. */
- if ( read_exact(io_fd, &vm86_tss, sizeof(uint32_t)) ||
- read_exact(io_fd, &vm86_tss, sizeof(uint64_t)) )
- {
- ERROR("error read the address of the vm86 TSS");
- goto out;
- }
-
- xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, vm86_tss);
- continue;
- }
-
- if ( j == 0 )
- break; /* our work here is done */
-
- if ( (j > MAX_BATCH_SIZE) || (j < 0) )
- {
- ERROR("Max batch size exceeded. Giving up.");
- goto out;
- }
-
- if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) )
- {
- ERROR("Error when reading region pfn types");
- goto out;
- }
-
- /* First pass for this batch: work out how much memory to alloc */
- nr_mfns = 0;
- for ( i = 0; i < j; i++ )
- {
- unsigned long pfn, pagetype;
- pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
- pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
-
- if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) &&
- (p2m[pfn] == INVALID_P2M_ENTRY) )
- {
- /* Have a live PFN which hasn't had an MFN allocated */
- p2m_batch[nr_mfns++] = pfn;
- p2m[pfn]--;
- }
- }
-
- /* Now allocate a bunch of mfns for this batch */
- if ( nr_mfns &&
- (xc_domain_memory_populate_physmap(xc_handle, dom, nr_mfns, 0,
- 0, p2m_batch) != 0) )
- {
- ERROR("Failed to allocate memory for batch.!\n");
- errno = ENOMEM;
- goto out;
- }
-
- /* Second pass for this batch: update p2m[] and region_mfn[] */
- nr_mfns = 0;
- for ( i = 0; i < j; i++ )
- {
- unsigned long pfn, pagetype;
- pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
- pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
-
- if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
- region_mfn[i] = ~0UL; /* map will fail but we don't care */
- else
- {
- if ( p2m[pfn] == (INVALID_P2M_ENTRY-1) )
- {
- /* We just allocated a new mfn above; update p2m */
- p2m[pfn] = p2m_batch[nr_mfns++];
- nr_pfns++;
- }
-
- /* setup region_mfn[] for batch map.
- * For HVM guests, this interface takes PFNs, not MFNs */
- region_mfn[i] = hvm ? pfn : p2m[pfn];
- }
- }
-
- /* Map relevant mfns */
- region_base = xc_map_foreign_batch(
- xc_handle, dom, PROT_WRITE, region_mfn, j);
-
- if ( region_base == NULL )
- {
- ERROR("map batch failed");
- goto out;
- }
-
- for ( i = 0; i < j; i++ )
- {
- void *page;
- unsigned long pagetype;
-
- pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
- pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
-
- if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
- /* a bogus/unmapped page: skip it */
- continue;
-
- if ( pfn > p2m_size )
- {
- ERROR("pfn out of range");
- goto out;
- }
-
- pfn_type[pfn] = pagetype;
-
- mfn = p2m[pfn];
-
- /* In verify mode, we use a copy; otherwise we work in place */
- page = verify ? (void *)buf : (region_base + i*PAGE_SIZE);
-
- if ( read_exact(io_fd, page, PAGE_SIZE) )
- {
- ERROR("Error when reading page (type was %lx)", pagetype);
- goto out;
- }
-
- pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
-
- if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
- (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
- {
- /*
- ** A page table page - need to 'uncanonicalize' it, i.e.
- ** replace all the references to pfns with the corresponding
- ** mfns for the new domain.
- **
- ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
- ** so we may need to update the p2m after the main loop.
- ** Hence we defer canonicalization of L1s until then.
- */
- if ((pt_levels != 3) ||
- pae_extended_cr3 ||
- (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
-
- if (!uncanonicalize_pagetable(xc_handle, dom,
- pagetype, page)) {
- /*
- ** Failing to uncanonicalize a page table can be ok
- ** under live migration since the pages type may have
- ** changed by now (and we'll get an update later).
- */
- DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
- pagetype >> 28, pfn, mfn);
- nraces++;
- continue;
- }
- }
- }
- else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
- {
- ERROR("Bogus page type %lx page table is out of range: "
- "i=%d p2m_size=%lu", pagetype, i, p2m_size);
- goto out;
-
- }
-
- if ( verify )
- {
- int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
- if ( res )
- {
- int v;
-
- DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
- "actualcs=%08lx\n", pfn, pfn_type[pfn],
- csum_page(region_base + i*PAGE_SIZE),
- csum_page(buf));
-
- for ( v = 0; v < 4; v++ )
- {
- unsigned long *p = (unsigned long *)
- (region_base + i*PAGE_SIZE);
- if ( buf[v] != p[v] )
- DPRINTF(" %d: %08lx %08lx\n", v, buf[v], p[v]);
- }
- }
- }
-
- if ( !hvm &&
- xc_add_mmu_update(xc_handle, mmu,
- (((unsigned long long)mfn) << PAGE_SHIFT)
- | MMU_MACHPHYS_UPDATE, pfn) )
- {
- ERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
- goto out;
- }
- } /* end of 'batch' for loop */
-
- munmap(region_base, j*PAGE_SIZE);
- n+= j; /* crude stats */
+ n += j; /* crude stats */
/*
* Discard cache for portion of file read so far up to last
@@ -785,6 +1020,32 @@
/* Non-HVM guests only from here on */
+ if (!completed) {
+ if ( buffer_tail(&tailbuf, io_fd, max_vcpu_id, vcpumap,
+ ext_vcpucontext) < 0 ) {
+ ERROR ("error buffering image tail");
+ goto out;
+ }
+ completed = 1;
+ }
+
+ DPRINTF("Buffered checkpoint\n");
+ if (pagebuf_get(&pagebuf, io_fd)) {
+ ERROR("error when buffering batch, finishing\n");
+ goto finish;
+ }
+ if ( buffer_tail(&tmptail, io_fd, max_vcpu_id, vcpumap,
+ ext_vcpucontext) < 0 ) {
+ ERROR ("error buffering image tail, finishing");
+ goto finish;
+ }
+ tailbuf_free(&tailbuf);
+ memcpy(&tailbuf, &tmptail, sizeof(tailbuf));
+
+ goto loadpages;
+
+ finish:
+
if ( (pt_levels == 3) && !pae_extended_cr3 )
{
/*
@@ -953,39 +1214,17 @@
/* Get the list of PFNs that are not in the psuedo-phys map */
{
- unsigned int count = 0;
- unsigned long *pfntab;
- int nr_frees;
+ int nr_frees = 0;
- if ( read_exact(io_fd, &count, sizeof(count)) ||
- (count > (1U << 28)) ) /* up to 1TB of address space */
+ for ( i = 0; i < tailbuf.pfncount; i++ )
{
- ERROR("Error when reading pfn count (= %u)", count);
- goto out;
- }
-
- if ( !(pfntab = malloc(sizeof(unsigned long) * count)) )
- {
- ERROR("Out of memory");
- goto out;
- }
-
- if ( read_exact(io_fd, pfntab, sizeof(unsigned long)*count) )
- {
- ERROR("Error when reading pfntab");
- goto out;
- }
-
- nr_frees = 0;
- for ( i = 0; i < count; i++ )
- {
- unsigned long pfn = pfntab[i];
+ unsigned long pfn = tailbuf.pfntab[i];
if ( p2m[pfn] != INVALID_P2M_ENTRY )
{
/* pfn is not in physmap now, but was at some point during
the save/migration process - need to free it */
- pfntab[nr_frees++] = p2m[pfn];
+ tailbuf.pfntab[nr_frees++] = p2m[pfn];
p2m[pfn] = INVALID_P2M_ENTRY; /* not in pseudo-physical map */
}
}
@@ -997,7 +1236,7 @@
.extent_order = 0,
.domid = dom
};
- set_xen_guest_handle(reservation.extent_start, pfntab);
+ set_xen_guest_handle(reservation.extent_start, tailbuf.pfntab);
if ( (frc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
&reservation)) != nr_frees )
@@ -1006,7 +1245,7 @@
goto out;
}
else
- DPRINTF("Decreased reservation by %d pages\n", count);
+ DPRINTF("Decreased reservation by %d pages\n", tailbuf.pfncount);
}
}
@@ -1016,18 +1255,17 @@
return 1;
}
+ vcpup = tailbuf.vcpubuf;
for ( i = 0; i <= max_vcpu_id; i++ )
{
if ( !(vcpumap & (1ULL << i)) )
continue;
- if ( read_exact(io_fd, &ctxt, ((guest_width == 8)
- ? sizeof(ctxt.x64)
- : sizeof(ctxt.x32))) )
- {
- ERROR("Error when reading ctxt %d", i);
- goto out;
- }
+ memcpy(&ctxt, vcpup, ((guest_width == 8) ? sizeof(ctxt.x64)
+ : sizeof(ctxt.x32)));
+ vcpup += (guest_width == 8) ? sizeof(ctxt.x64) : sizeof(ctxt.x32);
+
+ DPRINTF("read VCPU %d\n", i);
if ( !new_ctxt_format )
SET_FIELD(&ctxt, flags, GET_FIELD(&ctxt, flags) | VGCF_online);
@@ -1132,12 +1370,8 @@
if ( !ext_vcpucontext )
continue;
- if ( read_exact(io_fd, &domctl.u.ext_vcpucontext, 128) ||
- (domctl.u.ext_vcpucontext.vcpu != i) )
- {
- ERROR("Error when reading extended ctxt %d", i);
- goto out;
- }
+ memcpy(&domctl.u.ext_vcpucontext, vcpup, 128);
+ vcpup += 128;
domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
domctl.domain = dom;
frc = xc_domctl(xc_handle, &domctl);
@@ -1148,11 +1382,9 @@
}
}
- if ( read_exact(io_fd, shared_info_page, PAGE_SIZE) )
- {
- ERROR("Error when reading shared info page");
- goto out;
- }
+ memcpy(shared_info_page, tailbuf.shared_info_page, PAGE_SIZE);
+
+ DPRINTF("Completed checkpoint load\n");
/* Restore contents of shared-info page. No checking needed. */
new_shared_info = xc_map_foreign_range(
^ permalink raw reply [flat|nested] 10+ messages in thread* [PATCH 3 of 9] Initiate failover if a packet is not received every 500ms
2009-05-14 0:19 [PATCH 0 of 9] Xen support for Remus Brendan Cully
2009-05-14 0:19 ` [PATCH 1 of 9] Add callbacks for suspend, postcopy and preresume in xc_domain_save Brendan Cully
2009-05-14 0:19 ` [PATCH 2 of 9] Make xc_domain_restore loop until the fd is closed Brendan Cully
@ 2009-05-14 0:19 ` Brendan Cully
2009-05-14 0:19 ` [PATCH 4 of 9] Support more than 2 FDs per tapdisk Brendan Cully
` (5 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Brendan Cully @ 2009-05-14 0:19 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1240355510 25200
# Node ID b51238ea926948383500b94cd227321eb40a82dd
# Parent f5c0d3208d8ae9183391398d52c9be5969da24ec
Initiate failover if a packet is not received every 500ms.
This breaks checkpoints at lower frequencies, and should be made
configurable.
diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c
+++ b/tools/libxc/xc_domain_restore.c
@@ -59,6 +59,51 @@
/* Address size of the guest, in bytes */
unsigned int guest_width;
+/* set when a consistent image is available */
+static int completed = 0;
+
+#define HEARTBEAT_MS 500
+
+# ifndef __MINIOS__
+static ssize_t read_exact_timed(int fd, void* buf, size_t size)
+{
+ size_t offset = 0;
+ ssize_t len;
+ struct timeval tv;
+ fd_set rfds;
+
+ while ( offset < size )
+ {
+ if (completed) {
+ /* expect a heartbeat every HEARBEAT_MS ms maximum */
+ tv.tv_sec = 0;
+ tv.tv_usec = HEARTBEAT_MS * 1000;
+
+ FD_ZERO(&rfds);
+ FD_SET(fd, &rfds);
+ len = select(fd + 1, &rfds, NULL, NULL, &tv);
+ if ( !FD_ISSET(fd, &rfds) ) {
+ fprintf(stderr, "read_exact_timed failed (select returned %zd)\n", len);
+ return -1;
+ }
+ }
+
+ len = read(fd, buf + offset, size - offset);
+ if ( (len == -1) && ((errno == EINTR) || (errno == EAGAIN)) )
+ continue;
+ if ( len <= 0 )
+ return -1;
+ offset += len;
+ }
+
+ return 0;
+}
+
+#define read_exact read_exact_timed
+
+#else
+#define read_exact_timed read_exact
+#endif
/*
** In the state file (or during transfer), all page-table pages are
** converted into a 'canonical' form where references to actual mfns
@@ -413,7 +458,9 @@
// DPRINTF("reading batch of %d pages\n", count);
if (!count) {
+ /*
DPRINTF("Last batch read\n");
+ */
return 0;
} else if (count == -1) {
DPRINTF("Entering page verify mode\n");
@@ -704,7 +751,8 @@
int xc_domain_restore(int xc_handle, int io_fd, uint32_t dom,
unsigned int store_evtchn, unsigned long *store_mfn,
unsigned int console_evtchn, unsigned long *console_mfn,
- unsigned int hvm, unsigned int pae)
+ unsigned int hvm, unsigned int pae,
+ int (*resume)(void*), void* resumedata)
{
DECLARE_DOMCTL;
int rc = 1, frc, i, j, n, m, pae_extended_cr3 = 0, ext_vcpucontext = 0;
@@ -752,7 +800,6 @@
/* Buffer for holding HVM context */
uint8_t *hvm_buf = NULL;
- int completed = 0;
pagebuf_t pagebuf;
tailbuf_t tailbuf, tmptail;
void* vcpup;
@@ -946,7 +993,9 @@
goto out;
}
+ /*
DPRINTF("Received all pages (%d races)\n", nraces);
+ */
if ( hvm )
{
@@ -1021,28 +1070,40 @@
/* Non-HVM guests only from here on */
if (!completed) {
+ int flags = 0;
+
if ( buffer_tail(&tailbuf, io_fd, max_vcpu_id, vcpumap,
ext_vcpucontext) < 0 ) {
ERROR ("error buffering image tail");
goto out;
}
+
completed = 1;
+ /* shift into nonblocking mode for the remainder */
+ if ((flags = fcntl(io_fd, F_GETFL, 0)) < 0)
+ flags = 0;
+ fcntl(io_fd, F_SETFL, flags | O_NONBLOCK);
}
-
+
+ /*
DPRINTF("Buffered checkpoint\n");
- if (pagebuf_get(&pagebuf, io_fd)) {
- ERROR("error when buffering batch, finishing\n");
- goto finish;
+ */
+ if (!resume(resumedata)) {
+ if (pagebuf_get(&pagebuf, io_fd)) {
+ ERROR("error when buffering batch, finishing\n");
+ goto finish;
+ }
+ memset(&tmptail, 0, sizeof(tmptail));
+ if ( buffer_tail(&tmptail, io_fd, max_vcpu_id, vcpumap,
+ ext_vcpucontext) < 0 ) {
+ ERROR ("error buffering image tail, finishing");
+ goto finish;
+ }
+ tailbuf_free(&tailbuf);
+ memcpy(&tailbuf, &tmptail, sizeof(tailbuf));
+
+ goto loadpages;
}
- if ( buffer_tail(&tmptail, io_fd, max_vcpu_id, vcpumap,
- ext_vcpucontext) < 0 ) {
- ERROR ("error buffering image tail, finishing");
- goto finish;
- }
- tailbuf_free(&tailbuf);
- memcpy(&tailbuf, &tmptail, sizeof(tailbuf));
-
- goto loadpages;
finish:
diff --git a/tools/libxc/xenguest.h b/tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h
+++ b/tools/libxc/xenguest.h
@@ -54,12 +54,16 @@
* @parm store_mfn returned with the mfn of the store page
* @parm hvm non-zero if this is a HVM restore
* @parm pae non-zero if this HVM domain has PAE support enabled
+ * @parm resume a function returning 1 to resume or 0 to expect
+ * another checkpoint
+ * @parm resumedata a void pointer to pass back to the resume function
* @return 0 on success, -1 on failure
*/
int xc_domain_restore(int xc_handle, int io_fd, uint32_t dom,
unsigned int store_evtchn, unsigned long *store_mfn,
unsigned int console_evtchn, unsigned long *console_mfn,
- unsigned int hvm, unsigned int pae);
+ unsigned int hvm, unsigned int pae,
+ int (*resume)(void*), void* resumedata);
/**
* This function will create a domain for a paravirtualized Linux
diff --git a/tools/python/xen/xend/XendCheckpoint.py b/tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py
+++ b/tools/python/xen/xend/XendCheckpoint.py
@@ -318,7 +318,8 @@
restore_image.setCpuid()
- os.read(fd, 1) # Wait for source to close connection
+ #os.read(fd, 1) # Wait for source to close connection
+ # ^^ breaks failover, and I don't know why it's needed.
dominfo.completeRestore(handler.store_mfn, handler.console_mfn)
diff --git a/tools/xcutils/xc_restore.c b/tools/xcutils/xc_restore.c
--- a/tools/xcutils/xc_restore.c
+++ b/tools/xcutils/xc_restore.c
@@ -11,10 +11,59 @@
#include <stdlib.h>
#include <stdint.h>
#include <stdio.h>
+#include <string.h>
+#include <sys/select.h>
#include <xenctrl.h>
#include <xenguest.h>
+typedef struct {
+ int fd;
+} resume_t;
+
+static int resume(void* resumedata)
+{
+ fd_set rfds;
+ struct timeval tv;
+// char buf[64];
+ int rc;
+ resume_t* rd = resumedata;
+
+ FD_ZERO(&rfds);
+
+ do {
+ /* expect a heartbeat every 500ms maximum */
+ tv.tv_sec = 0;
+ tv.tv_usec = 500000;
+
+ FD_SET(rd->fd, &rfds);
+ rc = select(rd->fd + 1, &rfds, NULL, NULL, &tv);
+ if (!FD_ISSET(rd->fd, &rfds)) {
+ fprintf(stderr, "resume: heartbeat failed (select returned %d)\n", rc);
+ return -1;
+ }
+#if 0
+ rc = read(rd->fd, buf, 4);
+ if (rc == 4 && !strncmp(buf, "done", 4)) {
+ /*
+ fprintf(stderr, "resume: received 'done'\n");
+ */
+ return 0;
+ }
+ if (rc < 4 || strncmp(buf, "wait", 4)) {
+ if (rc >= 0)
+ buf[rc] = '\0';
+ else
+ buf[0] = '\0';
+ fprintf(stderr, "bad heartbeat response: %d, %s\n", rc, buf);
+ return -1;
+ }
+#endif
+ } while(0);
+
+ return 0;
+}
+
int
main(int argc, char **argv)
{
@@ -22,6 +71,7 @@
unsigned int hvm, pae, apic;
int xc_fd, io_fd, ret;
unsigned long store_mfn, console_mfn;
+ resume_t rdata;
if ( argc != 8 )
errx(1, "usage: %s iofd domid store_evtchn "
@@ -39,8 +89,11 @@
pae = atoi(argv[6]);
apic = atoi(argv[7]);
+ rdata.fd = io_fd;
+
ret = xc_domain_restore(xc_fd, io_fd, domid, store_evtchn, &store_mfn,
- console_evtchn, &console_mfn, hvm, pae);
+ console_evtchn, &console_mfn, hvm, pae,
+ resume, &rdata);
if ( ret == 0 )
{
^ permalink raw reply [flat|nested] 10+ messages in thread* [PATCH 4 of 9] Support more than 2 FDs per tapdisk
2009-05-14 0:19 [PATCH 0 of 9] Xen support for Remus Brendan Cully
` (2 preceding siblings ...)
2009-05-14 0:19 ` [PATCH 3 of 9] Initiate failover if a packet is not received every 500ms Brendan Cully
@ 2009-05-14 0:19 ` Brendan Cully
2009-05-14 0:19 ` [PATCH 5 of 9] Check tapdisk paths from the last colon instead of the first Brendan Cully
` (4 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Brendan Cully @ 2009-05-14 0:19 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1240355510 25200
# Node ID 99c01882edb65e702875afc76319e92b88966402
# Parent b51238ea926948383500b94cd227321eb40a82dd
Support more than 2 FDs per tapdisk.
The Remus disk replication module needs this to be able to poll the
backup server as well as the underlying disk module.
Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
diff --git a/tools/blktap/drivers/tapdisk.c b/tools/blktap/drivers/tapdisk.c
--- a/tools/blktap/drivers/tapdisk.c
+++ b/tools/blktap/drivers/tapdisk.c
@@ -132,16 +132,19 @@
{
fd_list_entry_t *ptr;
struct disk_driver *dd;
+ int i;
ptr = fd_start;
while (ptr != NULL) {
if (ptr->tap_fd) {
FD_SET(ptr->tap_fd, readfds);
td_for_each_disk(ptr->s, dd) {
- if (dd->io_fd[READ])
- FD_SET(dd->io_fd[READ], readfds);
- maxfds = (dd->io_fd[READ] > maxfds ?
- dd->io_fd[READ] : maxfds);
+ for (i = 0; i < MAX_IOFD; i++) {
+ if (dd->io_fd[i])
+ FD_SET(dd->io_fd[i], readfds);
+ maxfds = (dd->io_fd[i] > maxfds ?
+ dd->io_fd[i] : maxfds);
+ }
}
maxfds = (ptr->tap_fd > maxfds ? ptr->tap_fd : maxfds);
}
@@ -820,13 +823,15 @@
int progress_made = 0;
struct disk_driver *dd;
tapdev_info_t *info = ptr->s->ring_info;
+ int i;
td_for_each_disk(ptr->s, dd) {
- if (dd->io_fd[READ] &&
- FD_ISSET(dd->io_fd[READ],
- &readfds)) {
- io_done(dd, READ);
- progress_made = 1;
+ for (i = 0; i < MAX_IOFD; i++) {
+ if (dd->io_fd[i] &&
+ FD_ISSET(dd->io_fd[i], &readfds)) {
+ io_done(dd, i);
+ progress_made = 1;
+ }
}
}
diff --git a/tools/blktap/drivers/tapdisk.h b/tools/blktap/drivers/tapdisk.h
--- a/tools/blktap/drivers/tapdisk.h
+++ b/tools/blktap/drivers/tapdisk.h
@@ -74,7 +74,7 @@
#define SECTOR_SHIFT 9
#define DEFAULT_SECTOR_SIZE 512
-#define MAX_IOFD 2
+#define MAX_IOFD 4
#define BLK_NOT_ALLOCATED 99
#define TD_NO_PARENT 1
^ permalink raw reply [flat|nested] 10+ messages in thread* [PATCH 5 of 9] Check tapdisk paths from the last colon instead of the first
2009-05-14 0:19 [PATCH 0 of 9] Xen support for Remus Brendan Cully
` (3 preceding siblings ...)
2009-05-14 0:19 ` [PATCH 4 of 9] Support more than 2 FDs per tapdisk Brendan Cully
@ 2009-05-14 0:19 ` Brendan Cully
2009-05-14 0:19 ` [PATCH 6 of 9] Remus tapdisk proxy Brendan Cully
` (3 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Brendan Cully @ 2009-05-14 0:19 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Brendan Cully
# Date 1242260367 25200
# Node ID 42fddb3a8edeb80339618b7f758dc2959cf97115
# Parent 99c01882edb65e702875afc76319e92b88966402
Check tapdisk paths from the last colon instead of the first.
This allows the Remus tapdisk proxy to configure itself using the form
tap:remus:host:port:aio:....
Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
diff --git a/tools/blktap/lib/xenbus.c b/tools/blktap/lib/xenbus.c
--- a/tools/blktap/lib/xenbus.c
+++ b/tools/blktap/lib/xenbus.c
@@ -256,7 +256,7 @@
path += strlen("ioemu:");
}
- tmp = strchr(path, ':');
+ tmp = strrchr(path, ':');
if (tmp != NULL)
path = tmp + 1;
diff --git a/tools/python/xen/util/blkif.py b/tools/python/xen/util/blkif.py
--- a/tools/python/xen/util/blkif.py
+++ b/tools/python/xen/util/blkif.py
@@ -78,7 +78,8 @@
if typ in ("phy", "drbd") and not fn.startswith("/"):
fn = "/dev/%s" %(fn,)
if typ == "tap":
- (taptype, fn) = fn.split(":", 1)
+ flds = fn.split(':')
+ (taptype, fn) = flds[0], flds[-1]
return (fn, taptype)
def blkdev_uname_to_file(uname):
^ permalink raw reply [flat|nested] 10+ messages in thread* [PATCH 6 of 9] Remus tapdisk proxy
2009-05-14 0:19 [PATCH 0 of 9] Xen support for Remus Brendan Cully
` (4 preceding siblings ...)
2009-05-14 0:19 ` [PATCH 5 of 9] Check tapdisk paths from the last colon instead of the first Brendan Cully
@ 2009-05-14 0:19 ` Brendan Cully
2009-05-14 0:19 ` [PATCH 7 of 9] Buffer checkpoint data locally until domain has resumed execution Brendan Cully
` (2 subsequent siblings)
8 siblings, 0 replies; 10+ messages in thread
From: Brendan Cully @ 2009-05-14 0:19 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1240355510 25200
# Node ID 03fd0c9729f3d87e7803afb170dfc3cdff184998
# Parent 42fddb3a8edeb80339618b7f758dc2959cf97115
Remus tapdisk proxy.
This proxy forwards local disk writes to a backup server, where they are
buffered until the local disk receives a checkpoint signal from the remus
control tools, which it forwards to the backup to cause buffered writes
to be flushed.
Configuration is of the form
tap:remus:<backup host>:<backup port>:<underlying tapdisk string>
The first write to a disk protected by this proxy will block until the
Remus control tools have been activated, which create the backup
disk proxy.
Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
diff --git a/tools/blktap/drivers/Makefile b/tools/blktap/drivers/Makefile
--- a/tools/blktap/drivers/Makefile
+++ b/tools/blktap/drivers/Makefile
@@ -11,6 +11,8 @@
CFLAGS += $(CFLAGS_libxenctrl)
CFLAGS += $(CFLAGS_libxenstore)
CFLAGS += -I $(LIBAIO_DIR)
+# for hashtable_itr.h in gcc 4.2
+CFLAGS += -fgnu89-inline
CFLAGS += -D_GNU_SOURCE
ifeq ($(shell . ./check_gcrypt $(CC)),yes)
@@ -22,7 +24,8 @@
endif
LDFLAGS_blktapctrl := $(LDFLAGS_libxenctrl) $(LDFLAGS_libxenstore) -L../lib -lblktap
-LDFLAGS_img := $(LIBAIO_DIR)/libaio.a $(CRYPT_LIB) -lpthread -lz
+# hashtable.c uses ceilf from libm
+LDFLAGS_img := $(LIBAIO_DIR)/libaio.a $(CRYPT_LIB) -lpthread -lm -lz
BLK-OBJS-y := block-aio.o
BLK-OBJS-y += block-sync.o
@@ -30,6 +33,7 @@
BLK-OBJS-y += block-ram.o
BLK-OBJS-y += block-qcow.o
BLK-OBJS-y += block-qcow2.o
+BLK-OBJS-y += block-remus.o hashtable.o hashtable_itr.o hashtable_utility.o
BLK-OBJS-y += aes.o
BLK-OBJS-y += tapaio.o
BLK-OBJS-$(CONFIG_Linux) += blk_linux.o
diff --git a/tools/blktap/drivers/block-remus.c b/tools/blktap/drivers/block-remus.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap/drivers/block-remus.c
@@ -0,0 +1,1686 @@
+/* block-remus.c
+ *
+ * This disk sends all writes to a backup via a network interface before
+ * passing them to an underlying device.
+ * The backup is a bit more complicated:
+ * 1. It applies all incoming writes to a ramdisk.
+ * 2. When a checkpoint request arrives, it moves the ramdisk to
+ * a committing state and uses a new ramdisk for subsequent writes.
+ * It also acknowledges the request, to let the sender know it can
+ * release output.
+ * 3. The ramdisk flushes its contents to the underlying driver.
+ * 4. At failover, the backup waits for the in-flight ramdisk (if any) to
+ * drain before letting the domain be activated.
+ *
+ * The driver determines whether it is the client or server by attempting
+ * to bind to the replication address. If the address is not local,
+ * the driver acts as client.
+ *
+ * The following messages are defined for the replication stream:
+ * 1. write request
+ * "wreq" 4
+ * num_sectors 4
+ * sector 8
+ * buffer (num_sectors * sector_size)
+ * 2. submit request (may be used as a barrier
+ * "sreq" 4
+ * 3. commit request
+ * "creq" 4
+ * After a commit request, the client must wait for a competion message:
+ * 4. completion
+ * "done" 4
+ */
+
+/* due to architectural choices in tapdisk, block-buffer is forced to
+ * reimplement some code which is meant to be private */
+#define TAPDISK
+#include "tapdisk.h"
+#include "hashtable.h"
+#include "hashtable_itr.h"
+#include "hashtable_utility.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
+
+/* timeout for reads and writes in ms */
+#define NET_TIMEOUT 500
+#define RAMDISK_HASHSIZE 128
+
+#define RPRINTF(_f, _a...) syslog (LOG_DEBUG, "remus: " _f, ## _a)
+
+enum tdremus_mode {
+ mode_pass = 0,
+ mode_client,
+ mode_server
+};
+
+struct tdremus_req {
+ uint64_t sector;
+ int nb_sectors;
+ char buf[4096];
+};
+
+struct req_ring {
+ /* waste one slot to distinguish between empty and full */
+ struct tdremus_req requests[MAX_REQUESTS * 2 + 1];
+ unsigned int head;
+ unsigned int tail;
+};
+
+struct ramdisk {
+ size_t sector_size;
+ struct hashtable* h;
+ /* when a ramdisk is flushed, h is given a new empty hash for writes
+ * while the old ramdisk (prev) is drained asynchronously. To avoid
+ * a race where a read request points to a sector in prev which has
+ * not yet been flushed, check prev on a miss in h */
+ struct hashtable* prev;
+ /* count of outstanding requests to the base driver */
+ size_t inflight;
+};
+
+/* the ramdisk intercepts the original callback for reads and writes.
+ * This holds the original data. */
+/* Might be worth making this a static array in struct ramdisk to avoid
+ * a malloc per request */
+
+struct tdremus_state;
+
+struct ramdisk_cbdata {
+ td_callback_t cb;
+ void* private;
+ char* buf;
+ struct tdremus_state* state;
+};
+
+struct ramdisk_write_cbdata {
+ struct tdremus_state* state;
+ char* buf;
+};
+
+typedef int (*queue_rw_t) (struct disk_driver *dd, uint64_t sector,
+ int nb_sectors, char *buf, td_callback_t cb,
+ int id, void *prv);
+
+struct tdremus_state {
+ struct tap_disk* driver;
+ void* driver_data;
+ char* path;
+
+ char* ctlfifo; /* receive flush instruction here */
+ int ctl_fd_idx; /* io_fd slot for control FIFO */
+ char* msgfifo; /* output completion message here */
+ int msgfd;
+
+ /* replication host */
+ struct in_addr addr;
+ unsigned short port;
+ int sfd; /* server listen port */
+ int rfd; /* replication channel */
+ int rfd_idx; /* io_fd slot for replication channel */
+
+ /* queue write requests, batch-replicate at submit */
+ struct req_ring write_ring;
+
+ /* ramdisk data*/
+ struct ramdisk ramdisk;
+
+ /* mode methods */
+ enum tdremus_mode mode;
+ queue_rw_t queue_read;
+ queue_rw_t queue_write;
+ int (*queue_flush)(struct disk_driver* dd);
+ int (*submit)(struct disk_driver* dd);
+ int (*queue_close)(struct disk_driver *dd);
+};
+
+static int tdremus_queue_read(struct disk_driver *dd, uint64_t sector,
+ int nb_sectors, char *buf, td_callback_t cb,
+ int id, void *private);
+static int tdremus_queue_write(struct disk_driver *dd, uint64_t sector,
+ int nb_sectors, char *buf, td_callback_t cb,
+ int id, void *private);
+static int tdremus_close(struct disk_driver *dd);
+
+/* ring functions */
+static inline unsigned int ring_next(struct req_ring* ring, unsigned int pos)
+{
+ if (++pos >= MAX_REQUESTS * 2 + 1)
+ return 0;
+
+ return pos;
+}
+
+static inline int ring_isempty(struct req_ring* ring)
+{
+ return ring->head == ring->tail;
+}
+
+static inline int ring_isfull(struct req_ring* ring)
+{
+ return ring_next(ring, ring->tail) == ring->head;
+}
+
+static int buf_queue_write(struct disk_driver *dd, uint64_t sector,
+ int nb_sectors, char *buf)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ struct tdremus_req* req;
+
+ /* queue requests until flush */
+ if (ring_isfull(&s->write_ring))
+ return -EBUSY;
+
+ req = s->write_ring.requests + s->write_ring.tail;
+ s->write_ring.tail = ring_next(&s->write_ring, s->write_ring.tail);
+
+ req->sector = sector;
+ req->nb_sectors = nb_sectors;
+ memcpy(req->buf, buf, req->nb_sectors * dd->td_state->sector_size);
+
+ return 0;
+}
+
+/* passthrough functions */
+
+static int pass_start(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+
+ s->queue_read = NULL;
+ s->queue_write = NULL;
+ s->submit = NULL;
+ s->queue_flush = NULL;
+ s->queue_close = NULL;
+
+ return 0;
+}
+
+/* ramdisk methods */
+static int ramdisk_flush(struct disk_driver* dd, struct tdremus_state *s);
+
+/* http://www.concentric.net/~Ttwang/tech/inthash.htm */
+static unsigned int uint64_hash(void* k)
+{
+ uint64_t key = *(uint64_t*)k;
+
+ key = (~key) + (key << 18);
+ key = key ^ (key >> 31);
+ key = key * 21;
+ key = key ^ (key >> 11);
+ key = key + (key << 6);
+ key = key ^ (key >> 22);
+
+ return (unsigned int)key;
+}
+
+static int rd_hash_equal(void* k1, void* k2)
+{
+ uint64_t key1, key2;
+
+ key1 = *(uint64_t*)k1;
+ key2 = *(uint64_t*)k2;
+
+ return key1 == key2;
+}
+
+static int ramdisk_read(struct ramdisk* ramdisk, uint64_t sector,
+ int nb_sectors, char* buf)
+{
+ int i;
+ char* v;
+ uint64_t key;
+
+ for (i = 0; i < nb_sectors; i++) {
+ key = sector + i;
+ if (!(v = hashtable_search(ramdisk->h, &key))) {
+ /* check whether it is queued in a previous flush request */
+ if (!(ramdisk->prev && (v = hashtable_search(ramdisk->prev, &key))))
+ return -1;
+ }
+ memcpy(buf + i * ramdisk->sector_size, v, ramdisk->sector_size);
+ }
+
+ return 0;
+}
+
+static int ramdisk_write_hash(struct hashtable* h, uint64_t sector, char* buf,
+ size_t len)
+{
+ char* v;
+ uint64_t* key;
+
+ if ((v = hashtable_search(h, §or))) {
+ memcpy(v, buf, len);
+ return 0;
+ }
+
+ if (!(v = malloc(len))) {
+ DPRINTF("ramdisk_write_hash: malloc failed\n");
+ return -1;
+ }
+ memcpy(v, buf, len);
+ if (!(key = malloc(sizeof(*key)))) {
+ DPRINTF("ramdisk_write_hash: error allocating key\n");
+ free(v);
+ return -1;
+ }
+ *key = sector;
+ if (!hashtable_insert(h, key, v)) {
+ DPRINTF("ramdisk_write_hash failed on sector %" PRIu64 "\n", sector);
+ free(key);
+ free(v);
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int ramdisk_write(struct ramdisk* ramdisk, uint64_t sector,
+ int nb_sectors, char* buf)
+{
+ int i, rc;
+
+ for (i = 0; i < nb_sectors; i++) {
+ rc = ramdisk_write_hash(ramdisk->h, sector + i,
+ buf + i * ramdisk->sector_size,
+ ramdisk->sector_size);
+ if (rc)
+ return rc;
+ }
+
+ return 0;
+}
+
+/* the underlying driver calls this callback instead of the original.
+ The result is applied to the ramdisk. Then the ramdisk version of the
+ data is returned via the original callback. */
+static int ramdisk_read_cb(struct disk_driver* dd, int res, uint64_t sector,
+ int nb_sectors, int id, void* private)
+{
+ struct ramdisk_cbdata *cbdata = (struct ramdisk_cbdata*)private;
+ struct tdremus_state *s = cbdata->state;
+ char* sectorbuf;
+ int rc;
+ int i;
+
+ if (!res) {
+ for (i = 0; i < nb_sectors; i++) {
+ sectorbuf = cbdata->buf + i * s->ramdisk.sector_size;
+ /* if data is in ramdisk, overwrite the buffer read with the ramdisk */
+ ramdisk_read(&s->ramdisk, sector + i, 1, sectorbuf);
+ /* else: should we cache the results? is that just a waste of RAM? */
+ }
+ }
+
+ rc = cbdata->cb(dd, res, sector, nb_sectors, id, cbdata->private);
+
+ free(cbdata);
+ return rc;
+}
+
+static int ramdisk_write_cb(struct disk_driver* dd, int res, uint64_t sector,
+ int nb_sectors, int id, void* private)
+{
+ struct ramdisk_write_cbdata *cbdata = (struct ramdisk_write_cbdata*)private;
+ struct tdremus_state *s = cbdata->state;
+ int rc;
+
+ /*
+ RPRINTF("ramdisk write callback: rc %d, %d sectors @ %llu\n", res, nb_sectors,
+ sector);
+ */
+
+ free(cbdata->buf);
+ free(cbdata);
+
+ s->ramdisk.inflight--;
+ if (!s->ramdisk.inflight && !s->ramdisk.prev) {
+ /* when this reaches 0 and prev is empty, the disk is flushed. */
+ /*
+ RPRINTF("ramdisk flush complete\n");
+ */
+ }
+
+ if (s->ramdisk.prev) {
+ /* resubmit as much as possible in the remaining disk */
+ /*
+ RPRINTF("calling ramdisk_flush from write callback\n");
+ */
+ return ramdisk_flush(dd, s);
+ }
+
+ return 0;
+}
+
+static int ramdisk_queue_read(struct disk_driver *dd, uint64_t sector,
+ int nb_sectors, char *buf, td_callback_t cb,
+ int id, void *private)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ struct ramdisk_cbdata *cbdata;
+ int rc;
+
+ /* if block is present in ramdisk, return it. If any sectors are missing,
+ * fail and perform an overlay at the callback */
+ if (!(ramdisk_read(&s->ramdisk, sector, nb_sectors, buf)))
+ return cb(dd, 0, sector, nb_sectors, id, private);
+
+ /* otherwise queue a read to the underlying disk, with a new callback
+ * to fill the ramdisk */
+ if (!(cbdata = malloc(sizeof(*cbdata)))) {
+ DPRINTF("ramdisk_queue_read: Error allocating callback\n");
+ return -1;
+ }
+ cbdata->cb = cb;
+ cbdata->private = private;
+ cbdata->buf = buf;
+ /* callback may only have driver_data available */
+ cbdata->state = s;
+
+ dd->private = s->driver_data;
+ rc = s->driver->td_queue_read(dd, sector, nb_sectors, buf, ramdisk_read_cb,
+ id, cbdata);
+ dd->private = s;
+
+ return rc;
+}
+
+/* apply the write to the ramdisk directly. Wait until explicit flush
+ * to move to disk. Later we should probably trickle writes to the
+ * backup from here (not every call though). */
+static int ramdisk_queue_write(struct disk_driver *dd, uint64_t sector,
+ int nb_sectors, char *buf, td_callback_t cb,
+ int id, void *private)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ int rc;
+
+ /* write segment to ramdisk */
+ if ((rc = ramdisk_write(&s->ramdisk, sector, nb_sectors, buf))) {
+ DPRINTF("Ramdisk write failed: %d\n", rc);
+ return rc;
+ }
+
+ return cb(dd, 0, sector, nb_sectors, id, private);
+}
+
+static int uint64_compare(const void* k1, const void* k2)
+{
+ uint64_t u1 = *(uint64_t*)k1;
+ uint64_t u2 = *(uint64_t*)k2;
+
+ /* u1 - u2 is unsigned */
+ return u1 < u2 ? -1 : u1 > u2 ? 1 : 0;
+}
+
+/* set psectors to an array of the sector numbers in the hash, returning
+ * the number of entries (or -1 on error) */
+static int ramdisk_get_sectors(struct hashtable* h, uint64_t** psectors)
+{
+ struct hashtable_itr* itr;
+ uint64_t* sectors;
+ int count;
+
+ if (!(count = hashtable_count(h)))
+ return 0;
+
+ if (!(*psectors = malloc(count * sizeof(uint64_t)))) {
+ DPRINTF("ramdisk_get_sectors: error allocating sector map\n");
+ return -1;
+ }
+ sectors = *psectors;
+
+ itr = hashtable_iterator(h);
+ count = 0;
+ do {
+ sectors[count++] = *(uint64_t*)hashtable_iterator_key(itr);
+ } while (hashtable_iterator_advance(itr));
+ free(itr);
+
+ return count;
+}
+
+static char* merge_requests(struct ramdisk* ramdisk, uint64_t start,
+ size_t count)
+{
+ char* buf;
+ char* sector;
+ int i;
+
+ if (!(buf = valloc(count * ramdisk->sector_size))) {
+ DPRINTF("merge_request: allocation failed\n");
+ return NULL;
+ }
+
+ for (i = 0; i < count; i++) {
+ if (!(sector = hashtable_search(ramdisk->prev, &start))) {
+ DPRINTF("merge_request: lookup failed on %"PRIu64"\n", start);
+ return NULL;
+ }
+
+ memcpy(buf + i * ramdisk->sector_size, sector, ramdisk->sector_size);
+ free(sector);
+
+ start++;
+ }
+
+ return buf;
+}
+
+/* The underlying driver may not handle having the whole ramdisk queued at
+ * once. We queue what we can and let the callbacks attempt to queue more. */
+/* NOTE: may be called from callback, while dd->private still belongs to
+ * the underlying driver */
+static int ramdisk_flush(struct disk_driver* dd, struct tdremus_state* s)
+{
+ struct ramdisk_write_cbdata* cbdata;
+ uint64_t* sectors;
+ char* buf;
+ uint64_t base, batchlen;
+ int i, j, count = 0;
+
+ /*
+ RPRINTF("ramdisk flush\n");
+ */
+
+ if ((count = ramdisk_get_sectors(s->ramdisk.prev, §ors)) <= 0)
+ return count;
+
+ /*
+ RPRINTF("ramdisk: flushing %d sectors\n", count);
+ */
+
+ /* sort and merge sectors to improve disk performance */
+ qsort(sectors, count, sizeof(*sectors), uint64_compare);
+
+ for (i = 0; i < count;) {
+ if (!(cbdata = malloc(sizeof(*cbdata)))) {
+ RPRINTF("ramdisk_flush: error allocating cbdata\n");
+ free(sectors);
+ return -1;
+ }
+
+ base = sectors[i++];
+ while (i < count && sectors[i] == sectors[i-1] + 1)
+ i++;
+ batchlen = sectors[i-1] - base + 1;
+
+ if (!(buf = merge_requests(&s->ramdisk, base, batchlen))) {
+ RPRINTF("ramdisk_flush: merge_requests failed\n");
+ free(sectors);
+ return -1;
+ }
+
+ /* we probably want to record an ID for in-flight requests in ramdisk */
+ cbdata->state = s;
+ cbdata->buf = buf;
+
+ /*
+ RPRINTF("queuing write at %llu, length: %llu\n", base, batchlen);
+ */
+ dd->private = s->driver_data;
+ j = s->driver->td_queue_write(dd, base, batchlen, buf, ramdisk_write_cb,
+ 0, cbdata);
+ dd->private = s;
+
+ if (j) {
+ RPRINTF("ramdisk queue returned %d\n", j);
+ free(cbdata);
+ free(buf);
+ break;
+ }
+
+ s->ramdisk.inflight++;
+
+ for (j = 0; j < batchlen; j++) {
+ hashtable_remove(s->ramdisk.prev, &base);
+ base++;
+ }
+ }
+
+ if (!hashtable_count(s->ramdisk.prev)) {
+ /* everything is in flight */
+ hashtable_destroy(s->ramdisk.prev, 0);
+ s->ramdisk.prev = NULL;
+ }
+
+ free(sectors);
+
+ /*
+ RPRINTF("submitting requests\n");
+ */
+ dd->private = s->driver_data;
+ j = s->driver->td_submit(dd);
+ dd->private = s;
+ /*
+ RPRINTF("submit returned %d\n", j);
+ */
+
+ return 0;
+}
+
+/* flush ramdisk contents to disk */
+static int ramdisk_start_flush(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ uint64_t* key;
+ char* buf;
+ int rc = 0;
+ int i, j, count, batchlen;
+ uint64_t* sectors;
+
+ if (!hashtable_count(s->ramdisk.h)) {
+ /*
+ RPRINTF("Nothing to flush\n");
+ */
+ return 0;
+ }
+
+ if (s->ramdisk.prev) {
+ /* a flush request issued while a previous flush is still in progress
+ * will merge with the previous request. If you want the previous
+ * request to be consistent, wait for it to complete. */
+ if ((count = ramdisk_get_sectors(s->ramdisk.h, §ors)) < 0)
+ return count;
+
+ for (i = 0; i < count; i++) {
+ buf = hashtable_search(s->ramdisk.h, sectors + i);
+ ramdisk_write_hash(s->ramdisk.prev, sectors[i], buf,
+ s->ramdisk.sector_size);
+ }
+ free(sectors);
+
+ hashtable_destroy (s->ramdisk.h, 0);
+ } else
+ s->ramdisk.prev = s->ramdisk.h;
+
+ /* We create a new hashtable so that new writes can be performed before
+ * the old hashtable is completely drained. */
+ s->ramdisk.h = create_hashtable(RAMDISK_HASHSIZE, uint64_hash,
+ rd_hash_equal);
+
+ return ramdisk_flush(dd, s);
+}
+
+static int ramdisk_queue_close(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+
+ hashtable_destroy(s->ramdisk.h, 1);
+
+ return 0;
+}
+
+static int ramdisk_start(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+
+ if (s->ramdisk.h) {
+ RPRINTF("ramdisk already allocated\n");
+ return 0;
+ }
+
+ s->ramdisk.sector_size = dd->td_state->sector_size;
+ s->ramdisk.h = create_hashtable(RAMDISK_HASHSIZE, uint64_hash,
+ rd_hash_equal);
+
+ DPRINTF("Ramdisk started, %zu bytes/sector\n", s->ramdisk.sector_size);
+
+ return 0;
+}
+
+/* common client/server functions */
+/* mayberead: Time out after a certain interval. */
+static int mread(int fd, void* buf, size_t len)
+{
+ fd_set rfds;
+ int rc;
+ size_t cur = 0;
+ struct timeval tv = {
+ .tv_sec = 0,
+ .tv_usec = NET_TIMEOUT * 1000
+ };
+
+ if (!len)
+ return 0;
+
+ /* read first. Only select if read is incomplete. */
+ rc = read(fd, buf, len);
+ while (rc < 0 || cur + rc < len) {
+ if (!rc) {
+ RPRINTF("end-of-file");
+ return -1;
+ }
+ if (rc < 0 && errno != EAGAIN) {
+ RPRINTF("error during read: %d\n", errno);
+ return -1;
+ }
+ if (rc > 0)
+ cur += rc;
+
+ FD_ZERO(&rfds);
+ FD_SET(fd, &rfds);
+ if (!(rc = select(fd + 1, &rfds, NULL, NULL, &tv))) {
+ RPRINTF("time out during read\n");
+ return -1;
+ } else if (rc < 0) {
+ RPRINTF("error during select: %d\n", errno);
+ return -1;
+ }
+ rc = read(fd, buf + cur, len - cur);
+ }
+ /*
+ RPRINTF("read %d bytes\n", cur + rc);
+ */
+
+ return 0;
+}
+
+static int mwrite(int fd, void* buf, size_t len)
+{
+ fd_set wfds;
+ size_t cur = 0;
+ int rc;
+ struct timeval tv = {
+ .tv_sec = 0,
+ .tv_usec = NET_TIMEOUT * 1000
+ };
+
+ if (!len)
+ return 0;
+
+ /* read first. Only select if read is incomplete. */
+ rc = write(fd, buf, len);
+ while (rc < 0 || cur + rc < len) {
+ if (!rc) {
+ RPRINTF("end-of-file");
+ return -1;
+ }
+ if (rc < 0 && errno != EAGAIN) {
+ RPRINTF("error during read: %d\n", errno);
+ return -1;
+ }
+ if (rc > 0)
+ cur += rc;
+
+ FD_ZERO(&wfds);
+ FD_SET(fd, &wfds);
+ if (!(rc = select(fd + 1, NULL, &wfds, NULL, &tv))) {
+ RPRINTF("time out during write\n");
+ return -1;
+ } else if (rc < 0) {
+ RPRINTF("error during select: %d\n", errno);
+ return -1;
+ }
+ rc = write(fd, buf + cur, len - cur);
+ }
+ /*
+ RPRINTF("wrote %d bytes\n", cur + rc);
+ */
+
+ return 0;
+ FD_ZERO(&wfds);
+ FD_SET(fd, &wfds);
+ select(fd + 1, NULL, &wfds, NULL, &tv);
+}
+
+/* add a new FD to tapdisk selector */
+static int install_tdfd(struct disk_driver* dd, int* pidx, int fd) {
+ int idx;
+
+ for (idx = 0; idx < MAX_IOFD; idx++) {
+ if (!dd->io_fd[idx])
+ break;
+ }
+ if (idx == MAX_IOFD) {
+ RPRINTF("no free FD for replication channel\n");
+ return -1;
+ }
+
+ dd->io_fd[idx] = fd;
+ *pidx = idx;
+
+ return 0;
+}
+
+static int close_rfd(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+
+ RPRINTF("closing replication channel\n");
+
+ close(s->rfd);
+ s->rfd = -2;
+ dd->io_fd[s->rfd_idx] = 0;
+ s->rfd_idx = -1;
+
+ return 0;
+}
+
+/* remus client */
+
+static int client_connect(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ struct sockaddr_in sa;
+ int fd;
+ int rc;
+
+ RPRINTF("client connecting to %s:%d...\n", inet_ntoa(s->addr), s->port);
+
+ if ((fd = socket(PF_INET, SOCK_STREAM, 0)) < 0) {
+ RPRINTF("could not create client socket: %d\n", errno);
+ return -1;
+ }
+
+ memset(&sa, 0, sizeof(sa));
+ sa.sin_family = AF_INET;
+ sa.sin_addr.s_addr = s->addr.s_addr;
+ sa.sin_port = htons(s->port);
+ /* wait for remote end to start up */
+ do {
+ if ((rc = connect(fd, &sa, sizeof(sa))) < 0) {
+ if (errno == ECONNREFUSED) {
+ RPRINTF("connection refused -- retrying in 1 second\n");
+ sleep(1);
+ } else {
+ RPRINTF("connection failed: %d\n", errno);
+ goto err_sock;
+ }
+ }
+ } while (rc < 0);
+
+ RPRINTF("client connected\n");
+
+ if (install_tdfd(dd, &s->rfd_idx, fd) < 0)
+ goto err_sock;
+ RPRINTF("replication channel in slot %d\n", s->rfd_idx);
+
+ if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
+ RPRINTF("error making socket non-blocking\n");
+ goto err_sock;
+ }
+
+ s->rfd = fd;
+
+ return 0;
+
+ err_sock:
+ close(fd);
+ return -1;
+}
+
+static int client_replicate(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ struct tdremus_req* req;
+ int len, rc;
+ uint64_t header[2];
+
+ /* -2 means connection ended -- give up replication */
+ if (s->rfd == -2) {
+ RPRINTF("engaging passthrough mode\n");
+ pass_start(dd);
+ s->mode == mode_pass;
+ return 0;
+ }
+
+ if (ring_isempty(&s->write_ring))
+ return 0;
+
+ if (s->rfd < 0 && client_connect(dd) < 0) {
+ RPRINTF("replication failed");
+ return -1;
+ }
+ /* TODO: it would probably make sense to send the header for all
+ * requests in this batch in one write. */
+ memcpy(header, "wreq", 4);
+ while (!ring_isempty(&s->write_ring)) {
+ req = s->write_ring.requests + s->write_ring.head;
+ ((uint32_t*)header)[1] = req->nb_sectors;
+ header[1] = req->sector;
+
+ /*
+ RPRINTF("sending write request: %lu bytes at %llu\n",
+ req->nb_sectors * dd->td_state->sector_size, req->sector);
+ */
+
+ if (mwrite(s->rfd, header, sizeof(header)) < 0)
+ goto err_write;
+ if (mwrite(s->rfd, req->buf,
+ req->nb_sectors * dd->td_state->sector_size) < 0)
+ goto err_write;
+
+ s->write_ring.head = ring_next(&s->write_ring, s->write_ring.head);
+ }
+ /* submit barrier */
+ /*
+ RPRINTF("sending submit\n");
+ if (mwrite(s->rfd, "sreq", 4) < 0)
+ goto err_write;
+ */
+
+ return 0;
+
+ err_write:
+ close_rfd(dd);
+ return -1;
+}
+
+/* store a copy of the request in order to replicate it at submit */
+static int client_queue_write(struct disk_driver *dd, uint64_t sector,
+ int nb_sectors, char *buf, td_callback_t cb,
+ int id, void *private)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ int rc;
+
+ /* this is probably unsafe: if the underlying disk makes the callback
+ * before the request has been propagated, the buffer may be damaged.
+ * Perhaps the callback should be overridden to ensure the request
+ * has been sent before continuing. Or maybe we should just copy the
+ * buffer. */
+ /* UPDATE: using copies now */
+ if (ring_isfull(&s->write_ring)) {
+ RPRINTF("ring full, flushing\n");
+ client_replicate(dd);
+ }
+
+ /* client_replicate may have disabled replication on error */
+ if (s->mode == mode_client)
+ if ((rc = buf_queue_write(dd, sector, nb_sectors, buf)) < 0) {
+ RPRINTF("buf_queue_write returned %d\n", rc);
+ return rc;
+ }
+ dd->private = s->driver_data;
+ rc = s->driver->td_queue_write(dd, sector, nb_sectors, buf, cb, id,
+ private);
+ dd->private = s;
+
+ return rc;
+}
+
+/* submit requests, then replicate them while the underlying disk
+ * is handling the requests */
+static int client_submit(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ int rc;
+
+ dd->private = s->driver_data;
+ rc = s->driver->td_submit(dd);
+ dd->private = s;
+
+ client_replicate(dd);
+
+ return rc;
+}
+
+static int client_flush(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+
+ /*
+ RPRINTF("committing output\n");
+ */
+
+ if (mwrite(s->rfd, "creq", 4) < 0) {
+ close_rfd(dd);
+ }
+
+ return 0;
+}
+
+static int client_start(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+
+ RPRINTF("activating client mode\n");
+
+ s->queue_write = client_queue_write;
+ s->submit = client_submit;
+ s->queue_flush = client_flush;
+
+ return 0;
+}
+
+/* wait for "done" message to commit checkpoint */
+static int client_do_replicate(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ char buf[5];
+ int rc;
+
+ buf[4] = '\0';
+ if (mread(s->rfd, buf, 4) < 0) {
+ close_rfd(dd);
+ return -1;
+ }
+
+ if (!strncmp(buf, "done", 4)) {
+ if ((rc = write(s->msgfd, buf, 4)) < 0) {
+ RPRINTF("error writing notification: %d\n", errno);
+
+ close(s->msgfd);
+ if ((s->msgfd = open(s->msgfifo, O_RDWR)) < 0) {
+ RPRINTF("error reopening FIFO: %d\n", errno);
+ return -1;
+ }
+ }
+ } else
+ RPRINTF("received unknown message: %s\n", buf);
+
+ return 0;
+}
+
+/* remus server */
+
+/* returns the socket that receives write requests */
+static int remus_accept(struct tdremus_state* s)
+{
+ int fd;
+ struct sockaddr_in sa;
+ socklen_t sa_len;
+
+ RPRINTF("server waiting for connection\n");
+ sa_len = sizeof(sa);
+ if ((fd = accept(s->sfd, &sa, &sa_len)) < 0) {
+ RPRINTF("error accepting connection: %d\n", errno);
+ return -1;
+ }
+
+ RPRINTF("server accepted connection\n");
+
+ return fd;
+}
+
+/* returns -2 if EADDRNOTAVAIL */
+static int remus_bind(struct tdremus_state* s)
+{
+ struct sockaddr_in sa;
+ int opt;
+ int rc = -1;
+
+ if ((s->sfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ RPRINTF("could not create server socket: %d\n", errno);
+ return rc;
+ }
+ opt = 1;
+ if (setsockopt(s->sfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0)
+ RPRINTF("Error setting REUSEADDR on %d: %d\n", s->sfd, errno);
+
+ memset(&sa, 0, sizeof(sa));
+ sa.sin_family = AF_INET;
+ sa.sin_addr.s_addr = s->addr.s_addr;
+ sa.sin_port = htons(s->port);
+ if (bind(s->sfd, &sa, sizeof(sa)) < 0) {
+ RPRINTF("could not bind server socket %d to %s:%d: %d %s\n", s->sfd,
+ inet_ntoa(sa.sin_addr), s->port, errno, strerror(errno));
+ if (errno != EADDRINUSE)
+ rc = -2;
+ goto err_sfd;
+ }
+ if (listen(s->sfd, 10)) {
+ RPRINTF("could not listen on socket: %d\n", errno);
+ goto err_sfd;
+ }
+
+ return 0;
+
+ err_sfd:
+ close(s->sfd);
+ s->sfd = -1;
+
+ return rc;
+}
+
+/* wait for latest checkpoint to be applied */
+static inline int server_writes_inflight(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+
+ if (!s->ramdisk.inflight && !s->ramdisk.prev)
+ return 0;
+
+ return 1;
+}
+
+/* this should not be called until the domain has failed over.
+ * Its duty is to make sure the latest checkpoint is applied
+ * before resuming. */
+static int server_queue_read(struct disk_driver *dd, uint64_t sector,
+ int nb_sectors, char *buf, td_callback_t cb,
+ int id, void *private)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+
+ if (server_writes_inflight(dd)) {
+ RPRINTF("queue_read: waiting for queue to drain");
+ return -EBUSY;
+ }
+
+ RPRINTF("queue_read: activating backup");
+ pass_start(dd);
+ tdremus_queue_read(dd, sector, nb_sectors, buf, cb, id, private);
+
+ return 0;
+}
+
+/* see above */
+static int server_queue_write(struct disk_driver *dd, uint64_t sector,
+ int nb_sectors, char *buf, td_callback_t cb,
+ int id, void *private)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+
+ if (server_writes_inflight(dd)) {
+ RPRINTF("queue_write: waiting for queue to drain");
+ return -EBUSY;
+ }
+
+ RPRINTF("queue_write: activating backup");
+ pass_start(dd);
+ tdremus_queue_write(dd, sector, nb_sectors, buf, cb, id, private);
+
+ return 0;
+}
+
+static int server_start(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ int fd;
+
+ if (install_tdfd(dd, &s->rfd_idx, s->sfd) < 0)
+ return -1;
+
+ if (ramdisk_start(dd) < 0)
+ return -1;
+
+ RPRINTF("server listening in slot %d\n", s->rfd_idx);
+
+ s->queue_read = server_queue_read;
+ s->queue_write = server_queue_write;
+
+ return 0;
+}
+
+/* writes should be appended to a journal */
+static int server_do_wreq(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ char buf[4096];
+ uint64_t sector;
+ int nb_sectors, len, rc;
+
+ /*
+ RPRINTF("received write request\n");
+ */
+
+ if (mread(s->rfd, buf, 12) < 0)
+ goto err_read;
+
+ nb_sectors = *(uint32_t*)buf;
+ memcpy(§or, buf + 4, sizeof(sector));
+ len = nb_sectors * dd->td_state->sector_size;
+
+ /*
+ RPRINTF("writing %d sectors (%d bytes) starting at %llu\n", nb_sectors, len,
+ sector);
+ */
+
+ if (len > sizeof(buf)) {
+ /* freak out! */
+ RPRINTF("write request too large: %d/%u\n", len, (unsigned)sizeof(buf));
+ return -1;
+ }
+
+ if (mread(s->rfd, buf, len) < 0)
+ goto err_read;
+
+ ramdisk_write(&s->ramdisk, sector, nb_sectors, buf);
+
+ return 0;
+
+ err_read:
+ /* should start failover */
+ close_rfd(dd);
+
+ return -1;
+}
+
+static int server_do_sreq(struct disk_driver* dd)
+{
+ /*
+ RPRINTF("submit request received\n");
+ */
+
+ return 0;
+}
+
+/* at this point, the server can start applying the most recent
+ * ramdisk. */
+static int server_do_creq(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+
+ /*
+ RPRINTF("committing buffer\n");
+ */
+
+ ramdisk_start_flush(dd);
+
+ /* profit! */
+ if (write(s->rfd, "done", 4) != 4)
+ return -1;
+
+ return 0;
+}
+
+/* called when data is pending in s->rfd */
+static int server_do_replicate(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ char req[5];
+ int fd;
+
+ if (dd->io_fd[s->rfd_idx] == s->sfd) {
+ /* connection not yet established. Bring it up. */
+ if ((fd = remus_accept(s)) < 0)
+ return -1;
+
+ s->rfd = fd;
+ dd->io_fd[s->rfd_idx] = s->rfd;
+
+ if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) {
+ RPRINTF("error making socket non-blocking\n");
+ close_rfd(dd);
+ dd->io_fd[s->rfd_idx] = s->sfd;
+ return -1;
+ }
+
+ RPRINTF("replication channel in slot %d\n", s->rfd_idx);
+
+ return 0;
+ }
+
+ /*
+ RPRINTF("replication data waiting\n");
+ */
+
+ if (mread(s->rfd, req, 4) < 0) {
+ close_rfd(dd);
+ /* TODO: initiate failover recovery */
+ return -1;
+ }
+ req[5] = '\0';
+
+ /*
+ RPRINTF("received request: %s\n", req);
+ */
+
+ if (!strncmp(req, "wreq", 4))
+ return server_do_wreq(dd);
+ else if (!strncmp(req, "sreq", 4))
+ return server_do_sreq(dd);
+ else if (!strncmp(req, "creq", 4))
+ return server_do_creq(dd);
+ else
+ RPRINTF("unknown request received: %s\n", req);
+
+ return 0;
+}
+
+/* control */
+
+static inline int resolve_address(const char* addr, struct in_addr* ia)
+{
+ struct hostent* he;
+ uint32_t ip;
+
+ if (!(he = gethostbyname(addr))) {
+ RPRINTF("error resolving %s: %d\n", addr, h_errno);
+ return -1;
+ }
+
+ if (!he->h_addr_list[0]) {
+ RPRINTF("no address found for %s\n", addr);
+ return -1;
+ }
+
+ /* network byte order */
+ ip = *((uint32_t**)he->h_addr_list)[0];
+ ia->s_addr = ip;
+
+ return 0;
+}
+
+static int get_driver(struct disk_driver* dd, const char* name)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ char* host;
+ char* ports;
+ char* driver;
+ char* parent;
+ unsigned long ulport;
+ disk_info_t* disk_info = NULL;
+ int nr_drivers = sizeof(dtypes) / sizeof(disk_info_t*);
+ int i;
+
+ ports = strchr(name, ':');
+ if (!ports) {
+ RPRINTF("missing host in %s\n", name);
+ return -ENOENT;
+ }
+ if (!(host = strndup(name, ports - name))) {
+ RPRINTF("unable to allocate host\n");
+ return -ENOMEM;
+ }
+ ports++;
+ if (resolve_address(host, &s->addr) < 0) {
+ RPRINTF("unable to resolve host: %s\n", host);
+ free(host);
+ return -ENOENT;
+ }
+ free(host);
+
+ if (!(ulport = strtoul(ports, &driver, 10))) {
+ RPRINTF("missing port in %s\n", name);
+ return -ENOENT;
+ }
+ if (ulport > 65535) {
+ RPRINTF("port out of range: %lu\n", ulport);
+ return -ENOENT;
+ }
+ s->port = (unsigned short)ulport;
+ if (driver[0] != ':') {
+ RPRINTF("missing driver in %s\n", name);
+ return -ENOENT;
+ }
+ driver++;
+
+ parent = strchr(driver, ':');
+ if (!parent) {
+ RPRINTF("missing parent for %s\n", name);
+ return -ENOENT;
+ }
+ parent++;
+ s->path = strdup(parent);
+
+ RPRINTF("host: %s, port: %d\n", inet_ntoa(s->addr), s->port);
+ for (i = 0; i < nr_drivers; i ++)
+ if (!strncmp(driver, dtypes[i]->handle, strlen(dtypes[i]->handle)))
+ disk_info = dtypes[i];
+
+ if (disk_info) {
+ RPRINTF("found driver %s for %s\n", disk_info->handle, parent);
+ s->driver = disk_info->drv;
+
+ return 0;
+ }
+
+ RPRINTF("no driver found for %s\n", name);
+
+ err_driver:
+ free(s->path);
+ s->path = NULL;
+ return -ENOENT;
+}
+
+static int switch_mode(struct disk_driver* dd, enum tdremus_mode mode)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ int rc;
+
+ if (mode == s->mode)
+ return 0;
+
+ if (s->queue_flush)
+ if ((rc = s->queue_flush(dd)) < 0)
+ return rc;
+
+ if (mode == mode_pass)
+ rc = pass_start(dd);
+ if (mode == mode_client)
+ rc = client_start(dd);
+ else if (mode == mode_server)
+ rc = server_start(dd);
+ else {
+ RPRINTF("unknown mode requested: %d\n", mode);
+ rc = -1;
+ }
+
+ if (!rc)
+ s->mode = mode;
+
+ return rc;
+}
+
+static int do_ctl(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ int ctlfd = dd->io_fd[s->ctl_fd_idx];
+ char msg[80];
+ int rc;
+
+ if (!(rc = read(ctlfd, msg, sizeof(msg) - 1 /* append nul */))) {
+ RPRINTF("0-byte read received, reopening FIFO\n");
+ close(ctlfd);
+ if ((ctlfd = open(s->ctlfifo, O_RDWR)) < 0) {
+ RPRINTF("error reopening FIFO: %d\n", errno);
+ return -1;
+ }
+ dd->io_fd[s->ctl_fd_idx] = ctlfd;
+ return 0;
+ }
+
+ if (rc < 0) {
+ RPRINTF("error reading from FIFO: %d\n", errno);
+ return -1;
+ }
+
+ msg[rc] = '\0';
+ if (!strncmp(msg, "flush", 5)) {
+ if (s->queue_flush)
+ return s->queue_flush(dd);
+ else
+ return 0;
+ } else {
+ RPRINTF("unknown command: %s\n", msg);
+ }
+
+ return 0;
+}
+
+/* must be called after the underlying driver has been initialized */
+static int add_ctl_hook(struct disk_driver* dd, const char* name)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ int i, l;
+
+ for (s->ctl_fd_idx = 0; s->ctl_fd_idx < MAX_IOFD; s->ctl_fd_idx++) {
+ if (!dd->io_fd[s->ctl_fd_idx])
+ break;
+ }
+ if (s->ctl_fd_idx == MAX_IOFD) {
+ RPRINTF("no free FD for control channel\n");
+ return -1;
+ }
+
+ /* device name -> FIFO */
+ if (asprintf(&s->ctlfifo, BLKTAP_CTRL_DIR "/remus_%s", name) < 0)
+ return -1;
+ for (i = strlen(BLKTAP_CTRL_DIR) + 1, l = strlen(s->ctlfifo); i < l; i++) {
+ if (strchr(":/", s->ctlfifo[i]))
+ s->ctlfifo[i] = '_';
+ }
+ if (asprintf(&s->msgfifo, "%s.msg", s->ctlfifo) < 0)
+ goto err_ctlfifo;
+
+ if (mkfifo(s->ctlfifo, S_IRWXU|S_IRWXG|S_IRWXO) && errno != EEXIST) {
+ RPRINTF("error creating control FIFO %s: %d\n", s->ctlfifo, errno);
+ goto err_msgfifo;
+ }
+
+ if (mkfifo(s->msgfifo, S_IRWXU|S_IRWXG|S_IRWXO) && errno != EEXIST) {
+ RPRINTF("error creating message FIFO %s: %d\n", s->msgfifo, errno);
+ goto err_msgfifo;
+ }
+
+ /* RDWR so that fd doesn't block select when no writer is present */
+ if ((dd->io_fd[s->ctl_fd_idx] = open(s->ctlfifo, O_RDWR)) < 0) {
+ RPRINTF("error opening control FIFO %s: %d\n", s->ctlfifo, errno);
+ goto err_msgfifo;
+ }
+
+ if ((s->msgfd = open(s->msgfifo, O_RDWR)) < 0) {
+ RPRINTF("error opening message FIFO %s: %d\n", s->msgfifo, errno);
+ goto err_openctlfifo;
+ }
+
+ RPRINTF("control FIFO %s\n", s->ctlfifo);
+ RPRINTF("message FIFO %s\n", s->msgfifo);
+
+ return 0;
+
+ err_openctlfifo:
+ close(dd->io_fd[s->ctl_fd_idx]);
+ dd->io_fd[s->ctl_fd_idx] = 0;
+ err_msgfifo:
+ free(s->msgfifo);
+ s->msgfifo = NULL;
+ err_ctlfifo:
+ free(s->ctlfifo);
+ s->ctlfifo = NULL;
+ s->ctl_fd_idx = -1;
+ return -1;
+}
+
+static void del_ctl_hook(struct disk_driver* dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+
+ if (dd->io_fd[s->ctl_fd_idx]) {
+ close(dd->io_fd[s->ctl_fd_idx]);
+ dd->io_fd[s->ctl_fd_idx] = 0;
+ }
+ if (s->ctlfifo) {
+ unlink(s->ctlfifo);
+ free(s->ctlfifo);
+ s->ctlfifo = NULL;
+ }
+}
+
+/* interface */
+
+static int tdremus_open (struct disk_driver *dd, const char *name,
+ td_flag_t flags)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ int rc;
+
+ RPRINTF("opening %s\n", name);
+
+ memset(s, 0, sizeof(*s));
+ s->sfd = -1;
+ s->rfd = -1;
+ s->ctl_fd_idx = -1;
+ s->rfd_idx = -1;
+
+ if ((rc = get_driver(dd, name)))
+ return rc;
+
+ if (!(s->driver_data = malloc(s->driver->private_data_size))) {
+ RPRINTF("could not allocate driver data\n");
+ return -ENOMEM;
+ }
+
+ dd->private = s->driver_data;
+ if ((rc = s->driver->td_open(dd, s->path, flags))) {
+ RPRINTF("could not open driver\n");
+ dd->private = s;
+ free(s->driver_data);
+ return rc;
+ }
+ dd->private = s;
+
+ if ((rc = add_ctl_hook(dd, name))) {
+ RPRINTF("error setting up control channel\n");
+ free(s->driver_data);
+ return rc;
+ }
+
+ if ((rc = remus_bind(s))) {
+ if (rc == -2) {
+ rc = switch_mode(dd, mode_client);
+ } else
+ goto err_path;
+ } else
+ rc = switch_mode(dd, mode_server);
+
+ if (rc) {
+ tdremus_close(dd);
+ return -EIO;
+ }
+
+ return 0;
+
+ err_path:
+ free(s->path);
+ s->path = NULL;
+ return rc;
+}
+
+static int tdremus_queue_read(struct disk_driver *dd, uint64_t sector,
+ int nb_sectors, char *buf, td_callback_t cb,
+ int id, void *private)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ int rc;
+
+ if (s->queue_read)
+ return s->queue_read(dd, sector, nb_sectors, buf, cb, id, private);
+
+ dd->private = s->driver_data;
+ rc = s->driver->td_queue_read(dd, sector, nb_sectors, buf, cb, id, private);
+ dd->private = s;
+
+ return rc;
+}
+
+static int tdremus_queue_write(struct disk_driver *dd, uint64_t sector,
+ int nb_sectors, char *buf, td_callback_t cb,
+ int id, void *private)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ int rc;
+
+ if (s->queue_write)
+ return s->queue_write(dd, sector, nb_sectors, buf, cb, id, private);
+
+ dd->private = s->driver_data;
+ rc = s->driver->td_queue_write(dd, sector, nb_sectors, buf, cb, id,
+ private);
+ dd->private = s;
+
+ return rc;
+}
+
+static int tdremus_submit(struct disk_driver *dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ int rc;
+
+ if (s->submit)
+ return s->submit(dd);
+
+ dd->private = s->driver_data;
+ rc = s->driver->td_submit(dd);
+ dd->private = s;
+
+ return rc;
+}
+
+static int tdremus_close(struct disk_driver *dd)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ int rc;
+
+ RPRINTF("closing\n");
+
+ dd->private = s->driver_data;
+ rc = s->driver->td_close(dd);
+ dd->private = s;
+
+ if (s->driver_data) {
+ free(s->driver_data);
+ s->driver_data = NULL;
+ }
+ if (s->sfd >= 0) {
+ close(s->sfd);
+ s->sfd = -1;
+ }
+ if (s->rfd >= 0) {
+ close_rfd(dd);
+ }
+ if (s->path) {
+ free(s->path);
+ s->path = NULL;
+ }
+
+ del_ctl_hook(dd);
+
+ return rc;
+}
+
+static int tdremus_do_callbacks(struct disk_driver *dd, int sid)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ int rc;
+
+ if (sid == s->ctl_fd_idx)
+ return do_ctl(dd);
+ if (sid == s->rfd_idx) {
+ if (s->mode == mode_client)
+ return client_do_replicate(dd);
+ else
+ return server_do_replicate(dd);
+ }
+
+ dd->private = s->driver_data;
+ rc = s->driver->td_do_callbacks(dd, sid);
+ dd->private = s;
+
+ return rc;
+}
+
+static int tdremus_get_parent_id(struct disk_driver *dd, struct disk_id *id)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ int rc;
+
+ dd->private = s->driver_data;
+ rc = s->driver->td_get_parent_id(dd, id);
+ dd->private = s;
+
+ return rc;
+}
+
+static int tdremus_validate_parent(struct disk_driver *dd,
+ struct disk_driver *parent, td_flag_t flags)
+{
+ struct tdremus_state *s = (struct tdremus_state *)dd->private;
+ int rc;
+
+ dd->private = s->driver_data;
+ rc = s->driver->td_validate_parent(dd, parent, flags);
+ dd->private = s;
+
+ return rc;
+}
+
+struct tap_disk tapdisk_remus = {
+ .disk_type = "tapdisk_remus",
+ .private_data_size = sizeof(struct tdremus_state),
+ .td_open = tdremus_open,
+ .td_queue_read = tdremus_queue_read,
+ .td_queue_write = tdremus_queue_write,
+ .td_submit = tdremus_submit,
+ .td_close = tdremus_close,
+ .td_do_callbacks = tdremus_do_callbacks,
+ .td_get_parent_id = tdremus_get_parent_id,
+ .td_validate_parent = tdremus_validate_parent
+};
diff --git a/tools/blktap/drivers/hashtable.c b/tools/blktap/drivers/hashtable.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap/drivers/hashtable.c
@@ -0,0 +1,274 @@
+/* Copyright (C) 2004 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#include "hashtable.h"
+#include "hashtable_private.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/*
+Credit for primes table: Aaron Krowne
+ http://br.endernet.org/~akrowne/
+ http://planetmath.org/encyclopedia/GoodHashTablePrimes.html
+*/
+static const unsigned int primes[] = {
+53, 97, 193, 389,
+769, 1543, 3079, 6151,
+12289, 24593, 49157, 98317,
+196613, 393241, 786433, 1572869,
+3145739, 6291469, 12582917, 25165843,
+50331653, 100663319, 201326611, 402653189,
+805306457, 1610612741
+};
+const unsigned int prime_table_length = sizeof(primes)/sizeof(primes[0]);
+const float max_load_factor = 0.65;
+
+/*****************************************************************************/
+struct hashtable *
+create_hashtable(unsigned int minsize,
+ unsigned int (*hashf) (void*),
+ int (*eqf) (void*,void*))
+{
+ struct hashtable *h;
+ unsigned int pindex, size = primes[0];
+ /* Check requested hashtable isn't too large */
+ if (minsize > (1u << 30)) return NULL;
+ /* Enforce size as prime */
+ for (pindex=0; pindex < prime_table_length; pindex++) {
+ if (primes[pindex] > minsize) { size = primes[pindex]; break; }
+ }
+ h = (struct hashtable *)malloc(sizeof(struct hashtable));
+ if (NULL == h) return NULL; /*oom*/
+ h->table = (struct entry **)malloc(sizeof(struct entry*) * size);
+ if (NULL == h->table) { free(h); return NULL; } /*oom*/
+ memset(h->table, 0, size * sizeof(struct entry *));
+ h->tablelength = size;
+ h->primeindex = pindex;
+ h->entrycount = 0;
+ h->hashfn = hashf;
+ h->eqfn = eqf;
+ h->loadlimit = (unsigned int) ceil(size * max_load_factor);
+ return h;
+}
+
+/*****************************************************************************/
+unsigned int
+hash(struct hashtable *h, void *k)
+{
+ /* Aim to protect against poor hash functions by adding logic here
+ * - logic taken from java 1.4 hashtable source */
+ unsigned int i = h->hashfn(k);
+ i += ~(i << 9);
+ i ^= ((i >> 14) | (i << 18)); /* >>> */
+ i += (i << 4);
+ i ^= ((i >> 10) | (i << 22)); /* >>> */
+ return i;
+}
+
+/*****************************************************************************/
+static int
+hashtable_expand(struct hashtable *h)
+{
+ /* Double the size of the table to accomodate more entries */
+ struct entry **newtable;
+ struct entry *e;
+ struct entry **pE;
+ unsigned int newsize, i, index;
+ /* Check we're not hitting max capacity */
+ if (h->primeindex == (prime_table_length - 1)) return 0;
+ newsize = primes[++(h->primeindex)];
+
+ newtable = (struct entry **)malloc(sizeof(struct entry*) * newsize);
+ if (NULL != newtable)
+ {
+ memset(newtable, 0, newsize * sizeof(struct entry *));
+ /* This algorithm is not 'stable'. ie. it reverses the list
+ * when it transfers entries between the tables */
+ for (i = 0; i < h->tablelength; i++) {
+ while (NULL != (e = h->table[i])) {
+ h->table[i] = e->next;
+ index = indexFor(newsize,e->h);
+ e->next = newtable[index];
+ newtable[index] = e;
+ }
+ }
+ free(h->table);
+ h->table = newtable;
+ }
+ /* Plan B: realloc instead */
+ else
+ {
+ newtable = (struct entry **)
+ realloc(h->table, newsize * sizeof(struct entry *));
+ if (NULL == newtable) { (h->primeindex)--; return 0; }
+ h->table = newtable;
+ memset(newtable[h->tablelength], 0, newsize - h->tablelength);
+ for (i = 0; i < h->tablelength; i++) {
+ for (pE = &(newtable[i]), e = *pE; e != NULL; e = *pE) {
+ index = indexFor(newsize,e->h);
+ if (index == i)
+ {
+ pE = &(e->next);
+ }
+ else
+ {
+ *pE = e->next;
+ e->next = newtable[index];
+ newtable[index] = e;
+ }
+ }
+ }
+ }
+ h->tablelength = newsize;
+ h->loadlimit = (unsigned int) ceil(newsize * max_load_factor);
+ return -1;
+}
+
+/*****************************************************************************/
+unsigned int
+hashtable_count(struct hashtable *h)
+{
+ return h->entrycount;
+}
+
+/*****************************************************************************/
+int
+hashtable_insert(struct hashtable *h, void *k, void *v)
+{
+ /* This method allows duplicate keys - but they shouldn't be used */
+ unsigned int index;
+ struct entry *e;
+ if (++(h->entrycount) > h->loadlimit)
+ {
+ /* Ignore the return value. If expand fails, we should
+ * still try cramming just this value into the existing table
+ * -- we may not have memory for a larger table, but one more
+ * element may be ok. Next time we insert, we'll try expanding again.*/
+ hashtable_expand(h);
+ }
+ e = (struct entry *)malloc(sizeof(struct entry));
+ if (NULL == e) { --(h->entrycount); return 0; } /*oom*/
+ e->h = hash(h,k);
+ index = indexFor(h->tablelength,e->h);
+ e->k = k;
+ e->v = v;
+ e->next = h->table[index];
+ h->table[index] = e;
+ return -1;
+}
+
+/*****************************************************************************/
+void * /* returns value associated with key */
+hashtable_search(struct hashtable *h, void *k)
+{
+ struct entry *e;
+ unsigned int hashvalue, index;
+ hashvalue = hash(h,k);
+ index = indexFor(h->tablelength,hashvalue);
+ e = h->table[index];
+ while (NULL != e)
+ {
+ /* Check hash value to short circuit heavier comparison */
+ if ((hashvalue == e->h) && (h->eqfn(k, e->k))) return e->v;
+ e = e->next;
+ }
+ return NULL;
+}
+
+/*****************************************************************************/
+void * /* returns value associated with key */
+hashtable_remove(struct hashtable *h, void *k)
+{
+ /* TODO: consider compacting the table when the load factor drops enough,
+ * or provide a 'compact' method. */
+
+ struct entry *e;
+ struct entry **pE;
+ void *v;
+ unsigned int hashvalue, index;
+
+ hashvalue = hash(h,k);
+ index = indexFor(h->tablelength,hash(h,k));
+ pE = &(h->table[index]);
+ e = *pE;
+ while (NULL != e)
+ {
+ /* Check hash value to short circuit heavier comparison */
+ if ((hashvalue == e->h) && (h->eqfn(k, e->k)))
+ {
+ *pE = e->next;
+ h->entrycount--;
+ v = e->v;
+ freekey(e->k);
+ free(e);
+ return v;
+ }
+ pE = &(e->next);
+ e = e->next;
+ }
+ return NULL;
+}
+
+/*****************************************************************************/
+/* destroy */
+void
+hashtable_destroy(struct hashtable *h, int free_values)
+{
+ unsigned int i;
+ struct entry *e, *f;
+ struct entry **table = h->table;
+ if (free_values)
+ {
+ for (i = 0; i < h->tablelength; i++)
+ {
+ e = table[i];
+ while (NULL != e)
+ { f = e; e = e->next; freekey(f->k); free(f->v); free(f); }
+ }
+ }
+ else
+ {
+ for (i = 0; i < h->tablelength; i++)
+ {
+ e = table[i];
+ while (NULL != e)
+ { f = e; e = e->next; freekey(f->k); free(f); }
+ }
+ }
+ free(h->table);
+ free(h);
+}
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/blktap/drivers/hashtable_itr.c b/tools/blktap/drivers/hashtable_itr.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap/drivers/hashtable_itr.c
@@ -0,0 +1,188 @@
+/* Copyright (C) 2002, 2004 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#include "hashtable.h"
+#include "hashtable_private.h"
+#include "hashtable_itr.h"
+#include <stdlib.h> /* defines NULL */
+
+/*****************************************************************************/
+/* hashtable_iterator - iterator constructor */
+
+struct hashtable_itr *
+hashtable_iterator(struct hashtable *h)
+{
+ unsigned int i, tablelength;
+ struct hashtable_itr *itr = (struct hashtable_itr *)
+ malloc(sizeof(struct hashtable_itr));
+ if (NULL == itr) return NULL;
+ itr->h = h;
+ itr->e = NULL;
+ itr->parent = NULL;
+ tablelength = h->tablelength;
+ itr->index = tablelength;
+ if (0 == h->entrycount) return itr;
+
+ for (i = 0; i < tablelength; i++)
+ {
+ if (NULL != h->table[i])
+ {
+ itr->e = h->table[i];
+ itr->index = i;
+ break;
+ }
+ }
+ return itr;
+}
+
+/*****************************************************************************/
+/* key - return the key of the (key,value) pair at the current position */
+/* value - return the value of the (key,value) pair at the current position */
+
+void *
+hashtable_iterator_key(struct hashtable_itr *i)
+{ return i->e->k; }
+
+void *
+hashtable_iterator_value(struct hashtable_itr *i)
+{ return i->e->v; }
+
+/*****************************************************************************/
+/* advance - advance the iterator to the next element
+ * returns zero if advanced to end of table */
+
+int
+hashtable_iterator_advance(struct hashtable_itr *itr)
+{
+ unsigned int j,tablelength;
+ struct entry **table;
+ struct entry *next;
+ if (NULL == itr->e) return 0; /* stupidity check */
+
+ next = itr->e->next;
+ if (NULL != next)
+ {
+ itr->parent = itr->e;
+ itr->e = next;
+ return -1;
+ }
+ tablelength = itr->h->tablelength;
+ itr->parent = NULL;
+ if (tablelength <= (j = ++(itr->index)))
+ {
+ itr->e = NULL;
+ return 0;
+ }
+ table = itr->h->table;
+ while (NULL == (next = table[j]))
+ {
+ if (++j >= tablelength)
+ {
+ itr->index = tablelength;
+ itr->e = NULL;
+ return 0;
+ }
+ }
+ itr->index = j;
+ itr->e = next;
+ return -1;
+}
+
+/*****************************************************************************/
+/* remove - remove the entry at the current iterator position
+ * and advance the iterator, if there is a successive
+ * element.
+ * If you want the value, read it before you remove:
+ * beware memory leaks if you don't.
+ * Returns zero if end of iteration. */
+
+int
+hashtable_iterator_remove(struct hashtable_itr *itr)
+{
+ struct entry *remember_e, *remember_parent;
+ int ret;
+
+ /* Do the removal */
+ if (NULL == (itr->parent))
+ {
+ /* element is head of a chain */
+ itr->h->table[itr->index] = itr->e->next;
+ } else {
+ /* element is mid-chain */
+ itr->parent->next = itr->e->next;
+ }
+ /* itr->e is now outside the hashtable */
+ remember_e = itr->e;
+ itr->h->entrycount--;
+ freekey(remember_e->k);
+
+ /* Advance the iterator, correcting the parent */
+ remember_parent = itr->parent;
+ ret = hashtable_iterator_advance(itr);
+ if (itr->parent == remember_e) { itr->parent = remember_parent; }
+ free(remember_e);
+ return ret;
+}
+
+/*****************************************************************************/
+int /* returns zero if not found */
+hashtable_iterator_search(struct hashtable_itr *itr,
+ struct hashtable *h, void *k)
+{
+ struct entry *e, *parent;
+ unsigned int hashvalue, index;
+
+ hashvalue = hash(h,k);
+ index = indexFor(h->tablelength,hashvalue);
+
+ e = h->table[index];
+ parent = NULL;
+ while (NULL != e)
+ {
+ /* Check hash value to short circuit heavier comparison */
+ if ((hashvalue == e->h) && (h->eqfn(k, e->k)))
+ {
+ itr->index = index;
+ itr->e = e;
+ itr->parent = parent;
+ itr->h = h;
+ return -1;
+ }
+ parent = e;
+ e = e->next;
+ }
+ return 0;
+}
+
+
+/*
+ * Copyright (c) 2002, 2004, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/blktap/drivers/hashtable_itr.h b/tools/blktap/drivers/hashtable_itr.h
new file mode 100644
--- /dev/null
+++ b/tools/blktap/drivers/hashtable_itr.h
@@ -0,0 +1,112 @@
+/* Copyright (C) 2002, 2004 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#ifndef __HASHTABLE_ITR_CWC22__
+#define __HASHTABLE_ITR_CWC22__
+#include "hashtable.h"
+#include "hashtable_private.h" /* needed to enable inlining */
+
+/*****************************************************************************/
+/* This struct is only concrete here to allow the inlining of two of the
+ * accessor functions. */
+struct hashtable_itr
+{
+ struct hashtable *h;
+ struct entry *e;
+ struct entry *parent;
+ unsigned int index;
+};
+
+
+/*****************************************************************************/
+/* hashtable_iterator
+ */
+
+struct hashtable_itr *
+hashtable_iterator(struct hashtable *h);
+
+/*****************************************************************************/
+/* hashtable_iterator_key
+ * - return the value of the (key,value) pair at the current position */
+
+extern inline void *
+hashtable_iterator_key(struct hashtable_itr *i)
+{
+ return i->e->k;
+}
+
+/*****************************************************************************/
+/* value - return the value of the (key,value) pair at the current position */
+
+extern inline void *
+hashtable_iterator_value(struct hashtable_itr *i)
+{
+ return i->e->v;
+}
+
+/*****************************************************************************/
+/* advance - advance the iterator to the next element
+ * returns zero if advanced to end of table */
+
+int
+hashtable_iterator_advance(struct hashtable_itr *itr);
+
+/*****************************************************************************/
+/* remove - remove current element and advance the iterator to the next element
+ * NB: if you need the value to free it, read it before
+ * removing. ie: beware memory leaks!
+ * returns zero if advanced to end of table */
+
+int
+hashtable_iterator_remove(struct hashtable_itr *itr);
+
+/*****************************************************************************/
+/* search - overwrite the supplied iterator, to point to the entry
+ * matching the supplied key.
+ h points to the hashtable to be searched.
+ * returns zero if not found. */
+int
+hashtable_iterator_search(struct hashtable_itr *itr,
+ struct hashtable *h, void *k);
+
+#define DEFINE_HASHTABLE_ITERATOR_SEARCH(fnname, keytype) \
+int fnname (struct hashtable_itr *i, struct hashtable *h, keytype *k) \
+{ \
+ return (hashtable_iterator_search(i,h,k)); \
+}
+
+
+
+#endif /* __HASHTABLE_ITR_CWC22__*/
+
+/*
+ * Copyright (c) 2002, 2004, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/blktap/drivers/hashtable_utility.c b/tools/blktap/drivers/hashtable_utility.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap/drivers/hashtable_utility.c
@@ -0,0 +1,71 @@
+/* Copyright (C) 2002 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#include "hashtable.h"
+#include "hashtable_private.h"
+#include "hashtable_utility.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+/*****************************************************************************/
+/* hashtable_change
+ *
+ * function to change the value associated with a key, where there already
+ * exists a value bound to the key in the hashtable.
+ * Source due to Holger Schemel.
+ *
+ * */
+int
+hashtable_change(struct hashtable *h, void *k, void *v)
+{
+ struct entry *e;
+ unsigned int hashvalue, index;
+ hashvalue = hash(h,k);
+ index = indexFor(h->tablelength,hashvalue);
+ e = h->table[index];
+ while (NULL != e)
+ {
+ /* Check hash value to short circuit heavier comparison */
+ if ((hashvalue == e->h) && (h->eqfn(k, e->k)))
+ {
+ free(e->v);
+ e->v = v;
+ return -1;
+ }
+ e = e->next;
+ }
+ return 0;
+}
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/blktap/drivers/hashtable_utility.h b/tools/blktap/drivers/hashtable_utility.h
new file mode 100644
--- /dev/null
+++ b/tools/blktap/drivers/hashtable_utility.h
@@ -0,0 +1,55 @@
+/* Copyright (C) 2002 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#ifndef __HASHTABLE_CWC22_UTILITY_H__
+#define __HASHTABLE_CWC22_UTILITY_H__
+
+/*****************************************************************************
+ * hashtable_change
+ *
+ * function to change the value associated with a key, where there already
+ * exists a value bound to the key in the hashtable.
+ * Source due to Holger Schemel.
+ *
+ * @name hashtable_change
+ * @param h the hashtable
+ * @param key
+ * @param value
+ *
+ */
+int
+hashtable_change(struct hashtable *h, void *k, void *v);
+
+#endif /* __HASHTABLE_CWC22_H__ */
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/blktap/drivers/tapdisk.h b/tools/blktap/drivers/tapdisk.h
--- a/tools/blktap/drivers/tapdisk.h
+++ b/tools/blktap/drivers/tapdisk.h
@@ -160,6 +160,7 @@
extern struct tap_disk tapdisk_ram;
extern struct tap_disk tapdisk_qcow;
extern struct tap_disk tapdisk_qcow2;
+extern struct tap_disk tapdisk_remus;
/*Define Individual Disk Parameters here */
@@ -229,6 +230,17 @@
#endif
};
+static disk_info_t remus_disk = {
+ DISK_TYPE_REMUS,
+ "replicated disk (remus)",
+ "remus",
+ 0,
+ 0,
+#ifdef TAPDISK
+ &tapdisk_remus,
+#endif
+};
+
/*Main disk info array */
static disk_info_t *dtypes[] = {
&aio_disk,
@@ -237,6 +249,7 @@
&ram_disk,
&qcow_disk,
&qcow2_disk,
+ &remus_disk
};
typedef struct driver_list_entry {
diff --git a/tools/blktap/lib/blktaplib.h b/tools/blktap/lib/blktaplib.h
--- a/tools/blktap/lib/blktaplib.h
+++ b/tools/blktap/lib/blktaplib.h
@@ -219,6 +219,7 @@
#define DISK_TYPE_RAM 3
#define DISK_TYPE_QCOW 4
#define DISK_TYPE_QCOW2 5
+#define DISK_TYPE_REMUS 6
/* xenstore/xenbus: */
#define DOMNAME "Domain-0"
diff --git a/tools/python/xen/xend/server/BlktapController.py b/tools/python/xen/xend/server/BlktapController.py
--- a/tools/python/xen/xend/server/BlktapController.py
+++ b/tools/python/xen/xend/server/BlktapController.py
@@ -14,6 +14,7 @@
'ram',
'qcow',
'qcow2',
+ 'remus',
'ioemu',
'tapdisk',
^ permalink raw reply [flat|nested] 10+ messages in thread* [PATCH 7 of 9] Buffer checkpoint data locally until domain has resumed execution
2009-05-14 0:19 [PATCH 0 of 9] Xen support for Remus Brendan Cully
` (5 preceding siblings ...)
2009-05-14 0:19 ` [PATCH 6 of 9] Remus tapdisk proxy Brendan Cully
@ 2009-05-14 0:19 ` Brendan Cully
2009-05-14 0:19 ` [PATCH 8 of 9] Do not bother with to_skip/to_fix bitmaps after the first final round Brendan Cully
2009-05-14 0:19 ` [PATCH 9 of 9] Do bitmap scan word-by-word before bit-by-bit Brendan Cully
8 siblings, 0 replies; 10+ messages in thread
From: Brendan Cully @ 2009-05-14 0:19 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1240355510 25200
# Node ID dd394ec3ad784da4cfe789c380aa47092dfb0f20
# Parent 03fd0c9729f3d87e7803afb170dfc3cdff184998
Buffer checkpoint data locally until domain has resumed execution.
Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
diff --git a/tools/libxc/xc_domain_save.c b/tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c
+++ b/tools/libxc/xc_domain_save.c
@@ -57,6 +57,15 @@
/* Address size of the guest */
unsigned int guest_width;
+/* buffer for output */
+struct outbuf {
+ void* buf;
+ size_t size;
+ size_t pos;
+};
+
+#define OUTBUF_SIZE (16384 * 1024)
+
/* grep fodder: machine_to_phys */
#define mfn_to_pfn(_mfn) (live_m2p[(_mfn)])
@@ -159,6 +168,93 @@
return rc;
}
+static int outbuf_init(struct outbuf* ob, size_t size)
+{
+ memset(ob, 0, sizeof(*ob));
+
+ if (!(ob->buf = malloc(size))) {
+ DPRINTF("error allocating output buffer of size %zu\n", size);
+ return -1;
+ }
+
+ ob->size = size;
+
+ return 0;
+}
+
+static inline int outbuf_resize(struct outbuf* ob, size_t nsize)
+{
+ void* nbuf;
+
+ if (nsize <= ob->size)
+ return 0;
+
+ if (!(nbuf = realloc(ob->buf, nsize))) {
+ DPRINTF("error reallocating output buffer from %zu to %zu\n",
+ ob->size, nsize);
+ return -1;
+ }
+
+ ob->buf = nbuf;
+
+ return 0;
+}
+
+static inline int outbuf_write(struct outbuf* ob, void* buf, size_t len)
+{
+ if (len > ob->size - ob->pos) {
+ DPRINTF("outbuf_write: %zu > %zu@%zu\n", len, ob->size - ob->pos, ob->pos);
+ return -1;
+ }
+
+ memcpy(ob->buf + ob->pos, buf, len);
+ ob->pos += len;
+
+ return 0;
+}
+
+/* prep for nonblocking I/O */
+static int outbuf_flush(struct outbuf* ob, int fd)
+{
+ int rc;
+ int cur = 0;
+
+ if (!ob->pos)
+ return 0;
+
+ rc = write(fd, ob->buf, ob->pos);
+ while (rc < 0 || cur + rc < ob->pos) {
+ if (rc < 0 && errno != EAGAIN && errno != EINTR) {
+ DPRINTF("error flushing output: %d\n", errno);
+ return -1;
+ }
+ if (rc > 0)
+ cur += rc;
+
+ rc = write(fd, ob->buf + cur, ob->pos - cur);
+ }
+
+ ob->pos = 0;
+
+ return 0;
+}
+
+/* if there's no room in the buffer, flush it and try again. */
+static inline int outbuf_hardwrite(struct outbuf* ob, int fd, void* buf,
+ size_t len)
+{
+ if (!len)
+ return 0;
+
+ if (!outbuf_write(ob, buf, len))
+ return 0;
+
+ if (outbuf_flush(ob, fd) < 0)
+ return -1;
+
+ return outbuf_write(ob, buf, len);
+}
+
#ifdef ADAPTIVE_SAVE
/*
@@ -799,6 +895,10 @@
unsigned long mfn;
+ struct outbuf ob;
+
+ outbuf_init(&ob, OUTBUF_SIZE);
+
/* If no explicit control parameters given, use defaults */
max_iters = max_iters ? : DEF_MAX_ITERS;
max_factor = max_factor ? : DEF_MAX_FACTOR;
@@ -1171,13 +1271,21 @@
}
}
- if ( write_exact(io_fd, &batch, sizeof(unsigned int)) )
+ if ( last_iter )
+ rc = outbuf_hardwrite(&ob, io_fd, &batch, sizeof(unsigned int));
+ else
+ rc = write_exact(io_fd, &batch, sizeof(unsigned int));
+ if ( rc )
{
PERROR("Error when writing to state file (2)");
goto out;
}
- if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
+ if ( last_iter )
+ rc = outbuf_hardwrite(&ob, io_fd, pfn_type, sizeof(unsigned long)*batch);
+ else
+ rc = write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch);
+ if ( rc )
{
PERROR("Error when writing to state file (3)");
goto out;
@@ -1199,9 +1307,14 @@
run of pages we may have previously acumulated */
if ( run )
{
- if ( ratewrite(io_fd, live,
- (char*)region_base+(PAGE_SIZE*(j-run)),
- PAGE_SIZE*run) != PAGE_SIZE*run )
+ if ( last_iter )
+ rc = outbuf_hardwrite(&ob, io_fd,(char*)region_base+(PAGE_SIZE*(j-run)),
+ PAGE_SIZE*run);
+ else
+ rc = ratewrite(io_fd, live,
+ (char*)region_base+(PAGE_SIZE*(j-run)),
+ PAGE_SIZE*run) != PAGE_SIZE*run;
+ if ( rc )
{
ERROR("Error when writing to state file (4a)"
" (errno %d)", errno);
@@ -1231,7 +1344,12 @@
goto out;
}
- if ( ratewrite(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE )
+ if ( last_iter )
+ rc = outbuf_hardwrite(&ob, io_fd, page, PAGE_SIZE);
+ else
+ rc = ratewrite(io_fd, live, page, PAGE_SIZE) !=
+ PAGE_SIZE;
+ if ( rc )
{
ERROR("Error when writing to state file (4b)"
" (errno %d)", errno);
@@ -1240,17 +1358,37 @@
}
else
{
+#if 0
/* We have a normal page: accumulate it for writing. */
run++;
+#else
+ /* We have a normal page: just write it directly. */
+ if ( last_iter )
+ rc = outbuf_hardwrite(&ob, io_fd, spage, PAGE_SIZE);
+ else
+ rc = ratewrite(io_fd, live, spage, PAGE_SIZE) !=
+ PAGE_SIZE;
+ if ( rc )
+ {
+ ERROR("Error when writing to state file (5)"
+ " (errno %d)", errno);
+ goto out;
+ }
+#endif
}
} /* end of the write out for this batch */
if ( run )
{
/* write out the last accumulated run of pages */
- if ( ratewrite(io_fd, live,
- (char*)region_base+(PAGE_SIZE*(j-run)),
- PAGE_SIZE*run) != PAGE_SIZE*run )
+ if ( last_iter )
+ rc = outbuf_hardwrite(&ob, io_fd, (char*)region_base+(PAGE_SIZE*(j-run)),
+ PAGE_SIZE*run);
+ else
+ rc = ratewrite(io_fd, live,
+ (char*)region_base+(PAGE_SIZE*(j-run)),
+ PAGE_SIZE*run) != PAGE_SIZE*run;
+ if ( rc )
{
ERROR("Error when writing to state file (4c)"
" (errno %d)", errno);
@@ -1275,9 +1413,11 @@
{
print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
+ /*
DPRINTF("Total pages sent= %ld (%.2fx)\n",
total_sent, ((float)total_sent)/p2m_size );
DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
+ */
}
if ( last_iter && debug )
@@ -1288,7 +1428,7 @@
DPRINTF("Entering debug resend-all mode\n");
/* send "-1" to put receiver into debug mode */
- if ( write_exact(io_fd, &minusone, sizeof(int)) )
+ if ( outbuf_hardwrite(&ob, io_fd, &minusone, sizeof(int)) )
{
PERROR("Error when writing to state file (6)");
goto out;
@@ -1356,7 +1496,9 @@
}
} /* end of infinite for loop */
+ /*
DPRINTF("All memory is saved\n");
+ */
{
struct {
@@ -1380,7 +1522,7 @@
}
chunk.vcpumap = vcpumap;
- if ( write_exact(io_fd, &chunk, sizeof(chunk)) )
+ if ( outbuf_hardwrite(&ob, io_fd, &chunk, sizeof(chunk)) )
{
PERROR("Error when writing to state file");
goto out;
@@ -1400,7 +1542,7 @@
(unsigned long *)&chunk.data);
if ( (chunk.data != 0) &&
- write_exact(io_fd, &chunk, sizeof(chunk)) )
+ outbuf_hardwrite(&ob, io_fd, &chunk, sizeof(chunk)) )
{
PERROR("Error when writing the ident_pt for EPT guest");
goto out;
@@ -1411,7 +1553,7 @@
(unsigned long *)&chunk.data);
if ( (chunk.data != 0) &&
- write_exact(io_fd, &chunk, sizeof(chunk)) )
+ outbuf_hardwrite(&ob, io_fd, &chunk, sizeof(chunk)) )
{
PERROR("Error when writing the vm86 TSS for guest");
goto out;
@@ -1420,7 +1562,7 @@
/* Zero terminate */
i = 0;
- if ( write_exact(io_fd, &i, sizeof(int)) )
+ if ( outbuf_hardwrite(&ob, io_fd, &i, sizeof(int)) )
{
PERROR("Error when writing to state file (6')");
goto out;
@@ -1438,7 +1580,7 @@
(unsigned long *)&magic_pfns[1]);
xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
(unsigned long *)&magic_pfns[2]);
- if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
+ if ( outbuf_hardwrite(&ob, io_fd, magic_pfns, sizeof(magic_pfns)) )
{
PERROR("Error when writing to state file (7)");
goto out;
@@ -1452,13 +1594,13 @@
goto out;
}
- if ( write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
+ if ( outbuf_hardwrite(&ob, io_fd, &rec_size, sizeof(uint32_t)) )
{
PERROR("error write hvm buffer size");
goto out;
}
- if ( write_exact(io_fd, hvm_buf, rec_size) )
+ if ( outbuf_hardwrite(&ob, io_fd, hvm_buf, rec_size) )
{
PERROR("write HVM info failed!\n");
goto out;
@@ -1482,7 +1624,7 @@
j++;
}
- if ( write_exact(io_fd, &j, sizeof(unsigned int)) )
+ if ( outbuf_hardwrite(&ob, io_fd, &j, sizeof(unsigned int)) )
{
PERROR("Error when writing to state file (6a)");
goto out;
@@ -1496,7 +1638,7 @@
i++;
if ( (j == 1024) || (i == p2m_size) )
{
- if ( write_exact(io_fd, &pfntab, sizeof(unsigned long)*j) )
+ if ( outbuf_hardwrite(&ob, io_fd, &pfntab, sizeof(unsigned long)*j) )
{
PERROR("Error when writing to state file (6b)");
goto out;
@@ -1567,9 +1709,9 @@
FOLD_CR3(mfn_to_pfn(UNFOLD_CR3(ctxt.x64.ctrlreg[1])));
}
- if ( write_exact(io_fd, &ctxt, ((guest_width==8)
- ? sizeof(ctxt.x64)
- : sizeof(ctxt.x32))) )
+ if ( outbuf_hardwrite(&ob, io_fd, &ctxt, ((guest_width==8)
+ ? sizeof(ctxt.x64)
+ : sizeof(ctxt.x32))) )
{
PERROR("Error when writing to state file (1)");
goto out;
@@ -1583,7 +1725,7 @@
ERROR("No extended context for VCPU%d", i);
goto out;
}
- if ( write_exact(io_fd, &domctl.u.ext_vcpucontext, 128) )
+ if ( outbuf_hardwrite(&ob, io_fd, &domctl.u.ext_vcpucontext, 128) )
{
PERROR("Error when writing to state file (2)");
goto out;
@@ -1596,7 +1738,7 @@
memcpy(page, live_shinfo, PAGE_SIZE);
SET_FIELD(((shared_info_any_t *)page),
arch.pfn_to_mfn_frame_list_list, 0);
- if ( write_exact(io_fd, page, PAGE_SIZE) )
+ if ( outbuf_hardwrite(&ob, io_fd, page, PAGE_SIZE) )
{
PERROR("Error when writing to state file (1)");
goto out;
@@ -1610,6 +1752,11 @@
callbacks->postcopy(callbacks->data);
/* Flush last write and discard cache for file. */
+ if (outbuf_flush(&ob, io_fd) < 0) {
+ ERROR("Error when flushing output buffer\n");
+ rc = 1;
+ }
+
discard_file_cache(io_fd, 1 /* flush */);
/* checkpoint_cb can spend arbitrarily long in between rounds */
@@ -1627,9 +1774,11 @@
ERROR("Domain appears not to have suspended");
goto out;
}
+ /*
DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
+
print_stats(xc_handle, dom, 0, &stats, 1);
-
+ */
if ( xc_shadow_control(xc_handle, dom,
XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
p2m_size, NULL, 0, &stats) != p2m_size )
^ permalink raw reply [flat|nested] 10+ messages in thread* [PATCH 8 of 9] Do not bother with to_skip/to_fix bitmaps after the first final round
2009-05-14 0:19 [PATCH 0 of 9] Xen support for Remus Brendan Cully
` (6 preceding siblings ...)
2009-05-14 0:19 ` [PATCH 7 of 9] Buffer checkpoint data locally until domain has resumed execution Brendan Cully
@ 2009-05-14 0:19 ` Brendan Cully
2009-05-14 0:19 ` [PATCH 9 of 9] Do bitmap scan word-by-word before bit-by-bit Brendan Cully
8 siblings, 0 replies; 10+ messages in thread
From: Brendan Cully @ 2009-05-14 0:19 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Geoffrey Lefebvre <geoffrey@cs.ubc.ca>
# Date 1240355511 25200
# Node ID 0f1925415df0a4323d431085a9ee8956b8a95764
# Parent dd394ec3ad784da4cfe789c380aa47092dfb0f20
Do not bother with to_skip/to_fix bitmaps after the first final round.
diff --git a/tools/libxc/xc_domain_save.c b/tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c
+++ b/tools/libxc/xc_domain_save.c
@@ -897,6 +897,8 @@
struct outbuf ob;
+ int completed = 0;
+
outbuf_init(&ob, OUTBUF_SIZE);
/* If no explicit control parameters given, use defaults */
@@ -1153,54 +1155,66 @@
mfn_to_pfn(pfn_to_mfn(n)&0xFFFFF));
DPRINTF("\n");
}
- if ( !last_iter &&
- test_bit(n, to_send) &&
- test_bit(n, to_skip) )
- skip_this_iter++; /* stats keeping */
- if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
- (test_bit(n, to_send) && last_iter) ||
- (test_bit(n, to_fix) && last_iter)) )
- continue;
+ if ( completed )
+ {
+ if ( !test_bit(n, to_send) )
+ continue;
- /*
- ** we get here if:
- ** 1. page is marked to_send & hasn't already been re-dirtied
- ** 2. (ignore to_skip in last iteration)
- ** 3. add in pages that still need fixup (net bufs)
- */
+ pfn_batch[batch] = n;
+ pfn_type[batch] = live_p2m[n];
+ }
+ else
+ {
+ if ( !last_iter &&
+ test_bit(n, to_send) &&
+ test_bit(n, to_skip) )
+ skip_this_iter++; /* stats keeping */
- pfn_batch[batch] = n;
+ if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
+ (test_bit(n, to_send) && last_iter) ||
+ (test_bit(n, to_fix) && last_iter)) )
+ continue;
- /* Hypercall interfaces operate in PFNs for HVM guests
- * and MFNs for PV guests */
- if ( hvm )
- pfn_type[batch] = n;
- else
- pfn_type[batch] = pfn_to_mfn(n);
+ /*
+ ** we get here if:
+ ** 1. page is marked to_send & hasn't already been re-dirtied
+ ** 2. (ignore to_skip in last iteration)
+ ** 3. add in pages that still need fixup (net bufs)
+ */
+
+ pfn_batch[batch] = n;
+
+ /* Hypercall interfaces operate in PFNs for HVM guests
+ * and MFNs for PV guests */
+ if ( hvm )
+ pfn_type[batch] = n;
+ else
+ pfn_type[batch] = pfn_to_mfn(n);
- if ( !is_mapped(pfn_type[batch]) )
- {
- /*
- ** not currently in psuedo-physical map -- set bit
- ** in to_fix since we must send this page in last_iter
- ** unless its sent sooner anyhow, or it never enters
- ** pseudo-physical map (e.g. for ballooned down doms)
- */
- set_bit(n, to_fix);
- continue;
+ if ( !is_mapped(pfn_type[batch]) )
+ {
+ /*
+ ** not currently in psuedo-physical map -- set bit
+ ** in to_fix since we must send this page in last_iter
+ ** unless its sent sooner anyhow, or it never enters
+ ** pseudo-physical map (e.g. for ballooned down doms)
+ */
+ set_bit(n, to_fix);
+ continue;
+ }
+
+ if ( last_iter &&
+ test_bit(n, to_fix) &&
+ !test_bit(n, to_send) )
+ {
+ needed_to_fix++;
+ DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
+ iter, n, pfn_type[batch]);
+ }
+
+ clear_bit(n, to_fix);
}
-
- if ( last_iter &&
- test_bit(n, to_fix) &&
- !test_bit(n, to_send) )
- {
- needed_to_fix++;
- DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
- iter, n, pfn_type[batch]);
- }
-
- clear_bit(n, to_fix);
batch++;
}
@@ -1748,6 +1762,8 @@
rc = 0;
out:
+ completed = 1;
+
if ( !rc && callbacks->postcopy )
callbacks->postcopy(callbacks->data);
^ permalink raw reply [flat|nested] 10+ messages in thread* [PATCH 9 of 9] Do bitmap scan word-by-word before bit-by-bit
2009-05-14 0:19 [PATCH 0 of 9] Xen support for Remus Brendan Cully
` (7 preceding siblings ...)
2009-05-14 0:19 ` [PATCH 8 of 9] Do not bother with to_skip/to_fix bitmaps after the first final round Brendan Cully
@ 2009-05-14 0:19 ` Brendan Cully
8 siblings, 0 replies; 10+ messages in thread
From: Brendan Cully @ 2009-05-14 0:19 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1240355511 25200
# Node ID 4a9831ea4a7d7bb95cd9584abe94e2fdc50f46f6
# Parent 0f1925415df0a4323d431085a9ee8956b8a95764
Do bitmap scan word-by-word before bit-by-bit.
For sparse bitmaps and large domains this saves a lot of time.
Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
diff --git a/tools/libxc/xc_domain_save.c b/tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c
+++ b/tools/libxc/xc_domain_save.c
@@ -99,6 +99,8 @@
#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+#define ORDER_LONG (sizeof(unsigned long) == 4 ? 5 : 6)
+
static inline int test_bit (int nr, volatile void * addr)
{
return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
@@ -1158,6 +1160,14 @@
if ( completed )
{
+ /* for sparse bitmaps, word-by-word may save time */
+ if ( !to_send[N >> ORDER_LONG] )
+ {
+ /* incremented again in for loop! */
+ N += BITS_PER_LONG - 1;
+ continue;
+ }
+
if ( !test_bit(n, to_send) )
continue;
^ permalink raw reply [flat|nested] 10+ messages in thread