* [PATCH 01 of 11] Add callbacks for suspend, postcopy and preresume in xc_domain_save
2009-11-05 22:58 [PATCH 00 of 11] Remus 0.9 released! Brendan Cully
@ 2009-11-05 22:58 ` Brendan Cully
2009-11-05 22:58 ` [PATCH 02 of 11] Make xc_domain_restore loop until the fd is closed Brendan Cully
` (11 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Brendan Cully @ 2009-11-05 22:58 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1240355494 25200
# Node ID d325cdd95abb5f675e3d63fa9cc59ca89271489a
# Parent ac9d4ba48b8334f0adc3a928be4e48d2e6fdebd1
Add callbacks for suspend, postcopy and preresume in xc_domain_save.
This makes it possible to perform repeated checkpoints.
Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
diff --git a/tools/libxc/xc_domain_save.c b/tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c
+++ b/tools/libxc/xc_domain_save.c
@@ -332,11 +332,11 @@
return -1;
}
-
-static int suspend_and_state(int (*suspend)(void), int xc_handle, int io_fd,
- int dom, xc_dominfo_t *info)
+static int suspend_and_state(int (*suspend)(void*), void* data,
+ int xc_handle, int io_fd, int dom,
+ xc_dominfo_t *info)
{
- if ( !(*suspend)() )
+ if ( !(*suspend)(data) )
{
ERROR("Suspend request failed");
return -1;
@@ -742,13 +742,14 @@
}
int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
- uint32_t max_factor, uint32_t flags, int (*suspend)(void),
+ uint32_t max_factor, uint32_t flags,
+ struct save_callbacks* callbacks,
int hvm, void (*switch_qemu_logdirty)(int, unsigned))
{
xc_dominfo_t info;
DECLARE_DOMCTL;
- int rc = 1, frc, i, j, last_iter, iter = 0;
+ int rc = 1, frc, i, j, last_iter = 0, iter = 0;
int live = (flags & XCFLAGS_LIVE);
int debug = (flags & XCFLAGS_DEBUG);
int race = 0, sent_last_iter, skip_this_iter;
@@ -864,7 +865,8 @@
else
{
/* This is a non-live suspend. Suspend the domain .*/
- if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info) )
+ if ( suspend_and_state(callbacks->suspend, callbacks->data, xc_handle,
+ io_fd, dom, &info) )
{
ERROR("Domain appears not to have suspended");
goto out;
@@ -992,6 +994,7 @@
goto out;
}
+ copypages:
/* Now write out each data page, canonicalising page tables as we go... */
for ( ; ; )
{
@@ -1305,7 +1308,8 @@
DPRINTF("Start last iteration\n");
last_iter = 1;
- if ( suspend_and_state(suspend, xc_handle, io_fd, dom, &info) )
+ if ( suspend_and_state(callbacks->suspend, callbacks->data,
+ xc_handle, io_fd, dom, &info) )
{
ERROR("Domain appears not to have suspended");
goto out;
@@ -1586,6 +1590,39 @@
rc = 0;
out:
+ if ( !rc && callbacks->postcopy )
+ callbacks->postcopy(callbacks->data);
+
+ /* Flush last write and discard cache for file. */
+ discard_file_cache(io_fd, 1 /* flush */);
+
+ /* checkpoint_cb can spend arbitrarily long in between rounds */
+ if (!rc && callbacks->checkpoint &&
+ callbacks->checkpoint(callbacks->data) > 0)
+ {
+ /* reset stats timer */
+ print_stats(xc_handle, dom, 0, &stats, 0);
+
+ rc = 1;
+ /* last_iter = 1; */
+ if ( suspend_and_state(callbacks->suspend, callbacks->data, xc_handle,
+ io_fd, dom, &info) )
+ {
+ ERROR("Domain appears not to have suspended");
+ goto out;
+ }
+ DPRINTF("SUSPEND shinfo %08lx\n", info.shared_info_frame);
+ print_stats(xc_handle, dom, 0, &stats, 1);
+
+ if ( xc_shadow_control(xc_handle, dom,
+ XEN_DOMCTL_SHADOW_OP_CLEAN, to_send,
+ p2m_size, NULL, 0, &stats) != p2m_size )
+ {
+ ERROR("Error flushing shadow PT");
+ }
+
+ goto copypages;
+ }
if ( tmem_saved != 0 && live )
xc_tmem_save_done(xc_handle, dom);
@@ -1600,9 +1637,6 @@
switch_qemu_logdirty(dom, 0);
}
- /* Flush last write and discard cache for file. */
- discard_file_cache(io_fd, 1 /* flush */);
-
if ( live_shinfo )
munmap(live_shinfo, PAGE_SIZE);
diff --git a/tools/libxc/xenguest.h b/tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h
+++ b/tools/libxc/xenguest.h
@@ -14,6 +14,19 @@
#define XCFLAGS_HVM 4
#define XCFLAGS_STDVGA 8
+/* callbacks provided by xc_domain_save */
+struct save_callbacks {
+ int (*suspend)(void* data);
+ /* callback to rendezvous with external checkpoint functions */
+ int (*postcopy)(void* data);
+ /* returns:
+ * 0: terminate checkpointing gracefully
+ * 1: take another checkpoint */
+ int (*checkpoint)(void* data);
+
+ /* to be provided as the first argument to each callback function */
+ void* data;
+};
/**
* This function will save a running domain.
@@ -25,8 +38,8 @@
*/
int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
uint32_t max_factor, uint32_t flags /* XCFLAGS_xxx */,
- int (*suspend)(void), int hvm,
- void (*switch_qemu_logdirty)(int, unsigned)); /* HVM only */
+ struct save_callbacks* callbacks,
+ int hvm, void (*switch_qemu_logdirty)(int, unsigned)); /* HVM only */
/**
diff --git a/tools/xcutils/xc_save.c b/tools/xcutils/xc_save.c
--- a/tools/xcutils/xc_save.c
+++ b/tools/xcutils/xc_save.c
@@ -71,7 +71,7 @@
return 1;
}
-static int suspend(void)
+static int suspend(void* data)
{
unsigned long sx_state = 0;
@@ -166,6 +166,7 @@
{
unsigned int maxit, max_f;
int io_fd, ret, port;
+ struct save_callbacks callbacks;
if (argc != 6)
errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]);
@@ -202,8 +203,10 @@
"using slow path");
}
}
+ memset(&callbacks, 0, sizeof(callbacks));
+ callbacks.suspend = suspend;
ret = xc_domain_save(si.xc_fd, io_fd, si.domid, maxit, max_f, si.flags,
- &suspend, !!(si.flags & XCFLAGS_HVM),
+ &callbacks, !!(si.flags & XCFLAGS_HVM),
&switch_qemu_logdirty);
if (si.suspend_evtchn > 0)
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 02 of 11] Make xc_domain_restore loop until the fd is closed
2009-11-05 22:58 [PATCH 00 of 11] Remus 0.9 released! Brendan Cully
2009-11-05 22:58 ` [PATCH 01 of 11] Add callbacks for suspend, postcopy and preresume in xc_domain_save Brendan Cully
@ 2009-11-05 22:58 ` Brendan Cully
2009-11-05 22:58 ` [PATCH 03 of 11] Initiate failover if a packet is not received every 500ms Brendan Cully
` (10 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Brendan Cully @ 2009-11-05 22:58 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1240355507 25200
# Node ID a82f2e189c29acfc26794ea10d4000d0a350bf40
# Parent d325cdd95abb5f675e3d63fa9cc59ca89271489a
Make xc_domain_restore loop until the fd is closed.
The tail containing the final PFN table, VCPU contexts and
shared_info_page is buffered, then the read loop is restarted.
After the first pass, incoming pages are buffered until the next tail
is read, completing a new consistent checkpoint. At this point, the
memory changes are applied and the loop begins again. When the fd read
fails, the tail buffer is processed.
Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c
+++ b/tools/libxc/xc_domain_restore.c
@@ -624,6 +624,410 @@
return p2m_frame_list;
}
+typedef struct {
+ unsigned int pfncount;
+ unsigned long* pfntab;
+ unsigned int vcpucount;
+ unsigned char* vcpubuf;
+ unsigned char shared_info_page[PAGE_SIZE];
+} tailbuf_t;
+
+static int buffer_tail(tailbuf_t* buf, int fd, unsigned int max_vcpu_id,
+ uint64_t vcpumap, int ext_vcpucontext)
+{
+ unsigned int i;
+ size_t pfnlen, vcpulen;
+
+ /* TODO: handle changing pfntab and vcpu counts */
+ /* PFN tab */
+ if ( read_exact(fd, &buf->pfncount, sizeof(buf->pfncount)) ||
+ (buf->pfncount > (1U << 28)) ) /* up to 1TB of address space */
+ {
+ ERROR("Error when reading pfn count");
+ return -1;
+ }
+ pfnlen = sizeof(unsigned long) * buf->pfncount;
+ if ( !(buf->pfntab) ) {
+ if ( !(buf->pfntab = malloc(pfnlen)) ) {
+ ERROR("Error allocating PFN tail buffer");
+ return -1;
+ }
+ }
+ // DPRINTF("Reading PFN tab: %d bytes\n", pfnlen);
+ if ( read_exact(fd, buf->pfntab, pfnlen) ) {
+ ERROR("Error when reading pfntab");
+ goto free_pfntab;
+ }
+
+ /* VCPU contexts */
+ buf->vcpucount = 0;
+ for (i = 0; i <= max_vcpu_id; i++) {
+ // DPRINTF("vcpumap: %llx, cpu: %d, bit: %llu\n", vcpumap, i, (vcpumap % (1ULL << i)));
+ if ( (!(vcpumap & (1ULL << i))) )
+ continue;
+ buf->vcpucount++;
+ }
+ // DPRINTF("VCPU count: %d\n", buf->vcpucount);
+ vcpulen = ((guest_width == 8) ? sizeof(vcpu_guest_context_x86_64_t)
+ : sizeof(vcpu_guest_context_x86_32_t)) * buf->vcpucount;
+ if ( ext_vcpucontext )
+ vcpulen += 128 * buf->vcpucount;
+
+ if ( !(buf->vcpubuf) ) {
+ if ( !(buf->vcpubuf = malloc(vcpulen)) ) {
+ ERROR("Error allocating VCPU ctxt tail buffer");
+ goto free_pfntab;
+ }
+ }
+ // DPRINTF("Reading VCPUS: %d bytes\n", vcpulen);
+ if ( read_exact(fd, buf->vcpubuf, vcpulen) ) {
+ ERROR("Error when reading ctxt");
+ goto free_vcpus;
+ }
+
+ /* load shared_info_page */
+ // DPRINTF("Reading shared info: %lu bytes\n", PAGE_SIZE);
+ if ( read_exact(fd, buf->shared_info_page, PAGE_SIZE) ) {
+ ERROR("Error when reading shared info page");
+ goto free_vcpus;
+ }
+
+ return 0;
+
+ free_vcpus:
+ if (buf->vcpubuf) {
+ free (buf->vcpubuf);
+ buf->vcpubuf = NULL;
+ }
+ free_pfntab:
+ if (buf->pfntab) {
+ free (buf->pfntab);
+ buf->pfntab = NULL;
+ }
+
+ return -1;
+}
+
+static void tailbuf_free(tailbuf_t* buf)
+{
+ if (buf->vcpubuf) {
+ free(buf->vcpubuf);
+ buf->vcpubuf = NULL;
+ }
+ if (buf->pfntab) {
+ free(buf->pfntab);
+ buf->pfntab = NULL;
+ }
+}
+
+typedef struct {
+ void* pages;
+ /* pages is of length nr_physpages, pfn_types is of length nr_pages */
+ unsigned int nr_physpages, nr_pages;
+
+ /* Types of the pfns in the current region */
+ unsigned long* pfn_types;
+
+ int verify;
+
+ int new_ctxt_format;
+ int max_vcpu_id;
+ uint64_t vcpumap;
+ uint64_t identpt;
+ uint64_t vm86_tss;
+} pagebuf_t;
+
+static int pagebuf_init(pagebuf_t* buf)
+{
+ memset(buf, 0, sizeof(*buf));
+ return 0;
+}
+
+static void pagebuf_free(pagebuf_t* buf)
+{
+ if (buf->pages) {
+ free(buf->pages);
+ buf->pages = NULL;
+ }
+ if(buf->pfn_types) {
+ free(buf->pfn_types);
+ buf->pfn_types = NULL;
+ }
+}
+
+static int pagebuf_get_one(pagebuf_t* buf, int fd, int xch, uint32_t dom)
+{
+ int count, countpages, oldcount, i;
+ void* ptmp;
+
+ if ( read_exact(fd, &count, sizeof(count)) )
+ {
+ ERROR("Error when reading batch size");
+ return -1;
+ }
+
+ // DPRINTF("reading batch of %d pages\n", count);
+
+ if (!count) {
+ // DPRINTF("Last batch read\n");
+ return 0;
+ } else if (count == -1) {
+ DPRINTF("Entering page verify mode\n");
+ buf->verify = 1;
+ return pagebuf_get_one(buf, fd, xch, dom);
+ } else if (count == -2) {
+ buf->new_ctxt_format = 1;
+ if ( read_exact(fd, &buf->max_vcpu_id, sizeof(buf->max_vcpu_id)) ||
+ buf->max_vcpu_id >= 64 || read_exact(fd, &buf->vcpumap,
+ sizeof(uint64_t)) ) {
+ ERROR("Error when reading max_vcpu_id");
+ return -1;
+ }
+ // DPRINTF("Max VCPU ID: %d, vcpumap: %llx\n", buf->max_vcpu_id, buf->vcpumap);
+ return pagebuf_get_one(buf, fd, xch, dom);
+ } else if (count == -3) {
+ /* Skip padding 4 bytes then read the EPT identity PT location. */
+ if ( read_exact(fd, &buf->identpt, sizeof(uint32_t)) ||
+ read_exact(fd, &buf->identpt, sizeof(uint64_t)) )
+ {
+ ERROR("error read the address of the EPT identity map");
+ return -1;
+ }
+ // DPRINTF("EPT identity map address: %llx\n", buf->identpt);
+ return pagebuf_get_one(buf, fd, xch, dom);
+ } else if ( count == -4 ) {
+ /* Skip padding 4 bytes then read the vm86 TSS location. */
+ if ( read_exact(fd, &buf->vm86_tss, sizeof(uint32_t)) ||
+ read_exact(fd, &buf->vm86_tss, sizeof(uint64_t)) )
+ {
+ ERROR("error read the address of the vm86 TSS");
+ return -1;
+ }
+ // DPRINTF("VM86 TSS location: %llx\n", buf->vm86_tss);
+ return pagebuf_get_one(buf, fd, xch, dom);
+ } else if ( count == -5 ) {
+ DPRINTF("xc_domain_restore start tmem\n");
+ if ( xc_tmem_restore(xch, dom, fd) ) {
+ ERROR("error reading/restoring tmem");
+ return -1;
+ }
+ return pagebuf_get_one(buf, fd, xch, dom);
+ }
+ else if ( count == -6 ) {
+ if ( xc_tmem_restore_extra(xch, dom, fd) ) {
+ ERROR("error reading/restoring tmem extra");
+ return -1;
+ }
+ return pagebuf_get_one(buf, fd, xch, dom);
+ } else if ( (count > MAX_BATCH_SIZE) || (count < 0) ) {
+ ERROR("Max batch size exceeded (%d). Giving up.", count);
+ return -1;
+ }
+
+ oldcount = buf->nr_pages;
+ buf->nr_pages += count;
+ if (!buf->pfn_types) {
+ if (!(buf->pfn_types = malloc(buf->nr_pages * sizeof(*(buf->pfn_types))))) {
+ ERROR("Could not allocate PFN type buffer");
+ return -1;
+ }
+ } else {
+ if (!(ptmp = realloc(buf->pfn_types, buf->nr_pages * sizeof(*(buf->pfn_types))))) {
+ ERROR("Could not reallocate PFN type buffer");
+ return -1;
+ }
+ buf->pfn_types = ptmp;
+ }
+ if ( read_exact(fd, buf->pfn_types + oldcount, count * sizeof(*(buf->pfn_types)))) {
+ ERROR("Error when reading region pfn types");
+ return -1;
+ }
+
+ countpages = count;
+ for (i = oldcount; i < buf->nr_pages; ++i)
+ if ((buf->pfn_types[i] & XEN_DOMCTL_PFINFO_LTAB_MASK) == XEN_DOMCTL_PFINFO_XTAB)
+ --countpages;
+
+ if (!countpages)
+ return count;
+
+ oldcount = buf->nr_physpages;
+ buf->nr_physpages += countpages;
+ if (!buf->pages) {
+ if (!(buf->pages = malloc(buf->nr_physpages * PAGE_SIZE))) {
+ ERROR("Could not allocate page buffer");
+ return -1;
+ }
+ } else {
+ if (!(ptmp = realloc(buf->pages, buf->nr_physpages * PAGE_SIZE))) {
+ ERROR("Could not reallocate page buffer");
+ return -1;
+ }
+ buf->pages = ptmp;
+ }
+ if ( read_exact(fd, buf->pages + oldcount * PAGE_SIZE, countpages * PAGE_SIZE) ) {
+ ERROR("Error when reading pages");
+ return -1;
+ }
+
+ return count;
+}
+
+static int pagebuf_get(pagebuf_t* buf, int fd, int xch, uint32_t dom)
+{
+ int rc;
+
+ buf->nr_physpages = buf->nr_pages = 0;
+
+ do {
+ rc = pagebuf_get_one(buf, fd, xch, dom);
+ } while (rc > 0);
+
+ if (rc < 0)
+ pagebuf_free(buf);
+
+ return rc;
+}
+
+static int apply_batch(int xc_handle, uint32_t dom, xen_pfn_t* region_mfn,
+ unsigned long* pfn_type, int pae_extended_cr3,
+ unsigned int hvm, struct xc_mmu* mmu,
+ pagebuf_t* pagebuf, int curbatch, int superpages)
+{
+ int i, j, curpage;
+ /* used by debug verify code */
+ unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
+ /* Our mapping of the current region (batch) */
+ char *region_base;
+ /* A temporary mapping, and a copy, of one frame of guest memory. */
+ unsigned long *page = NULL;
+ int nraces = 0;
+
+ unsigned long mfn, pfn, pagetype;
+
+ j = pagebuf->nr_pages - curbatch;
+ if (j > MAX_BATCH_SIZE)
+ j = MAX_BATCH_SIZE;
+
+ if (allocate_physmem(xc_handle, dom, &pagebuf->pfn_types[curbatch],
+ j, hvm, region_mfn, superpages) != 0)
+ {
+ ERROR("allocate_physmem() failed\n");
+ return -1;
+ }
+
+ /* Map relevant mfns */
+ region_base = xc_map_foreign_batch(
+ xc_handle, dom, PROT_WRITE, region_mfn, j);
+
+ if ( region_base == NULL )
+ {
+ ERROR("map batch failed");
+ return -1;
+ }
+
+ for ( i = 0, curpage = -1; i < j; i++ )
+ {
+ pfn = pagebuf->pfn_types[i + curbatch] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ pagetype = pagebuf->pfn_types[i + curbatch] & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+ /* a bogus/unmapped page: skip it */
+ continue;
+
+ ++curpage;
+
+ if ( pfn > p2m_size )
+ {
+ ERROR("pfn out of range");
+ return -1;
+ }
+
+ pfn_type[pfn] = pagetype;
+
+ mfn = p2m[pfn];
+
+ /* In verify mode, we use a copy; otherwise we work in place */
+ page = pagebuf->verify ? (void *)buf : (region_base + i*PAGE_SIZE);
+
+ memcpy(page, pagebuf->pages + (curpage + curbatch) * PAGE_SIZE, PAGE_SIZE);
+
+ pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+ if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
+ (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+ {
+ /*
+ ** A page table page - need to 'uncanonicalize' it, i.e.
+ ** replace all the references to pfns with the corresponding
+ ** mfns for the new domain.
+ **
+ ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
+ ** so we may need to update the p2m after the main loop.
+ ** Hence we defer canonicalization of L1s until then.
+ */
+ if ((pt_levels != 3) ||
+ pae_extended_cr3 ||
+ (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
+
+ if (!uncanonicalize_pagetable(xc_handle, dom,
+ pagetype, page, superpages)) {
+ /*
+ ** Failing to uncanonicalize a page table can be ok
+ ** under live migration since the pages type may have
+ ** changed by now (and we'll get an update later).
+ */
+ DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
+ pagetype >> 28, pfn, mfn);
+ nraces++;
+ continue;
+ }
+ }
+ }
+ else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
+ {
+ ERROR("Bogus page type %lx page table is out of range: "
+ "i=%d p2m_size=%lu", pagetype, i, p2m_size);
+ return -1;
+ }
+
+ if ( pagebuf->verify )
+ {
+ int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
+ if ( res )
+ {
+ int v;
+
+ DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
+ "actualcs=%08lx\n", pfn, pagebuf->pfn_types[pfn],
+ csum_page(region_base + (i + curbatch)*PAGE_SIZE),
+ csum_page(buf));
+
+ for ( v = 0; v < 4; v++ )
+ {
+ unsigned long *p = (unsigned long *)
+ (region_base + i*PAGE_SIZE);
+ if ( buf[v] != p[v] )
+ DPRINTF(" %d: %08lx %08lx\n", v, buf[v], p[v]);
+ }
+ }
+ }
+
+ if ( !hvm &&
+ xc_add_mmu_update(xc_handle, mmu,
+ (((unsigned long long)mfn) << PAGE_SHIFT)
+ | MMU_MACHPHYS_UPDATE, pfn) )
+ {
+ ERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
+ return -1;
+ }
+ } /* end of 'batch' for loop */
+
+ munmap(region_base, j*PAGE_SIZE);
+
+ return nraces;
+}
+
int xc_domain_restore(int xc_handle, int io_fd, uint32_t dom,
unsigned int store_evtchn, unsigned long *store_mfn,
unsigned int console_evtchn, unsigned long *console_mfn,
@@ -633,7 +1037,6 @@
int rc = 1, frc, i, j, n, m, pae_extended_cr3 = 0, ext_vcpucontext = 0;
unsigned long mfn, pfn;
unsigned int prev_pc, this_pc;
- int verify = 0;
int nraces = 0;
/* The new domain's shared-info frame number. */
@@ -652,9 +1055,6 @@
/* A table of MFNs to map in the current region */
xen_pfn_t *region_mfn = NULL;
- /* Types of the pfns in the current region */
- unsigned long region_pfn_type[MAX_BATCH_SIZE];
-
/* A copy of the pfn-to-mfn table frame list. */
xen_pfn_t *p2m_frame_list = NULL;
@@ -666,9 +1066,6 @@
struct xc_mmu *mmu = NULL;
- /* used by debug verify code */
- unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
-
struct mmuext_op pin[MAX_PIN_BATCH];
unsigned int nr_pins;
@@ -682,6 +1079,14 @@
/* Buffer for holding HVM context */
uint8_t *hvm_buf = NULL;
+ int completed = 0;
+ pagebuf_t pagebuf;
+ tailbuf_t tailbuf, tmptail;
+ void* vcpup;
+
+ pagebuf_init(&pagebuf);
+ memset(&tailbuf, 0, sizeof(tailbuf));
+
/* For info only */
nr_pfns = 0;
@@ -784,9 +1189,10 @@
prev_pc = 0;
n = m = 0;
+ loadpages:
for ( ; ; )
{
- int j;
+ int j, curbatch;
this_pc = (n * 100) / p2m_size;
if ( (this_pc - prev_pc) >= 5 )
@@ -795,221 +1201,49 @@
prev_pc = this_pc;
}
- if ( read_exact(io_fd, &j, sizeof(int)) )
- {
- ERROR("Error when reading batch size");
- goto out;
+ if ( !completed ) {
+ pagebuf.nr_physpages = pagebuf.nr_pages = 0;
+ if ( pagebuf_get_one(&pagebuf, io_fd, xc_handle, dom) < 0 ) {
+ ERROR("Error when reading batch\n");
+ goto out;
+ }
}
+ j = pagebuf.nr_pages;
PPRINTF("batch %d\n",j);
- if ( j == -1 )
- {
- verify = 1;
- DPRINTF("Entering page verify mode\n");
- continue;
+ if ( j == 0 ) {
+ /* catch vcpu updates */
+ if (pagebuf.new_ctxt_format) {
+ vcpumap = pagebuf.vcpumap;
+ max_vcpu_id = pagebuf.max_vcpu_id;
+ }
+ /* should this be deferred? does it change? */
+ if ( pagebuf.identpt )
+ xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT, pagebuf.identpt);
+ if ( pagebuf.vm86_tss )
+ xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, pagebuf.vm86_tss);
+ break; /* our work here is done */
}
- if ( j == -2 )
- {
- new_ctxt_format = 1;
- if ( read_exact(io_fd, &max_vcpu_id, sizeof(int)) ||
- (max_vcpu_id >= 64) ||
- read_exact(io_fd, &vcpumap, sizeof(uint64_t)) )
- {
- ERROR("Error when reading max_vcpu_id");
+ /* break pagebuf into batches */
+ curbatch = 0;
+ while ( curbatch < j ) {
+ int brc;
+
+ brc = apply_batch(xc_handle, dom, region_mfn, pfn_type,
+ pae_extended_cr3, hvm, mmu, &pagebuf, curbatch, superpages);
+ if ( brc < 0 )
goto out;
- }
- continue;
+
+ nraces += brc;
+
+ curbatch += MAX_BATCH_SIZE;
}
- if ( j == -3 )
- {
- uint64_t ident_pt;
+ pagebuf.nr_physpages = pagebuf.nr_pages = 0;
- /* Skip padding 4 bytes then read the EPT identity PT location. */
- if ( read_exact(io_fd, &ident_pt, sizeof(uint32_t)) ||
- read_exact(io_fd, &ident_pt, sizeof(uint64_t)) )
- {
- ERROR("error read the address of the EPT identity map");
- goto out;
- }
-
- xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT, ident_pt);
- continue;
- }
-
- if ( j == -4 )
- {
- uint64_t vm86_tss;
-
- /* Skip padding 4 bytes then read the vm86 TSS location. */
- if ( read_exact(io_fd, &vm86_tss, sizeof(uint32_t)) ||
- read_exact(io_fd, &vm86_tss, sizeof(uint64_t)) )
- {
- ERROR("error read the address of the vm86 TSS");
- goto out;
- }
-
- xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, vm86_tss);
- continue;
- }
-
- if ( j == -5 )
- {
- DPRINTF("xc_domain_restore start tmem\n");
- if ( xc_tmem_restore(xc_handle, dom, io_fd) )
- {
- ERROR("error reading/restoring tmem");
- goto out;
- }
- continue;
- }
-
- if ( j == -6 )
- {
- if ( xc_tmem_restore_extra(xc_handle, dom, io_fd) )
- {
- ERROR("error reading/restoring tmem extra");
- goto out;
- }
- continue;
- }
-
- if ( j == 0 )
- break; /* our work here is done */
-
- if ( (j > MAX_BATCH_SIZE) || (j < 0) )
- {
- ERROR("Max batch size exceeded. Giving up.");
- goto out;
- }
-
- if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) )
- {
- ERROR("Error when reading region pfn types");
- goto out;
- }
-
- if (allocate_physmem(xc_handle, dom, region_pfn_type,
- j, hvm, region_mfn, superpages) != 0)
- goto out;
-
- /* Map relevant mfns */
- region_base = xc_map_foreign_batch(
- xc_handle, dom, PROT_WRITE, region_mfn, j);
-
- if ( region_base == NULL )
- {
- ERROR("map batch failed");
- goto out;
- }
-
- for ( i = 0; i < j; i++ )
- {
- void *page;
- unsigned long pagetype;
-
- pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
- pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
-
- if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
- /* a bogus/unmapped page: skip it */
- continue;
-
- if ( pfn > p2m_size )
- {
- ERROR("pfn out of range");
- goto out;
- }
-
- pfn_type[pfn] = pagetype;
-
- mfn = p2m[pfn];
-
- /* In verify mode, we use a copy; otherwise we work in place */
- page = verify ? (void *)buf : (region_base + i*PAGE_SIZE);
-
- if ( read_exact(io_fd, page, PAGE_SIZE) )
- {
- ERROR("Error when reading page (type was %lx)", pagetype);
- goto out;
- }
-
- pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
-
- if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
- (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
- {
- /*
- ** A page table page - need to 'uncanonicalize' it, i.e.
- ** replace all the references to pfns with the corresponding
- ** mfns for the new domain.
- **
- ** On PAE we need to ensure that PGDs are in MFNs < 4G, and
- ** so we may need to update the p2m after the main loop.
- ** Hence we defer canonicalization of L1s until then.
- */
- if ((pt_levels != 3) ||
- pae_extended_cr3 ||
- (pagetype != XEN_DOMCTL_PFINFO_L1TAB)) {
-
- if (!uncanonicalize_pagetable(xc_handle, dom,
- pagetype, page, superpages)) {
- /*
- ** Failing to uncanonicalize a page table can be ok
- ** under live migration since the pages type may have
- ** changed by now (and we'll get an update later).
- */
- DPRINTF("PT L%ld race on pfn=%08lx mfn=%08lx\n",
- pagetype >> 28, pfn, mfn);
- nraces++;
- continue;
- }
- }
- }
- else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
- {
- ERROR("Bogus page type %lx page table is out of range: "
- "i=%d p2m_size=%lu", pagetype, i, p2m_size);
- goto out;
-
- }
-
- if ( verify )
- {
- int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE);
- if ( res )
- {
- int v;
-
- DPRINTF("************** pfn=%lx type=%lx gotcs=%08lx "
- "actualcs=%08lx\n", pfn, pfn_type[pfn],
- csum_page(region_base + i*PAGE_SIZE),
- csum_page(buf));
-
- for ( v = 0; v < 4; v++ )
- {
- unsigned long *p = (unsigned long *)
- (region_base + i*PAGE_SIZE);
- if ( buf[v] != p[v] )
- DPRINTF(" %d: %08lx %08lx\n", v, buf[v], p[v]);
- }
- }
- }
-
- if ( !hvm &&
- xc_add_mmu_update(xc_handle, mmu,
- (((unsigned long long)mfn) << PAGE_SHIFT)
- | MMU_MACHPHYS_UPDATE, pfn) )
- {
- ERROR("failed machpys update mfn=%lx pfn=%lx", mfn, pfn);
- goto out;
- }
- } /* end of 'batch' for loop */
-
- munmap(region_base, j*PAGE_SIZE);
- n+= j; /* crude stats */
+ n += j; /* crude stats */
/*
* Discard cache for portion of file read so far up to last
@@ -1033,7 +1267,7 @@
goto out;
}
- DPRINTF("Received all pages (%d races)\n", nraces);
+ // DPRINTF("Received all pages (%d races)\n", nraces);
if ( hvm )
{
@@ -1107,6 +1341,34 @@
/* Non-HVM guests only from here on */
+ if ( !completed ) {
+ if ( buffer_tail(&tailbuf, io_fd, max_vcpu_id, vcpumap,
+ ext_vcpucontext) < 0 ) {
+ ERROR ("error buffering image tail");
+ goto out;
+ }
+ completed = 1;
+ }
+
+ // DPRINTF("Buffered checkpoint\n");
+
+ if ( pagebuf_get(&pagebuf, io_fd, xc_handle, dom) ) {
+ ERROR("error when buffering batch, finishing\n");
+ goto finish;
+ }
+ memset(&tmptail, 0, sizeof(tmptail));
+ if ( buffer_tail(&tmptail, io_fd, max_vcpu_id, vcpumap,
+ ext_vcpucontext) < 0 ) {
+ ERROR ("error buffering image tail, finishing");
+ goto finish;
+ }
+ tailbuf_free(&tailbuf);
+ memcpy(&tailbuf, &tmptail, sizeof(tailbuf));
+
+ goto loadpages;
+
+ finish:
+
if ( (pt_levels == 3) && !pae_extended_cr3 )
{
/*
@@ -1275,39 +1537,17 @@
/* Get the list of PFNs that are not in the psuedo-phys map */
{
- unsigned int count = 0;
- unsigned long *pfntab;
- int nr_frees;
+ int nr_frees = 0;
- if ( read_exact(io_fd, &count, sizeof(count)) ||
- (count > (1U << 28)) ) /* up to 1TB of address space */
+ for ( i = 0; i < tailbuf.pfncount; i++ )
{
- ERROR("Error when reading pfn count (= %u)", count);
- goto out;
- }
-
- if ( !(pfntab = malloc(sizeof(unsigned long) * count)) )
- {
- ERROR("Out of memory");
- goto out;
- }
-
- if ( read_exact(io_fd, pfntab, sizeof(unsigned long)*count) )
- {
- ERROR("Error when reading pfntab");
- goto out;
- }
-
- nr_frees = 0;
- for ( i = 0; i < count; i++ )
- {
- unsigned long pfn = pfntab[i];
+ unsigned long pfn = tailbuf.pfntab[i];
if ( p2m[pfn] != INVALID_P2M_ENTRY )
{
/* pfn is not in physmap now, but was at some point during
the save/migration process - need to free it */
- pfntab[nr_frees++] = p2m[pfn];
+ tailbuf.pfntab[nr_frees++] = p2m[pfn];
p2m[pfn] = INVALID_P2M_ENTRY; /* not in pseudo-physical map */
}
}
@@ -1319,7 +1559,7 @@
.extent_order = 0,
.domid = dom
};
- set_xen_guest_handle(reservation.extent_start, pfntab);
+ set_xen_guest_handle(reservation.extent_start, tailbuf.pfntab);
if ( (frc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
&reservation)) != nr_frees )
@@ -1328,7 +1568,7 @@
goto out;
}
else
- DPRINTF("Decreased reservation by %d pages\n", count);
+ DPRINTF("Decreased reservation by %d pages\n", tailbuf.pfncount);
}
}
@@ -1338,18 +1578,17 @@
return 1;
}
+ vcpup = tailbuf.vcpubuf;
for ( i = 0; i <= max_vcpu_id; i++ )
{
if ( !(vcpumap & (1ULL << i)) )
continue;
- if ( read_exact(io_fd, &ctxt, ((guest_width == 8)
- ? sizeof(ctxt.x64)
- : sizeof(ctxt.x32))) )
- {
- ERROR("Error when reading ctxt %d", i);
- goto out;
- }
+ memcpy(&ctxt, vcpup, ((guest_width == 8) ? sizeof(ctxt.x64)
+ : sizeof(ctxt.x32)));
+ vcpup += (guest_width == 8) ? sizeof(ctxt.x64) : sizeof(ctxt.x32);
+
+ DPRINTF("read VCPU %d\n", i);
if ( !new_ctxt_format )
SET_FIELD(&ctxt, flags, GET_FIELD(&ctxt, flags) | VGCF_online);
@@ -1454,12 +1693,8 @@
if ( !ext_vcpucontext )
continue;
- if ( read_exact(io_fd, &domctl.u.ext_vcpucontext, 128) ||
- (domctl.u.ext_vcpucontext.vcpu != i) )
- {
- ERROR("Error when reading extended ctxt %d", i);
- goto out;
- }
+ memcpy(&domctl.u.ext_vcpucontext, vcpup, 128);
+ vcpup += 128;
domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
domctl.domain = dom;
frc = xc_domctl(xc_handle, &domctl);
@@ -1470,11 +1705,9 @@
}
}
- if ( read_exact(io_fd, shared_info_page, PAGE_SIZE) )
- {
- ERROR("Error when reading shared info page");
- goto out;
- }
+ memcpy(shared_info_page, tailbuf.shared_info_page, PAGE_SIZE);
+
+ DPRINTF("Completed checkpoint load\n");
/* Restore contents of shared-info page. No checking needed. */
new_shared_info = xc_map_foreign_range(
diff --git a/tools/python/xen/xend/XendCheckpoint.py b/tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py
+++ b/tools/python/xen/xend/XendCheckpoint.py
@@ -345,7 +345,7 @@
restore_image.setCpuid()
- os.read(fd, 1) # Wait for source to close connection
+ # xc_restore will wait for source to close connection
dominfo.completeRestore(handler.store_mfn, handler.console_mfn)
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 03 of 11] Initiate failover if a packet is not received every 500ms
2009-11-05 22:58 [PATCH 00 of 11] Remus 0.9 released! Brendan Cully
2009-11-05 22:58 ` [PATCH 01 of 11] Add callbacks for suspend, postcopy and preresume in xc_domain_save Brendan Cully
2009-11-05 22:58 ` [PATCH 02 of 11] Make xc_domain_restore loop until the fd is closed Brendan Cully
@ 2009-11-05 22:58 ` Brendan Cully
2009-11-05 22:58 ` [PATCH 04 of 11] Buffer checkpoint data locally until domain has resumed execution Brendan Cully
` (9 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Brendan Cully @ 2009-11-05 22:58 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1240355510 25200
# Node ID 7a96f23c0e7d1bb142bace2d348d80043bfa518b
# Parent a82f2e189c29acfc26794ea10d4000d0a350bf40
Initiate failover if a packet is not received every 500ms.
This breaks checkpoints at lower frequencies, and should be made
optional.
diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c
+++ b/tools/libxc/xc_domain_restore.c
@@ -453,6 +453,51 @@
}
+/* set when a consistent image is available */
+static int completed = 0;
+
+#define HEARTBEAT_MS 500
+
+#ifndef __MINIOS__
+static ssize_t read_exact_timed(int fd, void* buf, size_t size)
+{
+ size_t offset = 0;
+ ssize_t len;
+ struct timeval tv;
+ fd_set rfds;
+
+ while ( offset < size )
+ {
+ if ( completed ) {
+ /* expect a heartbeat every HEARBEAT_MS ms maximum */
+ tv.tv_sec = 0;
+ tv.tv_usec = HEARTBEAT_MS * 1000;
+
+ FD_ZERO(&rfds);
+ FD_SET(fd, &rfds);
+ len = select(fd + 1, &rfds, NULL, NULL, &tv);
+ if ( !FD_ISSET(fd, &rfds) ) {
+ fprintf(stderr, "read_exact_timed failed (select returned %zd)\n", len);
+ return -1;
+ }
+ }
+
+ len = read(fd, buf + offset, size - offset);
+ if ( (len == -1) && ((errno == EINTR) || (errno == EAGAIN)) )
+ continue;
+ if ( len <= 0 )
+ return -1;
+ offset += len;
+ }
+
+ return 0;
+}
+
+#define read_exact read_exact_timed
+
+#else
+#define read_exact_timed read_exact
+#endif
/*
** In the state file (or during transfer), all page-table pages are
** converted into a 'canonical' form where references to actual mfns
@@ -1079,7 +1124,6 @@
/* Buffer for holding HVM context */
uint8_t *hvm_buf = NULL;
- int completed = 0;
pagebuf_t pagebuf;
tailbuf_t tailbuf, tmptail;
void* vcpup;
@@ -1342,12 +1386,18 @@
/* Non-HVM guests only from here on */
if ( !completed ) {
+ int flags = 0;
+
if ( buffer_tail(&tailbuf, io_fd, max_vcpu_id, vcpumap,
ext_vcpucontext) < 0 ) {
ERROR ("error buffering image tail");
goto out;
}
completed = 1;
+ /* shift into nonblocking mode for the remainder */
+ if ( (flags = fcntl(io_fd, F_GETFL,0)) < 0 )
+ flags = 0;
+ fcntl(io_fd, F_SETFL, flags | O_NONBLOCK);
}
// DPRINTF("Buffered checkpoint\n");
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 04 of 11] Buffer checkpoint data locally until domain has resumed execution
2009-11-05 22:58 [PATCH 00 of 11] Remus 0.9 released! Brendan Cully
` (2 preceding siblings ...)
2009-11-05 22:58 ` [PATCH 03 of 11] Initiate failover if a packet is not received every 500ms Brendan Cully
@ 2009-11-05 22:58 ` Brendan Cully
2009-11-05 22:58 ` [PATCH 05 of 11] Do not bother with to_skip/to_fix bitmaps after the first final round Brendan Cully
` (8 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Brendan Cully @ 2009-11-05 22:58 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1240355510 25200
# Node ID 4771b6cb3a82ae2b7da86388a9cfb472858cd90e
# Parent 7a96f23c0e7d1bb142bace2d348d80043bfa518b
Buffer checkpoint data locally until domain has resumed execution.
Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
diff --git a/tools/libxc/xc_domain_save.c b/tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c
+++ b/tools/libxc/xc_domain_save.c
@@ -52,6 +52,15 @@
/* Address size of the guest */
unsigned int guest_width;
+/* buffer for output */
+struct outbuf {
+ void* buf;
+ size_t size;
+ size_t pos;
+};
+
+#define OUTBUF_SIZE (16384 * 1024)
+
/* grep fodder: machine_to_phys */
#define mfn_to_pfn(_mfn) (live_m2p[(_mfn)])
@@ -154,6 +163,85 @@
return rc;
}
+static int outbuf_init(struct outbuf* ob, size_t size)
+{
+ memset(ob, 0, sizeof(*ob));
+
+ if ( !(ob->buf = malloc(size)) ) {
+ DPRINTF("error allocating output buffer of size %zu\n", size);
+ return -1;
+ }
+
+ ob->size = size;
+
+ return 0;
+}
+
+static inline int outbuf_write(struct outbuf* ob, void* buf, size_t len)
+{
+ if ( len > ob->size - ob->pos ) {
+ DPRINTF("outbuf_write: %zu > %zu@%zu\n", len, ob->size - ob->pos, ob->pos);
+ return -1;
+ }
+
+ memcpy(ob->buf + ob->pos, buf, len);
+ ob->pos += len;
+
+ return 0;
+}
+
+/* prep for nonblocking I/O */
+static int outbuf_flush(struct outbuf* ob, int fd)
+{
+ int rc;
+ int cur = 0;
+
+ if ( !ob->pos )
+ return 0;
+
+ rc = write(fd, ob->buf, ob->pos);
+ while (rc < 0 || cur + rc < ob->pos) {
+ if (rc < 0 && errno != EAGAIN && errno != EINTR) {
+ DPRINTF("error flushing output: %d\n", errno);
+ return -1;
+ }
+ if (rc > 0)
+ cur += rc;
+
+ rc = write(fd, ob->buf + cur, ob->pos - cur);
+ }
+
+ ob->pos = 0;
+
+ return 0;
+}
+
+/* if there's no room in the buffer, flush it and try again. */
+static inline int outbuf_hardwrite(struct outbuf* ob, int fd, void* buf,
+ size_t len)
+{
+ if ( !len )
+ return 0;
+
+ if ( !outbuf_write(ob, buf, len) )
+ return 0;
+
+ if ( outbuf_flush(ob, fd) < 0 )
+ return -1;
+
+ return outbuf_write(ob, buf, len);
+}
+
+/* start buffering output once we've reached checkpoint mode. */
+static inline int write_buffer(int dobuf, struct outbuf* ob, int fd, void* buf,
+ size_t len)
+{
+ if ( dobuf )
+ return outbuf_hardwrite(ob, fd, buf, len);
+ else
+ return write_exact(fd, buf, len);
+}
+
#ifdef ADAPTIVE_SAVE
/*
@@ -246,6 +334,16 @@
#endif
+/* like write_buffer for ratewrite, which returns number of bytes written */
+static inline int ratewrite_buffer(int dobuf, struct outbuf* ob, int fd,
+ int live, void* buf, size_t len)
+{
+ if ( dobuf )
+ return outbuf_hardwrite(ob, fd, buf, len) ? -1 : len;
+ else
+ return ratewrite(fd, live, buf, len);
+}
+
static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
xc_shadow_op_stats_t *stats, int print)
{
@@ -796,6 +894,10 @@
unsigned long mfn;
+ struct outbuf ob;
+
+ outbuf_init(&ob, OUTBUF_SIZE);
+
/* If no explicit control parameters given, use defaults */
max_iters = max_iters ? : DEF_MAX_ITERS;
max_factor = max_factor ? : DEF_MAX_FACTOR;
@@ -995,6 +1097,12 @@
}
copypages:
+#define write_exact(fd, buf, len) write_buffer(last_iter, &ob, (fd), (buf), (len))
+#ifdef ratewrite
+#undef ratewrite
+#endif
+#define ratewrite(fd, live, buf, len) ratewrite_buffer(last_iter, &ob, (fd), (live), (buf), (len))
+
/* Now write out each data page, canonicalising page tables as we go... */
for ( ; ; )
{
@@ -1594,7 +1702,10 @@
callbacks->postcopy(callbacks->data);
/* Flush last write and discard cache for file. */
- discard_file_cache(io_fd, 1 /* flush */);
+ if ( outbuf_flush(&ob, io_fd) < 0 ) {
+ ERROR("Error when flushing output buffer\n");
+ rc = 1;
+ }
/* checkpoint_cb can spend arbitrarily long in between rounds */
if (!rc && callbacks->checkpoint &&
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 05 of 11] Do not bother with to_skip/to_fix bitmaps after the first final round
2009-11-05 22:58 [PATCH 00 of 11] Remus 0.9 released! Brendan Cully
` (3 preceding siblings ...)
2009-11-05 22:58 ` [PATCH 04 of 11] Buffer checkpoint data locally until domain has resumed execution Brendan Cully
@ 2009-11-05 22:58 ` Brendan Cully
2009-11-05 22:58 ` [PATCH 06 of 11] Do bitmap scan word-by-word before bit-by-bit Brendan Cully
` (7 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Brendan Cully @ 2009-11-05 22:58 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Geoffrey Lefebvre <geoffrey@cs.ubc.ca>
# Date 1240355511 25200
# Node ID d6c00f82239b2527cbdb8cc0140f67d7b168ae20
# Parent 4771b6cb3a82ae2b7da86388a9cfb472858cd90e
Do not bother with to_skip/to_fix bitmaps after the first final round.
Signed-off-by: Geoffrey Lefebvre <geoffrey@cs.ubc.ca>
diff --git a/tools/libxc/xc_domain_save.c b/tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c
+++ b/tools/libxc/xc_domain_save.c
@@ -896,6 +896,8 @@
struct outbuf ob;
+ int completed = 0;
+
outbuf_init(&ob, OUTBUF_SIZE);
/* If no explicit control parameters given, use defaults */
@@ -1159,54 +1161,69 @@
mfn_to_pfn(pfn_to_mfn(n)&0xFFFFF));
DPRINTF("\n");
}
- if ( !last_iter &&
- test_bit(n, to_send) &&
- test_bit(n, to_skip) )
- skip_this_iter++; /* stats keeping */
- if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
- (test_bit(n, to_send) && last_iter) ||
- (test_bit(n, to_fix) && last_iter)) )
- continue;
+ if ( completed )
+ {
+ if ( !test_bit(n, to_send) )
+ continue;
- /*
- ** we get here if:
- ** 1. page is marked to_send & hasn't already been re-dirtied
- ** 2. (ignore to_skip in last iteration)
- ** 3. add in pages that still need fixup (net bufs)
- */
+ pfn_batch[batch] = n;
+ if ( hvm )
+ pfn_type[batch] = n;
+ else
+ pfn_type[batch] = pfn_to_mfn(n);
+ }
+ else
+ {
+ if ( !last_iter &&
+ test_bit(n, to_send) &&
+ test_bit(n, to_skip) )
+ skip_this_iter++; /* stats keeping */
- pfn_batch[batch] = n;
+ if ( !((test_bit(n, to_send) && !test_bit(n, to_skip)) ||
+ (test_bit(n, to_send) && last_iter) ||
+ (test_bit(n, to_fix) && last_iter)) )
+ continue;
- /* Hypercall interfaces operate in PFNs for HVM guests
- * and MFNs for PV guests */
- if ( hvm )
- pfn_type[batch] = n;
- else
- pfn_type[batch] = pfn_to_mfn(n);
+ /*
+ ** we get here if:
+ ** 1. page is marked to_send & hasn't already been re-dirtied
+ ** 2. (ignore to_skip in last iteration)
+ ** 3. add in pages that still need fixup (net bufs)
+ */
+
+ pfn_batch[batch] = n;
+
+ /* Hypercall interfaces operate in PFNs for HVM guests
+ * and MFNs for PV guests */
+ if ( hvm )
+ pfn_type[batch] = n;
+ else
+ pfn_type[batch] = pfn_to_mfn(n);
- if ( !is_mapped(pfn_type[batch]) )
- {
- /*
- ** not currently in psuedo-physical map -- set bit
- ** in to_fix since we must send this page in last_iter
- ** unless its sent sooner anyhow, or it never enters
- ** pseudo-physical map (e.g. for ballooned down doms)
- */
- set_bit(n, to_fix);
- continue;
+ if ( !is_mapped(pfn_type[batch]) )
+ {
+ /*
+ ** not currently in psuedo-physical map -- set bit
+ ** in to_fix since we must send this page in last_iter
+ ** unless its sent sooner anyhow, or it never enters
+ ** pseudo-physical map (e.g. for ballooned down doms)
+ */
+ set_bit(n, to_fix);
+ continue;
+ }
+
+ if ( last_iter &&
+ test_bit(n, to_fix) &&
+ !test_bit(n, to_send) )
+ {
+ needed_to_fix++;
+ DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
+ iter, n, pfn_type[batch]);
+ }
+
+ clear_bit(n, to_fix);
}
-
- if ( last_iter &&
- test_bit(n, to_fix) &&
- !test_bit(n, to_send) )
- {
- needed_to_fix++;
- DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
- iter, n, pfn_type[batch]);
- }
-
- clear_bit(n, to_fix);
batch++;
}
@@ -1698,6 +1715,8 @@
rc = 0;
out:
+ completed = 1;
+
if ( !rc && callbacks->postcopy )
callbacks->postcopy(callbacks->data);
@@ -1707,6 +1726,8 @@
rc = 1;
}
+ discard_file_cache(io_fd, 1 /* flush */);
+
/* checkpoint_cb can spend arbitrarily long in between rounds */
if (!rc && callbacks->checkpoint &&
callbacks->checkpoint(callbacks->data) > 0)
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 06 of 11] Do bitmap scan word-by-word before bit-by-bit
2009-11-05 22:58 [PATCH 00 of 11] Remus 0.9 released! Brendan Cully
` (4 preceding siblings ...)
2009-11-05 22:58 ` [PATCH 05 of 11] Do not bother with to_skip/to_fix bitmaps after the first final round Brendan Cully
@ 2009-11-05 22:58 ` Brendan Cully
2009-11-05 22:58 ` [PATCH 07 of 11] Make checkpoint buffering HVM-aware Brendan Cully
` (6 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Brendan Cully @ 2009-11-05 22:58 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1240355511 25200
# Node ID ae725eba611ff93dd9a09cfa995f61b6db28cf86
# Parent d6c00f82239b2527cbdb8cc0140f67d7b168ae20
Do bitmap scan word-by-word before bit-by-bit.
For sparse bitmaps and large domains this saves a lot of time.
Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
diff --git a/tools/libxc/xc_domain_save.c b/tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c
+++ b/tools/libxc/xc_domain_save.c
@@ -94,6 +94,8 @@
#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+#define ORDER_LONG (sizeof(unsigned long) == 4 ? 5 : 6)
+
static inline int test_bit (int nr, volatile void * addr)
{
return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
@@ -1164,6 +1166,14 @@
if ( completed )
{
+ /* for sparse bitmaps, word-by-word may save time */
+ if ( !to_send[N >> ORDER_LONG] )
+ {
+ /* incremented again in for loop! */
+ N += BITS_PER_LONG - 1;
+ continue;
+ }
+
if ( !test_bit(n, to_send) )
continue;
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 07 of 11] Make checkpoint buffering HVM-aware
2009-11-05 22:58 [PATCH 00 of 11] Remus 0.9 released! Brendan Cully
` (5 preceding siblings ...)
2009-11-05 22:58 ` [PATCH 06 of 11] Do bitmap scan word-by-word before bit-by-bit Brendan Cully
@ 2009-11-05 22:58 ` Brendan Cully
2009-11-05 22:58 ` [PATCH 08 of 11] blktap2: configurable driver chains Brendan Cully
` (5 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Brendan Cully @ 2009-11-05 22:58 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1253640427 25200
# Node ID 6ca67fe3514ada809a62282c30fe46d5df5ce265
# Parent ae725eba611ff93dd9a09cfa995f61b6db28cf86
Make checkpoint buffering HVM-aware
Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c
+++ b/tools/libxc/xc_domain_restore.c
@@ -670,15 +670,204 @@
}
typedef struct {
- unsigned int pfncount;
- unsigned long* pfntab;
- unsigned int vcpucount;
- unsigned char* vcpubuf;
- unsigned char shared_info_page[PAGE_SIZE];
+ int ishvm;
+ union {
+ struct tailbuf_pv {
+ unsigned int pfncount;
+ unsigned long* pfntab;
+ unsigned int vcpucount;
+ unsigned char* vcpubuf;
+ unsigned char shared_info_page[PAGE_SIZE];
+ } pv;
+ struct tailbuf_hvm {
+ uint64_t magicpfns[3];
+ uint32_t hvmbufsize, reclen;
+ uint8_t* hvmbuf;
+ struct {
+ uint32_t magic;
+ uint32_t version;
+ uint64_t len;
+ } qemuhdr;
+ uint32_t qemubufsize;
+ uint8_t* qemubuf;
+ } hvm;
+ } u;
} tailbuf_t;
-static int buffer_tail(tailbuf_t* buf, int fd, unsigned int max_vcpu_id,
- uint64_t vcpumap, int ext_vcpucontext)
+/* read stream until EOF, growing buffer as necssary */
+static int compat_buffer_qemu(int fd, struct tailbuf_hvm *buf)
+{
+ uint8_t *qbuf, *tmp;
+ int blen = 0, dlen = 0;
+ int rc;
+
+ /* currently save records tend to be about 7K */
+ blen = 8192;
+ if ( !(qbuf = malloc(blen)) ) {
+ ERROR("Error allocating QEMU buffer");
+ return -1;
+ }
+
+ while( (rc = read(fd, qbuf+dlen, blen-dlen)) > 0 ) {
+ DPRINTF("Read %d bytes of QEMU data\n", rc);
+ dlen += rc;
+
+ if (dlen == blen) {
+ DPRINTF("%d-byte QEMU buffer full, reallocating...\n", dlen);
+ blen += 4096;
+ tmp = realloc(qbuf, blen);
+ if ( !tmp ) {
+ ERROR("Error growing QEMU buffer to %d bytes", blen);
+ free(qbuf);
+ return -1;
+ }
+ qbuf = tmp;
+ }
+ }
+
+ if ( rc < 0 ) {
+ ERROR("Error reading QEMU data");
+ free(qbuf);
+ return -1;
+ }
+
+ if ( memcmp(qbuf, "QEVM", 4) ) {
+ ERROR("Invalid QEMU magic: 0x%08x", *(unsigned long*)qbuf);
+ free(qbuf);
+ return -1;
+ }
+
+ buf->qemubuf = qbuf;
+ buf->qemubufsize = dlen;
+
+ return 0;
+}
+
+static int buffer_qemu(int fd, struct tailbuf_hvm *buf)
+{
+ uint32_t qlen;
+ uint8_t *tmp;
+
+ if ( read_exact(fd, &qlen, sizeof(qlen)) ) {
+ ERROR("Error reading QEMU header length");
+ return -1;
+ }
+
+ if ( qlen > buf->qemubufsize ) {
+ if ( buf->qemubuf) {
+ tmp = realloc(buf->qemubuf, qlen);
+ if ( tmp )
+ buf->qemubuf = tmp;
+ else {
+ ERROR("Error reallocating QEMU state buffer");
+ return -1;
+ }
+ } else {
+ buf->qemubuf = malloc(qlen);
+ if ( !buf->qemubuf ) {
+ ERROR("Error allocating QEMU state buffer");
+ return -1;
+ }
+ }
+ }
+ buf->qemubufsize = qlen;
+
+ if ( read_exact(fd, buf->qemubuf, buf->qemubufsize) ) {
+ ERROR("Error reading QEMU state");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int dump_qemu(uint32_t dom, struct tailbuf_hvm *buf)
+{
+ int saved_errno;
+ char path[256];
+ FILE *fp;
+
+ sprintf(path, "/var/lib/xen/qemu-save.%u", dom);
+ fp = fopen(path, "wb");
+ if ( !fp )
+ return -1;
+
+ DPRINTF("Writing %d bytes of QEMU data\n", buf->qemubufsize);
+ if ( fwrite(buf->qemubuf, 1, buf->qemubufsize, fp) != buf->qemubufsize) {
+ saved_errno = errno;
+ fclose(fp);
+ errno = saved_errno;
+ return -1;
+ }
+
+ fclose(fp);
+
+ return 0;
+}
+
+static int buffer_tail_hvm(struct tailbuf_hvm *buf, int fd,
+ unsigned int max_vcpu_id, uint64_t vcpumap,
+ int ext_vcpucontext)
+{
+ uint8_t *tmp;
+ unsigned char qemusig[21];
+
+ if ( read_exact(fd, buf->magicpfns, sizeof(buf->magicpfns)) ) {
+ ERROR("Error reading magic PFNs");
+ return -1;
+ }
+
+ if ( read_exact(fd, &buf->reclen, sizeof(buf->reclen)) ) {
+ ERROR("Error reading HVM params size");
+ return -1;
+ }
+
+ if ( buf->reclen > buf->hvmbufsize ) {
+ if ( buf->hvmbuf) {
+ tmp = realloc(buf->hvmbuf, buf->reclen);
+ if ( tmp ) {
+ buf->hvmbuf = tmp;
+ buf->hvmbufsize = buf->reclen;
+ } else {
+ ERROR("Error reallocating HVM param buffer");
+ return -1;
+ }
+ } else {
+ buf->hvmbuf = malloc(buf->reclen);
+ if ( !buf->hvmbuf ) {
+ ERROR("Error allocating HVM param buffer");
+ return -1;
+ }
+ buf->hvmbufsize = buf->reclen;
+ }
+ }
+
+ if ( read_exact(fd, buf->hvmbuf, buf->reclen) ) {
+ ERROR("Error reading HVM params");
+ return -1;
+ }
+
+ if ( read_exact(fd, qemusig, sizeof(qemusig)) ) {
+ ERROR("Error reading QEMU signature");
+ return -1;
+ }
+
+ /* The normal live-migration QEMU record has no length information.
+ * Short of reimplementing the QEMU parser, we're forced to just read
+ * until EOF. Remus gets around this by sending a different signature
+ * which includes a length prefix */
+ if ( !memcmp(qemusig, "QemuDeviceModelRecord", sizeof(qemusig)) )
+ return compat_buffer_qemu(fd, buf);
+ else if ( !memcmp(qemusig, "RemusDeviceModelState", sizeof(qemusig)) )
+ return buffer_qemu(fd, buf);
+
+ qemusig[20] = '\0';
+ ERROR("Invalid QEMU signature: %s", qemusig);
+ return -1;
+}
+
+static int buffer_tail_pv(struct tailbuf_pv *buf, int fd,
+ unsigned int max_vcpu_id, uint64_t vcpumap,
+ int ext_vcpucontext)
{
unsigned int i;
size_t pfnlen, vcpulen;
@@ -753,18 +942,49 @@
return -1;
}
-static void tailbuf_free(tailbuf_t* buf)
+static int buffer_tail(tailbuf_t *buf, int fd, unsigned int max_vcpu_id,
+ uint64_t vcpumap, int ext_vcpucontext)
{
- if (buf->vcpubuf) {
+ if ( buf->ishvm )
+ return buffer_tail_hvm(&buf->u.hvm, fd, max_vcpu_id, vcpumap,
+ ext_vcpucontext);
+ else
+ return buffer_tail_pv(&buf->u.pv, fd, max_vcpu_id, vcpumap,
+ ext_vcpucontext);
+}
+
+static void tailbuf_free_hvm(struct tailbuf_hvm *buf)
+{
+ if ( buf->hvmbuf ) {
+ free(buf->hvmbuf);
+ buf->hvmbuf = NULL;
+ }
+ if ( buf->qemubuf ) {
+ free(buf->qemubuf);
+ buf->qemubuf = NULL;
+ }
+}
+
+static void tailbuf_free_pv(struct tailbuf_pv *buf)
+{
+ if ( buf->vcpubuf ) {
free(buf->vcpubuf);
buf->vcpubuf = NULL;
}
- if (buf->pfntab) {
+ if ( buf->pfntab ) {
free(buf->pfntab);
buf->pfntab = NULL;
}
}
+static void tailbuf_free(tailbuf_t *buf)
+{
+ if ( buf->ishvm )
+ tailbuf_free_hvm(&buf->u.hvm);
+ else
+ tailbuf_free_pv(&buf->u.pv);
+}
+
typedef struct {
void* pages;
/* pages is of length nr_physpages, pfn_types is of length nr_pages */
@@ -1118,18 +1338,13 @@
unsigned int max_vcpu_id = 0;
int new_ctxt_format = 0;
- /* Magic frames in HVM guests: ioreqs and xenstore comms. */
- uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
-
- /* Buffer for holding HVM context */
- uint8_t *hvm_buf = NULL;
-
pagebuf_t pagebuf;
tailbuf_t tailbuf, tmptail;
void* vcpup;
pagebuf_init(&pagebuf);
memset(&tailbuf, 0, sizeof(tailbuf));
+ tailbuf.ishvm = hvm;
/* For info only */
nr_pfns = 0;
@@ -1313,78 +1528,6 @@
// DPRINTF("Received all pages (%d races)\n", nraces);
- if ( hvm )
- {
- uint32_t rec_len;
-
- /* Set HVM-specific parameters */
- if ( read_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
- {
- ERROR("error reading magic page addresses");
- goto out;
- }
-
- /* These comms pages need to be zeroed at the start of day */
- if ( xc_clear_domain_page(xc_handle, dom, magic_pfns[0]) ||
- xc_clear_domain_page(xc_handle, dom, magic_pfns[1]) ||
- xc_clear_domain_page(xc_handle, dom, magic_pfns[2]) )
- {
- ERROR("error zeroing magic pages");
- goto out;
- }
-
- if ( (frc = xc_set_hvm_param(xc_handle, dom,
- HVM_PARAM_IOREQ_PFN, magic_pfns[0]))
- || (frc = xc_set_hvm_param(xc_handle, dom,
- HVM_PARAM_BUFIOREQ_PFN, magic_pfns[1]))
- || (frc = xc_set_hvm_param(xc_handle, dom,
- HVM_PARAM_STORE_PFN, magic_pfns[2]))
- || (frc = xc_set_hvm_param(xc_handle, dom,
- HVM_PARAM_PAE_ENABLED, pae))
- || (frc = xc_set_hvm_param(xc_handle, dom,
- HVM_PARAM_STORE_EVTCHN,
- store_evtchn)) )
- {
- ERROR("error setting HVM params: %i", frc);
- goto out;
- }
- *store_mfn = magic_pfns[2];
-
- /* Read HVM context */
- if ( read_exact(io_fd, &rec_len, sizeof(uint32_t)) )
- {
- ERROR("error read hvm context size!\n");
- goto out;
- }
-
- hvm_buf = malloc(rec_len);
- if ( hvm_buf == NULL )
- {
- ERROR("memory alloc for hvm context buffer failed");
- errno = ENOMEM;
- goto out;
- }
-
- if ( read_exact(io_fd, hvm_buf, rec_len) )
- {
- ERROR("error loading the HVM context");
- goto out;
- }
-
- frc = xc_domain_hvm_setcontext(xc_handle, dom, hvm_buf, rec_len);
- if ( frc )
- {
- ERROR("error setting the HVM context");
- goto out;
- }
-
- /* HVM success! */
- rc = 0;
- goto out;
- }
-
- /* Non-HVM guests only from here on */
-
if ( !completed ) {
int flags = 0;
@@ -1407,6 +1550,7 @@
goto finish;
}
memset(&tmptail, 0, sizeof(tmptail));
+ tmptail.ishvm = hvm;
if ( buffer_tail(&tmptail, io_fd, max_vcpu_id, vcpumap,
ext_vcpucontext) < 0 ) {
ERROR ("error buffering image tail, finishing");
@@ -1418,6 +1562,8 @@
goto loadpages;
finish:
+ if ( hvm )
+ goto finish_hvm;
if ( (pt_levels == 3) && !pae_extended_cr3 )
{
@@ -1589,15 +1735,15 @@
{
int nr_frees = 0;
- for ( i = 0; i < tailbuf.pfncount; i++ )
+ for ( i = 0; i < tailbuf.u.pv.pfncount; i++ )
{
- unsigned long pfn = tailbuf.pfntab[i];
+ unsigned long pfn = tailbuf.u.pv.pfntab[i];
if ( p2m[pfn] != INVALID_P2M_ENTRY )
{
/* pfn is not in physmap now, but was at some point during
the save/migration process - need to free it */
- tailbuf.pfntab[nr_frees++] = p2m[pfn];
+ tailbuf.u.pv.pfntab[nr_frees++] = p2m[pfn];
p2m[pfn] = INVALID_P2M_ENTRY; /* not in pseudo-physical map */
}
}
@@ -1609,7 +1755,7 @@
.extent_order = 0,
.domid = dom
};
- set_xen_guest_handle(reservation.extent_start, tailbuf.pfntab);
+ set_xen_guest_handle(reservation.extent_start, tailbuf.u.pv.pfntab);
if ( (frc = xc_memory_op(xc_handle, XENMEM_decrease_reservation,
&reservation)) != nr_frees )
@@ -1618,7 +1764,7 @@
goto out;
}
else
- DPRINTF("Decreased reservation by %d pages\n", tailbuf.pfncount);
+ DPRINTF("Decreased reservation by %d pages\n", tailbuf.u.pv.pfncount);
}
}
@@ -1628,7 +1774,7 @@
return 1;
}
- vcpup = tailbuf.vcpubuf;
+ vcpup = tailbuf.u.pv.vcpubuf;
for ( i = 0; i <= max_vcpu_id; i++ )
{
if ( !(vcpumap & (1ULL << i)) )
@@ -1755,7 +1901,7 @@
}
}
- memcpy(shared_info_page, tailbuf.shared_info_page, PAGE_SIZE);
+ memcpy(shared_info_page, tailbuf.u.pv.shared_info_page, PAGE_SIZE);
DPRINTF("Completed checkpoint load\n");
@@ -1812,6 +1958,51 @@
DPRINTF("Domain ready to be built.\n");
rc = 0;
+ goto out;
+
+ finish_hvm:
+ /* Dump the QEMU state to a state file for QEMU to load */
+ if ( dump_qemu(dom, &tailbuf.u.hvm) ) {
+ ERROR("Error dumping QEMU state to file");
+ goto out;
+ }
+
+ /* These comms pages need to be zeroed at the start of day */
+ if ( xc_clear_domain_page(xc_handle, dom, tailbuf.u.hvm.magicpfns[0]) ||
+ xc_clear_domain_page(xc_handle, dom, tailbuf.u.hvm.magicpfns[1]) ||
+ xc_clear_domain_page(xc_handle, dom, tailbuf.u.hvm.magicpfns[2]) )
+ {
+ ERROR("error zeroing magic pages");
+ goto out;
+ }
+
+ if ( (frc = xc_set_hvm_param(xc_handle, dom,
+ HVM_PARAM_IOREQ_PFN, tailbuf.u.hvm.magicpfns[0]))
+ || (frc = xc_set_hvm_param(xc_handle, dom,
+ HVM_PARAM_BUFIOREQ_PFN, tailbuf.u.hvm.magicpfns[1]))
+ || (frc = xc_set_hvm_param(xc_handle, dom,
+ HVM_PARAM_STORE_PFN, tailbuf.u.hvm.magicpfns[2]))
+ || (frc = xc_set_hvm_param(xc_handle, dom,
+ HVM_PARAM_PAE_ENABLED, pae))
+ || (frc = xc_set_hvm_param(xc_handle, dom,
+ HVM_PARAM_STORE_EVTCHN,
+ store_evtchn)) )
+ {
+ ERROR("error setting HVM params: %i", frc);
+ goto out;
+ }
+ *store_mfn = tailbuf.u.hvm.magicpfns[2];
+
+ frc = xc_domain_hvm_setcontext(xc_handle, dom, tailbuf.u.hvm.hvmbuf,
+ tailbuf.u.hvm.reclen);
+ if ( frc )
+ {
+ ERROR("error setting the HVM context");
+ goto out;
+ }
+
+ /* HVM success! */
+ rc = 0;
out:
if ( (rc != 0) && (dom != 0) )
@@ -1819,7 +2010,7 @@
free(mmu);
free(p2m);
free(pfn_type);
- free(hvm_buf);
+ tailbuf_free(&tailbuf);
/* discard cache for save file */
discard_file_cache(io_fd, 1 /*flush*/);
diff --git a/tools/python/xen/xend/XendCheckpoint.py b/tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py
+++ b/tools/python/xen/xend/XendCheckpoint.py
@@ -323,26 +323,6 @@
if not is_hvm and handler.console_mfn is None:
raise XendError('Could not read console MFN')
- # get qemu state and create a tmp file for dm restore
- # Even PV guests may have QEMU stat, but its not currently
- # used so only bother with HVM currently.
- if is_hvm:
- qemu_signature = read_exact(fd, len(QEMU_SIGNATURE),
- "invalid device model signature read")
- if qemu_signature != QEMU_SIGNATURE:
- raise XendError("not a valid device model state: found '%s'" %
- qemu_signature)
- qemu_fd = os.open("/var/lib/xen/qemu-save.%d" % dominfo.getDomid(),
- os.O_WRONLY | os.O_CREAT | os.O_TRUNC)
- while True:
- buf = os.read(fd, dm_batch)
- if len(buf):
- write_exact(qemu_fd, buf,
- "could not write dm state to tmp file")
- else:
- break
- os.close(qemu_fd)
-
restore_image.setCpuid()
# xc_restore will wait for source to close connection
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 08 of 11] blktap2: configurable driver chains
2009-11-05 22:58 [PATCH 00 of 11] Remus 0.9 released! Brendan Cully
` (6 preceding siblings ...)
2009-11-05 22:58 ` [PATCH 07 of 11] Make checkpoint buffering HVM-aware Brendan Cully
@ 2009-11-05 22:58 ` Brendan Cully
2009-11-06 21:12 ` Daniel Stodden
2009-11-05 22:58 ` [PATCH 09 of 11] blktap2: only open driver stack once Brendan Cully
` (4 subsequent siblings)
12 siblings, 1 reply; 16+ messages in thread
From: Brendan Cully @ 2009-11-05 22:58 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Ryan O'Connor <rjo@cs.ubc.ca>
# Date 1252530408 25200
# Node ID 4ca00ee41ce9731b8725c736b27c841b484dce5d
# Parent 6ca67fe3514ada809a62282c30fe46d5df5ce265
blktap2: configurable driver chains
Blktap2 allows block device drivers to be layered to create more
advanced virtual block devices. However, composing a layered driver is
not exposed to the user. This patch fixes this, and allows the user to
explicitly specify a driver chain when starting a tapdisk process, using
the pipe character ('|') to explicitly seperate layers in a blktap2
configuration string.
for example, the command:
~$ tapdisk2 -n "log:|aio:/path/to/file.img"
will create a blktap2 device where read and write requests are passed to
the 'log' driver, then forwarded to the 'aio' driver.
Signed-off-by: Ryan O'Connor <rjo@cs.ubc.ca>
diff --git a/tools/blktap2/drivers/tapdisk-vbd.c b/tools/blktap2/drivers/tapdisk-vbd.c
--- a/tools/blktap2/drivers/tapdisk-vbd.c
+++ b/tools/blktap2/drivers/tapdisk-vbd.c
@@ -106,6 +106,7 @@
vbd->callback = tapdisk_vbd_callback;
vbd->argument = vbd;
+ INIT_LIST_HEAD(&vbd->driver_stack);
INIT_LIST_HEAD(&vbd->images);
INIT_LIST_HEAD(&vbd->new_requests);
INIT_LIST_HEAD(&vbd->pending_requests);
@@ -541,6 +542,105 @@
goto out;
}
+/* TODO: ugh, lets not call it parent info... */
+static struct list_head *
+tapdisk_vbd_open_level(td_vbd_t *vbd, char* params, int driver_type, td_disk_info_t *parent_info, td_flag_t flags)
+{
+ char *name;
+ int type, err;
+ td_image_t *image;
+ td_disk_id_t id;
+ struct list_head *images;
+ td_driver_t *driver;
+
+ images = calloc(1, sizeof(struct list_head));
+ INIT_LIST_HEAD(images);
+
+ name = params;
+ type = driver_type;
+
+ for (;;) {
+ err = -ENOMEM;
+ image = tapdisk_image_allocate(name, type,
+ vbd->storage, flags, vbd);
+
+ /* free 'name' if it was created by td_get_parent_id() */
+ if (name != params) {
+ free(name);
+ name = NULL;
+ }
+
+ if (!image)
+ return NULL;
+
+
+ /* We have to do this to set the driver info for child drivers. this conflicts with td_open */
+ driver = image->driver;
+ if (!driver) {
+ driver = tapdisk_driver_allocate(image->type,
+ image->name,
+ image->flags,
+ image->storage);
+ if (!driver)
+ return NULL;
+ }
+ /* the image has a driver, set the info and driver */
+ image->driver = driver;
+ image->info = driver->info;
+
+ /* XXX: we don't touch driver->refcount, broken? */
+ /* XXX: we've replicated about 90% of td_open() gross! */
+ /* XXX: this breaks if a driver modifies its info within a layer */
+
+ /* if the parent info is set, pass it to the child */
+ if(parent_info)
+ {
+ image->driver->info = *parent_info;
+ }
+
+ err = td_load(image);
+ if (err) {
+ if (err != -ENODEV)
+ return NULL;
+
+ err = td_open(image);
+ if (err)
+ return NULL;
+ }
+
+ /* TODO: non-sink drivers that don't care about their child
+ * currently return EINVAL. Could return TD_PARENT_OK or
+ * TD_ANY_PARENT */
+
+ err = td_get_parent_id(image, &id);
+ if (err && (err != TD_NO_PARENT && err != -EINVAL)) {
+ td_close(image);
+ return NULL;
+ }
+
+ if (!image->storage)
+ image->storage = vbd->storage;
+
+ /* add this image to the end of the list */
+ list_add_tail(&image->next, images);
+
+ image = NULL;
+
+ /* if the image does not have a parent we return the
+ * list of images generated by this level of the stack */
+ if (err == TD_NO_PARENT || err == -EINVAL)
+ break;
+
+ name = id.name;
+ type = id.drivertype;
+#if 0
+ /* catch this by validate, not here */
+ flags |= (TD_OPEN_RDONLY | TD_OPEN_SHAREABLE);
+#endif
+ }
+ return images;
+}
+
static int
__tapdisk_vbd_open_vdi(td_vbd_t *vbd, td_flag_t extra_flags)
{
@@ -548,58 +648,36 @@
int err, type;
td_flag_t flags;
td_disk_id_t id;
- td_image_t *image, *tmp;
+ td_image_t *tmp;
struct tfilter *filter = NULL;
+ td_vbd_driver_info_t *driver_info;
+ struct list_head *images;
+ td_disk_info_t *parent_info = NULL;
err = tapdisk_vbd_reactivate_volumes(vbd, 0);
if (err)
return err;
flags = (vbd->flags & ~TD_OPEN_SHAREABLE) | extra_flags;
- file = vbd->name;
- type = vbd->type;
- for (;;) {
- err = -ENOMEM;
- image = tapdisk_image_allocate(file, type,
- vbd->storage, flags, vbd);
- if (file != vbd->name) {
- free(file);
- file = NULL;
- }
+ /* loop on each user specified driver.
+ * NOTE: driver_info is in reverse order. That is, the first
+ * item is the 'parent' or 'sink' driver */
+ list_for_each_entry(driver_info, &vbd->driver_stack, next) {
+ file = driver_info->params;
+ type = driver_info->type;
+ images = tapdisk_vbd_open_level(vbd, file, type, parent_info, flags);
+ if (!images)
+ return -EINVAL;
- if (!image)
- goto fail;
+ /* after each loop, append the created stack to the result stack */
+ list_splice(images, &vbd->images);
+ free(images);
- err = td_load(image);
- if (err) {
- if (err != -ENODEV)
- goto fail;
-
- err = td_open(image);
- if (err)
- goto fail;
- }
-
- err = td_get_parent_id(image, &id);
- if (err && err != TD_NO_PARENT) {
- td_close(image);
- goto fail;
- }
-
- if (!image->storage)
- image->storage = vbd->storage;
-
- tapdisk_vbd_add_image(vbd, image);
- image = NULL;
-
- if (err == TD_NO_PARENT)
- break;
-
- file = id.name;
- type = id.drivertype;
- flags |= (TD_OPEN_RDONLY | TD_OPEN_SHAREABLE);
+ /* set the parent_info to the first diskinfo on the stack */
+ tmp = tapdisk_vbd_first_image(vbd);
+ parent_info = &tmp->info;
}
if (td_flag_test(vbd->flags, TD_OPEN_LOG_DIRTY)) {
@@ -623,14 +701,91 @@
return 0;
fail:
+
+/* TODO: loop over vbd to free images? maybe do that in vbd_close_vdi */
+#if 0
if (image)
tapdisk_image_free(image);
+#endif
+ /* TODO: handle partial stack createion? */
tapdisk_vbd_close_vdi(vbd);
return err;
}
+/* this populates a vbd type based on path */
+int
+tapdisk_vbd_parse_stack(td_vbd_t *vbd, const char *path)
+{
+ int err;
+ char *params, *driver_str;
+ td_vbd_driver_info_t *driver;
+
+ /* make a copy of path */
+ /* TODO: check against MAX_NAME_LEM ? */
+ err = tapdisk_namedup(¶ms, path);
+ if(err)
+ goto error;
+
+
+ /* tokenize params based on pipe '|' */
+ driver_str = strtok(params, "|");
+ while(driver_str != NULL)
+ {
+ /* parse driver info and add to vbd */
+ driver = calloc(1, sizeof(td_vbd_driver_info_t));
+ INIT_LIST_HEAD(&driver->next);
+ err = tapdisk_parse_disk_type(driver_str, &driver->params, &driver->type);
+ if(err)
+ goto error;
+
+ /* build the list backwards as the last driver will be the first
+ * driver to open in the stack */
+ list_add(&driver->next, &vbd->driver_stack);
+
+ /* get next driver string */
+ driver_str = strtok(NULL, "|");
+ }
+
+ return 0;
+
+ /* error: free any driver_info's and params */
+error:
+ while(!list_empty(&vbd->driver_stack)) {
+ driver = list_entry(vbd->driver_stack.next, td_vbd_driver_info_t, next);
+ list_del(&driver->next);
+ free(driver);
+ }
+
+ return err;
+}
+
+/* NOTE: driver type, etc. must be set */
+static int
+tapdisk_vbd_open_stack(td_vbd_t *vbd, uint16_t storage, td_flag_t flags)
+{
+ int i, err;
+
+ vbd->flags = flags;
+ vbd->storage = storage;
+
+ for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
+ err = __tapdisk_vbd_open_vdi(vbd, 0);
+ if (err != -EIO)
+ break;
+
+ sleep(TD_VBD_EIO_SLEEP);
+ }
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ return err;
+}
+
int
tapdisk_vbd_open_vdi(td_vbd_t *vbd, const char *path,
uint16_t drivertype, uint16_t storage, td_flag_t flags)
@@ -759,7 +914,7 @@
{
int err;
- err = tapdisk_vbd_open_vdi(vbd, name, type, storage, flags);
+ err = tapdisk_vbd_open_stack(vbd, storage, flags);
if (err)
goto out;
diff --git a/tools/blktap2/drivers/tapdisk-vbd.h b/tools/blktap2/drivers/tapdisk-vbd.h
--- a/tools/blktap2/drivers/tapdisk-vbd.h
+++ b/tools/blktap2/drivers/tapdisk-vbd.h
@@ -53,6 +53,7 @@
typedef struct td_ring td_ring_t;
typedef struct td_vbd_request td_vbd_request_t;
+typedef struct td_vbd_driver_info td_vbd_driver_info_t;
typedef struct td_vbd_handle td_vbd_t;
typedef void (*td_vbd_cb_t) (void *, blkif_response_t *);
@@ -79,12 +80,20 @@
struct list_head next;
};
+struct td_vbd_driver_info {
+ char *params;
+ int type;
+ struct list_head next;
+};
+
struct td_vbd_handle {
char *name;
td_uuid_t uuid;
int type;
+ struct list_head driver_stack;
+
int storage;
uint8_t reopened;
@@ -164,6 +173,7 @@
int tapdisk_vbd_initialize(int, int, td_uuid_t);
void tapdisk_vbd_set_callback(td_vbd_t *, td_vbd_cb_t, void *);
+int tapdisk_vbd_parse_stack(td_vbd_t *vbd, const char *path);
int tapdisk_vbd_open(td_vbd_t *, const char *, uint16_t,
uint16_t, const char *, td_flag_t);
int tapdisk_vbd_close(td_vbd_t *);
diff --git a/tools/blktap2/drivers/tapdisk2.c b/tools/blktap2/drivers/tapdisk2.c
--- a/tools/blktap2/drivers/tapdisk2.c
+++ b/tools/blktap2/drivers/tapdisk2.c
@@ -264,6 +264,13 @@
return err;
}
+ err = tapdisk_vbd_parse_stack(vbd, name);
+ if (err) {
+ CHILD_ERR(err, "vbd_parse_stack failed: %d\n", err);
+ return err;
+ }
+
+ /* TODO: clean this up */
err = tapdisk_vbd_open(vbd, path, type,
TAPDISK_STORAGE_TYPE_DEFAULT,
devname, 0);
diff --git a/tools/blktap2/include/list.h b/tools/blktap2/include/list.h
--- a/tools/blktap2/include/list.h
+++ b/tools/blktap2/include/list.h
@@ -87,6 +87,26 @@
return list->next == head;
}
+static inline void __list_splice(struct list_head *list,
+ struct list_head *head)
+{
+ struct list_head *first = list->next;
+ struct list_head *last = list->prev;
+ struct list_head *at = head->next;
+
+ first->prev = head;
+ head->next = first;
+
+ last->next = at;
+ at->prev = last;
+}
+
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+ if (!list_empty(list))
+ __list_splice(list, head);
+}
+
#define list_entry(ptr, type, member) \
((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
^ permalink raw reply [flat|nested] 16+ messages in thread* Re: [PATCH 08 of 11] blktap2: configurable driver chains
2009-11-05 22:58 ` [PATCH 08 of 11] blktap2: configurable driver chains Brendan Cully
@ 2009-11-06 21:12 ` Daniel Stodden
0 siblings, 0 replies; 16+ messages in thread
From: Daniel Stodden @ 2009-11-06 21:12 UTC (permalink / raw)
To: Brendan Cully, Ryan O'Connor
Cc: andy@cs.ubc.ca, xen-devel@lists.xensource.com
Hey Ryan.
The command line idea is ok, but how about "!"?
Also, please check the line width.
Also, please don't alter the kernel list macros [..or submit yours to
lkml].
Daniel
On Thu, 2009-11-05 at 17:58 -0500, Brendan Cully wrote:
> # HG changeset patch
> # User Ryan O'Connor <rjo@cs.ubc.ca>
> # Date 1252530408 25200
> # Node ID 4ca00ee41ce9731b8725c736b27c841b484dce5d
> # Parent 6ca67fe3514ada809a62282c30fe46d5df5ce265
> blktap2: configurable driver chains
>
> Blktap2 allows block device drivers to be layered to create more
> advanced virtual block devices. However, composing a layered driver is
> not exposed to the user. This patch fixes this, and allows the user to
> explicitly specify a driver chain when starting a tapdisk process, using
> the pipe character ('|') to explicitly seperate layers in a blktap2
> configuration string.
>
> for example, the command:
> ~$ tapdisk2 -n "log:|aio:/path/to/file.img"
> will create a blktap2 device where read and write requests are passed to
> the 'log' driver, then forwarded to the 'aio' driver.
>
> Signed-off-by: Ryan O'Connor <rjo@cs.ubc.ca>
>
> diff --git a/tools/blktap2/drivers/tapdisk-vbd.c b/tools/blktap2/drivers/tapdisk-vbd.c
> --- a/tools/blktap2/drivers/tapdisk-vbd.c
> +++ b/tools/blktap2/drivers/tapdisk-vbd.c
> @@ -106,6 +106,7 @@
> vbd->callback = tapdisk_vbd_callback;
> vbd->argument = vbd;
>
> + INIT_LIST_HEAD(&vbd->driver_stack);
> INIT_LIST_HEAD(&vbd->images);
> INIT_LIST_HEAD(&vbd->new_requests);
> INIT_LIST_HEAD(&vbd->pending_requests);
> @@ -541,6 +542,105 @@
> goto out;
> }
>
> +/* TODO: ugh, lets not call it parent info... */
> +static struct list_head *
> +tapdisk_vbd_open_level(td_vbd_t *vbd, char* params, int driver_type, td_disk_info_t *parent_info, td_flag_t flags)
> +{
> + char *name;
> + int type, err;
> + td_image_t *image;
> + td_disk_id_t id;
> + struct list_head *images;
> + td_driver_t *driver;
> +
> + images = calloc(1, sizeof(struct list_head));
> + INIT_LIST_HEAD(images);
> +
> + name = params;
> + type = driver_type;
> +
> + for (;;) {
> + err = -ENOMEM;
> + image = tapdisk_image_allocate(name, type,
> + vbd->storage, flags, vbd);
> +
> + /* free 'name' if it was created by td_get_parent_id() */
> + if (name != params) {
> + free(name);
> + name = NULL;
> + }
> +
> + if (!image)
> + return NULL;
> +
> +
> + /* We have to do this to set the driver info for child drivers. this conflicts with td_open */
> + driver = image->driver;
> + if (!driver) {
> + driver = tapdisk_driver_allocate(image->type,
> + image->name,
> + image->flags,
> + image->storage);
> + if (!driver)
> + return NULL;
> + }
> + /* the image has a driver, set the info and driver */
> + image->driver = driver;
> + image->info = driver->info;
> +
> + /* XXX: we don't touch driver->refcount, broken? */
> + /* XXX: we've replicated about 90% of td_open() gross! */
> + /* XXX: this breaks if a driver modifies its info within a layer */
> +
> + /* if the parent info is set, pass it to the child */
> + if(parent_info)
> + {
> + image->driver->info = *parent_info;
> + }
> +
> + err = td_load(image);
> + if (err) {
> + if (err != -ENODEV)
> + return NULL;
> +
> + err = td_open(image);
> + if (err)
> + return NULL;
> + }
> +
> + /* TODO: non-sink drivers that don't care about their child
> + * currently return EINVAL. Could return TD_PARENT_OK or
> + * TD_ANY_PARENT */
> +
> + err = td_get_parent_id(image, &id);
> + if (err && (err != TD_NO_PARENT && err != -EINVAL)) {
> + td_close(image);
> + return NULL;
> + }
> +
> + if (!image->storage)
> + image->storage = vbd->storage;
> +
> + /* add this image to the end of the list */
> + list_add_tail(&image->next, images);
> +
> + image = NULL;
> +
> + /* if the image does not have a parent we return the
> + * list of images generated by this level of the stack */
> + if (err == TD_NO_PARENT || err == -EINVAL)
> + break;
> +
> + name = id.name;
> + type = id.drivertype;
> +#if 0
> + /* catch this by validate, not here */
> + flags |= (TD_OPEN_RDONLY | TD_OPEN_SHAREABLE);
> +#endif
> + }
> + return images;
> +}
> +
> static int
> __tapdisk_vbd_open_vdi(td_vbd_t *vbd, td_flag_t extra_flags)
> {
> @@ -548,58 +648,36 @@
> int err, type;
> td_flag_t flags;
> td_disk_id_t id;
> - td_image_t *image, *tmp;
> + td_image_t *tmp;
> struct tfilter *filter = NULL;
> + td_vbd_driver_info_t *driver_info;
> + struct list_head *images;
> + td_disk_info_t *parent_info = NULL;
>
> err = tapdisk_vbd_reactivate_volumes(vbd, 0);
> if (err)
> return err;
>
> flags = (vbd->flags & ~TD_OPEN_SHAREABLE) | extra_flags;
> - file = vbd->name;
> - type = vbd->type;
>
> - for (;;) {
> - err = -ENOMEM;
> - image = tapdisk_image_allocate(file, type,
> - vbd->storage, flags, vbd);
>
> - if (file != vbd->name) {
> - free(file);
> - file = NULL;
> - }
> + /* loop on each user specified driver.
> + * NOTE: driver_info is in reverse order. That is, the first
> + * item is the 'parent' or 'sink' driver */
> + list_for_each_entry(driver_info, &vbd->driver_stack, next) {
> + file = driver_info->params;
> + type = driver_info->type;
> + images = tapdisk_vbd_open_level(vbd, file, type, parent_info, flags);
> + if (!images)
> + return -EINVAL;
>
> - if (!image)
> - goto fail;
> + /* after each loop, append the created stack to the result stack */
> + list_splice(images, &vbd->images);
> + free(images);
>
> - err = td_load(image);
> - if (err) {
> - if (err != -ENODEV)
> - goto fail;
> -
> - err = td_open(image);
> - if (err)
> - goto fail;
> - }
> -
> - err = td_get_parent_id(image, &id);
> - if (err && err != TD_NO_PARENT) {
> - td_close(image);
> - goto fail;
> - }
> -
> - if (!image->storage)
> - image->storage = vbd->storage;
> -
> - tapdisk_vbd_add_image(vbd, image);
> - image = NULL;
> -
> - if (err == TD_NO_PARENT)
> - break;
> -
> - file = id.name;
> - type = id.drivertype;
> - flags |= (TD_OPEN_RDONLY | TD_OPEN_SHAREABLE);
> + /* set the parent_info to the first diskinfo on the stack */
> + tmp = tapdisk_vbd_first_image(vbd);
> + parent_info = &tmp->info;
> }
>
> if (td_flag_test(vbd->flags, TD_OPEN_LOG_DIRTY)) {
> @@ -623,14 +701,91 @@
> return 0;
>
> fail:
> +
> +/* TODO: loop over vbd to free images? maybe do that in vbd_close_vdi */
> +#if 0
> if (image)
> tapdisk_image_free(image);
> +#endif
>
> + /* TODO: handle partial stack createion? */
> tapdisk_vbd_close_vdi(vbd);
>
> return err;
> }
>
> +/* this populates a vbd type based on path */
> +int
> +tapdisk_vbd_parse_stack(td_vbd_t *vbd, const char *path)
> +{
> + int err;
> + char *params, *driver_str;
> + td_vbd_driver_info_t *driver;
> +
> + /* make a copy of path */
> + /* TODO: check against MAX_NAME_LEM ? */
> + err = tapdisk_namedup(¶ms, path);
> + if(err)
> + goto error;
> +
> +
> + /* tokenize params based on pipe '|' */
> + driver_str = strtok(params, "|");
> + while(driver_str != NULL)
> + {
> + /* parse driver info and add to vbd */
> + driver = calloc(1, sizeof(td_vbd_driver_info_t));
> + INIT_LIST_HEAD(&driver->next);
> + err = tapdisk_parse_disk_type(driver_str, &driver->params, &driver->type);
> + if(err)
> + goto error;
> +
> + /* build the list backwards as the last driver will be the first
> + * driver to open in the stack */
> + list_add(&driver->next, &vbd->driver_stack);
> +
> + /* get next driver string */
> + driver_str = strtok(NULL, "|");
> + }
> +
> + return 0;
> +
> + /* error: free any driver_info's and params */
> +error:
> + while(!list_empty(&vbd->driver_stack)) {
> + driver = list_entry(vbd->driver_stack.next, td_vbd_driver_info_t, next);
> + list_del(&driver->next);
> + free(driver);
> + }
> +
> + return err;
> +}
> +
> +/* NOTE: driver type, etc. must be set */
> +static int
> +tapdisk_vbd_open_stack(td_vbd_t *vbd, uint16_t storage, td_flag_t flags)
> +{
> + int i, err;
> +
> + vbd->flags = flags;
> + vbd->storage = storage;
> +
> + for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
> + err = __tapdisk_vbd_open_vdi(vbd, 0);
> + if (err != -EIO)
> + break;
> +
> + sleep(TD_VBD_EIO_SLEEP);
> + }
> + if (err)
> + goto fail;
> +
> + return 0;
> +
> +fail:
> + return err;
> +}
> +
> int
> tapdisk_vbd_open_vdi(td_vbd_t *vbd, const char *path,
> uint16_t drivertype, uint16_t storage, td_flag_t flags)
> @@ -759,7 +914,7 @@
> {
> int err;
>
> - err = tapdisk_vbd_open_vdi(vbd, name, type, storage, flags);
> + err = tapdisk_vbd_open_stack(vbd, storage, flags);
> if (err)
> goto out;
>
> diff --git a/tools/blktap2/drivers/tapdisk-vbd.h b/tools/blktap2/drivers/tapdisk-vbd.h
> --- a/tools/blktap2/drivers/tapdisk-vbd.h
> +++ b/tools/blktap2/drivers/tapdisk-vbd.h
> @@ -53,6 +53,7 @@
>
> typedef struct td_ring td_ring_t;
> typedef struct td_vbd_request td_vbd_request_t;
> +typedef struct td_vbd_driver_info td_vbd_driver_info_t;
> typedef struct td_vbd_handle td_vbd_t;
> typedef void (*td_vbd_cb_t) (void *, blkif_response_t *);
>
> @@ -79,12 +80,20 @@
> struct list_head next;
> };
>
> +struct td_vbd_driver_info {
> + char *params;
> + int type;
> + struct list_head next;
> +};
> +
> struct td_vbd_handle {
> char *name;
>
> td_uuid_t uuid;
> int type;
>
> + struct list_head driver_stack;
> +
> int storage;
>
> uint8_t reopened;
> @@ -164,6 +173,7 @@
>
> int tapdisk_vbd_initialize(int, int, td_uuid_t);
> void tapdisk_vbd_set_callback(td_vbd_t *, td_vbd_cb_t, void *);
> +int tapdisk_vbd_parse_stack(td_vbd_t *vbd, const char *path);
> int tapdisk_vbd_open(td_vbd_t *, const char *, uint16_t,
> uint16_t, const char *, td_flag_t);
> int tapdisk_vbd_close(td_vbd_t *);
> diff --git a/tools/blktap2/drivers/tapdisk2.c b/tools/blktap2/drivers/tapdisk2.c
> --- a/tools/blktap2/drivers/tapdisk2.c
> +++ b/tools/blktap2/drivers/tapdisk2.c
> @@ -264,6 +264,13 @@
> return err;
> }
>
> + err = tapdisk_vbd_parse_stack(vbd, name);
> + if (err) {
> + CHILD_ERR(err, "vbd_parse_stack failed: %d\n", err);
> + return err;
> + }
> +
> + /* TODO: clean this up */
> err = tapdisk_vbd_open(vbd, path, type,
> TAPDISK_STORAGE_TYPE_DEFAULT,
> devname, 0);
> diff --git a/tools/blktap2/include/list.h b/tools/blktap2/include/list.h
> --- a/tools/blktap2/include/list.h
> +++ b/tools/blktap2/include/list.h
> @@ -87,6 +87,26 @@
> return list->next == head;
> }
>
> +static inline void __list_splice(struct list_head *list,
> + struct list_head *head)
> +{
> + struct list_head *first = list->next;
> + struct list_head *last = list->prev;
> + struct list_head *at = head->next;
> +
> + first->prev = head;
> + head->next = first;
> +
> + last->next = at;
> + at->prev = last;
> +}
> +
> +static inline void list_splice(struct list_head *list, struct list_head *head)
> +{
> + if (!list_empty(list))
> + __list_splice(list, head);
> +}
> +
> #define list_entry(ptr, type, member) \
> ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 09 of 11] blktap2: only open driver stack once
2009-11-05 22:58 [PATCH 00 of 11] Remus 0.9 released! Brendan Cully
` (7 preceding siblings ...)
2009-11-05 22:58 ` [PATCH 08 of 11] blktap2: configurable driver chains Brendan Cully
@ 2009-11-05 22:58 ` Brendan Cully
2009-11-05 22:58 ` [PATCH 10 of 11] Fixup for tap:tapdisk syntax in remus uname Brendan Cully
` (3 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Brendan Cully @ 2009-11-05 22:58 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Ryan O'Connor <rjo@cs.ubc.ca>
# Date 1252530408 25200
# Node ID b14bb76b36dd239586b41c47e6f2531cbcbf4b47
# Parent 4ca00ee41ce9731b8725c736b27c841b484dce5d
blktap2: only open driver stack once
Currently blktap2 opens a driver stack, closes it, and re-opens it. This
causes problems with our remus driver: the primary may connect to the
backup in between the first and second open.
This is a temporary fix.
Signed-off-by: Ryan O'Connor <rjo@cs.ubc.ca>
diff --git a/tools/blktap2/drivers/tapdisk-vbd.c b/tools/blktap2/drivers/tapdisk-vbd.c
--- a/tools/blktap2/drivers/tapdisk-vbd.c
+++ b/tools/blktap2/drivers/tapdisk-vbd.c
@@ -1561,9 +1561,11 @@
gettimeofday(&vreq->last_try, NULL);
tapdisk_vbd_move_request(vreq, &vbd->pending_requests);
+#if 0
err = tapdisk_vbd_check_queue(vbd);
if (err)
goto fail;
+#endif
err = tapdisk_image_check_ring_request(image, req);
if (err)
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 10 of 11] Fixup for tap:tapdisk syntax in remus uname
2009-11-05 22:58 [PATCH 00 of 11] Remus 0.9 released! Brendan Cully
` (8 preceding siblings ...)
2009-11-05 22:58 ` [PATCH 09 of 11] blktap2: only open driver stack once Brendan Cully
@ 2009-11-05 22:58 ` Brendan Cully
2009-11-05 22:58 ` [PATCH 11 of 11] blktap2: add remus driver Brendan Cully
` (2 subsequent siblings)
12 siblings, 0 replies; 16+ messages in thread
From: Brendan Cully @ 2009-11-05 22:58 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1252530408 25200
# Node ID 5e0779189ef3b9382675b2e574b75936a6e9fb15
# Parent b14bb76b36dd239586b41c47e6f2531cbcbf4b47
Fixup for tap:tapdisk syntax in remus uname
Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
diff --git a/tools/python/xen/xend/server/BlktapController.py b/tools/python/xen/xend/server/BlktapController.py
--- a/tools/python/xen/xend/server/BlktapController.py
+++ b/tools/python/xen/xend/server/BlktapController.py
@@ -172,6 +172,8 @@
uname = config.get('uname', '')
try:
(typ, subtyp, params, file) = string.split(uname, ':', 3)
+ if subtyp not in ('tapdisk', 'ioemu'):
+ raise ValueError('invalid subtype')
except:
(typ, params, file) = string.split(uname, ':', 2)
subtyp = 'tapdisk'
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 11 of 11] blktap2: add remus driver
2009-11-05 22:58 [PATCH 00 of 11] Remus 0.9 released! Brendan Cully
` (9 preceding siblings ...)
2009-11-05 22:58 ` [PATCH 10 of 11] Fixup for tap:tapdisk syntax in remus uname Brendan Cully
@ 2009-11-05 22:58 ` Brendan Cully
2009-11-09 8:52 ` [PATCH 00 of 11] Remus 0.9 released! Keir Fraser
2009-12-22 2:32 ` Keith Coleman
12 siblings, 0 replies; 16+ messages in thread
From: Brendan Cully @ 2009-11-05 22:58 UTC (permalink / raw)
To: xen-devel; +Cc: andy
# HG changeset patch
# User Brendan Cully <brendan@cs.ubc.ca>
# Date 1252530408 25200
# Node ID aaf56934865afa3f5611cd891347953cdd1e5729
# Parent 5e0779189ef3b9382675b2e574b75936a6e9fb15
blktap2: add remus driver
Blktap2 port of remus disk driver. Backwards compatable with blktap1
implementation.
Signed-off-by: Ryan O'Connor <rjo@cs.ubc.ca>
Signed-off-by: Brendan Cully <brendan@cs.ubc.ca>
diff --git a/tools/blktap2/drivers/Makefile b/tools/blktap2/drivers/Makefile
--- a/tools/blktap2/drivers/Makefile
+++ b/tools/blktap2/drivers/Makefile
@@ -36,7 +36,7 @@
CRYPT_LIB += -lcrypto
endif
-LDFLAGS_img := $(CRYPT_LIB) -lpthread -lz
+LDFLAGS_img := $(CRYPT_LIB) -lpthread -lz -lm
LIBS += -L$(LIBVHDDIR) -lvhd
@@ -44,6 +44,14 @@
LIBS += -luuid
endif
+REMUS-OBJS := block-remus.o
+REMUS-OBJS += hashtable.o
+REMUS-OBJS += hashtable_itr.o
+REMUS-OBJS += hashtable_utility.o
+
+$(REMUS-OBJS): CFLAGS += -fgnu89-inline -I$(XEN_XENSTORE)
+
+
LIBAIO_DIR = $(XEN_ROOT)/tools/libaio/src
tapdisk2 tapdisk-stream tapdisk-diff $(QCOW_UTIL): AIOLIBS := $(LIBAIO_DIR)/libaio.a
tapdisk-client tapdisk-stream tapdisk-diff $(QCOW_UTIL): CFLAGS += -I$(LIBAIO_DIR) -I$(XEN_LIBXC)
@@ -81,6 +89,7 @@
BLK-OBJS-y += block-qcow.o
BLK-OBJS-y += aes.o
BLK-OBJS-y += $(PORTABLE-OBJS-y)
+BLK-OBJS-y += $(REMUS-OBJS)
all: $(IBIN) lock-util qcow-util
diff --git a/tools/blktap2/drivers/block-remus.c b/tools/blktap2/drivers/block-remus.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap2/drivers/block-remus.c
@@ -0,0 +1,1670 @@
+/* block-remus.c
+ *
+ * This disk sends all writes to a backup via a network interface before
+ * passing them to an underlying device.
+ * The backup is a bit more complicated:
+ * 1. It applies all incoming writes to a ramdisk.
+ * 2. When a checkpoint request arrives, it moves the ramdisk to
+ * a committing state and uses a new ramdisk for subsequent writes.
+ * It also acknowledges the request, to let the sender know it can
+ * release output.
+ * 3. The ramdisk flushes its contents to the underlying driver.
+ * 4. At failover, the backup waits for the in-flight ramdisk (if any) to
+ * drain before letting the domain be activated.
+ *
+ * The driver determines whether it is the client or server by attempting
+ * to bind to the replication address. If the address is not local,
+ * the driver acts as client.
+ *
+ * The following messages are defined for the replication stream:
+ * 1. write request
+ * "wreq" 4
+ * num_sectors 4
+ * sector 8
+ * buffer (num_sectors * sector_size)
+ * 2. submit request (may be used as a barrier
+ * "sreq" 4
+ * 3. commit request
+ * "creq" 4
+ * After a commit request, the client must wait for a competion message:
+ * 4. completion
+ * "done" 4
+ */
+
+/* due to architectural choices in tapdisk, block-buffer is forced to
+ * reimplement some code which is meant to be private */
+#define TAPDISK
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "hashtable.h"
+#include "hashtable_itr.h"
+#include "hashtable_utility.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
+
+/* timeout for reads and writes in ms */
+#define NET_TIMEOUT 500
+#define RAMDISK_HASHSIZE 128
+
+/* connect retry timeout (seconds) */
+#define REMUS_CONNRETRY_TIMEOUT 10
+
+#define RPRINTF(_f, _a...) syslog (LOG_DEBUG, "remus: " _f, ## _a)
+
+enum tdremus_mode {
+ mode_invalid = 0,
+ mode_unprotected,
+ mode_primary,
+ mode_backup
+};
+
+struct tdremus_req {
+ uint64_t sector;
+ int nb_sectors;
+ char buf[4096];
+};
+
+struct req_ring {
+ /* waste one slot to distinguish between empty and full */
+ struct tdremus_req requests[MAX_REQUESTS * 2 + 1];
+ unsigned int head;
+ unsigned int tail;
+};
+
+/* TODO: This isn't very pretty, but to properly generate our own treqs (needed
+ * by the backup) we need to know our td_vbt_t and td_image_t (blktap2
+ * internals). As a proper fix, we should consider extending the tapdisk
+ * interface with a td_create_request() function, or something similar.
+ *
+ * For now, we just grab the vbd in the td_open() command, and the td_image_t
+ * from the first read request.
+ */
+td_vbd_t *device_vbd = NULL;
+td_image_t *remus_image = NULL;
+
+struct ramdisk {
+ size_t sector_size;
+ struct hashtable* h;
+ /* when a ramdisk is flushed, h is given a new empty hash for writes
+ * while the old ramdisk (prev) is drained asynchronously. To avoid
+ * a race where a read request points to a sector in prev which has
+ * not yet been flushed, check prev on a miss in h */
+ struct hashtable* prev;
+ /* count of outstanding requests to the base driver */
+ size_t inflight;
+};
+
+/* the ramdisk intercepts the original callback for reads and writes.
+ * This holds the original data. */
+/* Might be worth making this a static array in struct ramdisk to avoid
+ * a malloc per request */
+
+struct tdremus_state;
+
+struct ramdisk_cbdata {
+ td_callback_t cb;
+ void* private;
+ char* buf;
+ struct tdremus_state* state;
+};
+
+struct ramdisk_write_cbdata {
+ struct tdremus_state* state;
+ char* buf;
+};
+
+typedef void (*queue_rw_t) (td_driver_t *driver, td_request_t treq);
+
+/* poll_fd type for blktap2 fd system. taken from block_log.c */
+typedef struct poll_fd {
+ int fd;
+ event_id_t id;
+} poll_fd_t;
+
+struct tdremus_state {
+// struct tap_disk* driver;
+ void* driver_data;
+
+ /* XXX: this is needed so that the server can perform operations on
+ * the driver from the stream_fd event handler. fix this. */
+ td_driver_t *tdremus_driver;
+
+ /* TODO: we may wish to replace these two FIFOs with a unix socket */
+ char* ctl_path; /* receive flush instruction here */
+ poll_fd_t ctl_fd; /* io_fd slot for control FIFO */
+ char* msg_path; /* output completion message here */
+ poll_fd_t msg_fd;
+
+ /* replication host */
+ struct sockaddr_in sa;
+ poll_fd_t server_fd; /* server listen port */
+ poll_fd_t stream_fd; /* replication channel */
+
+ /* queue write requests, batch-replicate at submit */
+ struct req_ring write_ring;
+
+ /* ramdisk data*/
+ struct ramdisk ramdisk;
+
+ /* mode methods */
+ enum tdremus_mode mode;
+ int (*queue_flush)(td_driver_t *driver);
+};
+
+typedef struct tdremus_wire {
+ uint32_t op;
+ uint64_t id;
+ uint64_t sec;
+ uint32_t secs;
+} tdremus_wire_t;
+
+#define TDREMUS_READ "rreq"
+#define TDREMUS_WRITE "wreq"
+#define TDREMUS_SUBMIT "sreq"
+#define TDREMUS_COMMIT "creq"
+#define TDREMUS_DONE "done"
+#define TDREMUS_FAIL "fail"
+
+/* primary read/write functions */
+static void primary_queue_read(td_driver_t *driver, td_request_t treq);
+static void primary_queue_write(td_driver_t *driver, td_request_t treq);
+
+/* backup read/write functions */
+static void backup_queue_read(td_driver_t *driver, td_request_t treq);
+static void backup_queue_write(td_driver_t *driver, td_request_t treq);
+
+/* unpritected read/write functions */
+static void unprotected_queue_read(td_driver_t *driver, td_request_t treq);
+static void unprotected_queue_write(td_driver_t *driver, td_request_t treq);
+
+static int tdremus_close(td_driver_t *driver);
+
+static int switch_mode(td_driver_t *driver, enum tdremus_mode mode);
+static int ctl_respond(struct tdremus_state *s, const char *response);
+
+/* ring functions */
+static inline unsigned int ring_next(struct req_ring* ring, unsigned int pos)
+{
+ if (++pos >= MAX_REQUESTS * 2 + 1)
+ return 0;
+
+ return pos;
+}
+
+static inline int ring_isempty(struct req_ring* ring)
+{
+ return ring->head == ring->tail;
+}
+
+static inline int ring_isfull(struct req_ring* ring)
+{
+ return ring_next(ring, ring->tail) == ring->head;
+}
+
+/* functions to create and sumbit treq's */
+
+static void
+replicated_write_callback(td_request_t treq, int err)
+{
+ struct tdremus_state *s = (struct tdremus_state *) treq.cb_data;
+ td_vbd_request_t *vreq;
+
+ vreq = (td_vbd_request_t *) treq.private;
+
+ /* the write failed for now, lets panic. this is very bad */
+ if (err) {
+ RPRINTF("ramdisk write failed, disk image is not consistent\n");
+ exit(-1);
+ }
+
+ /* The write succeeded. let's pull the vreq off whatever request list
+ * it is on and free() it */
+ list_del(&vreq->next);
+ free(vreq);
+
+ s->ramdisk.inflight--;
+ if (!s->ramdisk.inflight && !s->ramdisk.prev) {
+ /* TODO: the ramdisk has been flushed */
+ }
+}
+
+static inline int
+create_write_request(struct tdremus_state *state, td_sector_t sec, int secs, char *buf)
+{
+ td_request_t treq;
+ td_vbd_request_t *vreq;
+
+ treq.op = TD_OP_WRITE;
+ treq.buf = buf;
+ treq.sec = sec;
+ treq.secs = secs;
+ treq.image = remus_image;
+ treq.cb = replicated_write_callback;
+ treq.cb_data = state;
+ treq.id = 0;
+ treq.sidx = 0;
+
+ vreq = calloc(1, sizeof(td_vbd_request_t));
+ treq.private = vreq;
+
+ if(!vreq)
+ return -1;
+
+ vreq->submitting = 1;
+ INIT_LIST_HEAD(&vreq->next);
+ tapdisk_vbd_move_request(treq.private, &device_vbd->pending_requests);
+
+ /* TODO:
+ * we should probably leave it up to the caller to forward the request */
+ td_forward_request(treq);
+
+ vreq->submitting--;
+
+ return 0;
+}
+
+
+/* ramdisk methods */
+static int ramdisk_flush(td_driver_t *driver, struct tdremus_state *s);
+
+/* http://www.concentric.net/~Ttwang/tech/inthash.htm */
+static unsigned int uint64_hash(void* k)
+{
+ uint64_t key = *(uint64_t*)k;
+
+ key = (~key) + (key << 18);
+ key = key ^ (key >> 31);
+ key = key * 21;
+ key = key ^ (key >> 11);
+ key = key + (key << 6);
+ key = key ^ (key >> 22);
+
+ return (unsigned int)key;
+}
+
+static int rd_hash_equal(void* k1, void* k2)
+{
+ uint64_t key1, key2;
+
+ key1 = *(uint64_t*)k1;
+ key2 = *(uint64_t*)k2;
+
+ return key1 == key2;
+}
+
+static int ramdisk_read(struct ramdisk* ramdisk, uint64_t sector,
+ int nb_sectors, char* buf)
+{
+ int i;
+ char* v;
+ uint64_t key;
+
+ for (i = 0; i < nb_sectors; i++) {
+ key = sector + i;
+ if (!(v = hashtable_search(ramdisk->h, &key))) {
+ /* check whether it is queued in a previous flush request */
+ if (!(ramdisk->prev && (v = hashtable_search(ramdisk->prev, &key))))
+ return -1;
+ }
+ memcpy(buf + i * ramdisk->sector_size, v, ramdisk->sector_size);
+ }
+
+ return 0;
+}
+
+static int ramdisk_write_hash(struct hashtable* h, uint64_t sector, char* buf,
+ size_t len)
+{
+ char* v;
+ uint64_t* key;
+
+ if ((v = hashtable_search(h, §or))) {
+ memcpy(v, buf, len);
+ return 0;
+ }
+
+ if (!(v = malloc(len))) {
+ DPRINTF("ramdisk_write_hash: malloc failed\n");
+ return -1;
+ }
+ memcpy(v, buf, len);
+ if (!(key = malloc(sizeof(*key)))) {
+ DPRINTF("ramdisk_write_hash: error allocating key\n");
+ free(v);
+ return -1;
+ }
+ *key = sector;
+ if (!hashtable_insert(h, key, v)) {
+ DPRINTF("ramdisk_write_hash failed on sector %" PRIu64 "\n", sector);
+ free(key);
+ free(v);
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int ramdisk_write(struct ramdisk* ramdisk, uint64_t sector,
+ int nb_sectors, char* buf)
+{
+ int i, rc;
+
+ for (i = 0; i < nb_sectors; i++) {
+ rc = ramdisk_write_hash(ramdisk->h, sector + i,
+ buf + i * ramdisk->sector_size,
+ ramdisk->sector_size);
+ if (rc)
+ return rc;
+ }
+
+ return 0;
+}
+
+static int ramdisk_write_cb(td_driver_t *driver, int res, uint64_t sector,
+ int nb_sectors, int id, void* private)
+{
+ struct ramdisk_write_cbdata *cbdata = (struct ramdisk_write_cbdata*)private;
+ struct tdremus_state *s = cbdata->state;
+ int rc;
+
+ /*
+ RPRINTF("ramdisk write callback: rc %d, %d sectors @ %" PRIu64 "\n", res, nb_sectors,
+ sector);
+ */
+
+ free(cbdata->buf);
+ free(cbdata);
+
+ s->ramdisk.inflight--;
+ if (!s->ramdisk.inflight && !s->ramdisk.prev) {
+ /* when this reaches 0 and prev is empty, the disk is flushed. */
+ /*
+ RPRINTF("ramdisk flush complete\n");
+ */
+ }
+
+ if (s->ramdisk.prev) {
+ /* resubmit as much as possible in the remaining disk */
+ /*
+ RPRINTF("calling ramdisk_flush from write callback\n");
+ */
+ return ramdisk_flush(driver, s);
+ }
+
+ return 0;
+}
+
+static int uint64_compare(const void* k1, const void* k2)
+{
+ uint64_t u1 = *(uint64_t*)k1;
+ uint64_t u2 = *(uint64_t*)k2;
+
+ /* u1 - u2 is unsigned */
+ return u1 < u2 ? -1 : u1 > u2 ? 1 : 0;
+}
+
+/* set psectors to an array of the sector numbers in the hash, returning
+ * the number of entries (or -1 on error) */
+static int ramdisk_get_sectors(struct hashtable* h, uint64_t** psectors)
+{
+ struct hashtable_itr* itr;
+ uint64_t* sectors;
+ int count;
+
+ if (!(count = hashtable_count(h)))
+ return 0;
+
+ if (!(*psectors = malloc(count * sizeof(uint64_t)))) {
+ DPRINTF("ramdisk_get_sectors: error allocating sector map\n");
+ return -1;
+ }
+ sectors = *psectors;
+
+ itr = hashtable_iterator(h);
+ count = 0;
+ do {
+ sectors[count++] = *(uint64_t*)hashtable_iterator_key(itr);
+ } while (hashtable_iterator_advance(itr));
+ free(itr);
+
+ return count;
+}
+
+static char* merge_requests(struct ramdisk* ramdisk, uint64_t start,
+ size_t count)
+{
+ char* buf;
+ char* sector;
+ int i;
+
+ if (!(buf = valloc(count * ramdisk->sector_size))) {
+ DPRINTF("merge_request: allocation failed\n");
+ return NULL;
+ }
+
+ for (i = 0; i < count; i++) {
+ if (!(sector = hashtable_search(ramdisk->prev, &start))) {
+ DPRINTF("merge_request: lookup failed on %"PRIu64"\n", start);
+ return NULL;
+ }
+
+ memcpy(buf + i * ramdisk->sector_size, sector, ramdisk->sector_size);
+ free(sector);
+
+ start++;
+ }
+
+ return buf;
+}
+
+/* The underlying driver may not handle having the whole ramdisk queued at
+ * once. We queue what we can and let the callbacks attempt to queue more. */
+/* NOTE: may be called from callback, while dd->private still belongs to
+ * the underlying driver */
+static int ramdisk_flush(td_driver_t *driver, struct tdremus_state* s)
+{
+ uint64_t* sectors;
+ char* buf;
+ uint64_t base, batchlen;
+ int i, j, count = 0;
+
+ // RPRINTF("ramdisk flush\n");
+
+ if ((count = ramdisk_get_sectors(s->ramdisk.prev, §ors)) <= 0)
+ return count;
+
+ /*
+ RPRINTF("ramdisk: flushing %d sectors\n", count);
+ */
+
+ /* sort and merge sectors to improve disk performance */
+ qsort(sectors, count, sizeof(*sectors), uint64_compare);
+
+ for (i = 0; i < count;) {
+ base = sectors[i++];
+ while (i < count && sectors[i] == sectors[i-1] + 1)
+ i++;
+ batchlen = sectors[i-1] - base + 1;
+
+ if (!(buf = merge_requests(&s->ramdisk, base, batchlen))) {
+ RPRINTF("ramdisk_flush: merge_requests failed\n");
+ free(sectors);
+ return -1;
+ }
+
+ /* NOTE: create_write_request() creates a treq AND forwards it down
+ * the driver chain */
+ // RPRINTF("forwarding write request at %" PRIu64 ", length: %" PRIu64 "\n", base, batchlen);
+ create_write_request(s, base, batchlen, buf);
+ //RPRINTF("write request at %" PRIu64 ", length: %" PRIu64 " forwarded\n", base, batchlen);
+
+ s->ramdisk.inflight++;
+
+ for (j = 0; j < batchlen; j++) {
+ hashtable_remove(s->ramdisk.prev, &base);
+ base++;
+ }
+ }
+
+ if (!hashtable_count(s->ramdisk.prev)) {
+ /* everything is in flight */
+ hashtable_destroy(s->ramdisk.prev, 0);
+ s->ramdisk.prev = NULL;
+ }
+
+ free(sectors);
+
+ // RPRINTF("ramdisk flush done\n");
+ return 0;
+}
+
+/* flush ramdisk contents to disk */
+static int ramdisk_start_flush(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+ uint64_t* key;
+ char* buf;
+ int rc = 0;
+ int i, j, count, batchlen;
+ uint64_t* sectors;
+
+ if (!hashtable_count(s->ramdisk.h)) {
+ /*
+ RPRINTF("Nothing to flush\n");
+ */
+ return 0;
+ }
+
+ if (s->ramdisk.prev) {
+ /* a flush request issued while a previous flush is still in progress
+ * will merge with the previous request. If you want the previous
+ * request to be consistent, wait for it to complete. */
+ if ((count = ramdisk_get_sectors(s->ramdisk.h, §ors)) < 0)
+ return count;
+
+ for (i = 0; i < count; i++) {
+ buf = hashtable_search(s->ramdisk.h, sectors + i);
+ ramdisk_write_hash(s->ramdisk.prev, sectors[i], buf,
+ s->ramdisk.sector_size);
+ }
+ free(sectors);
+
+ hashtable_destroy (s->ramdisk.h, 0);
+ } else
+ s->ramdisk.prev = s->ramdisk.h;
+
+ /* We create a new hashtable so that new writes can be performed before
+ * the old hashtable is completely drained. */
+ s->ramdisk.h = create_hashtable(RAMDISK_HASHSIZE, uint64_hash,
+ rd_hash_equal);
+
+ return ramdisk_flush(driver, s);
+}
+
+
+static int ramdisk_start(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ if (s->ramdisk.h) {
+ RPRINTF("ramdisk already allocated\n");
+ return 0;
+ }
+
+ s->ramdisk.sector_size = driver->info.sector_size;
+ s->ramdisk.h = create_hashtable(RAMDISK_HASHSIZE, uint64_hash,
+ rd_hash_equal);
+
+ DPRINTF("Ramdisk started, %zu bytes/sector\n", s->ramdisk.sector_size);
+
+ return 0;
+}
+
+/* common client/server functions */
+/* mayberead: Time out after a certain interval. */
+static int mread(int fd, void* buf, size_t len)
+{
+ fd_set rfds;
+ int rc;
+ size_t cur = 0;
+ struct timeval tv = {
+ .tv_sec = 0,
+ .tv_usec = NET_TIMEOUT * 1000
+ };
+
+ if (!len)
+ return 0;
+
+ /* read first. Only select if read is incomplete. */
+ rc = read(fd, buf, len);
+ while (rc < 0 || cur + rc < len) {
+ if (!rc) {
+ RPRINTF("end-of-file");
+ return -1;
+ }
+ if (rc < 0 && errno != EAGAIN) {
+ RPRINTF("error during read: %s\n", strerror(errno));
+ return -1;
+ }
+ if (rc > 0)
+ cur += rc;
+
+ FD_ZERO(&rfds);
+ FD_SET(fd, &rfds);
+ if (!(rc = select(fd + 1, &rfds, NULL, NULL, &tv))) {
+ RPRINTF("time out during read\n");
+ return -1;
+ } else if (rc < 0) {
+ RPRINTF("error during select: %d\n", errno);
+ return -1;
+ }
+ rc = read(fd, buf + cur, len - cur);
+ }
+ /*
+ RPRINTF("read %d bytes\n", cur + rc);
+ */
+
+ return 0;
+}
+
+static int mwrite(int fd, void* buf, size_t len)
+{
+ fd_set wfds;
+ size_t cur = 0;
+ int rc;
+ struct timeval tv = {
+ .tv_sec = 0,
+ .tv_usec = NET_TIMEOUT * 1000
+ };
+
+ if (!len)
+ return 0;
+
+ /* read first. Only select if read is incomplete. */
+ rc = write(fd, buf, len);
+ while (rc < 0 || cur + rc < len) {
+ if (!rc) {
+ RPRINTF("end-of-file");
+ return -1;
+ }
+ if (rc < 0 && errno != EAGAIN) {
+ RPRINTF("error during write: %s\n", strerror(errno));
+ return -1;
+ }
+ if (rc > 0)
+ cur += rc;
+
+ FD_ZERO(&wfds);
+ FD_SET(fd, &wfds);
+ if (!(rc = select(fd + 1, NULL, &wfds, NULL, &tv))) {
+ RPRINTF("time out during write\n");
+ return -1;
+ } else if (rc < 0) {
+ RPRINTF("error during select: %d\n", errno);
+ return -1;
+ }
+ rc = write(fd, buf + cur, len - cur);
+ }
+ /*
+ RPRINTF("wrote %d bytes\n", cur + rc);
+ */
+
+ return 0;
+ FD_ZERO(&wfds);
+ FD_SET(fd, &wfds);
+ select(fd + 1, NULL, &wfds, NULL, &tv);
+}
+
+
+static void inline close_stream_fd(struct tdremus_state *s)
+{
+ /* XXX: -2 is magic. replace with macro perhaps? */
+ tapdisk_server_unregister_event(s->stream_fd.id);
+ close(s->stream_fd.fd);
+ s->stream_fd.fd = -2;
+}
+
+/* primary functions */
+static void remus_client_event(event_id_t, char mode, void *private);
+static void remus_connect_event(event_id_t id, char mode, void *private);
+static void remus_retry_connect_event(event_id_t id, char mode, void *private);
+
+static int primary_do_connect(struct tdremus_state *state)
+{
+ event_id_t id;
+ int fd;
+ int rc;
+ int flags;
+
+ RPRINTF("client connecting to %s:%d...\n", inet_ntoa(state->sa.sin_addr), ntohs(state->sa.sin_port));
+
+ if ((fd = socket(PF_INET, SOCK_STREAM, 0)) < 0) {
+ RPRINTF("could not create client socket: %d\n", errno);
+ return -1;
+ }
+
+ /* make socket nonblocking */
+ if ((flags = fcntl(fd, F_GETFL, 0)) == -1)
+ flags = 0;
+ if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1)
+ return -1;
+
+ /* once we have created the socket and populated the address, we can now start
+ * our non-blocking connect. rather than duplicating code we trigger a timeout
+ * on the socket fd, which calls out nonblocking connect code
+ */
+ if((id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, fd, 0, remus_retry_connect_event, state)) < 0) {
+ RPRINTF("error registering timeout client connection event handler: %s\n", strerror(id));
+ /* TODO: we leak a fd here */
+ return -1;
+ }
+ state->stream_fd.fd = fd;
+ state->stream_fd.id = id;
+ return 0;
+}
+
+static int primary_blocking_connect(struct tdremus_state *state)
+{
+ int fd;
+ int id;
+ int rc;
+ int flags;
+
+ RPRINTF("client connecting to %s:%d...\n", inet_ntoa(state->sa.sin_addr), ntohs(state->sa.sin_port));
+
+ if ((fd = socket(PF_INET, SOCK_STREAM, 0)) < 0) {
+ RPRINTF("could not create client socket: %d\n", errno);
+ return -1;
+ }
+
+ do {
+ if ((rc = connect(fd, &state->sa, sizeof(state->sa))) < 0) {
+ if (errno == ECONNREFUSED) {
+ RPRINTF("connection refused -- retrying in 1 second\n");
+ sleep(1);
+ } else {
+ RPRINTF("connection failed: %d\n", errno);
+ close(fd);
+ return -1;
+ }
+ }
+ } while (rc < 0);
+
+ RPRINTF("client connected\n");
+
+ /* make socket nonblocking */
+ if ((flags = fcntl(fd, F_GETFL, 0)) == -1)
+ flags = 0;
+ if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1)
+ {
+ RPRINTF("error making socket nonblocking\n");
+ close(fd);
+ return -1;
+ }
+
+ if((id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, fd, 0, remus_client_event, state)) < 0) {
+ RPRINTF("error registering client event handler: %s\n", strerror(id));
+ close(fd);
+ return -1;
+ }
+
+ state->stream_fd.fd = fd;
+ state->stream_fd.id = id;
+ return 0;
+}
+
+/* on read, just pass request through */
+static void primary_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ /* just pass read through */
+ td_forward_request(treq);
+}
+
+/* TODO:
+ * The primary uses mwrite() to write the contents of a write request to the
+ * backup. This effectively blocks until all data has been copied into a system
+ * buffer or a timeout has occured. We may wish to instead use tapdisk's
+ * nonblocking i/o interface, tapdisk_server_register_event(), to set timeouts
+ * and write data in an asynchronous fashion.
+ */
+static void primary_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ char header[sizeof(uint32_t) + sizeof(uint64_t)];
+ uint32_t *sectors = (uint32_t *)header;
+ uint64_t *sector = (uint64_t *)(header + sizeof(uint32_t));
+
+ // RPRINTF("write: stream_fd.fd: %d\n", s->stream_fd.fd);
+
+ /* -1 means we haven't connected yet, -2 means the connection was lost */
+ if(s->stream_fd.fd == -1) {
+ RPRINTF("connecting to backup...\n");
+ primary_blocking_connect(s);
+ }
+
+ *sectors = treq.secs;
+ *sector = treq.sec;
+
+ if (mwrite(s->stream_fd.fd, TDREMUS_WRITE, strlen(TDREMUS_WRITE)) < 0)
+ goto fail;
+ if (mwrite(s->stream_fd.fd, header, sizeof(header)) < 0)
+ goto fail;
+
+ if (mwrite(s->stream_fd.fd, treq.buf, treq.secs * driver->info.sector_size) < 0)
+ goto fail;
+
+ td_forward_request(treq);
+
+ return;
+
+ fail:
+ /* switch to unprotected mode and tell tapdisk to retry */
+ RPRINTF("write request replication failed, switching to unprotected mode");
+ switch_mode(s->tdremus_driver, mode_unprotected);
+ td_complete_request(treq, -EBUSY);
+}
+
+
+static int client_flush(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ // RPRINTF("committing output\n");
+
+ if (s->stream_fd.fd == -1)
+ /* connection not yet established, nothing to flush */
+ return 0;
+
+ if (mwrite(s->stream_fd.fd, TDREMUS_COMMIT, strlen(TDREMUS_COMMIT)) < 0) {
+ RPRINTF("error flushing output");
+ close_stream_fd(s);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int primary_start(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ RPRINTF("activating client mode\n");
+
+ tapdisk_remus.td_queue_read = primary_queue_read;
+ tapdisk_remus.td_queue_write = primary_queue_write;
+ s->queue_flush = client_flush;
+
+ s->stream_fd.fd = -1;
+ s->stream_fd.id = -1;
+
+ return 0;
+}
+
+/* timeout callback */
+static void remus_retry_connect_event(event_id_t id, char mode, void *private)
+{
+ struct tdremus_state *s = (struct tdremus_state *)private;
+
+ /* do a non-blocking connect */
+ if (connect(s->stream_fd.fd, &s->sa, sizeof(s->sa)) && errno != EINPROGRESS) {
+ if(errno == ECONNREFUSED || errno == ENETUNREACH || errno == EAGAIN || errno == ECONNABORTED)
+ {
+ /* try again in a second */
+ tapdisk_server_unregister_event(s->stream_fd.id);
+ if((id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, s->stream_fd.fd, REMUS_CONNRETRY_TIMEOUT, remus_retry_connect_event, s)) < 0) {
+ RPRINTF("error registering timeout client connection event handler: %s\n", strerror(id));
+ return;
+ }
+ s->stream_fd.id = id;
+ }
+ else
+ {
+ /* not recoverable */
+ RPRINTF("error connection to server %s\n", strerror(errno));
+ return;
+ }
+ }
+ else
+ {
+ /* the connect returned EINPROGRESS (nonblocking connect) we must wait for the fd to be writeable to determine if the connect worked */
+
+ tapdisk_server_unregister_event(s->stream_fd.id);
+ if((id = tapdisk_server_register_event(SCHEDULER_POLL_WRITE_FD, s->stream_fd.fd, 0, remus_connect_event, s)) < 0) {
+ RPRINTF("error registering client connection event handler: %s\n", strerror(id));
+ return;
+ }
+ s->stream_fd.id = id;
+ }
+}
+
+/* callback when nonblocking connect() is finished */
+/* called only by primary in unprotected state */
+static void remus_connect_event(event_id_t id, char mode, void *private)
+{
+ int socket_errno;
+ socklen_t socket_errno_size;
+ struct tdremus_state *s = (struct tdremus_state *)private;
+
+ /* check to se if the connect succeeded */
+ socket_errno_size = sizeof(socket_errno);
+ if (getsockopt(s->stream_fd.fd, SOL_SOCKET, SO_ERROR, &socket_errno, &socket_errno_size)) {
+ RPRINTF("error getting socket errno\n");
+ return;
+ }
+
+ RPRINTF("socket connect returned %d\n", socket_errno);
+
+ if(socket_errno)
+ {
+ /* the connect did not succeed */
+
+ if(socket_errno == ECONNREFUSED || socket_errno == ENETUNREACH || socket_errno == ETIMEDOUT
+ || socket_errno == ECONNABORTED || socket_errno == EAGAIN)
+ {
+ /* we can probably assume that the backup is down. just try again later */
+ tapdisk_server_unregister_event(s->stream_fd.id);
+ if((id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, s->stream_fd.fd, REMUS_CONNRETRY_TIMEOUT, remus_retry_connect_event, s)) < 0) {
+ RPRINTF("error registering timeout client connection event handler: %s\n", strerror(id));
+ return;
+ }
+ s->stream_fd.id = id;
+ }
+ else
+ {
+ RPRINTF("socket connect returned %d, giving up\n", socket_errno);
+ }
+ }
+ else
+ {
+ /* the connect succeeded */
+
+ /* unregister this function and register a new event handler */
+ tapdisk_server_unregister_event(s->stream_fd.id);
+ if((id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, s->stream_fd.fd, 0, remus_client_event, s)) < 0) {
+ RPRINTF("error registering client event handler: %s\n", strerror(id));
+ return;
+ }
+ s->stream_fd.id = id;
+
+ /* switch from unprotected to protected client */
+ switch_mode(s->tdremus_driver, mode_primary);
+ }
+}
+
+
+/* we install this event handler on the primary once we have connected to the backup */
+/* wait for "done" message to commit checkpoint */
+static void remus_client_event(event_id_t id, char mode, void *private)
+{
+ struct tdremus_state *s = (struct tdremus_state *)private;
+ char req[5];
+ int rc;
+
+ if (mread(s->stream_fd.fd, req, sizeof(req) - 1) < 0) {
+ /* replication stream closed or otherwise broken (timeout, reset, &c) */
+ RPRINTF("error reading from backup\n");
+ close_stream_fd(s);
+ return;
+ }
+
+ req[4] = '\0';
+
+ if (!strcmp(req, TDREMUS_DONE))
+ /* checkpoint committed, inform msg_fd */
+ ctl_respond(s, TDREMUS_DONE);
+ else {
+ RPRINTF("received unknown message: %s\n", req);
+ close_stream_fd(s);
+ }
+
+ return;
+}
+
+/* backup functions */
+static void remus_server_event(event_id_t id, char mode, void *private);
+
+/* returns the socket that receives write requests */
+static void remus_server_accept(event_id_t id, char mode, void* private)
+{
+ struct tdremus_state* s = (struct tdremus_state *) private;
+
+ int stream_fd;
+ event_id_t cid;
+
+ /* XXX: add address-based black/white list */
+ if ((stream_fd = accept(s->server_fd.fd, NULL, NULL)) < 0) {
+ RPRINTF("error accepting connection: %d\n", errno);
+ return;
+ }
+
+ /* TODO: check to see if we are already replicating. if so just close the
+ * connection (or do something smarter) */
+ RPRINTF("server accepted connection\n");
+
+ /* add tapdisk event for replication stream */
+ cid = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, stream_fd, 0,
+ remus_server_event, s);
+
+ if(cid < 0) {
+ RPRINTF("error registering connection event handler: %s\n", strerror(errno));
+ close(stream_fd);
+ return;
+ }
+
+ /* store replication file descriptor */
+ s->stream_fd.fd = stream_fd;
+ s->stream_fd.id = cid;
+}
+
+/* returns -2 if EADDRNOTAVAIL */
+static int remus_bind(struct tdremus_state* s)
+{
+// struct sockaddr_in sa;
+ int opt;
+ int rc = -1;
+
+ if ((s->server_fd.fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+ RPRINTF("could not create server socket: %d\n", errno);
+ return rc;
+ }
+ opt = 1;
+ if (setsockopt(s->server_fd.fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) < 0)
+ RPRINTF("Error setting REUSEADDR on %d: %d\n", s->server_fd.fd, errno);
+
+ if (bind(s->server_fd.fd, &s->sa, sizeof(s->sa)) < 0) {
+ RPRINTF("could not bind server socket %d to %s:%d: %d %s\n", s->server_fd.fd,
+ inet_ntoa(s->sa.sin_addr), ntohs(s->sa.sin_port), errno, strerror(errno));
+ if (errno != EADDRINUSE)
+ rc = -2;
+ goto err_sfd;
+ }
+ if (listen(s->server_fd.fd, 10)) {
+ RPRINTF("could not listen on socket: %d\n", errno);
+ goto err_sfd;
+ }
+
+ /* The socket s now bound to the address and listening so we may now register
+ * the fd with tapdisk */
+
+ if((s->server_fd.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ s->server_fd.fd, 0,
+ remus_server_accept, s)) < 0) {
+ RPRINTF("error registering server connection event handler: %s",
+ strerror(s->server_fd.id));
+ goto err_sfd;
+ }
+
+ return 0;
+
+ err_sfd:
+ close(s->server_fd.fd);
+ s->server_fd.fd = -1;
+
+ return rc;
+}
+
+/* wait for latest checkpoint to be applied */
+static inline int server_writes_inflight(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ if (!s->ramdisk.inflight && !s->ramdisk.prev)
+ return 0;
+
+ return 1;
+}
+
+/* Due to block device prefetching this code may be called on the server side
+ * during normal replication. In this case we must return EBUSY, otherwise the
+ * domain may be started with stale data.
+ */
+void backup_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ if(!remus_image)
+ remus_image = treq.image;
+
+#if 0
+ /* due to prefetching, we must return EBUSY on server reads. This
+ * maintains a consistent disk image */
+ td_complete_request(treq, -EBUSY);
+#else
+ /* what exactly is the race that requires the response above? */
+ td_forward_request(treq);
+#endif
+}
+
+/* see above */
+void backup_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ /* on a server write, we know the domain has failed over. we must change our
+ * state to unprotected and then have the unprotected queue_write function
+ * handle the write
+ */
+
+ switch_mode(driver, mode_unprotected);
+ /* TODO: call the appropriate write function rather than return EBUSY */
+ td_complete_request(treq, -EBUSY);
+}
+
+static int backup_start(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+ int fd;
+
+ if (ramdisk_start(driver) < 0)
+ return -1;
+
+ tapdisk_remus.td_queue_read = backup_queue_read;
+ tapdisk_remus.td_queue_write = backup_queue_write;
+ /* TODO set flush function */
+ return 0;
+}
+
+static int server_do_wreq(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+ static tdremus_wire_t twreq;
+ char buf[4096];
+ int len, rc;
+
+ char header[sizeof(uint32_t) + sizeof(uint64_t)];
+ uint32_t *sectors = (uint32_t *) header;
+ uint64_t *sector = (uint64_t *) &header[sizeof(uint32_t)];
+
+ // RPRINTF("received write request\n");
+
+ if (mread(s->stream_fd.fd, header, sizeof(header)) < 0)
+ goto err;
+
+ len = *sectors * driver->info.sector_size;
+
+ //RPRINTF("writing %d sectors (%d bytes) starting at %" PRIu64 "\n", *sectors, len,
+ // *sector);
+
+ if (len > sizeof(buf)) {
+ /* freak out! */
+ RPRINTF("write request too large: %d/%u\n", len, (unsigned)sizeof(buf));
+ return -1;
+ }
+
+ if (mread(s->stream_fd.fd, buf, len) < 0)
+ goto err;
+
+ if (ramdisk_write(&s->ramdisk, *sector, *sectors, buf) < 0)
+ goto err;
+
+ return 0;
+
+ err:
+ /* should start failover */
+ RPRINTF("backup write request error\n");
+ close_stream_fd(s);
+
+ return -1;
+}
+
+static int server_do_sreq(td_driver_t *driver)
+{
+ /*
+ RPRINTF("submit request received\n");
+ */
+
+ return 0;
+}
+
+/* at this point, the server can start applying the most recent
+ * ramdisk. */
+static int server_do_creq(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ // RPRINTF("committing buffer\n");
+
+ ramdisk_start_flush(driver);
+
+ /* XXX this message should not be sent until flush completes! */
+ if (write(s->stream_fd.fd, TDREMUS_DONE, strlen(TDREMUS_DONE)) != 4)
+ return -1;
+
+ return 0;
+}
+
+
+/* called when data is pending in s->rfd */
+static void remus_server_event(event_id_t id, char mode, void *private)
+{
+ struct tdremus_state *s = (struct tdremus_state *)private;
+ td_driver_t *driver = s->tdremus_driver;
+ char req[5];
+
+ // RPRINTF("replication data waiting\n");
+
+ /* TODO: add a get_connection_by_event_id() function.
+ * for now we can assume that the fd is s->stream_fd */
+
+ if (mread(s->stream_fd.fd, req, sizeof(req) - 1) < 0) {
+ RPRINTF("error reading server event, activating backup\n");
+ switch_mode(driver, mode_unprotected);
+ return;
+ }
+
+ req[4] = '\0';
+
+ if (!strcmp(req, TDREMUS_WRITE))
+ server_do_wreq(driver);
+ else if (!strcmp(req, TDREMUS_SUBMIT))
+ server_do_sreq(driver);
+ else if (!strcmp(req, TDREMUS_COMMIT))
+ server_do_creq(driver);
+ else
+ RPRINTF("unknown request received: %s\n", req);
+
+ return;
+
+}
+
+/* unprotected */
+
+void unprotected_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ /* wait for previous ramdisk to flush before servicing reads */
+ if (server_writes_inflight(driver)) {
+ /* for now lets just return EBUSY. if this becomes an issue we can
+ * do something smarter */
+ td_complete_request(treq, -EBUSY);
+ }
+ else {
+ /* here we just pass reads through */
+ td_forward_request(treq);
+ }
+}
+
+/* For a recoverable remus solution we need to log unprotected writes here */
+void unprotected_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ /* wait for previous ramdisk to flush */
+ if (server_writes_inflight(driver)) {
+ RPRINTF("queue_write: waiting for queue to drain");
+ td_complete_request(treq, -EBUSY);
+ }
+ else {
+ // RPRINTF("servicing write request on backup\n");
+ td_forward_request(treq);
+ }
+}
+
+static int unprotected_start(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ RPRINTF("failure detected, activating passthrough\n");
+
+ /* close the server socket */
+ close_stream_fd(s);
+
+ /* unregister the replication stream */
+ tapdisk_server_unregister_event(s->server_fd.id);
+
+ /* close the replication stream */
+ close(s->server_fd.fd);
+ s->server_fd.fd = -1;
+
+ /* install the unprotected read/write handlers */
+ tapdisk_remus.td_queue_read = unprotected_queue_read;
+ tapdisk_remus.td_queue_write = unprotected_queue_write;
+
+ return 0;
+}
+
+
+/* control */
+
+static inline int resolve_address(const char* addr, struct in_addr* ia)
+{
+ struct hostent* he;
+ uint32_t ip;
+
+ if (!(he = gethostbyname(addr))) {
+ RPRINTF("error resolving %s: %d\n", addr, h_errno);
+ return -1;
+ }
+
+ if (!he->h_addr_list[0]) {
+ RPRINTF("no address found for %s\n", addr);
+ return -1;
+ }
+
+ /* network byte order */
+ ip = *((uint32_t**)he->h_addr_list)[0];
+ ia->s_addr = ip;
+
+ return 0;
+}
+
+static int get_args(td_driver_t *driver, const char* name)
+{
+ struct tdremus_state *state = (struct tdremus_state *)driver->data;
+ char* host;
+ char* port;
+// char* driver_str;
+// char* parent;
+// int type;
+// char* path;
+// unsigned long ulport;
+// int i;
+// struct sockaddr_in server_addr_in;
+
+ int gai_status;
+ int valid_addr;
+ struct addrinfo gai_hints;
+ struct addrinfo *servinfo, *servinfo_itr;
+
+ memset(&gai_hints, 0, sizeof gai_hints);
+ gai_hints.ai_family = AF_UNSPEC;
+ gai_hints.ai_socktype = SOCK_STREAM;
+
+ port = strchr(name, ':');
+ if (!port) {
+ RPRINTF("missing host in %s\n", name);
+ return -ENOENT;
+ }
+ if (!(host = strndup(name, port - name))) {
+ RPRINTF("unable to allocate host\n");
+ return -ENOMEM;
+ }
+ port++;
+
+ if ((gai_status = getaddrinfo(host, port, &gai_hints, &servinfo)) != 0) {
+ RPRINTF("getaddrinfo error: %s\n", gai_strerror(gai_status));
+ return -ENOENT;
+ }
+
+ /* TODO: do something smarter here */
+ valid_addr = 0;
+ for(servinfo_itr = servinfo; servinfo_itr != NULL; servinfo_itr = servinfo_itr->ai_next) {
+ void *addr;
+ char *ipver;
+
+ if (servinfo_itr->ai_family == AF_INET) {
+ valid_addr = 1;
+ memset(&state->sa, 0, sizeof(state->sa));
+ state->sa = *(struct sockaddr_in *)servinfo_itr->ai_addr;
+ break;
+ }
+ }
+ freeaddrinfo(servinfo);
+
+ if (!valid_addr)
+ return -ENOENT;
+
+ RPRINTF("host: %s, port: %d\n", inet_ntoa(state->sa.sin_addr), ntohs(state->sa.sin_port));
+
+ return 0;
+}
+
+static int switch_mode(td_driver_t *driver, enum tdremus_mode mode)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+ int rc;
+
+ if (mode == s->mode)
+ return 0;
+
+ if (s->queue_flush)
+ if ((rc = s->queue_flush(driver)) < 0) {
+ // fall back to unprotected mode on error
+ RPRINTF("switch_mode: error flushing queue (old: %d, new: %d)", s->mode, mode);
+ mode = mode_unprotected;
+ }
+
+ if (mode == mode_unprotected)
+ rc = unprotected_start(driver);
+ else if (mode == mode_primary)
+ rc = primary_start(driver);
+ else if (mode == mode_backup)
+ rc = backup_start(driver);
+ else {
+ RPRINTF("unknown mode requested: %d\n", mode);
+ rc = -1;
+ }
+
+ if (!rc)
+ s->mode = mode;
+
+ return rc;
+}
+
+static void ctl_request(event_id_t id, char mode, void *private)
+{
+ struct tdremus_state *s = (struct tdremus_state *)private;
+ td_driver_t *driver = s->tdremus_driver;
+ char msg[80];
+ int rc;
+
+ // RPRINTF("data waiting on control fifo\n");
+
+ if (!(rc = read(s->ctl_fd.fd, msg, sizeof(msg) - 1 /* append nul */))) {
+ RPRINTF("0-byte read received, reopening FIFO\n");
+ /*TODO: we may have to unregister/re-register with tapdisk_server */
+ close(s->ctl_fd.fd);
+ RPRINTF("FIFO closed\n");
+ if ((s->ctl_fd.fd = open(s->ctl_path, O_RDWR)) < 0) {
+ RPRINTF("error reopening FIFO: %d\n", errno);
+ }
+ return;
+ }
+
+ if (rc < 0) {
+ RPRINTF("error reading from FIFO: %d\n", errno);
+ return;
+ }
+
+ /* TODO: need to get driver somehow */
+ msg[rc] = '\0';
+ if (!strncmp(msg, "flush", 5)) {
+ if (s->queue_flush)
+ if ((rc = s->queue_flush(driver))) {
+ RPRINTF("error passing flush request to backup");
+ ctl_respond(s, TDREMUS_FAIL);
+ }
+ } else {
+ RPRINTF("unknown command: %s\n", msg);
+ }
+}
+
+static int ctl_respond(struct tdremus_state *s, const char *response)
+{
+ int rc;
+
+ if ((rc = write(s->msg_fd.fd, response, strlen(response))) < 0) {
+ RPRINTF("error writing notification: %d\n", errno);
+ close(s->msg_fd.fd);
+ if ((s->msg_fd.fd = open(s->msg_path, O_RDWR)) < 0)
+ RPRINTF("error reopening FIFO: %d\n", errno);
+ }
+
+ return rc;
+}
+
+/* must be called after the underlying driver has been initialized */
+static int ctl_open(td_driver_t *driver, const char* name)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+ int i, l;
+
+ /* first we must ensure that BLKTAP_CTRL_DIR exists */
+ if (mkdir(BLKTAP_CTRL_DIR, 0755) && errno != EEXIST)
+ {
+ DPRINTF("error creating directory %s: %d\n", BLKTAP_CTRL_DIR, errno);
+ return -1;
+ }
+
+ /* use the device name to create the control fifo path */
+ if (asprintf(&s->ctl_path, BLKTAP_CTRL_DIR "/remus_%s", name) < 0)
+ return -1;
+ /* scrub fifo pathname */
+ for (i = strlen(BLKTAP_CTRL_DIR) + 1, l = strlen(s->ctl_path); i < l; i++) {
+ if (strchr(":/", s->ctl_path[i]))
+ s->ctl_path[i] = '_';
+ }
+ if (asprintf(&s->msg_path, "%s.msg", s->ctl_path) < 0)
+ goto err_ctlfifo;
+
+ if (mkfifo(s->ctl_path, S_IRWXU|S_IRWXG|S_IRWXO) && errno != EEXIST) {
+ RPRINTF("error creating control FIFO %s: %d\n", s->ctl_path, errno);
+ goto err_msgfifo;
+ }
+
+ if (mkfifo(s->msg_path, S_IRWXU|S_IRWXG|S_IRWXO) && errno != EEXIST) {
+ RPRINTF("error creating message FIFO %s: %d\n", s->msg_path, errno);
+ goto err_msgfifo;
+ }
+
+ /* RDWR so that fd doesn't block select when no writer is present */
+ if ((s->ctl_fd.fd = open(s->ctl_path, O_RDWR)) < 0) {
+ RPRINTF("error opening control FIFO %s: %d\n", s->ctl_path, errno);
+ goto err_msgfifo;
+ }
+
+ if ((s->msg_fd.fd = open(s->msg_path, O_RDWR)) < 0) {
+ RPRINTF("error opening message FIFO %s: %d\n", s->msg_path, errno);
+ goto err_openctlfifo;
+ }
+
+ RPRINTF("control FIFO %s\n", s->ctl_path);
+ RPRINTF("message FIFO %s\n", s->msg_path);
+
+ return 0;
+
+ err_openctlfifo:
+ close(s->ctl_fd.fd);
+ err_msgfifo:
+ free(s->msg_path);
+ s->msg_path = NULL;
+ err_ctlfifo:
+ free(s->ctl_path);
+ s->ctl_path = NULL;
+ return -1;
+}
+
+static void ctl_close(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+
+ /* TODO: close *all* connections */
+
+ if(s->ctl_fd.fd)
+ close(s->ctl_fd.fd);
+
+ if (s->ctl_path) {
+ unlink(s->ctl_path);
+ free(s->ctl_path);
+ s->ctl_path = NULL;
+ }
+ if (s->msg_path) {
+ unlink(s->msg_path);
+ free(s->msg_path);
+ s->msg_path = NULL;
+ }
+}
+
+static int ctl_register(struct tdremus_state *s)
+{
+ RPRINTF("registering ctl fifo\n");
+
+ /* register ctl fd */
+ s->ctl_fd.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, s->ctl_fd.fd, 0, ctl_request, s);
+
+ if (s->ctl_fd.id < 0) {
+ RPRINTF("error registering ctrl FIFO %s: %d\n", s->ctl_path, s->ctl_fd.id);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* interface */
+
+static int tdremus_open(td_driver_t *driver, const char *name,
+ td_flag_t flags)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+ int rc;
+
+ RPRINTF("opening %s\n", name);
+
+ /* first we need to get the underlying vbd for this driver stack. To do so we
+ * need to know the vbd's id. Fortunately, for tapdisk2 this is hard-coded as
+ * 0 (see tapdisk2.c)
+ */
+ device_vbd = tapdisk_server_get_vbd(0);
+
+ memset(s, 0, sizeof(*s));
+ s->server_fd.fd = -1;
+ s->stream_fd.fd = -1;
+ s->ctl_fd.fd = -1;
+ s->msg_fd.fd = -1;
+
+ /* TODO: this is only needed so that the server can send writes down
+ * the driver stack from the stream_fd event handler */
+ s->tdremus_driver = driver;
+
+ /* parse name to get info etc */
+ if ((rc = get_args(driver, name)))
+ return rc;
+
+ if ((rc = ctl_open(driver, name))) {
+ RPRINTF("error setting up control channel\n");
+ free(s->driver_data);
+ return rc;
+ }
+
+ if ((rc = ctl_register(s))) {
+ RPRINTF("error registering control channel\n");
+ free(s->driver_data);
+ return rc;
+ }
+
+ if (!(rc = remus_bind(s)))
+ rc = switch_mode(driver, mode_backup);
+ else if (rc == -2)
+ rc = switch_mode(driver, mode_primary);
+
+ if (!rc)
+ return 0;
+
+ tdremus_close(driver);
+ return -EIO;
+}
+
+static int tdremus_close(td_driver_t *driver)
+{
+ struct tdremus_state *s = (struct tdremus_state *)driver->data;
+ int rc;
+
+ RPRINTF("closing\n");
+
+ if (s->driver_data) {
+ free(s->driver_data);
+ s->driver_data = NULL;
+ }
+ if (s->server_fd.fd >= 0) {
+ close(s->server_fd.fd);
+ s->server_fd.fd = -1;
+ }
+ if (s->stream_fd.fd >= 0)
+ close_stream_fd(s);
+
+ ctl_close(driver);
+
+ return rc;
+}
+
+static int tdremus_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+ /* we shouldn't have a parent... for now */
+ return -EINVAL;
+}
+
+static int tdremus_validate_parent(td_driver_t *driver,
+ td_driver_t *pdriver, td_flag_t flags)
+{
+ return 0;
+}
+
+struct tap_disk tapdisk_remus = {
+ .disk_type = "tapdisk_remus",
+ .private_data_size = sizeof(struct tdremus_state),
+ .td_open = tdremus_open,
+ .td_queue_read = unprotected_queue_read,
+ .td_queue_write = unprotected_queue_write,
+ .td_close = tdremus_close,
+ .td_get_parent_id = tdremus_get_parent_id,
+ .td_validate_parent = tdremus_validate_parent,
+ .td_debug = NULL,
+};
diff --git a/tools/blktap2/drivers/disktypes.h b/tools/blktap2/drivers/disktypes.h
--- a/tools/blktap2/drivers/disktypes.h
+++ b/tools/blktap2/drivers/disktypes.h
@@ -49,6 +49,7 @@
extern struct tap_disk tapdisk_qcow;
extern struct tap_disk tapdisk_block_cache;
extern struct tap_disk tapdisk_log;
+extern struct tap_disk tapdisk_remus;
#define MAX_DISK_TYPES 20
@@ -61,6 +62,7 @@
#define DISK_TYPE_QCOW 6
#define DISK_TYPE_BLOCK_CACHE 7
#define DISK_TYPE_LOG 9
+#define DISK_TYPE_REMUS 10
/*Define Individual Disk Parameters here */
static disk_info_t null_disk = {
@@ -167,6 +169,16 @@
#endif
};
+static disk_info_t remus_disk = {
+ DISK_TYPE_REMUS,
+ "remus disk replicator (remus)",
+ "remus",
+ 0,
+#ifdef TAPDISK
+ &tapdisk_remus,
+#endif
+};
+
/*Main disk info array */
static disk_info_t *dtypes[] = {
&aio_disk,
@@ -179,6 +191,7 @@
&block_cache_disk,
&null_disk,
&log_disk,
+ &remus_disk,
};
#endif
diff --git a/tools/blktap2/drivers/hashtable.c b/tools/blktap2/drivers/hashtable.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap2/drivers/hashtable.c
@@ -0,0 +1,274 @@
+/* Copyright (C) 2004 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#include "hashtable.h"
+#include "hashtable_private.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/*
+Credit for primes table: Aaron Krowne
+ http://br.endernet.org/~akrowne/
+ http://planetmath.org/encyclopedia/GoodHashTablePrimes.html
+*/
+static const unsigned int primes[] = {
+53, 97, 193, 389,
+769, 1543, 3079, 6151,
+12289, 24593, 49157, 98317,
+196613, 393241, 786433, 1572869,
+3145739, 6291469, 12582917, 25165843,
+50331653, 100663319, 201326611, 402653189,
+805306457, 1610612741
+};
+const unsigned int prime_table_length = sizeof(primes)/sizeof(primes[0]);
+const float max_load_factor = 0.65;
+
+/*****************************************************************************/
+struct hashtable *
+create_hashtable(unsigned int minsize,
+ unsigned int (*hashf) (void*),
+ int (*eqf) (void*,void*))
+{
+ struct hashtable *h;
+ unsigned int pindex, size = primes[0];
+ /* Check requested hashtable isn't too large */
+ if (minsize > (1u << 30)) return NULL;
+ /* Enforce size as prime */
+ for (pindex=0; pindex < prime_table_length; pindex++) {
+ if (primes[pindex] > minsize) { size = primes[pindex]; break; }
+ }
+ h = (struct hashtable *)malloc(sizeof(struct hashtable));
+ if (NULL == h) return NULL; /*oom*/
+ h->table = (struct entry **)malloc(sizeof(struct entry*) * size);
+ if (NULL == h->table) { free(h); return NULL; } /*oom*/
+ memset(h->table, 0, size * sizeof(struct entry *));
+ h->tablelength = size;
+ h->primeindex = pindex;
+ h->entrycount = 0;
+ h->hashfn = hashf;
+ h->eqfn = eqf;
+ h->loadlimit = (unsigned int) ceil(size * max_load_factor);
+ return h;
+}
+
+/*****************************************************************************/
+unsigned int
+hash(struct hashtable *h, void *k)
+{
+ /* Aim to protect against poor hash functions by adding logic here
+ * - logic taken from java 1.4 hashtable source */
+ unsigned int i = h->hashfn(k);
+ i += ~(i << 9);
+ i ^= ((i >> 14) | (i << 18)); /* >>> */
+ i += (i << 4);
+ i ^= ((i >> 10) | (i << 22)); /* >>> */
+ return i;
+}
+
+/*****************************************************************************/
+static int
+hashtable_expand(struct hashtable *h)
+{
+ /* Double the size of the table to accomodate more entries */
+ struct entry **newtable;
+ struct entry *e;
+ struct entry **pE;
+ unsigned int newsize, i, index;
+ /* Check we're not hitting max capacity */
+ if (h->primeindex == (prime_table_length - 1)) return 0;
+ newsize = primes[++(h->primeindex)];
+
+ newtable = (struct entry **)malloc(sizeof(struct entry*) * newsize);
+ if (NULL != newtable)
+ {
+ memset(newtable, 0, newsize * sizeof(struct entry *));
+ /* This algorithm is not 'stable'. ie. it reverses the list
+ * when it transfers entries between the tables */
+ for (i = 0; i < h->tablelength; i++) {
+ while (NULL != (e = h->table[i])) {
+ h->table[i] = e->next;
+ index = indexFor(newsize,e->h);
+ e->next = newtable[index];
+ newtable[index] = e;
+ }
+ }
+ free(h->table);
+ h->table = newtable;
+ }
+ /* Plan B: realloc instead */
+ else
+ {
+ newtable = (struct entry **)
+ realloc(h->table, newsize * sizeof(struct entry *));
+ if (NULL == newtable) { (h->primeindex)--; return 0; }
+ h->table = newtable;
+ memset(newtable[h->tablelength], 0, newsize - h->tablelength);
+ for (i = 0; i < h->tablelength; i++) {
+ for (pE = &(newtable[i]), e = *pE; e != NULL; e = *pE) {
+ index = indexFor(newsize,e->h);
+ if (index == i)
+ {
+ pE = &(e->next);
+ }
+ else
+ {
+ *pE = e->next;
+ e->next = newtable[index];
+ newtable[index] = e;
+ }
+ }
+ }
+ }
+ h->tablelength = newsize;
+ h->loadlimit = (unsigned int) ceil(newsize * max_load_factor);
+ return -1;
+}
+
+/*****************************************************************************/
+unsigned int
+hashtable_count(struct hashtable *h)
+{
+ return h->entrycount;
+}
+
+/*****************************************************************************/
+int
+hashtable_insert(struct hashtable *h, void *k, void *v)
+{
+ /* This method allows duplicate keys - but they shouldn't be used */
+ unsigned int index;
+ struct entry *e;
+ if (++(h->entrycount) > h->loadlimit)
+ {
+ /* Ignore the return value. If expand fails, we should
+ * still try cramming just this value into the existing table
+ * -- we may not have memory for a larger table, but one more
+ * element may be ok. Next time we insert, we'll try expanding again.*/
+ hashtable_expand(h);
+ }
+ e = (struct entry *)malloc(sizeof(struct entry));
+ if (NULL == e) { --(h->entrycount); return 0; } /*oom*/
+ e->h = hash(h,k);
+ index = indexFor(h->tablelength,e->h);
+ e->k = k;
+ e->v = v;
+ e->next = h->table[index];
+ h->table[index] = e;
+ return -1;
+}
+
+/*****************************************************************************/
+void * /* returns value associated with key */
+hashtable_search(struct hashtable *h, void *k)
+{
+ struct entry *e;
+ unsigned int hashvalue, index;
+ hashvalue = hash(h,k);
+ index = indexFor(h->tablelength,hashvalue);
+ e = h->table[index];
+ while (NULL != e)
+ {
+ /* Check hash value to short circuit heavier comparison */
+ if ((hashvalue == e->h) && (h->eqfn(k, e->k))) return e->v;
+ e = e->next;
+ }
+ return NULL;
+}
+
+/*****************************************************************************/
+void * /* returns value associated with key */
+hashtable_remove(struct hashtable *h, void *k)
+{
+ /* TODO: consider compacting the table when the load factor drops enough,
+ * or provide a 'compact' method. */
+
+ struct entry *e;
+ struct entry **pE;
+ void *v;
+ unsigned int hashvalue, index;
+
+ hashvalue = hash(h,k);
+ index = indexFor(h->tablelength,hash(h,k));
+ pE = &(h->table[index]);
+ e = *pE;
+ while (NULL != e)
+ {
+ /* Check hash value to short circuit heavier comparison */
+ if ((hashvalue == e->h) && (h->eqfn(k, e->k)))
+ {
+ *pE = e->next;
+ h->entrycount--;
+ v = e->v;
+ freekey(e->k);
+ free(e);
+ return v;
+ }
+ pE = &(e->next);
+ e = e->next;
+ }
+ return NULL;
+}
+
+/*****************************************************************************/
+/* destroy */
+void
+hashtable_destroy(struct hashtable *h, int free_values)
+{
+ unsigned int i;
+ struct entry *e, *f;
+ struct entry **table = h->table;
+ if (free_values)
+ {
+ for (i = 0; i < h->tablelength; i++)
+ {
+ e = table[i];
+ while (NULL != e)
+ { f = e; e = e->next; freekey(f->k); free(f->v); free(f); }
+ }
+ }
+ else
+ {
+ for (i = 0; i < h->tablelength; i++)
+ {
+ e = table[i];
+ while (NULL != e)
+ { f = e; e = e->next; freekey(f->k); free(f); }
+ }
+ }
+ free(h->table);
+ free(h);
+}
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/blktap2/drivers/hashtable_itr.c b/tools/blktap2/drivers/hashtable_itr.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap2/drivers/hashtable_itr.c
@@ -0,0 +1,188 @@
+/* Copyright (C) 2002, 2004 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#include "hashtable.h"
+#include "hashtable_private.h"
+#include "hashtable_itr.h"
+#include <stdlib.h> /* defines NULL */
+
+/*****************************************************************************/
+/* hashtable_iterator - iterator constructor */
+
+struct hashtable_itr *
+hashtable_iterator(struct hashtable *h)
+{
+ unsigned int i, tablelength;
+ struct hashtable_itr *itr = (struct hashtable_itr *)
+ malloc(sizeof(struct hashtable_itr));
+ if (NULL == itr) return NULL;
+ itr->h = h;
+ itr->e = NULL;
+ itr->parent = NULL;
+ tablelength = h->tablelength;
+ itr->index = tablelength;
+ if (0 == h->entrycount) return itr;
+
+ for (i = 0; i < tablelength; i++)
+ {
+ if (NULL != h->table[i])
+ {
+ itr->e = h->table[i];
+ itr->index = i;
+ break;
+ }
+ }
+ return itr;
+}
+
+/*****************************************************************************/
+/* key - return the key of the (key,value) pair at the current position */
+/* value - return the value of the (key,value) pair at the current position */
+
+void *
+hashtable_iterator_key(struct hashtable_itr *i)
+{ return i->e->k; }
+
+void *
+hashtable_iterator_value(struct hashtable_itr *i)
+{ return i->e->v; }
+
+/*****************************************************************************/
+/* advance - advance the iterator to the next element
+ * returns zero if advanced to end of table */
+
+int
+hashtable_iterator_advance(struct hashtable_itr *itr)
+{
+ unsigned int j,tablelength;
+ struct entry **table;
+ struct entry *next;
+ if (NULL == itr->e) return 0; /* stupidity check */
+
+ next = itr->e->next;
+ if (NULL != next)
+ {
+ itr->parent = itr->e;
+ itr->e = next;
+ return -1;
+ }
+ tablelength = itr->h->tablelength;
+ itr->parent = NULL;
+ if (tablelength <= (j = ++(itr->index)))
+ {
+ itr->e = NULL;
+ return 0;
+ }
+ table = itr->h->table;
+ while (NULL == (next = table[j]))
+ {
+ if (++j >= tablelength)
+ {
+ itr->index = tablelength;
+ itr->e = NULL;
+ return 0;
+ }
+ }
+ itr->index = j;
+ itr->e = next;
+ return -1;
+}
+
+/*****************************************************************************/
+/* remove - remove the entry at the current iterator position
+ * and advance the iterator, if there is a successive
+ * element.
+ * If you want the value, read it before you remove:
+ * beware memory leaks if you don't.
+ * Returns zero if end of iteration. */
+
+int
+hashtable_iterator_remove(struct hashtable_itr *itr)
+{
+ struct entry *remember_e, *remember_parent;
+ int ret;
+
+ /* Do the removal */
+ if (NULL == (itr->parent))
+ {
+ /* element is head of a chain */
+ itr->h->table[itr->index] = itr->e->next;
+ } else {
+ /* element is mid-chain */
+ itr->parent->next = itr->e->next;
+ }
+ /* itr->e is now outside the hashtable */
+ remember_e = itr->e;
+ itr->h->entrycount--;
+ freekey(remember_e->k);
+
+ /* Advance the iterator, correcting the parent */
+ remember_parent = itr->parent;
+ ret = hashtable_iterator_advance(itr);
+ if (itr->parent == remember_e) { itr->parent = remember_parent; }
+ free(remember_e);
+ return ret;
+}
+
+/*****************************************************************************/
+int /* returns zero if not found */
+hashtable_iterator_search(struct hashtable_itr *itr,
+ struct hashtable *h, void *k)
+{
+ struct entry *e, *parent;
+ unsigned int hashvalue, index;
+
+ hashvalue = hash(h,k);
+ index = indexFor(h->tablelength,hashvalue);
+
+ e = h->table[index];
+ parent = NULL;
+ while (NULL != e)
+ {
+ /* Check hash value to short circuit heavier comparison */
+ if ((hashvalue == e->h) && (h->eqfn(k, e->k)))
+ {
+ itr->index = index;
+ itr->e = e;
+ itr->parent = parent;
+ itr->h = h;
+ return -1;
+ }
+ parent = e;
+ e = e->next;
+ }
+ return 0;
+}
+
+
+/*
+ * Copyright (c) 2002, 2004, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/blktap2/drivers/hashtable_itr.h b/tools/blktap2/drivers/hashtable_itr.h
new file mode 100644
--- /dev/null
+++ b/tools/blktap2/drivers/hashtable_itr.h
@@ -0,0 +1,112 @@
+/* Copyright (C) 2002, 2004 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#ifndef __HASHTABLE_ITR_CWC22__
+#define __HASHTABLE_ITR_CWC22__
+#include "hashtable.h"
+#include "hashtable_private.h" /* needed to enable inlining */
+
+/*****************************************************************************/
+/* This struct is only concrete here to allow the inlining of two of the
+ * accessor functions. */
+struct hashtable_itr
+{
+ struct hashtable *h;
+ struct entry *e;
+ struct entry *parent;
+ unsigned int index;
+};
+
+
+/*****************************************************************************/
+/* hashtable_iterator
+ */
+
+struct hashtable_itr *
+hashtable_iterator(struct hashtable *h);
+
+/*****************************************************************************/
+/* hashtable_iterator_key
+ * - return the value of the (key,value) pair at the current position */
+
+extern inline void *
+hashtable_iterator_key(struct hashtable_itr *i)
+{
+ return i->e->k;
+}
+
+/*****************************************************************************/
+/* value - return the value of the (key,value) pair at the current position */
+
+extern inline void *
+hashtable_iterator_value(struct hashtable_itr *i)
+{
+ return i->e->v;
+}
+
+/*****************************************************************************/
+/* advance - advance the iterator to the next element
+ * returns zero if advanced to end of table */
+
+int
+hashtable_iterator_advance(struct hashtable_itr *itr);
+
+/*****************************************************************************/
+/* remove - remove current element and advance the iterator to the next element
+ * NB: if you need the value to free it, read it before
+ * removing. ie: beware memory leaks!
+ * returns zero if advanced to end of table */
+
+int
+hashtable_iterator_remove(struct hashtable_itr *itr);
+
+/*****************************************************************************/
+/* search - overwrite the supplied iterator, to point to the entry
+ * matching the supplied key.
+ h points to the hashtable to be searched.
+ * returns zero if not found. */
+int
+hashtable_iterator_search(struct hashtable_itr *itr,
+ struct hashtable *h, void *k);
+
+#define DEFINE_HASHTABLE_ITERATOR_SEARCH(fnname, keytype) \
+int fnname (struct hashtable_itr *i, struct hashtable *h, keytype *k) \
+{ \
+ return (hashtable_iterator_search(i,h,k)); \
+}
+
+
+
+#endif /* __HASHTABLE_ITR_CWC22__*/
+
+/*
+ * Copyright (c) 2002, 2004, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/blktap2/drivers/hashtable_utility.c b/tools/blktap2/drivers/hashtable_utility.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap2/drivers/hashtable_utility.c
@@ -0,0 +1,71 @@
+/* Copyright (C) 2002 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#include "hashtable.h"
+#include "hashtable_private.h"
+#include "hashtable_utility.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+/*****************************************************************************/
+/* hashtable_change
+ *
+ * function to change the value associated with a key, where there already
+ * exists a value bound to the key in the hashtable.
+ * Source due to Holger Schemel.
+ *
+ * */
+int
+hashtable_change(struct hashtable *h, void *k, void *v)
+{
+ struct entry *e;
+ unsigned int hashvalue, index;
+ hashvalue = hash(h,k);
+ index = indexFor(h->tablelength,hashvalue);
+ e = h->table[index];
+ while (NULL != e)
+ {
+ /* Check hash value to short circuit heavier comparison */
+ if ((hashvalue == e->h) && (h->eqfn(k, e->k)))
+ {
+ free(e->v);
+ e->v = v;
+ return -1;
+ }
+ e = e->next;
+ }
+ return 0;
+}
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/blktap2/drivers/hashtable_utility.h b/tools/blktap2/drivers/hashtable_utility.h
new file mode 100644
--- /dev/null
+++ b/tools/blktap2/drivers/hashtable_utility.h
@@ -0,0 +1,55 @@
+/* Copyright (C) 2002 Christopher Clark <firstname.lastname@cl.cam.ac.uk> */
+
+#ifndef __HASHTABLE_CWC22_UTILITY_H__
+#define __HASHTABLE_CWC22_UTILITY_H__
+
+/*****************************************************************************
+ * hashtable_change
+ *
+ * function to change the value associated with a key, where there already
+ * exists a value bound to the key in the hashtable.
+ * Source due to Holger Schemel.
+ *
+ * @name hashtable_change
+ * @param h the hashtable
+ * @param key
+ * @param value
+ *
+ */
+int
+hashtable_change(struct hashtable *h, void *k, void *v);
+
+#endif /* __HASHTABLE_CWC22_H__ */
+
+/*
+ * Copyright (c) 2002, Christopher Clark
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * * Neither the name of the original author; nor the names of any contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
diff --git a/tools/python/xen/xend/server/BlktapController.py b/tools/python/xen/xend/server/BlktapController.py
--- a/tools/python/xen/xend/server/BlktapController.py
+++ b/tools/python/xen/xend/server/BlktapController.py
@@ -28,6 +28,7 @@
'ram',
'qcow',
'vhd',
+ 'remus',
]
blktap_disk_types = blktap1_disk_types + blktap2_disk_types
^ permalink raw reply [flat|nested] 16+ messages in thread* Re: [PATCH 00 of 11] Remus 0.9 released!
2009-11-05 22:58 [PATCH 00 of 11] Remus 0.9 released! Brendan Cully
` (10 preceding siblings ...)
2009-11-05 22:58 ` [PATCH 11 of 11] blktap2: add remus driver Brendan Cully
@ 2009-11-09 8:52 ` Keir Fraser
2009-12-22 2:32 ` Keith Coleman
12 siblings, 0 replies; 16+ messages in thread
From: Keir Fraser @ 2009-11-09 8:52 UTC (permalink / raw)
To: xen-devel@lists.xensource.com; +Cc: andy@cs.ubc.ca, Brendan Cully
On 05/11/2009 22:58, "Brendan Cully" <brendan@cs.ubc.ca> wrote:
> It's my pleasure to announce the release of Remus 0.9!
>
> I believe that it is now ready for inclusion in Xen, and I would love
> to have people try it out and let me know how it goes.
Does anyone on the list have any comments to make on Remus?
Personally I would like to get this in asap ahead of stabilisation for the
next major Xen release. If there are comments/criticisms/support for Remus,
or Kemari, or HA solutions in general, then they need to be aired now!
-- Keir
^ permalink raw reply [flat|nested] 16+ messages in thread* Re: [PATCH 00 of 11] Remus 0.9 released!
2009-11-05 22:58 [PATCH 00 of 11] Remus 0.9 released! Brendan Cully
` (11 preceding siblings ...)
2009-11-09 8:52 ` [PATCH 00 of 11] Remus 0.9 released! Keir Fraser
@ 2009-12-22 2:32 ` Keith Coleman
2009-12-28 20:07 ` gilberto nunes
12 siblings, 1 reply; 16+ messages in thread
From: Keith Coleman @ 2009-12-22 2:32 UTC (permalink / raw)
To: Brendan Cully; +Cc: andy, xen-devel
On Thu, Nov 5, 2009 at 5:58 PM, Brendan Cully <brendan@cs.ubc.ca> wrote:
> This release works with the tip of the xen-unstable repository, and
> supports PV and HVM in 32-on-32, 64-on-64, 32-on-64, and 64-on-32
> configurations. It has been tested using Linux (Ubuntu) PV guests, and
> both Linux and Windows XP under HVM.
It should be noted that HVM guests running PV drivers are unsupported.
Keith Coleman
^ permalink raw reply [flat|nested] 16+ messages in thread