* [RFC][PATCH 01/13] Kemari: add ECS_TAP state to event channel
2009-03-12 1:14 [RFC][PATCH 00/13] Kemari: updated to the 3.4 unstable tree Yoshiaki Tamura
@ 2009-03-12 1:15 ` Yoshiaki Tamura
2009-03-12 1:16 ` [RFC][PATCH 02/13] Kemari: core kemari code Yoshiaki Tamura
` (8 subsequent siblings)
9 siblings, 0 replies; 14+ messages in thread
From: Yoshiaki Tamura @ 2009-03-12 1:15 UTC (permalink / raw)
To: xen-devel
Cc: "柳澤佳里(yanagisawa yoshisato)",
Ian Pratt, ian.jackson, Keir Fraser, Stefano Stabellini
This is an updated version of the following patch. No major changes.
http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00369.html
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp>
Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp>
---
xen/common/event_channel.c | 150 ++++++++++++++++++++++++++++++++++++++++++++-
xen/include/xen/event.h | 14 ++++
xen/include/xen/sched.h | 10 +++
3 files changed, 173 insertions(+), 1 deletion(-)
diff -r b249f3e979a5 -r cf6a910e3663 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h Mon Mar 09 10:32:24 2009 +0000
+++ b/xen/include/xen/sched.h Wed Mar 11 18:03:47 2009 +0900
@@ -20,6 +20,7 @@
#include <xen/rcupdate.h>
#include <xen/irq.h>
#include <xen/mm.h>
+#include <xen/kemari.h>
#ifdef CONFIG_COMPAT
#include <compat/vcpu.h>
@@ -47,6 +48,7 @@
#define ECS_PIRQ 4 /* Channel is bound to a physical IRQ line. */
#define ECS_VIRQ 5 /* Channel is bound to a virtual IRQ line. */
#define ECS_IPI 6 /* Channel is bound to a virtual IPI line. */
+#define ECS_TAP 7 /* Channel is bound and tapped. */
u8 state; /* ECS_* */
u8 consumer_is_xen; /* Consumed by Xen or by guest? */
u16 notify_vcpu_id; /* VCPU for local delivery notification */
@@ -61,6 +63,11 @@
u16 pirq; /* state == ECS_PIRQ */
u16 virq; /* state == ECS_VIRQ */
} u;
+ struct {
+ u8 mode; /* Tap IN, OUT or both. */
+ /* Fucntion to call when an event is detected. */
+ long (*redirect) (struct evtchn *lchn, struct evtchn *rchn);
+ } tap;
#ifdef FLASK_ENABLE
void *ssid;
#endif
@@ -255,6 +262,9 @@
/* OProfile support. */
struct xenoprof *xenoprof;
int32_t time_offset_seconds;
+
+ /* Kemari support. */
+ struct kemari *kemari;
struct rcu_head rcu;
diff -r b249f3e979a5 -r cf6a910e3663 xen/include/xen/event.h
--- a/xen/include/xen/event.h Mon Mar 09 10:32:24 2009 +0000
+++ b/xen/include/xen/event.h Wed Mar 11 18:03:47 2009 +0900
@@ -82,4 +82,18 @@
mb(); /* set blocked status /then/ caller does his work */ \
} while ( 0 )
+struct evtchn_bind_tap {
+ /* IN parameters. */
+ domid_t tap_dom;
+ uint32_t tap_port;
+ uint8_t mode;
+ long (*redirect) (struct evtchn *lchn, struct evtchn *rchn);
+};
+
+void notify_via_xen_evtchn_tap(struct domain *ld, int lport);
+
+long evtchn_bind_tap(struct evtchn_bind_tap *bind_tap);
+
+long evtchn_unbind_tap(struct evtchn_bind_tap *bind_tap);
+
#endif /* __XEN_EVENT_H__ */
diff -r b249f3e979a5 -r cf6a910e3663 xen/common/event_channel.c
--- a/xen/common/event_channel.c Mon Mar 09 10:32:24 2009 +0000
+++ b/xen/common/event_channel.c Wed Mar 11 18:03:47 2009 +0900
@@ -191,7 +191,8 @@
if ( !port_is_valid(rd, rport) )
ERROR_EXIT_DOM(-EINVAL, rd);
rchn = evtchn_from_port(rd, rport);
- if ( (rchn->state != ECS_UNBOUND) ||
+ /* kemari needs to reuse rchn information */
+ if ( (rchn->state != ECS_UNBOUND) &&
(rchn->u.unbound.remote_domid != ld->domain_id) )
ERROR_EXIT_DOM(-EINVAL, rd);
@@ -338,6 +339,113 @@
return rc;
}
+long evtchn_bind_tap(struct evtchn_bind_tap *bind_tap)
+{
+ struct evtchn *lchn, *rchn;
+ struct domain *ld, *rd;
+ int lport = bind_tap->tap_port, rport;
+ domid_t ldom = bind_tap->tap_dom;
+ long ret;
+
+ if ( (ld = rcu_lock_domain_by_id(ldom)) == NULL )
+ return -ESRCH;
+
+ spin_lock(&ld->event_lock);
+
+ ret = -EINVAL;
+ if ( !port_is_valid(ld, lport) )
+ goto lchn_out;
+ lchn = evtchn_from_port(ld, lport);
+ if ( lchn->state != ECS_INTERDOMAIN )
+ goto lchn_out;
+
+ ret = -ESRCH;
+ rd = lchn->u.interdomain.remote_dom;
+ if ( rd == NULL )
+ goto lchn_out;
+
+ spin_lock(&rd->event_lock);
+
+ rport = lchn->u.interdomain.remote_port;
+ if ( !port_is_valid(rd, rport) )
+ goto rchn_out;
+ rchn = evtchn_from_port(rd, rport);
+ if ( rchn->state != ECS_INTERDOMAIN )
+ goto rchn_out;
+
+ lchn->state = ECS_TAP;
+ lchn->tap.mode = bind_tap->mode;
+ lchn->tap.redirect = bind_tap->redirect;
+
+ rchn->state = ECS_TAP;
+ rchn->tap.redirect = bind_tap->redirect;
+
+ ret = 0;
+
+ rchn_out:
+ spin_unlock(&rd->event_lock);
+
+ lchn_out:
+ spin_unlock(&ld->event_lock);
+
+ rcu_unlock_domain(ld);
+
+ return ret;
+}
+
+long evtchn_unbind_tap(struct evtchn_bind_tap *bind_tap)
+{
+ struct evtchn *lchn, *rchn;
+ struct domain *ld, *rd;
+ int lport = bind_tap->tap_port, rport;
+ domid_t ldom = bind_tap->tap_dom;
+ long ret;
+
+ if ( (ld = rcu_lock_domain_by_id(ldom)) == NULL )
+ return -ESRCH;
+
+ spin_lock(&ld->event_lock);
+
+ ret = -EINVAL;
+ if ( !port_is_valid(ld, lport) )
+ goto lchn_out;
+ lchn = evtchn_from_port(ld, lport);
+ if ( lchn->state != ECS_TAP )
+ goto lchn_out;
+
+ ret = -ESRCH;
+ rd = lchn->u.interdomain.remote_dom;
+ if ( rd == NULL )
+ goto lchn_out;
+
+ spin_lock(&rd->event_lock);
+
+ rport = lchn->u.interdomain.remote_port;
+ if ( !port_is_valid(rd, rport) )
+ goto rchn_out;
+ rchn = evtchn_from_port(rd, rport);
+ if ( rchn->state != ECS_TAP )
+ goto rchn_out;
+
+ lchn->state = ECS_INTERDOMAIN;
+ lchn->tap.mode = bind_tap->mode;
+ lchn->tap.redirect = NULL;
+
+ rchn->state = ECS_INTERDOMAIN;
+ rchn->tap.redirect = NULL;
+
+ ret = 0;
+
+ rchn_out:
+ spin_unlock(&rd->event_lock);
+
+ lchn_out:
+ spin_unlock(&ld->event_lock);
+
+ rcu_unlock_domain(ld);
+
+ return ret;
+}
static long __evtchn_close(struct domain *d1, int port1)
{
@@ -393,6 +501,7 @@
case ECS_IPI:
break;
+ case ECS_TAP:
case ECS_INTERDOMAIN:
if ( d2 == NULL )
{
@@ -430,6 +539,14 @@
BUG_ON(!port_is_valid(d2, port2));
chn2 = evtchn_from_port(d2, port2);
+
+ if ( chn1->state == ECS_TAP )
+ {
+ chn1->tap.redirect = NULL;
+ chn2->tap.redirect = NULL;
+ chn2->state = ECS_INTERDOMAIN;
+ }
+
BUG_ON(chn2->state != ECS_INTERDOMAIN);
BUG_ON(chn2->u.interdomain.remote_dom != d1);
@@ -499,6 +616,13 @@
switch ( lchn->state )
{
+ case ECS_TAP:
+ rd = lchn->u.interdomain.remote_dom;
+ rport = lchn->u.interdomain.remote_port;
+ rchn = evtchn_from_port(rd, rport);
+
+ lchn->tap.redirect(lchn, rchn);
+
case ECS_INTERDOMAIN:
rd = lchn->u.interdomain.remote_dom;
rport = lchn->u.interdomain.remote_port;
@@ -1009,6 +1133,30 @@
spin_unlock(&ld->event_lock);
}
+void notify_via_xen_evtchn_tap(struct domain *ld, int lport)
+{
+ struct evtchn *lchn, *rchn;
+ struct domain *rd;
+ int rport;
+
+ if (ld != current->domain)
+ spin_lock(&ld->event_lock);
+
+ ASSERT(port_is_valid(ld, lport));
+ lchn = evtchn_from_port(ld, lport);
+ ASSERT(lchn->consumer_is_xen);
+
+ if ( likely(lchn->state == ECS_INTERDOMAIN) )
+ {
+ rd = lchn->u.interdomain.remote_dom;
+ rport = lchn->u.interdomain.remote_port;
+ rchn = evtchn_from_port(rd, rport);
+ evtchn_set_pending(rd->vcpu[rchn->notify_vcpu_id], rport);
+ }
+
+ if (ld != current->domain)
+ spin_unlock(&ld->event_lock);
+}
int evtchn_init(struct domain *d)
{
^ permalink raw reply [flat|nested] 14+ messages in thread* [RFC][PATCH 02/13] Kemari: core kemari code
2009-03-12 1:14 [RFC][PATCH 00/13] Kemari: updated to the 3.4 unstable tree Yoshiaki Tamura
2009-03-12 1:15 ` [RFC][PATCH 01/13] Kemari: add ECS_TAP state to event channel Yoshiaki Tamura
@ 2009-03-12 1:16 ` Yoshiaki Tamura
2009-03-12 1:16 ` [RFC][PATCH 03/13] Kemari: change parameter type of xc_{set, get}_hvm_param Yoshiaki Tamura
` (7 subsequent siblings)
9 siblings, 0 replies; 14+ messages in thread
From: Yoshiaki Tamura @ 2009-03-12 1:16 UTC (permalink / raw)
To: xen-devel
Cc: "柳澤佳里(yanagisawa yoshisato)",
Ian Pratt, ian.jackson, Keir Fraser, Stefano Stabellini
This is an updated version of the following patch. No major changes.
http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00373.html
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp>
Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp>
---
xen/arch/x86/Makefile | 1
xen/arch/x86/domain.c | 4
xen/arch/x86/domctl.c | 16
xen/arch/x86/kemari/Makefile | 1
xen/arch/x86/kemari/kemari.c | 670 +++++++++++++++++++++++++++++++++++++++++
xen/include/public/domctl.h | 33 ++
xen/include/public/io/xenbus.h | 4
xen/include/public/kemari.h | 97 +++++
xen/include/xen/kemari.h | 75 ++++
9 files changed, 900 insertions(+), 1 deletion(-)
diff -r b249f3e979a5 -r cf6a910e3663 xen/include/public/kemari.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/public/kemari.h Wed Mar 11 18:03:47 2009 +0900
@@ -0,0 +1,97 @@
+/******************************************************************************
+ * kemari.h
+ *
+ * Tools interface to Kemari.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef __XEN_PUBLIC_KEMARI_H__
+#define __XEN_PUBLIC_KEMARI_H__
+
+#define KEMARI_TAP_OFF 0
+#define KEMARI_TAP_IN 1
+#define KEMARI_TAP_OUT 2
+
+struct kemari_ring {
+ uint32_t cons;
+ uint32_t prod;
+ uint32_t num_ents;
+ unsigned int dirty_bitmap_size; /* num of ditry bits */
+ struct {
+ uint32_t buf_size;
+ uint32_t rec_size;
+ uint32_t buf_offset;
+ } hvm_ctxt;
+ char data[1];
+};
+
+struct kemari_ent {
+ union {
+ struct {
+ uint16_t pages;
+ uint16_t port;
+ } header;
+ struct {
+ uint16_t start;
+ uint16_t end;
+ } index;
+ unsigned long dirty_bitmap;
+ } u;
+};
+
+#define KEMARI_RING_GET_PROD(_ring) \
+ (&((struct kemari_ent *)(_ring)->data)[(_ring)->prod % (_ring)->num_ents])
+
+#define KEMARI_RING_GET_CONS(_ring) \
+ (&((struct kemari_ent *)(_ring)->data)[(_ring)->cons % (_ring)->num_ents])
+
+static inline void kemari_ring_read(struct kemari_ring *ring,
+ struct kemari_ent **buf)
+{
+ *buf = KEMARI_RING_GET_CONS(ring);
+#ifdef __XEN__
+ wmb();
+#elif __XEN_TOOLS__
+ xen_wmb();
+#endif
+ ring->cons++;
+}
+
+static inline void kemari_ring_write(struct kemari_ring *ring,
+ struct kemari_ent *buf)
+{
+ memcpy(KEMARI_RING_GET_PROD(ring), buf, sizeof(struct kemari_ent));
+#ifdef __XEN__
+ wmb();
+#elif __XEN_TOOLS__
+ xen_wmb();
+#endif
+ ring->prod++;
+}
+
+#endif /* __XEN_PUBLIC_KEMARI_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r b249f3e979a5 -r cf6a910e3663 xen/include/public/domctl.h
--- a/xen/include/public/domctl.h Mon Mar 09 10:32:24 2009 +0000
+++ b/xen/include/public/domctl.h Wed Mar 11 18:03:47 2009 +0900
@@ -645,6 +645,38 @@
} xen_domctl_hvmcontext_partial_t;
DEFINE_XEN_GUEST_HANDLE(xen_domctl_hvmcontext_partial_t);
+/* Kemari interface */
+#define XEN_DOMCTL_kemari_op 56
+
+#define _XEN_KEMARI_OP_enable 0
+#define XEN_KEMARI_OP_enable (1UL<<_XEN_KEMARI_OP_enable)
+#define _XEN_KEMARI_OP_off 1
+#define XEN_KEMARI_OP_off (1UL<<_XEN_KEMARI_OP_off)
+#define _XEN_KEMARI_OP_attach 2
+#define XEN_KEMARI_OP_attach (1UL<<_XEN_KEMARI_OP_attach)
+#define _XEN_KEMARI_OP_detach 3
+#define XEN_KEMARI_OP_detach (1UL<<_XEN_KEMARI_OP_detach)
+
+struct xen_domctl_kemari_op {
+ uint32_t cmd;
+
+ union {
+ struct {
+ uint32_t port;
+ uint32_t num_pages;
+ uint64_t mfn;
+ } enable; /* XEN_KEMARI_OP_enable */
+ struct {
+ uint32_t port;
+ uint16_t evtchn_tap_mode;
+ } attach; /* XEN_KEMARI_OP_attach */
+ struct {
+ uint32_t port;
+ } detach; /* XEN_KEMARI_OP_detach */
+ } u;
+};
+typedef struct xen_domctl_kemari_op xen_domctl_kemari_op_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_kemari_op_t);
struct xen_domctl {
uint32_t cmd;
@@ -687,6 +719,7 @@
struct xen_domctl_set_target set_target;
struct xen_domctl_subscribe subscribe;
struct xen_domctl_debug_op debug_op;
+ struct xen_domctl_kemari_op kemari_op;
#if defined(__i386__) || defined(__x86_64__)
struct xen_domctl_cpuid cpuid;
#endif
diff -r b249f3e979a5 -r cf6a910e3663 xen/include/public/io/xenbus.h
--- a/xen/include/public/io/xenbus.h Mon Mar 09 10:32:24 2009 +0000
+++ b/xen/include/public/io/xenbus.h Wed Mar 11 18:03:47 2009 +0900
@@ -63,7 +63,9 @@
*/
XenbusStateReconfiguring = 7,
- XenbusStateReconfigured = 8
+ XenbusStateReconfigured = 8,
+
+ XenbusStateAttached = 9
};
typedef enum xenbus_state XenbusState;
diff -r b249f3e979a5 -r cf6a910e3663 xen/include/xen/kemari.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/xen/kemari.h Wed Mar 11 18:03:47 2009 +0900
@@ -0,0 +1,75 @@
+/******************************************************************************
+ * kemari.h
+ *
+ * Kemari header file.
+ *
+ * Copyright (C) 2008 Nippon Telegraph and Telephone Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef __XEN_KEMARI_H__
+#define __XEN_KEMARI_H__
+
+#include <public/domctl.h>
+
+#define NUM_KEMARI_TAPS 32
+
+#define _KEMARI_TAP_ATTACHED 0
+#define KEMARI_TAP_ATTACHED (1UL<<_KEMARI_TAP_ATTACHED)
+#define _KEMARI_TAP_DETACHED 1
+#define KEMARI_TAP_DETACHED (1UL<<_KEMARI_TAP_DETACHED)
+
+struct kemari_tap {
+ uint64_t status;
+ uint64_t in_events;
+ uint64_t out_events;
+};
+
+/* Main data structure of Kemari */
+struct kemari {
+ struct domain *domain;
+
+ struct kemari_ring *ring;
+
+ uint32_t port;
+
+ uint32_t num_pages;
+
+ uint64_t mfn;
+
+ uint64_t num_events;
+
+ uint64_t priv_dirty_pages;
+
+ struct kemari_tap taps[NUM_KEMARI_TAPS];
+};
+
+long kemari_off(struct domain *d);
+
+/* Entry point to Kemari */
+long do_kemari_op(struct domain *d, struct xen_domctl_kemari_op *kemari_op);
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r b249f3e979a5 -r cf6a910e3663 xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile Mon Mar 09 10:32:24 2009 +0000
+++ b/xen/arch/x86/Makefile Wed Mar 11 18:03:47 2009 +0900
@@ -4,6 +4,7 @@
subdir-y += hvm
subdir-y += mm
subdir-y += oprofile
+subdir-y += kemari
subdir-$(x86_32) += x86_32
subdir-$(x86_64) += x86_64
diff -r b249f3e979a5 -r cf6a910e3663 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c Mon Mar 09 10:32:24 2009 +0000
+++ b/xen/arch/x86/domain.c Wed Mar 11 18:03:47 2009 +0900
@@ -1912,6 +1912,10 @@
BUG();
}
+ /* Turn off Kemari. */
+ if ( d->kemari )
+ kemari_off(d);
+
if ( is_hvm_domain(d) )
hvm_domain_relinquish_resources(d);
diff -r b249f3e979a5 -r cf6a910e3663 xen/arch/x86/domctl.c
--- a/xen/arch/x86/domctl.c Mon Mar 09 10:32:24 2009 +0000
+++ b/xen/arch/x86/domctl.c Wed Mar 11 18:03:47 2009 +0900
@@ -20,6 +20,7 @@
#include <xen/trace.h>
#include <xen/console.h>
#include <xen/iocap.h>
+#include <xen/kemari.h>
#include <xen/paging.h>
#include <asm/irq.h>
#include <asm/hvm/hvm.h>
@@ -1079,6 +1080,21 @@
}
break;
+ case XEN_DOMCTL_kemari_op:
+ {
+ struct domain *d = rcu_lock_domain_by_id(domctl->domain);
+
+ ret = -ESRCH;
+ if ( unlikely(d == NULL) )
+ break;
+
+ ret = do_kemari_op(d, &domctl->u.kemari_op);
+
+ copy_to_guest(u_domctl, domctl, 1);
+ rcu_unlock_domain(d);
+ }
+ break;
+
default:
ret = -ENOSYS;
break;
diff -r b249f3e979a5 -r cf6a910e3663 xen/arch/x86/kemari/Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/kemari/Makefile Wed Mar 11 18:03:47 2009 +0900
@@ -0,0 +1,1 @@
+obj-y += kemari.o
diff -r b249f3e979a5 -r cf6a910e3663 xen/arch/x86/kemari/kemari.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/kemari/kemari.c Wed Mar 11 18:03:47 2009 +0900
@@ -0,0 +1,670 @@
+/******************************************************************************
+ * kemari.c
+ *
+ * The hypervisor part of VM synchronization mechanism (Kemari).
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Copied log_dirty_lock(_d), log_dirty_unlock(_d) and paging_log_dirty_op()
+ * from arch/x86/paging.c.
+ *
+ * x86 specific paging support
+ * Copyright (c) 2007 Advanced Micro Devices (Wei Huang)
+ * Copyright (c) 2007 XenSource Inc.
+ */
+
+#include <xen/config.h>
+#include <xen/errno.h>
+#include <xen/sched.h>
+#include <xen/event.h>
+#include <xen/kemari.h>
+#include <xen/mm.h>
+#include <xen/domain.h>
+
+#include <public/kemari.h>
+#include <asm/domain.h>
+#include <asm/hvm/support.h>
+#include <asm/page.h>
+#include <asm/paging.h>
+#include <asm/shadow.h>
+#include <asm/types.h>
+
+/* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_valid
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
+
+#define log_dirty_lock(_d) \
+ do { \
+ if (unlikely((_d)->arch.paging.log_dirty.locker==current->processor))\
+ { \
+ printk("Error: paging log dirty lock held by %s\n", \
+ (_d)->arch.paging.log_dirty.locker_function); \
+ BUG(); \
+ } \
+ spin_lock(&(_d)->arch.paging.log_dirty.lock); \
+ ASSERT((_d)->arch.paging.log_dirty.locker == -1); \
+ (_d)->arch.paging.log_dirty.locker = current->processor; \
+ (_d)->arch.paging.log_dirty.locker_function = __func__; \
+ } while (0)
+
+#define log_dirty_unlock(_d) \
+ do { \
+ ASSERT((_d)->arch.paging.log_dirty.locker == current->processor); \
+ (_d)->arch.paging.log_dirty.locker = -1; \
+ (_d)->arch.paging.log_dirty.locker_function = "nobody"; \
+ spin_unlock(&(_d)->arch.paging.log_dirty.lock); \
+ } while (0)
+
+#define bucket_from_port(d,p) \
+ ((d)->evtchn[(p)/EVTCHNS_PER_BUCKET])
+#define port_is_valid(d,p) \
+ (((p) >= 0) && ((p) < MAX_EVTCHNS(d)) && \
+ (bucket_from_port(d,p) != NULL))
+#define evtchn_from_port(d,p) \
+ (&(bucket_from_port(d,p))[(p)&(EVTCHNS_PER_BUCKET-1)])
+
+static void kemari_send_domaininfo_ctxt(struct kemari_ring *ring,
+ struct domain *d)
+{
+ struct hvm_domain_context ctxt;
+
+ if ( !d->is_paused_by_controller )
+ {
+ dprintk(XENLOG_ERR, "Domain isn't paused\n");
+ return;
+ }
+
+ ctxt.cur = 0;
+ ctxt.size = ring->hvm_ctxt.buf_size;
+ ctxt.data = (uint8_t *)ring + ring->hvm_ctxt.buf_offset;
+ hvm_save(d, &ctxt);
+ ring->hvm_ctxt.rec_size = ctxt.cur;
+}
+
+static long kemari_send_dirty_bitmap_page(struct kemari_ring *ring,
+ struct domain *d,
+ unsigned long *dirty_bitmap,
+ uint16_t index, unsigned int bytes)
+{
+ uint16_t i, j;
+ struct kemari_ent *buf;
+
+ for ( i = 0; i < bytes / BYTES_PER_LONG; i++ )
+ {
+ j = i;
+
+ while ( (j < bytes / BYTES_PER_LONG) && (dirty_bitmap[j] != 0) )
+ j++;
+
+ if ( i == j )
+ continue;
+
+ buf = KEMARI_RING_GET_PROD(ring);
+ buf->u.index.start = i + index;
+ buf->u.index.end = j + index;
+ wmb();
+ ring->prod++;
+
+ while( i < j )
+ {
+ buf = (struct kemari_ent *)&dirty_bitmap[i];
+ kemari_ring_write(ring, buf);
+ i++;
+ }
+ }
+ return i;
+}
+
+/* Based on paging_log_dirty_op() in xen/arch/x86/mm/paging.c. */
+static long kemari_send_dirty_bitmap(struct kemari_ring *ring,
+ struct domain *d)
+{
+ long ret = 0, clean = 1, peek = 1;
+ unsigned long pages = 0;
+ unsigned long p2m_size;
+ mfn_t *l4, *l3, *l2;
+ unsigned long *l1;
+ int i4, i3, i2;
+ uint16_t index = 0;
+
+ log_dirty_lock(d);
+
+ if ( clean )
+ {
+ d->arch.paging.log_dirty.fault_count = 0;
+ d->arch.paging.log_dirty.dirty_count = 0;
+ }
+
+ if ( !mfn_valid(d->arch.paging.log_dirty.top) )
+ {
+ ret = -EINVAL; /* perhaps should be ENOMEM? */
+ goto out;
+ }
+
+ if ( unlikely(d->arch.paging.log_dirty.failed_allocs) ) {
+ printk("%s: %d failed page allocs while logging dirty pages\n",
+ __FUNCTION__, d->arch.paging.log_dirty.failed_allocs);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pages = 0;
+ l4 = map_domain_page(mfn_x(d->arch.paging.log_dirty.top));
+
+ p2m_size = domain_get_maximum_gpfn(d) + 1;
+
+ for ( i4 = 0;
+ (pages < p2m_size) && (i4 < LOGDIRTY_NODE_ENTRIES);
+ i4++ )
+ {
+ l3 = mfn_valid(l4[i4]) ? map_domain_page(mfn_x(l4[i4])) : NULL;
+ for ( i3 = 0;
+ (pages < p2m_size) && (i3 < LOGDIRTY_NODE_ENTRIES);
+ i3++ )
+ {
+ l2 = ((l3 && mfn_valid(l3[i3])) ?
+ map_domain_page(mfn_x(l3[i3])) : NULL);
+ for ( i2 = 0;
+ (pages < p2m_size) && (i2 < LOGDIRTY_NODE_ENTRIES);
+ i2++ )
+ {
+ unsigned int bytes = PAGE_SIZE;
+ l1 = ((l2 && mfn_valid(l2[i2])) ?
+ map_domain_page(mfn_x(l2[i2])) : NULL);
+ if ( unlikely(((p2m_size - pages + 7) >> 3) < bytes) )
+ bytes = (unsigned int)((p2m_size - pages +
+ BITS_PER_LONG - 1) >> 3);
+ if ( likely(peek) )
+ {
+ if ( l1 != NULL &&
+ kemari_send_dirty_bitmap_page(ring, d, l1,
+ index, bytes) < 0 )
+ {
+ ret = -EFAULT;
+ dprintk(XENLOG_ERR,
+ "%s: kemari_send_dirty_bitmap_page\n",
+ __FUNCTION__);
+ goto out;
+ }
+ }
+ index += PAGE_SIZE / BYTES_PER_LONG;
+
+ if ( clean && l1 != NULL )
+ clear_page(l1);
+ pages += bytes << 3;
+ if ( l1 != NULL )
+ unmap_domain_page(l1);
+ }
+ if ( l2 )
+ unmap_domain_page(l2);
+ }
+ if ( l3 )
+ unmap_domain_page(l3);
+ }
+ unmap_domain_page(l4);
+
+ log_dirty_unlock(d);
+
+ if ( clean )
+ {
+ /* We need to further call clean_dirty_bitmap() functions of specific
+ * paging modes (shadow or hap). Safe because the domain is paused. */
+ d->arch.paging.log_dirty.clean_dirty_bitmap(d);
+ }
+
+ return ret;
+
+ out:
+ log_dirty_unlock(d);
+
+ return ret;
+}
+
+static void kemari_guest_notify(struct kemari *kemari)
+{
+ if ( likely(kemari != NULL) )
+ notify_via_xen_evtchn_tap(kemari->domain, kemari->port);
+}
+
+/* VM synchronization entry point. */
+static long run_kemari(struct evtchn *lchn, struct evtchn *rchn)
+{
+ long ret;
+ uint32_t port;
+ uint64_t *events;
+ struct domain *d, *rd = lchn->u.interdomain.remote_dom;
+ struct kemari *kemari;
+ struct kemari_ring *ring;
+ struct evtchn *kemari_evtchn;
+
+ if (lchn->tap.mode & KEMARI_TAP_OUT)
+ {
+ domain_pause_for_debugger();
+ d = current->domain;
+ kemari = d->kemari;
+ port = rchn->u.interdomain.remote_port;
+ events = &kemari->taps[port].out_events;
+ }
+ else if (rchn->tap.mode & KEMARI_TAP_IN)
+ {
+ domain_pause_by_systemcontroller(rd);
+ d = rd;
+ kemari = rd->kemari;
+ port = lchn->u.interdomain.remote_port;
+ events = &kemari->taps[port].in_events;
+ }
+ else
+ {
+ ret = 0;
+ goto out;
+ }
+
+ spin_lock(&d->grant_table->lock);
+
+ ++*events;
+
+ kemari_evtchn = evtchn_from_port(d, kemari->port);
+ if (kemari_evtchn->notify_vcpu_id != current->vcpu_id)
+ kemari_evtchn->notify_vcpu_id = current->vcpu_id;
+
+ ring = kemari->ring;
+
+ ret = kemari_send_dirty_bitmap(ring, d);
+ if ( ret < 0 )
+ goto unlock_out;
+
+ kemari_guest_notify(kemari);
+
+ prepare_wait_on_xen_event_channel(kemari->port);
+
+ test_and_clear_bit(_VPF_blocked_in_xen, ¤t->pause_flags);
+
+ ret = 0;
+
+ unlock_out:
+ spin_unlock(&d->grant_table->lock);
+
+ out:
+ return ret;
+}
+
+static long kemari_bind_tap(struct domain *d,
+ struct xen_domctl_kemari_op *kemari_op)
+{
+ long ret;
+ struct evtchn_bind_tap bind_tap;
+
+ bind_tap.tap_dom = d->domain_id;
+ bind_tap.tap_port = kemari_op->u.attach.port;
+ bind_tap.mode = kemari_op->u.attach.evtchn_tap_mode;
+ bind_tap.redirect = run_kemari;
+
+ ret = evtchn_bind_tap(&bind_tap);
+
+ return ret;
+}
+
+static long kemari_unbind_tap(struct domain *d,
+ struct xen_domctl_kemari_op *kemari_op)
+{
+ long ret;
+ struct evtchn_bind_tap unbind_tap;
+
+ unbind_tap.tap_dom = d->domain_id;
+ unbind_tap.tap_port = kemari_op->u.detach.port;
+ unbind_tap.mode = KEMARI_TAP_OFF;
+
+ ret = evtchn_unbind_tap(&unbind_tap);
+
+ return ret;
+}
+
+static long kemari_attach(struct domain *d,
+ struct xen_domctl_kemari_op *kemari_op)
+{
+ long ret;
+ uint32_t port = kemari_op->u.attach.port;
+ struct kemari *kemari = d->kemari;
+ struct kemari_tap *tap;
+
+ dprintk(XENLOG_DEBUG, "%s: in\n", __FUNCTION__);
+
+ ret = -EINVAL;
+ if ( unlikely(kemari == NULL) )
+ {
+ dprintk(XENLOG_ERR, "kemari is off\n");
+ goto out;
+ }
+ dprintk(XENLOG_DEBUG, "%s: kemari_bind_tap\n", __FUNCTION__);
+ ret = kemari_bind_tap(d, kemari_op);
+ if (ret < 0)
+ {
+ dprintk(XENLOG_ERR,
+ "couldn't bind evtchn tap port=%u\n", port);
+ goto out;
+ }
+
+ tap = &kemari->taps[port];
+
+ tap->status = KEMARI_TAP_ATTACHED;
+
+ out:
+ dprintk(XENLOG_DEBUG, "%s: out\n", __FUNCTION__);
+ return ret;
+}
+
+static long kemari_detach(struct domain *d,
+ struct xen_domctl_kemari_op *kemari_op)
+{
+ long ret;
+ uint32_t port = kemari_op->u.detach.port;
+ struct kemari *kemari = d->kemari;
+ struct kemari_tap *tap = &kemari->taps[port];
+
+ ret = -EINVAL;
+ if ( unlikely(kemari == NULL) )
+ {
+ dprintk(XENLOG_ERR, "kemari is off\n");
+ goto out;
+ }
+
+ ret = -EINVAL;
+ if ( unlikely(tap->status != KEMARI_TAP_ATTACHED) )
+ goto out;
+
+ ret = kemari_unbind_tap(d, kemari_op);
+ if (ret < 0)
+ goto out;
+
+ tap->status = KEMARI_TAP_DETACHED;
+
+ out:
+ return ret;
+}
+
+static void share_kemari_page_with_privileged_guests(struct kemari *kemari)
+{
+ int i;
+ struct kemari_ring *ring = kemari->ring;
+
+ for ( i = 0; i < kemari->num_pages; i++ )
+ share_xen_page_with_privileged_guests(virt_to_page(ring) + i,
+ XENSHARE_writable);
+}
+
+static void unshare_kemari_page_with_privileged_guests(struct kemari *kemari)
+{
+ int i;
+
+ for ( i = 0; i < kemari->num_pages; i++ )
+ {
+ struct page_info *page = mfn_to_page(kemari->mfn + i);
+ BUG_ON(page_get_owner(page) != dom_xen);
+ if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
+ put_page(page);
+ }
+}
+
+static void kemari_free_ring(struct domain *d)
+{
+ int order;
+ struct vcpu *v = d->vcpu[0];
+ struct kemari *kemari = d->kemari;
+
+ if ( kemari->ring == NULL ||
+ kemari->num_pages == 0 ||
+ kemari->port == 0 )
+ return;
+
+ free_xen_event_channel(v, kemari->port);
+
+ unshare_kemari_page_with_privileged_guests(kemari);
+
+ order = get_order_from_pages(kemari->num_pages);
+ free_xenheap_pages(kemari->ring, order);
+
+ kemari->mfn = 0;
+ kemari->ring = NULL;
+ kemari->num_pages = 0;
+ kemari->port = 0;
+}
+
+static long kemari_alloc_ring(struct domain *d, struct kemari *kemari)
+{
+ long ret;
+ unsigned int order;
+ unsigned long num_pages;
+ domid_t current_domid = current->domain->domain_id;
+ struct vcpu *v = d->vcpu[0];
+ struct kemari_ring *ring;
+ unsigned long dirty_bitmap_size;
+ uint32_t hvm_buf_size;
+
+ ret = alloc_unbound_xen_event_channel(v, current_domid);
+ if ( ret < 0 )
+ {
+ dprintk(XENLOG_ERR, "couldn't alloc xen_event_channel\n");
+ goto out;
+ }
+ kemari->port = ret;
+
+ dirty_bitmap_size = (BITS_TO_LONGS(domain_get_maximum_gpfn(d) + 1)
+ * sizeof(unsigned long));
+
+ ret = -EINVAL;
+ if ( dirty_bitmap_size == 0 || !mfn_valid(d->arch.paging.log_dirty.top) )
+ {
+ dprintk(XENLOG_ERR, "dirty_bitmap is EMPTY\n");
+ goto out_evtchn;
+ }
+
+ hvm_buf_size = hvm_save_size(d);
+ num_pages = (sizeof(struct kemari_ring)
+ + hvm_buf_size
+ + (dirty_bitmap_size >> 3)
+ + PAGE_SIZE - 1) / PAGE_SIZE;
+ order = get_order_from_pages(num_pages);
+ num_pages = (1UL << order);
+
+ dprintk(XENLOG_DEBUG, "ring=%u, bitmap=%lu, ctxt=%u, PAGE=%ld\n",
+ sizeof(struct kemari_ring), dirty_bitmap_size / 8,
+ hvm_buf_size, PAGE_SIZE);
+
+ ret = -ENOMEM;
+ ring = alloc_xenheap_pages(order, 0);
+ if ( ring == NULL )
+ {
+ dprintk(XENLOG_ERR, "couldn't alloc xenheap_pages\n");
+ goto out_evtchn;
+ }
+ memset(ring, 0, PAGE_SIZE * num_pages);
+
+ ring->num_ents =
+ (PAGE_SIZE * num_pages - hvm_buf_size + (long)ring - (long)ring->data)
+ / sizeof(struct kemari_ent);
+ ring->hvm_ctxt.buf_size = hvm_buf_size;
+ ring->hvm_ctxt.buf_offset = PAGE_SIZE * num_pages - hvm_buf_size;
+
+ kemari->num_pages = num_pages;
+ kemari->mfn = virt_to_mfn(ring);
+ kemari->ring = ring;
+
+ share_kemari_page_with_privileged_guests(kemari);
+
+ dprintk(XENLOG_DEBUG, "num_ents=%u, num_pages=%u\n",
+ ring->num_ents, kemari->num_pages);
+
+ return 0;
+
+ out_evtchn:
+ free_xen_event_channel(v, kemari->port);
+ out:
+ return ret;
+}
+
+static long kemari_enable(struct domain *d,
+ struct xen_domctl_kemari_op *kemari_op)
+{
+ long ret;
+ struct kemari *kemari;
+
+ ret = -EBUSY;
+ if ( unlikely(d->kemari != NULL) )
+ {
+ dprintk(XENLOG_ERR, "kemari already enabled\n");
+ goto out;
+ }
+
+ ret = -ENOMEM;
+ kemari = xmalloc_bytes(sizeof(struct kemari));
+ if ( kemari == NULL )
+ {
+ dprintk(XENLOG_ERR, "couldn't alloc kemari\n");
+ goto out;
+ }
+
+ memset(kemari, 0, sizeof(struct kemari) );
+
+ domain_pause_by_systemcontroller(d);
+
+ ret = kemari_alloc_ring(d, kemari);
+ if ( ret < 0 )
+ goto kemari_free;
+
+ kemari_op->u.enable.port = kemari->port;
+ kemari_op->u.enable.mfn = kemari->mfn;
+ kemari_op->u.enable.num_pages = kemari->num_pages;
+
+ dprintk(XENLOG_DEBUG, "port=%u, mfn=%llu\n", kemari->port, kemari->mfn);
+
+ kemari->domain = d;
+
+ d->kemari = kemari;
+
+ kemari_send_domaininfo_ctxt(kemari->ring, d);
+
+ domain_unpause_by_systemcontroller(d);
+
+ dprintk(XENLOG_DEBUG, "kemari enabled\n");
+ return 0;
+
+ kemari_free:
+ xfree(kemari);
+ domain_unpause_by_systemcontroller(d);
+ out:
+ return ret;
+}
+
+long kemari_off(struct domain *d)
+{
+ long ret;
+ uint32_t port;
+ struct kemari *kemari = d->kemari;
+ struct kemari_tap *tap;
+ struct evtchn_bind_tap kemari_unbind_tap;
+
+ ret = -EINVAL;
+ if ( unlikely(kemari == NULL) )
+ {
+ dprintk(XENLOG_ERR, "kemari already off\n");
+ goto out;
+ }
+
+ domain_pause_by_systemcontroller(d);
+
+ kemari_unbind_tap.tap_dom = d->domain_id;
+
+ for ( port = 0; port < NUM_KEMARI_TAPS; port++ ) {
+ tap = &kemari->taps[port];
+
+ if ( (tap->status != KEMARI_TAP_ATTACHED) ||
+ (!port_is_valid(d, port)) )
+ continue;
+
+ kemari_unbind_tap.tap_port = port;
+
+ if ( evtchn_unbind_tap(&kemari_unbind_tap) < 0 )
+ dprintk(XENLOG_ERR,
+ "couldn't unbind evtchn tap port=%u\n", port);
+ }
+
+ if ( kemari->ring )
+ kemari_free_ring(d);
+
+ xfree(kemari);
+
+ d->kemari = NULL;
+
+ domain_unpause_by_systemcontroller(d);
+
+ return 0;
+
+ out:
+ return ret;
+}
+
+long do_kemari_op(struct domain *d, struct xen_domctl_kemari_op *kemari_op)
+{
+ static DEFINE_SPINLOCK(lock);
+ long ret;
+
+ /* We don't support calling kemari by itself or dom0. */
+ if ( d == current->domain || d == dom0 )
+ {
+ dprintk(XENLOG_ERR, "can't attach kemari by itself or to dom0");
+ return -EINVAL;
+ }
+
+ spin_lock(&lock);
+
+ switch ( kemari_op->cmd )
+ {
+ case XEN_KEMARI_OP_enable:
+ ret = kemari_enable(d, kemari_op);
+ break;
+
+ case XEN_KEMARI_OP_off:
+ ret = kemari_off(d);
+ break;
+
+ case XEN_KEMARI_OP_attach:
+ ret = kemari_attach(d, kemari_op);
+ break;
+
+ case XEN_KEMARI_OP_detach:
+ ret = kemari_detach(d, kemari_op);
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ spin_unlock(&lock);
+
+ return ret;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
^ permalink raw reply [flat|nested] 14+ messages in thread* [RFC][PATCH 03/13] Kemari: change parameter type of xc_{set, get}_hvm_param
2009-03-12 1:14 [RFC][PATCH 00/13] Kemari: updated to the 3.4 unstable tree Yoshiaki Tamura
2009-03-12 1:15 ` [RFC][PATCH 01/13] Kemari: add ECS_TAP state to event channel Yoshiaki Tamura
2009-03-12 1:16 ` [RFC][PATCH 02/13] Kemari: core kemari code Yoshiaki Tamura
@ 2009-03-12 1:16 ` Yoshiaki Tamura
2009-03-12 1:17 ` [RFC][PATCH 04/13] Kemari: Kemari controller interface in libxc Yoshiaki Tamura
` (6 subsequent siblings)
9 siblings, 0 replies; 14+ messages in thread
From: Yoshiaki Tamura @ 2009-03-12 1:16 UTC (permalink / raw)
To: xen-devel
Cc: "柳澤佳里(yanagisawa yoshisato)",
Ian Pratt, ian.jackson, Keir Fraser, Stefano Stabellini
This is an updated version of the following patch. Modifies files
which use xc_{set,get}_hvm_param.
http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00370.html
Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp>
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp>
---
tools/libxc/xc_cpuid_x86.c | 2 +-
tools/libxc/xc_domain.c | 4 ++--
tools/libxc/xc_domain_save.c | 10 +++++-----
tools/libxc/xc_resume.c | 2 +-
tools/python/xen/lowlevel/xc/xc.c | 2 +-
tools/xcutils/xc_save.c | 2 +-
6 files changed, 11 insertions(+), 11 deletions(-)
diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xc_domain.c
--- a/tools/libxc/xc_domain.c Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/libxc/xc_domain.c Wed Mar 11 18:03:47 2009 +0900
@@ -792,7 +792,7 @@
return do_domctl(xc_handle, &domctl);
}
-int xc_set_hvm_param(int handle, domid_t dom, int param, unsigned long value)
+int xc_set_hvm_param(int handle, domid_t dom, int param, uint64_t value)
{
DECLARE_HYPERCALL;
xen_hvm_param_t arg;
@@ -811,7 +811,7 @@
return rc;
}
-int xc_get_hvm_param(int handle, domid_t dom, int param, unsigned long *value)
+int xc_get_hvm_param(int handle, domid_t dom, int param, uint64_t *value)
{
DECLARE_HYPERCALL;
xen_hvm_param_t arg;
diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xc_cpuid_x86.c
--- a/tools/libxc/xc_cpuid_x86.c Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/libxc/xc_cpuid_x86.c Wed Mar 11 18:03:47 2009 +0900
@@ -167,7 +167,7 @@
int xc, domid_t domid, const unsigned int *input, unsigned int *regs)
{
char brand[13];
- unsigned long pae;
+ uint64_t pae;
int is_pae;
xc_get_hvm_param(xc, domid, HVM_PARAM_PAE_ENABLED, &pae);
diff -r b249f3e979a5 -r cf6a910e3663 tools/xcutils/xc_save.c
--- a/tools/xcutils/xc_save.c Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/xcutils/xc_save.c Wed Mar 11 18:03:47 2009 +0900
@@ -164,7 +164,7 @@
static int suspend(void)
{
- unsigned long sx_state = 0;
+ uint64_t sx_state = 0;
/* Cannot notify guest to shut itself down if it's in ACPI sleep state. */
if (si.flags & XCFLAGS_HVM)
diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/libxc/xc_domain_save.c Wed Mar 11 18:03:47 2009 +0900
@@ -1395,7 +1395,7 @@
chunk.id = -3;
xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
- (unsigned long *)&chunk.data);
+ &chunk.data);
if ( (chunk.data != 0) &&
write_exact(io_fd, &chunk, sizeof(chunk)) )
@@ -1406,7 +1406,7 @@
chunk.id = -4;
xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS,
- (unsigned long *)&chunk.data);
+ &chunk.data);
if ( (chunk.data != 0) &&
write_exact(io_fd, &chunk, sizeof(chunk)) )
@@ -1431,11 +1431,11 @@
/* Save magic-page locations. */
memset(magic_pfns, 0, sizeof(magic_pfns));
xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
- (unsigned long *)&magic_pfns[0]);
+ &magic_pfns[0]);
xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
- (unsigned long *)&magic_pfns[1]);
+ &magic_pfns[1]);
xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
- (unsigned long *)&magic_pfns[2]);
+ &magic_pfns[2]);
if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
{
PERROR("Error when writing to state file (7)");
diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xc_resume.c
--- a/tools/libxc/xc_resume.c Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/libxc/xc_resume.c Wed Mar 11 18:03:47 2009 +0900
@@ -27,7 +27,7 @@
/* HVM guests without PV drivers do not have a return code to modify. */
if ( info.hvm )
{
- unsigned long irq = 0;
+ uint64_t irq = 0;
xc_get_hvm_param(xc_handle, domid, HVM_PARAM_CALLBACK_IRQ, &irq);
if ( !irq )
return 0;
diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/python/xen/lowlevel/xc/xc.c Wed Mar 11 18:03:47 2009 +0900
@@ -490,7 +490,7 @@
{
uint32_t dom;
int param;
- unsigned long value;
+ uint64_t value;
static char *kwd_list[] = { "domid", "param", NULL };
if ( !PyArg_ParseTupleAndKeywords(args, kwds, "ii", kwd_list,
^ permalink raw reply [flat|nested] 14+ messages in thread* [RFC][PATCH 04/13] Kemari: Kemari controller interface in libxc
2009-03-12 1:14 [RFC][PATCH 00/13] Kemari: updated to the 3.4 unstable tree Yoshiaki Tamura
` (2 preceding siblings ...)
2009-03-12 1:16 ` [RFC][PATCH 03/13] Kemari: change parameter type of xc_{set, get}_hvm_param Yoshiaki Tamura
@ 2009-03-12 1:17 ` Yoshiaki Tamura
2009-03-12 1:17 ` [RFC][PATCH 05/13] Kemari: Kemari sender Yoshiaki Tamura
` (5 subsequent siblings)
9 siblings, 0 replies; 14+ messages in thread
From: Yoshiaki Tamura @ 2009-03-12 1:17 UTC (permalink / raw)
To: xen-devel
Cc: "柳澤佳里(yanagisawa yoshisato)",
Ian Pratt, ian.jackson, Keir Fraser, Stefano Stabellini
This is an updated version of the following patch. No major changes.
http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00372.html
Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp>
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp>
---
tools/libxc/Makefile | 2 +
tools/libxc/xc_dom_kemari.c | 79 ++++++++++++++++++++++++++++++++++++++++++
tools/libxc/xenctrl.h | 13 +++++-
tools/libxc/xenguest.h | 45 +++++++++++++++++++++++
tools/libxc/xg_save_restore.h | 1
tools/xcutils/Makefile | 1
6 files changed, 139 insertions(+), 2 deletions(-)
diff -r b249f3e979a5 -r cf6a910e3663 tools/xcutils/Makefile
--- a/tools/xcutils/Makefile Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/xcutils/Makefile Wed Mar 11 18:03:47 2009 +0900
@@ -15,6 +15,7 @@
CFLAGS += $(CFLAGS_libxenctrl) $(CFLAGS_libxenguest) $(CFLAGS_libxenstore)
PROGRAMS = xc_restore xc_save readnotes lsevtchn
+PROGRAMS += xc_kemari_restore xc_kemari_save
LDLIBS = $(LDFLAGS_libxenctrl) $(LDFLAGS_libxenguest) $(LDFLAGS_libxenstore)
diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/Makefile
--- a/tools/libxc/Makefile Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/libxc/Makefile Wed Mar 11 18:03:47 2009 +0900
@@ -31,6 +31,8 @@
GUEST_SRCS-y :=
GUEST_SRCS-y += xg_private.c
GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_domain_save.c
+GUEST_SRCS-$(CONFIG_MIGRATE) += xc_dom_kemari_restore.c xc_dom_kemari_save.c
+GUEST_SRCS-$(CONFIG_MIGRATE) += xc_dom_kemari.c
GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c
vpath %.c ../../xen/common/libelf
diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/libxc/xenctrl.h Wed Mar 11 18:03:47 2009 +0900
@@ -1041,8 +1041,8 @@
*/
xc_error_handler xc_set_error_handler(xc_error_handler handler);
-int xc_set_hvm_param(int handle, domid_t dom, int param, unsigned long value);
-int xc_get_hvm_param(int handle, domid_t dom, int param, unsigned long *value);
+int xc_set_hvm_param(int handle, domid_t dom, int param, uint64_t value);
+int xc_get_hvm_param(int handle, domid_t dom, int param, uint64_t *value);
/* IA64 specific, nvram save */
int xc_ia64_save_to_nvram(int xc_handle, uint32_t dom);
@@ -1242,4 +1242,13 @@
int xc_set_cpufreq_gov(int xc_handle, int cpuid, char *govname);
int xc_set_cpufreq_para(int xc_handle, int cpuid,
int ctrl_type, int ctrl_value);
+
+/* kemari control interface */
+int xc_kemari_control(int xc_handle,
+ uint32_t domid,
+ uint32_t cmd,
+ evtchn_port_t *port,
+ uint32_t *num_pages,
+ uint64_t *mfn,
+ uint16_t tap_mode);
#endif /* XENCTRL_H */
diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/libxc/xenguest.h Wed Mar 11 18:03:47 2009 +0900
@@ -43,6 +43,51 @@
* @return 0 on success, -1 on failure
*/
int xc_domain_restore(int xc_handle, int io_fd, uint32_t dom,
+ unsigned int store_evtchn, unsigned long *store_mfn,
+ unsigned int console_evtchn, unsigned long *console_mfn,
+ unsigned int hvm, unsigned int pae);
+
+/**
+ * This function will save a running domain for Kemari.
+ *
+ * @parm xc_handle a handle to an open hypervisor interface
+ * @parm fd the file descriptor to save a domain to
+ * @parm dom the id of the domain
+ * @return 0 on success, -1 on failure
+ */
+int xc_kemari_save(int xc_handle, int io_fd, uint32_t dom,
+ void *kemari_ring, uint32_t flags /* XCFLAGS_xxx */,
+ int hvm,
+ void *(*init_qemu_maps)(int, unsigned));
+
+/**
+ * This function will update a domain for Kemari.
+ *
+ * @parm xc_handle a handle to an open hypervisor interface
+ * @parm fd the file descriptor to save a domain to
+ * @parm dom the id of the domain
+ * @return 0 on success, -1 on failure
+ */
+int xc_kemari_update(int xc_handle, int io_fd, uint32_t dom,
+ void *kemari_ring, uint32_t flags,
+ void (*qemu_save_image)(int),
+ void (*qemu_end_flip)(void),
+ void (*qemu_end_save)(void),
+ void (*qemu_image_sent)(void));
+
+/**
+ * This function will restore a saved domain for Kemari.
+ *
+ * @parm xc_handle a handle to an open hypervisor interface
+ * @parm fd the file descriptor to restore a domain from
+ * @parm dom the id of the domain
+ * @parm store_evtchn the store event channel for this domain to use
+ * @parm store_mfn returned with the mfn of the store page
+ * @parm hvm non-zero if this is a HVM restore
+ * @parm pae non-zero if this HVM domain has PAE support enabled
+ * @return 0 on success, -1 on failure
+ */
+int xc_kemari_restore(int xc_handle, int io_fd, uint32_t dom,
unsigned int store_evtchn, unsigned long *store_mfn,
unsigned int console_evtchn, unsigned long *console_mfn,
unsigned int hvm, unsigned int pae);
diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/libxc/xg_save_restore.h Wed Mar 11 18:03:47 2009 +0900
@@ -8,6 +8,7 @@
#include <xen/foreign/x86_32.h>
#include <xen/foreign/x86_64.h>
+#include <xen/kemari.h>
/*
** We process save/restore/migrate in batches of pages; the below
diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xc_dom_kemari.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_dom_kemari.c Wed Mar 11 18:03:47 2009 +0900
@@ -0,0 +1,79 @@
+/*
+ * xc_dom_kemari.c
+ *
+ * The API for manipulating and obtaining information on kemari-domains.
+ *
+ * Copyright (C) 2008 Nippon Telegraph and Telephone Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include "xc_private.h"
+
+/*
+ * Kemari controller interface.
+ */
+int xc_kemari_control(int xc_handle,
+ uint32_t domid,
+ uint32_t cmd,
+ evtchn_port_t *port,
+ uint32_t *num_pages,
+ uint64_t *mfn,
+ uint16_t tap_mode)
+{
+ int rc;
+ struct xen_domctl_kemari_op *kemari_op;
+ DECLARE_DOMCTL;
+
+ domctl.cmd = XEN_DOMCTL_kemari_op;
+ domctl.domain = (domid_t)domid;
+
+ kemari_op = &domctl.u.kemari_op;
+ kemari_op->cmd = cmd;
+
+ if ( cmd == XEN_KEMARI_OP_attach )
+ {
+ kemari_op->u.attach.port = *port;
+ kemari_op->u.attach.evtchn_tap_mode = tap_mode;
+ }
+
+ if ( cmd /* == */ & XEN_KEMARI_OP_detach )
+ kemari_op->u.detach.port = *port;
+
+ DPRINTF("xc_kemari_control: cmd=%d\n", cmd);
+
+ rc = do_domctl(xc_handle, &domctl);
+
+ if ( cmd == XEN_KEMARI_OP_enable )
+ {
+ *port = kemari_op->u.enable.port;
+ *mfn = kemari_op->u.enable.mfn;
+ *num_pages = kemari_op->u.enable.num_pages;
+ }
+
+ return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
^ permalink raw reply [flat|nested] 14+ messages in thread* [RFC][PATCH 05/13] Kemari: Kemari sender
2009-03-12 1:14 [RFC][PATCH 00/13] Kemari: updated to the 3.4 unstable tree Yoshiaki Tamura
` (3 preceding siblings ...)
2009-03-12 1:17 ` [RFC][PATCH 04/13] Kemari: Kemari controller interface in libxc Yoshiaki Tamura
@ 2009-03-12 1:17 ` Yoshiaki Tamura
2009-03-24 6:59 ` Yoshiaki Tamura
2009-03-12 1:18 ` [RFC][PATCH 06/13] Kemari: Kemari receiver Yoshiaki Tamura
` (4 subsequent siblings)
9 siblings, 1 reply; 14+ messages in thread
From: Yoshiaki Tamura @ 2009-03-12 1:17 UTC (permalink / raw)
To: xen-devel
Cc: "柳澤佳里(yanagisawa yoshisato)",
Ian Pratt, ian.jackson, Keir Fraser, Stefano Stabellini
This is an updated version of the following patch. Followed the
changes in live migration code.
http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00374.html
Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp>
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp>
---
tools/libxc/xc_dom_kemari_save.c | 1139 +++++++++++++++++++++++++++++++++++++++
tools/xcutils/xc_kemari_save.c | 518 +++++++++++++++++
2 files changed, 1657 insertions(+)
diff -r b249f3e979a5 -r cf6a910e3663 tools/xcutils/xc_kemari_save.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xcutils/xc_kemari_save.c Wed Mar 11 18:03:47 2009 +0900
@@ -0,0 +1,518 @@
+/*
+ * xc_kemari_save.c
+ *
+ * Save the state of a running Linux session.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License. See the file "COPYING" in the main directory of
+ * this archive for more details.
+ *
+ * This source code is based on xc_save.c.
+ * Copied qemu_destroy_buffer and init_qemu_maps from xc_save.c.
+ *
+ * Copyright (C) 2005 by Christian Limpach
+ *
+ */
+
+
+#include <err.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <xs.h>
+#include <xenctrl.h>
+#include <xenguest.h>
+#include <xc_private.h>
+#include <xen/kemari.h>
+
+static volatile sig_atomic_t run = 1;
+static int xc_handle, xce_handle, io_fd;
+static struct kemari_ring *ring = NULL;
+static uint32_t kemari_ring_size = 0;
+static pid_t qemu_pid;
+static int is_finalized = 0;
+static int domid;
+
+/* For HVM guests, there are two sources of dirty pages: the Xen shadow
+ * log-dirty bitmap, which we get with a hypercall, and qemu's version.
+ * The protocol for getting page-dirtying data from qemu uses a
+ * double-buffered shared memory interface directly between xc_save and
+ * qemu-dm.
+ *
+ * xc_save calculates the size of the bitmaps and notifies qemu-dm
+ * through the store that it wants to share the bitmaps. qemu-dm then
+ * starts filling in the 'active' buffer.
+ *
+ * To change the buffers over, xc_save writes the other buffer number to
+ * the store and waits for qemu to acknowledge that it is now writing to
+ * the new active buffer. xc_save can then process and clear the old
+ * active buffer. */
+
+static char *qemu_active_path;
+static char *qemu_next_active_path;
+static int qemu_shmid = -1;
+static struct xs_handle *xs;
+
+
+/* Mark the shared-memory segment for destruction */
+static void qemu_destroy_buffer(void)
+{
+ if (qemu_shmid != -1)
+ shmctl(qemu_shmid, IPC_RMID, NULL);
+ qemu_shmid = -1;
+}
+
+static char *kemari_qemu_info = NULL;
+static void qemu_save_image(int next_active)
+{
+ kemari_qemu_info[0] = next_active;
+ kemari_qemu_info[1] = 0;
+ xen_wmb();
+ kill(qemu_pid, SIGUSR1);
+}
+
+static void qemu_end_flip(void)
+{
+ while (kemari_qemu_info[1] == 0)
+ xen_rmb();
+}
+
+static void qemu_end_save(void)
+{
+ while (kemari_qemu_info[2] == 0)
+ xen_rmb();
+}
+
+static void qemu_image_sent(void)
+{
+ /* after QEMU image sent */
+ kemari_qemu_info[2] = 0;
+ xen_wmb();
+}
+
+static void *init_qemu_maps(int domid, unsigned int bitmap_size)
+{
+ key_t key;
+ char key_ascii[17] = {0,};
+ void *seg;
+ char *path, *p;
+
+ /* Make a shared-memory segment */
+ do {
+ key = rand(); /* No security, just a sequence of numbers */
+ qemu_shmid = shmget(key, 2 * bitmap_size + PAGE_SIZE,
+ IPC_CREAT|IPC_EXCL|S_IRUSR|S_IWUSR);
+ if (qemu_shmid == -1 && errno != EEXIST)
+ errx(1, "can't get shmem to talk to qemu-dm");
+ } while (qemu_shmid == -1);
+
+ /* Remember to tidy up after ourselves */
+ atexit(qemu_destroy_buffer);
+
+ /* Map it into our address space */
+ seg = shmat(qemu_shmid, NULL, 0);
+ if (seg == (void *) -1)
+ errx(1, "can't map shmem to talk to qemu-dm");
+ memset(seg, 0, 2 * bitmap_size + PAGE_SIZE);
+
+ /* Write the size of it into the first 32 bits */
+ *(uint32_t *)seg = bitmap_size;
+
+ /* Tell qemu about it */
+ if ((xs = xs_daemon_open()) == NULL)
+ errx(1, "Couldn't contact xenstore");
+ if (!(path = strdup("/local/domain/0/device-model/")))
+ errx(1, "can't get domain path in store");
+ if (!(path = realloc(path, strlen(path)
+ + 10
+ + strlen("/logdirty/next-active") + 1)))
+ errx(1, "no memory for constructing xenstore path");
+ snprintf(path + strlen(path), 11, "%i", domid);
+ strcat(path, "/logdirty/");
+ p = path + strlen(path);
+
+ strcpy(p, "key");
+ snprintf(key_ascii, 17, "%16.16llx", (unsigned long long) key);
+ if (!xs_write(xs, XBT_NULL, path, key_ascii, 16))
+ errx(1, "can't write key (%s) to store path (%s)\n", key_ascii, path);
+
+ /* Watch for qemu's indication of the active buffer, and request it
+ * to start writing to buffer 0 */
+ strcpy(p, "active");
+ if (!xs_watch(xs, path, "qemu-active-buffer"))
+ errx(1, "can't set watch in store (%s)\n", path);
+ if (!(qemu_active_path = strdup(path)))
+ errx(1, "no memory for copying xenstore path");
+
+ strcpy(p, "next-active");
+ if (!(qemu_next_active_path = strdup(path)))
+ errx(1, "no memory for copying xenstore path");
+
+ kemari_qemu_info = seg + 2 * bitmap_size;
+ xen_wmb();
+ qemu_save_image(0);
+
+ free(path);
+ return seg;
+}
+
+static void close_handler(int sig_type)
+{
+ run = 0;
+}
+
+static int handle_event(int domid, unsigned int flags)
+{
+ int ret = 1, rcv_port;
+
+ if ((rcv_port = xc_evtchn_pending(xce_handle)) < 0) {
+ ERROR("Failed to read from event fd");
+ goto out;
+ }
+
+ if (xc_kemari_update(xc_handle, io_fd, domid, ring, flags,
+ qemu_save_image, qemu_end_flip, qemu_end_save, qemu_image_sent) != 0) {
+ xc_domain_pause(xc_handle, domid);
+ kill(qemu_pid, SIGSTOP);
+ ERROR("xc_kemari_update failed");
+ goto out;
+ }
+
+ if (xc_evtchn_unmask(xce_handle, rcv_port) < 0) {
+ ERROR("Failed to write to event fd");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static void set_signal_handler(void (*handler)(int))
+{
+ struct sigaction act;
+
+ act.sa_handler = handler;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = 0;
+ sigaction(SIGQUIT, &act, 0);
+ sigaction(SIGINT, &act, 0);
+ sigaction(SIGHUP, &act, 0);
+ sigaction(SIGTERM, &act, 0);
+}
+
+static int attach_ports(int domid)
+{
+ struct xs_handle *xs_handle;
+ char **list, *data;
+ unsigned int list_size, data_size;
+ char path[128];
+ uint32_t port;
+ int i, ret = 1;
+
+ if ((xs_handle = xs_daemon_open()) == NULL)
+ errx(1, "Couldn't contact xenstore");
+
+ /*
+ * attach block port.
+ */
+ snprintf(path, sizeof(path), "/local/domain/%d/device/vbd", domid);
+ list = xs_directory(xs_handle, XBT_NULL, path, &list_size);
+ if (list == NULL)
+ errx(1, "xs_directory (%s) failed", path);
+
+ for (i = 0; i < list_size; i++) {
+ snprintf(path, sizeof(path),
+ "/local/domain/%d/device/vbd/%s/event-channel", domid, list[i]);
+ data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+ if (data == NULL)
+ continue;
+ port = strtoul(data, NULL, 10);
+ if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_attach,
+ &port, NULL,
+ NULL, KEMARI_TAP_OUT)) != 0) {
+ ERROR("Error when attaching blk_port (%d) on kemari", port);
+ goto out;
+ }
+ free(data);
+ DPRINTF("blk_port %d attached\n", port);
+ }
+ free(list);
+
+ /*
+ * attach net port.
+ */
+ snprintf(path, sizeof(path), "/local/domain/%d/device/vif", domid);
+ list = xs_directory(xs_handle, XBT_NULL, path, &list_size);
+ if (list == NULL)
+ errx(1, "xs_directory (%s) failed", path);
+
+ for (i = 0; i < list_size; i++) {
+ snprintf(path, sizeof(path),
+ "/local/domain/%d/device/vif/%s/event-channel", domid, list[i]);
+ data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+ if (data == NULL)
+ continue;
+ port = strtoul(data, NULL, 10);
+ if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_attach,
+ &port, NULL,
+ NULL, KEMARI_TAP_OUT)) != 0) {
+ ERROR("Error when attaching net_port (%d) on kemari", port);
+ goto out;
+ }
+ free(data);
+ DPRINTF("net_port %d attached\n", port);
+ }
+ free(list);
+
+ /* attach success */
+ ret = 0;
+
+out:
+ xs_daemon_close(xs_handle);
+
+ return ret;
+}
+
+static pid_t get_qemu_pid(int domid)
+{
+ struct xs_handle *xs_handle;
+ char path[128];
+ char *data;
+ unsigned int data_size;
+ pid_t pid = 0;
+
+ if ((xs_handle = xs_daemon_open()) == NULL)
+ errx(1, "Couldn't contact xenstore");
+
+ snprintf(path, sizeof(path),
+ "/local/domain/%d/image/device-model-pid", domid);
+ data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+ if (data == NULL) {
+ ERROR("Could not find QEMU pid for domid %d", domid);
+ goto out;
+ }
+ pid = strtoul(data, NULL, 10);
+ free(data);
+
+out:
+ xs_daemon_close(xs_handle);
+
+ return pid;
+}
+
+static void finalize(void)
+{
+ int ret;
+
+ if (is_finalized)
+ return;
+
+ set_signal_handler(SIG_IGN);
+ if (ring != NULL)
+ munmap(ring, kemari_ring_size * PAGE_SIZE);
+
+ if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_off,
+ NULL, NULL, NULL, 0)) != 0) {
+ ERROR("Error when turning off kemari");
+ } else {
+ DPRINTF("successufully execute KEMARI_OP_off\n");
+ }
+
+ if ( xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_OFF,
+ NULL, 0, NULL, 0, NULL) < 0 ) {
+ ERROR("Warning - couldn't disable shadow mode");
+ }
+
+ if (!run)
+ xc_domain_destroy(xc_handle, domid);
+
+ xc_interface_close(xc_handle);
+
+ is_finalized = 1;
+}
+
+int
+main(int argc, char **argv)
+{
+ unsigned int maxit, max_f, flags;
+ int ret;
+ int evtchn_fd;
+ uint32_t port, kemari_port;
+ uint64_t kemari_mfn;
+ fd_set inset;
+
+ if (argc != 6)
+ errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]);
+
+ xc_handle = xc_interface_open();
+ if (xc_handle < 0)
+ errx(1, "failed to open control interface");
+
+ io_fd = atoi(argv[1]);
+ domid = atoi(argv[2]);
+ maxit = atoi(argv[3]);
+ max_f = atoi(argv[4]);
+ flags = atoi(argv[5]);
+
+ set_signal_handler(close_handler);
+ if ((qemu_pid = get_qemu_pid(domid)) == 0)
+ errx(1, "failed to get qemu pid");
+ atexit(finalize);
+
+ if (io_fd == -1) /* means test mode */
+ {
+ io_fd = open("/dev/null", O_RDWR);
+ flags |= XCFLAGS_DEBUG;
+ }
+ else
+ {
+ int one = 1;
+ if (setsockopt(io_fd, IPPROTO_TCP, TCP_NODELAY,
+ &one, sizeof(one)) < 0) {
+ ERROR("failed to set TCP_NODELAY");
+ }
+ }
+
+ if ((xce_handle = xc_evtchn_open()) < 0) {
+ errx(1, "failed to open control interface");
+ }
+
+ evtchn_fd = xc_evtchn_fd(xce_handle);
+
+ if ( xc_shadow_control(xc_handle, domid,
+ XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+ NULL, 0, NULL, 0, NULL) < 0 )
+ {
+ int frc;
+ /* log-dirty already enabled? There's no test op,
+ so attempt to disable then reenable it */
+ frc = xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_OFF,
+ NULL, 0, NULL, 0, NULL);
+ if ( frc >= 0 )
+ {
+ frc = xc_shadow_control(xc_handle, domid,
+ XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+ NULL, 0, NULL, 0, NULL);
+ }
+
+ if ( frc < 0 )
+ {
+ err(errno, "Couldn't enable shadow mode (rc %d)", frc);
+ }
+ }
+
+ if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_enable,
+ &kemari_port, &kemari_ring_size,
+ &kemari_mfn, 0) != 0)) {
+ errx(1, "Error when turning on kemari");
+ }
+
+ DPRINTF("kemari_port=%u, kemari_mfn=%llu, kemari_ring_size=%u\n",
+ kemari_port, kemari_mfn, kemari_ring_size);
+
+ if (attach_ports(domid) != 0) {
+ ERROR("attaching port failed ");
+ goto out;
+ }
+
+ if ((port = xc_evtchn_bind_interdomain(xce_handle, domid,
+ kemari_port)) < 0) {
+ ERROR("xc_evtchn_bind_interdomain failed ");
+ goto out;
+ }
+
+ if ((ring = xc_map_foreign_range(xc_handle, DOMID_XEN,
+ kemari_ring_size * PAGE_SIZE, PROT_READ | PROT_WRITE,
+ kemari_mfn)) == 0) {
+ ERROR("xc_map_foreign_range failed");
+ goto out;
+ }
+
+ if (xc_domain_pause(xc_handle, domid) < 0) {
+ ERROR("Domain appears not to have paused");
+ goto out;
+ }
+
+ ret = xc_kemari_save(xc_handle, io_fd, domid, ring, flags,
+ !!(flags & XCFLAGS_HVM),
+ &init_qemu_maps);
+ if (ret != 0) {
+ ERROR("xc_kemari_save failed");
+ goto out;
+ }
+
+ FD_ZERO(&inset);
+ FD_SET(evtchn_fd, &inset);
+
+ if (xc_domain_unpause(xc_handle, domid) < 0) {
+ ERROR("Domain appears not to have unpaused");
+ goto out;
+ }
+
+ DPRINTF("running start");
+
+ while (run) {
+
+ if (select(evtchn_fd + 1, &inset, NULL, NULL, NULL) < 0) {
+ if (errno == EINTR)
+ continue;
+ ERROR("Error when waiting events by select()");
+ break;
+ }
+
+ if (evtchn_fd != -1 && FD_ISSET(evtchn_fd, &inset)) {
+
+ if ((ret = handle_event(domid, flags)) != 0) {
+ ERROR("Error when handling events");
+ break;
+ }
+
+ /* usleep(10000); */
+
+ if (xc_evtchn_notify(xce_handle, port) < 0) {
+ ERROR("xc_evtchn_notify failed");
+ /* goto out; */
+ break;
+ }
+
+ if(xc_domain_unpause(xc_handle, domid) < 0) {
+ ERROR("xc_domain_unpause");
+ /* goto out; */
+ break;
+ }
+
+ }
+ }
+
+ out:
+ close(io_fd);
+ finalize();
+
+ return ret;
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xc_dom_kemari_save.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_dom_kemari_save.c Wed Mar 11 18:03:47 2009 +0900
@@ -0,0 +1,1139 @@
+/******************************************************************************
+ * xc_dom_kemari_save.c
+ *
+ * Save the state of a running Linux session.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * This source code is based on xc_domain_save.c.
+ * Copied BITS_PER_LONG, BITS_TO_LONGS, BITMAP_SIZE, BITMAP_SHIFT,
+ * RATE_IS_MAX, test_bit, clear_bit, set_bit, tv_delta, noncached_write,
+ * initialize_mbit_rate, and ratewrite from xc_domain_save.c
+ *
+ * Copyright (c) 2003, K A Fraser.
+ */
+
+#include <inttypes.h>
+#include <time.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/time.h>
+
+#include "xc_private.h"
+#include "xc_dom.h"
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+#include <xen/hvm/params.h>
+#include "xc_e820.h"
+
+#ifdef __MINIOS__
+/*
+ * Caution: atomicity of following alternative libc functions are broken.
+ */
+static ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
+{
+ char buf[1024];
+ int len, wrote_len = 0;
+
+ if (offset != NULL) {
+ ERROR("Sorry sendfile for stubdomain should not have offset");
+ errno = EIO;
+ return -1;
+ }
+
+ while (count > 0) {
+ len = (count < sizeof(buf))?count:sizeof(buf);
+ len = read(in_fd, buf, len);
+ if (len < 0)
+ return -1;
+ if (write_exact(out_fd, buf, len))
+ return -1;
+ wrote_len += len;
+ count -= len;
+ }
+ return wrote_len;
+}
+
+#define IOV_MAX 1024
+struct iovec {
+ void *iov_base; /* Base address. */
+ size_t iov_len; /* Length. */
+};
+static ssize_t writev(int d, const struct iovec *iov, int iovcnt)
+{
+ int i;
+ int len, wrote_len;
+
+ if (iovcnt < 0 || iovcnt > IOV_MAX) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ for (i = 0, wrote_len = 0; i < iovcnt; i++) {
+ len = write(d, iov[i].iov_base, iov[i].iov_len);
+ if (len < 0)
+ return -1;
+
+ wrote_len += len;
+ if (wrote_len < 0) { /* integer overflow */
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (len != iov[i].iov_len)
+ return wrote_len;
+ }
+
+ return wrote_len;
+}
+#else /* !__MINIOS__ */
+#include <sys/sendfile.h>
+#include <sys/uio.h>
+#endif /* __MINIOS__ */
+
+/* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */
+static unsigned long *qemu_bitmaps[2];
+static int qemu_active;
+static int qemu_non_active;
+
+/* number of pfns this guest has (i.e. number of entries in the P2M) */
+static unsigned long p2m_size;
+
+/* page frame numbers */
+static unsigned long *pfn_type = NULL;
+
+/* The new domain's shared-info frame number. */
+static unsigned long shared_info_frame;
+
+/*
+ * guest memory
+ */
+#define GUEST_MEM_ENTRY_SIZE 1024 /* up to 4MB at a time. */
+static unsigned char ** guest_memory = NULL;
+static unsigned long ** guest_memory_status = NULL;
+static unsigned long guest_memory_size = 0;
+
+static inline int map_guest_mem(int xc_handle, uint32_t domid,
+ unsigned long base)
+{
+ int j;
+ unsigned char * region_base;
+ unsigned long * pfn_base;
+
+ pfn_base = guest_memory_status[base];
+
+ memset(pfn_base, 0, GUEST_MEM_ENTRY_SIZE);
+ for (j = 0; j < GUEST_MEM_ENTRY_SIZE; j++) {
+ pfn_base[j] = base * GUEST_MEM_ENTRY_SIZE + j;
+ }
+ region_base = xc_map_foreign_batch(
+ xc_handle, domid, PROT_READ, pfn_base, GUEST_MEM_ENTRY_SIZE);
+ if ( region_base == NULL )
+ {
+ PERROR("map failed at guest memory frame 0x%lx - 0x%lx (%lu)",
+ base * GUEST_MEM_ENTRY_SIZE, (base + 1)* GUEST_MEM_ENTRY_SIZE - 1,
+ base);
+ return -1;
+ }
+
+ /* Look for and skip completely empty batches. */
+ for ( j = 0; j < GUEST_MEM_ENTRY_SIZE; j++ )
+ pfn_base[j] &= XEN_DOMCTL_PFINFO_LTAB_MASK;
+ for ( j = 0; j < GUEST_MEM_ENTRY_SIZE; j++ )
+ if ( pfn_base[j] != XEN_DOMCTL_PFINFO_XTAB )
+ break;
+ if ( j == GUEST_MEM_ENTRY_SIZE )
+ {
+ munmap(region_base, GUEST_MEM_ENTRY_SIZE*PAGE_SIZE);
+ guest_memory[base] = NULL;
+ return 1;
+ }
+
+ guest_memory[base] = region_base;
+
+ return 0;
+}
+
+static inline unsigned char * search_guest_mem(int xc_handle, uint32_t domid,
+ unsigned long mfn)
+{
+ unsigned long base = mfn / GUEST_MEM_ENTRY_SIZE;
+ unsigned long offset = mfn % GUEST_MEM_ENTRY_SIZE;
+
+ if (base >= guest_memory_size) {
+ ERROR("Error base(%lu) is greater than guest_memory_size(%lu)\n",
+ base, guest_memory_size);
+ return NULL;
+ }
+
+ if ( guest_memory_status[base][offset] == XEN_DOMCTL_PFINFO_XTAB ) {
+ /* reload XTAB place */
+ munmap(guest_memory[base], GUEST_MEM_ENTRY_SIZE*PAGE_SIZE);
+ guest_memory[base] = NULL;
+ DPRINTF("guest_memory[%lu] (frame 0x%lx - 0x%lx) will be remapped\n",
+ base, base * GUEST_MEM_ENTRY_SIZE,
+ (base + 1) * GUEST_MEM_ENTRY_SIZE - 1);
+ }
+
+ if (guest_memory[base] == NULL)
+ if (map_guest_mem(xc_handle, domid, offset))
+ return NULL;
+
+ return guest_memory[base] + offset * PAGE_SIZE;
+ /* Since I don't care of XEN_DOMCTL_PFINFO_LTAB_MASK,
+ this program may cause some accidents. */
+}
+
+static inline int init_guest_mem(int xc_handle, uint32_t dom)
+{
+ int i;
+
+ guest_memory_size = p2m_size / GUEST_MEM_ENTRY_SIZE + 1;
+ DPRINTF("guest_memory_size: %lu\n", guest_memory_size);
+
+ /* mapped memory */
+ guest_memory = xg_memalign(PAGE_SIZE,
+ guest_memory_size * sizeof(guest_memory[0]));
+ if (guest_memory == NULL)
+ {
+ PERROR("failed to allocate guest_memory");
+ return -1;
+ }
+ if ( lock_pages(guest_memory, guest_memory_size * sizeof(guest_memory[0])))
+ {
+ ERROR("Unable to lock guest_memory array");
+ return -1;
+ }
+
+ /* memory status */
+ guest_memory_status = xg_memalign(PAGE_SIZE,
+ guest_memory_size * sizeof(guest_memory_status[0]));
+ if ( guest_memory_status == NULL )
+ {
+ ERROR("failed to alloc memory for guest_memory_status");
+ errno = ENOMEM;
+ return -1;
+ }
+ if ( lock_pages(guest_memory_status,
+ guest_memory_size * sizeof(guest_memory_status[0])))
+ {
+ ERROR("Unable to lock guest_memory_status array");
+ return -1;
+ }
+
+ for (i = 0; i < guest_memory_size; i++) {
+ guest_memory_status[i] = xg_memalign(PAGE_SIZE,
+ GUEST_MEM_ENTRY_SIZE * sizeof(guest_memory_status[0][0]));
+ if (guest_memory_status[i] == NULL) {
+ ERROR("failed to alloc memory for guest_memory_status[%d]", i);
+ errno = ENOMEM;
+ return -1;
+ }
+ if ( lock_pages(guest_memory_status,
+ guest_memory_size * sizeof(guest_memory_status[0][0])))
+ {
+ ERROR("Unable to lock guest_memory_status[%d]", i);
+ return -1;
+ }
+ }
+
+ for (i = 0; i < guest_memory_size; i++)
+ if (map_guest_mem(xc_handle, dom, i) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int writev_exact(int fd, const struct iovec *iov, size_t count)
+{
+ int i;
+ size_t sum;
+ for (i = 0, sum = 0; i < count; i++)
+ sum += iov[i].iov_len;
+
+ if (writev(fd, iov, count) != sum)
+ return -1;
+ else
+ return 0;
+}
+
+/* grep fodder: machine_to_phys */
+
+
+/*
+** During (live) save/migrate, we maintain a number of bitmaps to track
+** which pages we have to send, to fixup, and to skip.
+*/
+
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+#define BITMAP_SIZE (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long))
+
+#define BITMAP_ENTRY(_nr,_bmap) \
+ ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
+
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+static inline int test_bit (int nr, volatile void * addr)
+{
+ return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
+}
+
+static inline void clear_bit (int nr, volatile void * addr)
+{
+ BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
+}
+
+static inline void set_bit ( int nr, volatile void * addr)
+{
+ BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
+}
+
+static uint64_t tv_delta(struct timeval *new, struct timeval *old)
+{
+ return (((new->tv_sec - old->tv_sec)*1000000) +
+ (new->tv_usec - old->tv_usec));
+}
+
+static int noncached_write(int fd, void *buffer, int len)
+{
+ static int write_count = 0;
+ int rc = (write_exact(fd, buffer, len) == 0) ? len : -1;
+
+ write_count += len;
+ if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
+ {
+ /* Time to discard cache - dont care if this fails */
+ discard_file_cache(fd, 0 /* no flush */);
+ write_count = 0;
+ }
+
+ return rc;
+}
+
+#ifdef ADAPTIVE_SAVE
+
+/*
+** We control the rate at which we transmit (or save) to minimize impact
+** on running domains (including the target if we're doing live migrate).
+*/
+
+#define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
+#define START_MBIT_RATE 100 /* initial transmit rate for migrate */
+
+/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
+#define RATE_TO_BTU 781250
+
+/* Amount in bytes we allow ourselves to send in a burst */
+#define BURST_BUDGET (100*1024)
+
+/* We keep track of the current and previous transmission rate */
+static int mbit_rate, ombit_rate = 0;
+
+/* Have we reached the maximum transmission rate? */
+#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
+
+static inline void initialize_mbit_rate()
+{
+ mbit_rate = START_MBIT_RATE;
+}
+
+static int ratewrite(int io_fd, void *buf, int n)
+{
+ static int budget = 0;
+ static int burst_time_us = -1;
+ static struct timeval last_put = { 0 };
+ struct timeval now;
+ struct timespec delay;
+ long long delta;
+
+ if ( START_MBIT_RATE == 0 )
+ return noncached_write(io_fd, buf, n);
+
+ budget -= n;
+ if ( budget < 0 )
+ {
+ if ( mbit_rate != ombit_rate )
+ {
+ burst_time_us = RATE_TO_BTU / mbit_rate;
+ ombit_rate = mbit_rate;
+ DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
+ mbit_rate, BURST_BUDGET, burst_time_us);
+ }
+ if ( last_put.tv_sec == 0 )
+ {
+ budget += BURST_BUDGET;
+ gettimeofday(&last_put, NULL);
+ }
+ else
+ {
+ while ( budget < 0 )
+ {
+ gettimeofday(&now, NULL);
+ delta = tv_delta(&now, &last_put);
+ while ( delta > burst_time_us )
+ {
+ budget += BURST_BUDGET;
+ last_put.tv_usec += burst_time_us;
+ if ( last_put.tv_usec > 1000000 )
+ {
+ last_put.tv_usec -= 1000000;
+ last_put.tv_sec++;
+ }
+ delta -= burst_time_us;
+ }
+ if ( budget > 0 )
+ break;
+ delay.tv_sec = 0;
+ delay.tv_nsec = 1000 * (burst_time_us - delta);
+ while ( delay.tv_nsec > 0 )
+ if ( nanosleep(&delay, &delay) == 0 )
+ break;
+ }
+ }
+ }
+ return noncached_write(io_fd, buf, n);
+}
+
+#else /* ! ADAPTIVE SAVE */
+
+#define RATE_IS_MAX() (0)
+#define ratewrite(_io_fd, _buf, _n) noncached_write((_io_fd), (_buf), (_n))
+#define initialize_mbit_rate()
+
+#endif
+
+static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
+ xc_shadow_op_stats_t *stats, int print)
+{
+ static struct timeval wall_last;
+ static long long d0_cpu_last;
+ static long long d1_cpu_last;
+
+ struct timeval wall_now;
+ long long wall_delta;
+ long long d0_cpu_now, d0_cpu_delta;
+ long long d1_cpu_now, d1_cpu_delta;
+
+ gettimeofday(&wall_now, NULL);
+
+ d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
+ d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
+
+ if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
+ DPRINTF("ARRHHH!!\n");
+
+ wall_delta = tv_delta(&wall_now,&wall_last)/1000;
+ if ( wall_delta == 0 )
+ wall_delta = 1;
+
+ d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
+ d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
+
+ if ( print )
+ DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
+ "dirtied %dMb/s %" PRId32 " pages\n",
+ wall_delta,
+ (int)((d0_cpu_delta*100)/wall_delta),
+ (int)((d1_cpu_delta*100)/wall_delta),
+ (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
+ (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
+ stats->dirty_count);
+
+#ifdef ADAPTIVE_SAVE
+ if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
+ {
+ mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
+ + 50;
+ if ( mbit_rate > MAX_MBIT_RATE )
+ mbit_rate = MAX_MBIT_RATE;
+ }
+#endif
+
+ d0_cpu_last = d0_cpu_now;
+ d1_cpu_last = d1_cpu_now;
+ wall_last = wall_now;
+
+ return 0;
+}
+
+static int send_qemu_image(int xc_handle, int io_fd, uint32_t dom)
+{
+ char path[128];
+ struct stat st;
+ struct {
+ int minusfour;
+ uint32_t image_size;
+ } chunk = { -1, 0 };
+ int qemu_fd;
+ int rc = -1;
+
+ snprintf(path, sizeof(path), "/dev/shm/qemu-save.%d", dom);
+ if ((qemu_fd = open(path, O_RDONLY)) == -1)
+ {
+ PERROR("Error when opening qemu image %s", path);
+ goto out;
+ }
+
+ if (fstat(qemu_fd, &st) == -1)
+ {
+ PERROR("Error fstat qemu file %s", path);
+ goto out;
+ }
+ chunk.image_size = st.st_size;
+
+ if ( write_exact(io_fd, &chunk, sizeof(chunk)) )
+ {
+ PERROR("Error when writing header for qemu image");
+ goto out;
+ }
+
+ if ( sendfile(io_fd, qemu_fd, NULL, chunk.image_size) !=
+ chunk.image_size)
+ {
+ PERROR("Error when writing qemu image");
+ goto out;
+ }
+ close(qemu_fd);
+
+ rc = 0;
+out:
+ return rc;
+}
+
+static int send_hvm_params(int xc_handle, int io_fd, uint32_t dom)
+{
+ struct {
+ int id;
+ uint32_t pad;
+ uint64_t data;
+ } chunk = { 0, 0 };
+
+ chunk.id = -3;
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
+ &chunk.data);
+
+ if ( (chunk.data != 0) &&
+ write_exact(io_fd, &chunk, sizeof(chunk)) )
+ {
+ PERROR("Error when writing the ident_pt for EPT guest");
+ return -1;
+ }
+
+ chunk.id = -4;
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS,
+ &chunk.data);
+
+ if ( (chunk.data != 0) &&
+ write_exact(io_fd, &chunk, sizeof(chunk)) )
+ {
+ PERROR("Error when writing the vm86 TSS for guest");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int send_hvm_context(int xc_handle, int io_fd,
+ struct kemari_ring *ring, uint32_t dom)
+{
+ uint32_t buf_size = ring->hvm_ctxt.buf_size;
+ uint32_t rec_size = ring->hvm_ctxt.rec_size;
+ uint8_t *hvm_buf = (uint8_t *)ring + ring->hvm_ctxt.buf_offset;
+ int rc = -1;
+
+ /* Get HVM context from Xen and save it too */
+ if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
+ buf_size)) == -1 )
+ {
+ ERROR("HVM:Could not get hvm buffer");
+ goto out;
+ }
+
+ if ( write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
+ {
+ PERROR("error write hvm buffer size");
+ goto out;
+ }
+
+ if ( write_exact(io_fd, hvm_buf, rec_size) )
+ {
+ PERROR("write HVM info failed!\n");
+ goto out;
+ }
+ rc = 0;
+
+out:
+ return rc;
+}
+
+int xc_kemari_save(int xc_handle, int io_fd, uint32_t dom,
+ void *kemari_ring, uint32_t flags,
+ int hvm, void *(*init_qemu_maps)(int, unsigned))
+{
+ int rc = 1, i, j, iter = 0;
+ int debug = (flags & XCFLAGS_DEBUG);
+ int sent_last_iter, skip_this_iter;
+ xc_dominfo_t info;
+ struct kemari_ring *ring = (struct kemari_ring *)kemari_ring;
+
+ /* base of the region in which domain memory is mapped */
+ unsigned char *region_base = NULL;
+
+ /* bitmap of pages:
+ - that should be sent this iteration (unless later marked as skip);
+ - to skip this iteration because already dirty;
+ - to fixup by sending at the end if not already resent; */
+ unsigned long *to_send = NULL, *to_fix = NULL;
+
+ xc_shadow_op_stats_t stats;
+
+ unsigned long needed_to_fix = 0;
+ unsigned long total_sent = 0;
+
+ /* HVM: magic frames for ioreqs and xenstore comms. */
+ uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
+
+ /* callback irq */
+ uint64_t callback_irq = 0;
+
+ if ( !hvm )
+ {
+ ERROR("HVM domain is required for the kemari migration.");
+ return 1;
+ }
+
+ initialize_mbit_rate();
+
+ if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
+ {
+ ERROR("Could not get domain info");
+ return 1;
+ }
+
+ shared_info_frame = info.shared_info_frame;
+ DPRINTF("xc_kemari_save: shared_info_frame: %lu\n", shared_info_frame);
+
+ /* Get the size of the P2M table */
+ p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom) + 1;
+ DPRINTF("xc_kemari_save: p2m_size: %lu\n", p2m_size);
+
+ /* Domain is still running at this point */
+ {
+ /* Get qemu-dm logging dirty pages too */
+ void *seg = init_qemu_maps(dom, BITMAP_SIZE);
+ qemu_bitmaps[0] = seg;
+ qemu_bitmaps[1] = seg + BITMAP_SIZE;
+ qemu_active = 0;
+ qemu_non_active = 1;
+ }
+
+ /* pretend we sent all the pages last iteration */
+ sent_last_iter = p2m_size;
+
+ /* Setup to_send / to_fix bitmaps */
+ to_send = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT));
+ to_fix = calloc(1, BITMAP_SIZE);
+
+ if ( !to_send || !to_fix )
+ {
+ ERROR("Couldn't allocate to_send array");
+ goto out;
+ }
+
+ memset(to_send, 0xff, BITMAP_SIZE);
+
+ if ( lock_pages(to_send, BITMAP_SIZE) )
+ {
+ ERROR("Unable to lock to_send");
+ return 1;
+ }
+
+ pfn_type = xg_memalign(PAGE_SIZE, ROUNDUP(
+ MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
+ if ( pfn_type == NULL )
+ {
+ ERROR("failed to alloc memory for pfn_type arrays");
+ errno = ENOMEM;
+ goto out;
+ }
+ memset(pfn_type, 0,
+ ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
+
+ if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
+ {
+ ERROR("Unable to lock pfn_type array");
+ goto out;
+ }
+
+ /* Start writing out the saved-domain record. */
+ if ( write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
+ {
+ PERROR("write: p2m_size");
+ goto out;
+ }
+
+ /* send shared_info_frame */
+ if ( write_exact(io_fd, &shared_info_frame, sizeof(unsigned long)) )
+ {
+ PERROR("write: shared_info_frame");
+ goto out;
+ }
+
+ /* Save magic-page locations. */
+ memset(magic_pfns, 0, sizeof(magic_pfns));
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
+ &magic_pfns[0]);
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
+ &magic_pfns[1]);
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
+ &magic_pfns[2]);
+ DPRINTF("kemari_restore: magic_pfns 0: %lld, 1: %lld, 2: %lld\n",
+ magic_pfns[0], magic_pfns[1], magic_pfns[2]);
+ if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
+ {
+ PERROR("Error when writing to state file (7)");
+ goto out;
+ }
+
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_CALLBACK_IRQ,
+ &callback_irq);
+ DPRINTF("kemari_restore: callback irq %llx", callback_irq);
+ if ( write_exact(io_fd, &callback_irq, sizeof(callback_irq)) )
+ {
+ PERROR("Error when writing to state file (8)");
+ goto out;
+ }
+
+ print_stats(xc_handle, dom, 0, &stats, 0);
+
+ /* Now write out each data page, canonicalising page tables as we go... */
+ {
+ unsigned int prev_pc, sent_this_iter, N, batch, run;
+
+ iter++;
+ sent_this_iter = 0;
+ skip_this_iter = 0;
+ prev_pc = 0;
+ N = 0;
+
+ DPRINTF("Saving memory pages: iter %d 0%%", iter);
+
+ while ( N < p2m_size )
+ {
+ unsigned int this_pc = (N * 100) / p2m_size;
+
+ if ( (this_pc - prev_pc) >= 5 )
+ {
+ DPRINTF("\b\b\b\b%3d%%", this_pc);
+ prev_pc = this_pc;
+ }
+
+ /* load pfn_type[] with the mfn of all the pages we're doing in
+ this batch. */
+ for ( batch = 0;
+ (batch < MAX_BATCH_SIZE) && (N < p2m_size);
+ N++ )
+ {
+ int n = N;
+
+ if ( debug )
+ {
+ DPRINTF("%d pfn= %08lx mfn= %08lx %d",
+ iter, (unsigned long)n,
+ (long unsigned int)0,
+ test_bit(n, to_send));
+ DPRINTF("\n");
+ }
+
+ if ( !( (test_bit(n, to_send)) || (test_bit(n, to_fix))) )
+ continue;
+
+#if 0
+ /* Skip PFNs that aren't really there */
+ if (((n >= 0xa0 && n < 0xc0) /* VGA hole */
+ || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT)
+ && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ ) {
+ if (n >= shared_info_frame && n <= shared_info_frame + 32) {
+ /* DPRINTF("shared_info_frame or grant: %d\n", n); */
+ } else {
+ continue;
+ }
+ }
+#endif
+
+ /*
+ ** we get here if:
+ ** 1. page is marked to_send & hasn't already been re-dirtied
+ ** 2. add in pages that still need fixup (net bufs)
+ */
+
+ /* Hypercall interfaces operate in PFNs for HVM guests
+ * and MFNs for PV guests */
+ pfn_type[batch] = n;
+
+ if ( !is_mapped(pfn_type[batch]) )
+ {
+ /*
+ ** not currently in psuedo-physical map -- set bit
+ ** in to_fix since we must send this page in last_iter
+ ** unless its sent sooner anyhow, or it never enters
+ ** pseudo-physical map (e.g. for ballooned down doms)
+ */
+ set_bit(n, to_fix);
+ continue;
+ }
+
+ if ( test_bit(n, to_fix) &&
+ !test_bit(n, to_send) )
+ {
+ needed_to_fix++;
+ DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
+ iter, n, pfn_type[batch]);
+ }
+
+ clear_bit(n, to_fix);
+
+ batch++;
+ }
+
+ if ( batch == 0 )
+ goto skip; /* vanishingly unlikely... */
+
+ region_base = xc_map_foreign_batch(
+ xc_handle, dom, PROT_READ, pfn_type, batch);
+ if ( region_base == NULL )
+ {
+ ERROR("map batch failed");
+ goto out;
+ }
+
+ {
+ /* Look for and skip completely empty batches. */
+ for ( j = 0; j < batch; j++ )
+ if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) !=
+ XEN_DOMCTL_PFINFO_XTAB )
+ break;
+ if ( j == batch )
+ {
+ munmap(region_base, batch*PAGE_SIZE);
+ continue; /* bail on this batch: no valid pages */
+ }
+ }
+
+ if ( write_exact(io_fd, &batch, sizeof(unsigned int)) )
+ {
+ PERROR("Error when writing to state file (2)");
+ goto out;
+ }
+
+ if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
+ {
+ PERROR("Error when writing to state file (3)");
+ goto out;
+ }
+
+ /* entering this loop, pfn_type is now in pfns (Not mfns) */
+ run = 0;
+ for ( j = 0; j < batch; j++ )
+ {
+ unsigned long pfn, pagetype;
+
+ pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ if ( pagetype != 0 )
+ {
+ /* If the page is not a normal data page, write out any
+ run of pages we may have previously acumulated */
+ if ( run )
+ {
+ if ( ratewrite(io_fd,
+ (char*)region_base+(PAGE_SIZE*(j-run)),
+ PAGE_SIZE*run) != PAGE_SIZE*run )
+ {
+ ERROR("Error when writing to state file (4a)"
+ " (errno %d)", errno);
+ goto out;
+ }
+ run = 0;
+ }
+ }
+
+ /* skip pages that aren't present */
+ if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+ continue;
+
+ pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+ if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
+ (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+ {
+ DPRINTF("canonicalize_pagetable pagetype = %lx pfn = %lu\n", pagetype, pfn);
+ }
+ else
+ {
+ /* We have a normal page: accumulate it for writing. */
+ run++;
+ }
+ } /* end of the write out for this batch */
+
+ if ( run )
+ {
+ /* write out the last accumulated run of pages */
+ if ( ratewrite(io_fd,
+ (char*)region_base+(PAGE_SIZE*(j-run)),
+ PAGE_SIZE*run) != PAGE_SIZE*run )
+ {
+ ERROR("Error when writing to state file (4c)"
+ " (errno %d)", errno);
+ goto out;
+ }
+ }
+
+ sent_this_iter += batch;
+
+ munmap(region_base, batch*PAGE_SIZE);
+
+ } /* end of this while loop for this iteration */
+
+ skip:
+
+ total_sent += sent_this_iter;
+
+ DPRINTF("\r %d: sent %d, skipped %d, ",
+ iter, sent_this_iter, skip_this_iter );
+
+ {
+ print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
+
+ DPRINTF("Total pages sent= %ld (%.2fx)\n",
+ total_sent, ((float)total_sent)/p2m_size );
+ DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
+ }
+ } /* end of infinite for loop */
+
+ DPRINTF("All memory is saved\n");
+
+ if (send_hvm_params(xc_handle, io_fd, dom) < 0)
+ goto out;
+
+ /* Zero terminate */
+ i = 0;
+ if ( write_exact(io_fd, &i, sizeof(int)) )
+ {
+ PERROR("Error when writing to state file (6')");
+ goto out;
+ }
+
+ if (send_hvm_context(xc_handle, io_fd, ring, dom) < 0)
+ goto out;
+
+ if (!debug)
+ {
+ int rcv_status;
+ if ( read_exact(io_fd, &rcv_status, sizeof(int))) {
+ ERROR("Error when reading receiver status");
+ goto out;
+ }
+ DPRINTF("status received: %d\n", rcv_status);
+ }
+
+ if (init_guest_mem(xc_handle, dom) < 0)
+ goto out;
+
+ /* HVM guests are done now */
+ rc = 0;
+
+ out:
+
+ /* Flush last write and discard cache for file. */
+ discard_file_cache(io_fd, 1 /* flush */);
+
+ free(to_send);
+ free(to_fix);
+
+ DPRINTF("Save exit rc=%d\n",rc);
+
+ return !!rc;
+}
+
+
+int xc_kemari_update(int xc_handle, int io_fd, uint32_t dom,
+ void *kemari_ring, uint32_t flags,
+ void (*qemu_save_image)(int),
+ void (*qemu_end_flip)(void),
+ void (*qemu_end_save)(void),
+ void (*qemu_image_sent)(void))
+{
+ int rc = 1, k;
+ int debug = (flags & XCFLAGS_DEBUG);
+ uint32_t i, j, index = 0;
+ unsigned int batch = 0;
+ struct kemari_ring *ring = (struct kemari_ring *)kemari_ring;
+ struct kemari_ent *buf;
+ struct iovec iov[MAX_BATCH_SIZE + 2]; /* 2 for batch and pfn_type */
+ int iovcnt = 2;
+
+#define ADD_IOV(base, len) do { \
+ iov[iovcnt].iov_base = base; \
+ iov[iovcnt].iov_len = len; \
+ iovcnt++; \
+} while (0)
+
+
+
+ /* flip active qemu */
+ qemu_active = qemu_non_active;
+ qemu_non_active = qemu_active ? 0 : 1;
+ qemu_save_image(qemu_active);
+
+ /*
+ * main iteration starts from here
+ */
+ while (ring->cons < ring->prod) {
+
+ kemari_ring_read(ring, &buf);
+
+ for (i = buf->u.index.start, j = buf->u.index.end; i < j; i++) {
+
+ int next, offset = 0;
+
+ index = i * BITS_PER_LONG;
+
+ kemari_ring_read(ring, &buf);
+
+ while (buf->u.dirty_bitmap && offset < BITS_PER_LONG) {
+ int n;
+ next = ffs(buf->u.dirty_bitmap);
+ buf->u.dirty_bitmap >>= next;
+ offset += next;
+ n = offset + index - 1;
+#if 0
+ if (((n >= 0xa0 && n < 0xc0) /* VGA hole */
+ || (n >= (HVM_BELOW_4G_MMIO_START >> PAGE_SHIFT)
+ && n < (1ULL<<32) >> PAGE_SHIFT)) /* MMIO */ ) {
+ if (n >= shared_info_frame && n <= shared_info_frame + 32) {
+ ;
+ } else {
+ continue;
+ }
+ }
+#endif
+ ADD_IOV(search_guest_mem(xc_handle, dom, n), PAGE_SIZE);
+ pfn_type[batch] = n;
+ batch++;
+ }
+
+ if ((batch + BITS_PER_LONG - 1 < MAX_BATCH_SIZE) &&
+ !(ring->cons == ring->prod))
+ continue;
+
+ /* Pull in the dirty bits from qemu-dm too */
+ qemu_end_flip();
+ for ( k = 0; k < BITMAP_SIZE / BITS_PER_LONG; k++) {
+ if (qemu_bitmaps[qemu_non_active][k] != 0) {
+ unsigned int bmp = qemu_bitmaps[qemu_non_active][k];
+
+ index = k * BITS_PER_LONG;
+ while (bmp && offset < BITS_PER_LONG) {
+ int n, next, offset = 0;
+ next = ffs(bmp);
+ bmp >>= next;
+ offset += next;
+ n = offset + index - 1;
+
+ ADD_IOV(search_guest_mem(xc_handle, dom, n), PAGE_SIZE);
+ pfn_type[batch] = n;
+ batch++;
+ }
+ qemu_bitmaps[qemu_non_active][k] = 0;
+ }
+ if (batch >= MAX_BATCH_SIZE) {
+ ERROR("Sorry, reached MAX_BATCH_SIZE. "
+ "We will fix this lator.");
+ goto out;
+ }
+ }
+
+ PPRINTF("batch %d\n", batch);
+
+ /* send pages */
+ iov[0].iov_base = &batch;
+ iov[0].iov_len = sizeof(batch);
+
+ iov[1].iov_base = pfn_type;
+ iov[1].iov_len = sizeof(pfn_type[0]) * batch;
+
+ for (k = 0; k < iovcnt / IOV_MAX + 1; k++) {
+ int count = (iovcnt<IOV_MAX*(k+1))?(iovcnt-IOV_MAX*k):IOV_MAX;
+ if (writev_exact(io_fd, &iov[IOV_MAX * k], count)) {
+ ERROR("Error when writing pages state file (2--4)"
+ " (errno %d)", errno);
+ goto out;
+ }
+ }
+
+ batch = 0;
+ }
+ }
+
+ if (send_hvm_params(xc_handle, io_fd, dom) < 0)
+ goto out;
+ qemu_end_save();
+ if (!debug && send_qemu_image(xc_handle, io_fd, dom) < 0)
+ goto out;
+ qemu_image_sent();
+
+ /* Zero terminate */
+ i = 0;
+ if ( write_exact(io_fd, &i, sizeof(int)) )
+ {
+ PERROR("Error when writing to state file (6')");
+ goto out;
+ }
+
+ if (send_hvm_context(xc_handle, io_fd, ring, dom) < 0)
+ goto out;
+
+ if (!debug)
+ {
+ int rcv_status;
+ if ( read_exact(io_fd, &rcv_status, sizeof(int))) {
+ ERROR("Error when reading receiver status");
+ goto out;
+ }
+ }
+
+ rc = 0;
+out:
+
+ return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [RFC][PATCH 05/13] Kemari: Kemari sender
2009-03-12 1:17 ` [RFC][PATCH 05/13] Kemari: Kemari sender Yoshiaki Tamura
@ 2009-03-24 6:59 ` Yoshiaki Tamura
0 siblings, 0 replies; 14+ messages in thread
From: Yoshiaki Tamura @ 2009-03-24 6:59 UTC (permalink / raw)
To: xen-devel
Cc: Ian Pratt, ian.jackson,
"柳澤佳里(yanagisawa yoshisato)",
Stefano Stabellini, Keir Fraser
This is an updated version of the following patch. It uses an event
channel instead of a signal to notify buffer flip and order save of
the QEMU status.
http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00749.html
Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp>
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp>
---
tools/libxc/xc_dom_kemari_save.c | 1114 +++++++++++++++++++++++++++++++++++++++
tools/xcutils/xc_kemari_save.c | 525 ++++++++++++++++++
2 files changed, 1639 insertions(+)
diff -r b249f3e979a5 -r 06b950859c92 tools/libxc/xc_dom_kemari_save.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_dom_kemari_save.c Tue Mar 24 15:11:38 2009 +0900
@@ -0,0 +1,1114 @@
+/******************************************************************************
+ * xc_dom_kemari_save.c
+ *
+ * Save the state of a running Linux session.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * This source code is based on xc_domain_save.c.
+ * Copied BITS_PER_LONG, BITS_TO_LONGS, BITMAP_SIZE, BITMAP_SHIFT,
+ * RATE_IS_MAX, test_bit, clear_bit, set_bit, tv_delta, noncached_write,
+ * initialize_mbit_rate, and ratewrite from xc_domain_save.c
+ *
+ * Copyright (c) 2003, K A Fraser.
+ */
+
+#include <inttypes.h>
+#include <time.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <sys/time.h>
+
+#include "xc_private.h"
+#include "xc_dom.h"
+#include "xg_private.h"
+#include "xg_save_restore.h"
+
+#include <xen/hvm/params.h>
+#include "xc_e820.h"
+
+#ifdef __MINIOS__
+/*
+ * Caution: atomicity of following alternative libc functions are broken.
+ */
+static ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
+{
+ char buf[1024];
+ int len, wrote_len = 0;
+
+ if (offset != NULL) {
+ ERROR("Sorry sendfile for stubdomain should not have offset");
+ errno = EIO;
+ return -1;
+ }
+
+ while (count > 0) {
+ len = (count < sizeof(buf))?count:sizeof(buf);
+ len = read(in_fd, buf, len);
+ if (len < 0)
+ return -1;
+ if (write_exact(out_fd, buf, len))
+ return -1;
+ wrote_len += len;
+ count -= len;
+ }
+ return wrote_len;
+}
+
+#define IOV_MAX 1024
+struct iovec {
+ void *iov_base; /* Base address. */
+ size_t iov_len; /* Length. */
+};
+static ssize_t writev(int d, const struct iovec *iov, int iovcnt)
+{
+ int i;
+ int len, wrote_len;
+
+ if (iovcnt < 0 || iovcnt > IOV_MAX) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ for (i = 0, wrote_len = 0; i < iovcnt; i++) {
+ len = write(d, iov[i].iov_base, iov[i].iov_len);
+ if (len < 0)
+ return -1;
+
+ wrote_len += len;
+ if (wrote_len < 0) { /* integer overflow */
+ errno = EINVAL;
+ return -1;
+ }
+
+ if (len != iov[i].iov_len)
+ return wrote_len;
+ }
+
+ return wrote_len;
+}
+#else /* !__MINIOS__ */
+#include <sys/sendfile.h>
+#include <sys/uio.h>
+#endif /* __MINIOS__ */
+
+/* HVM: shared-memory bitmaps for getting log-dirty bits from qemu-dm */
+static unsigned long *qemu_bitmaps[2];
+static int qemu_active;
+static int qemu_non_active;
+
+/* number of pfns this guest has (i.e. number of entries in the P2M) */
+static unsigned long p2m_size;
+
+/* page frame numbers */
+static unsigned long *pfn_type = NULL;
+
+/* The new domain's shared-info frame number. */
+static unsigned long shared_info_frame;
+
+/*
+ * guest memory
+ */
+#define GUEST_MEM_ENTRY_SIZE 1024 /* up to 4MB at a time. */
+static unsigned char ** guest_memory = NULL;
+static unsigned long ** guest_memory_status = NULL;
+static unsigned long guest_memory_size = 0;
+
+static inline int map_guest_mem(int xc_handle, uint32_t domid,
+ unsigned long base)
+{
+ int j;
+ unsigned char * region_base;
+ unsigned long * pfn_base;
+
+ pfn_base = guest_memory_status[base];
+
+ memset(pfn_base, 0, GUEST_MEM_ENTRY_SIZE);
+ for (j = 0; j < GUEST_MEM_ENTRY_SIZE; j++) {
+ pfn_base[j] = base * GUEST_MEM_ENTRY_SIZE + j;
+ }
+ region_base = xc_map_foreign_batch(
+ xc_handle, domid, PROT_READ, pfn_base, GUEST_MEM_ENTRY_SIZE);
+ if ( region_base == NULL )
+ {
+ PERROR("map failed at guest memory frame 0x%lx - 0x%lx (%lu)",
+ base * GUEST_MEM_ENTRY_SIZE, (base + 1)* GUEST_MEM_ENTRY_SIZE - 1,
+ base);
+ return -1;
+ }
+
+ /* Look for and skip completely empty batches. */
+ for ( j = 0; j < GUEST_MEM_ENTRY_SIZE; j++ )
+ pfn_base[j] &= XEN_DOMCTL_PFINFO_LTAB_MASK;
+ for ( j = 0; j < GUEST_MEM_ENTRY_SIZE; j++ )
+ if ( pfn_base[j] != XEN_DOMCTL_PFINFO_XTAB )
+ break;
+ if ( j == GUEST_MEM_ENTRY_SIZE )
+ {
+ munmap(region_base, GUEST_MEM_ENTRY_SIZE*PAGE_SIZE);
+ guest_memory[base] = NULL;
+ return 1;
+ }
+
+ guest_memory[base] = region_base;
+
+ return 0;
+}
+
+static inline unsigned char * search_guest_mem(int xc_handle, uint32_t domid,
+ unsigned long mfn)
+{
+ unsigned long base = mfn / GUEST_MEM_ENTRY_SIZE;
+ unsigned long offset = mfn % GUEST_MEM_ENTRY_SIZE;
+
+ if (base >= guest_memory_size) {
+ ERROR("Error base(%lu) is greater than guest_memory_size(%lu)\n",
+ base, guest_memory_size);
+ return NULL;
+ }
+
+ if ( guest_memory_status[base][offset] == XEN_DOMCTL_PFINFO_XTAB ) {
+ /* reload XTAB place */
+ munmap(guest_memory[base], GUEST_MEM_ENTRY_SIZE*PAGE_SIZE);
+ guest_memory[base] = NULL;
+ DPRINTF("guest_memory[%lu] (frame 0x%lx - 0x%lx) will be remapped\n",
+ base, base * GUEST_MEM_ENTRY_SIZE,
+ (base + 1) * GUEST_MEM_ENTRY_SIZE - 1);
+ }
+
+ if (guest_memory[base] == NULL)
+ if (map_guest_mem(xc_handle, domid, offset))
+ return NULL;
+
+ return guest_memory[base] + offset * PAGE_SIZE;
+ /* Since I don't care of XEN_DOMCTL_PFINFO_LTAB_MASK,
+ this program may cause some accidents. */
+}
+
+static inline int init_guest_mem(int xc_handle, uint32_t dom)
+{
+ int i;
+
+ guest_memory_size = p2m_size / GUEST_MEM_ENTRY_SIZE + 1;
+ DPRINTF("guest_memory_size: %lu\n", guest_memory_size);
+
+ /* mapped memory */
+ guest_memory = xg_memalign(PAGE_SIZE,
+ guest_memory_size * sizeof(guest_memory[0]));
+ if (guest_memory == NULL)
+ {
+ PERROR("failed to allocate guest_memory");
+ return -1;
+ }
+ if ( lock_pages(guest_memory, guest_memory_size * sizeof(guest_memory[0])))
+ {
+ ERROR("Unable to lock guest_memory array");
+ return -1;
+ }
+
+ /* memory status */
+ guest_memory_status = xg_memalign(PAGE_SIZE,
+ guest_memory_size * sizeof(guest_memory_status[0]));
+ if ( guest_memory_status == NULL )
+ {
+ ERROR("failed to alloc memory for guest_memory_status");
+ errno = ENOMEM;
+ return -1;
+ }
+ if ( lock_pages(guest_memory_status,
+ guest_memory_size * sizeof(guest_memory_status[0])))
+ {
+ ERROR("Unable to lock guest_memory_status array");
+ return -1;
+ }
+
+ for (i = 0; i < guest_memory_size; i++) {
+ guest_memory_status[i] = xg_memalign(PAGE_SIZE,
+ GUEST_MEM_ENTRY_SIZE * sizeof(guest_memory_status[0][0]));
+ if (guest_memory_status[i] == NULL) {
+ ERROR("failed to alloc memory for guest_memory_status[%d]", i);
+ errno = ENOMEM;
+ return -1;
+ }
+ if ( lock_pages(guest_memory_status,
+ guest_memory_size * sizeof(guest_memory_status[0][0])))
+ {
+ ERROR("Unable to lock guest_memory_status[%d]", i);
+ return -1;
+ }
+ }
+
+ for (i = 0; i < guest_memory_size; i++)
+ if (map_guest_mem(xc_handle, dom, i) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int writev_exact(int fd, const struct iovec *iov, size_t count)
+{
+ int i;
+ size_t sum;
+ for (i = 0, sum = 0; i < count; i++)
+ sum += iov[i].iov_len;
+
+ if (writev(fd, iov, count) != sum)
+ return -1;
+ else
+ return 0;
+}
+
+/* grep fodder: machine_to_phys */
+
+
+/*
+** During (live) save/migrate, we maintain a number of bitmaps to track
+** which pages we have to send, to fixup, and to skip.
+*/
+
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+#define BITMAP_SIZE (BITS_TO_LONGS(p2m_size) * sizeof(unsigned long))
+
+#define BITMAP_ENTRY(_nr,_bmap) \
+ ((volatile unsigned long *)(_bmap))[(_nr)/BITS_PER_LONG]
+
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+static inline int test_bit (int nr, volatile void * addr)
+{
+ return (BITMAP_ENTRY(nr, addr) >> BITMAP_SHIFT(nr)) & 1;
+}
+
+static inline void clear_bit (int nr, volatile void * addr)
+{
+ BITMAP_ENTRY(nr, addr) &= ~(1UL << BITMAP_SHIFT(nr));
+}
+
+static inline void set_bit ( int nr, volatile void * addr)
+{
+ BITMAP_ENTRY(nr, addr) |= (1UL << BITMAP_SHIFT(nr));
+}
+
+static uint64_t tv_delta(struct timeval *new, struct timeval *old)
+{
+ return (((new->tv_sec - old->tv_sec)*1000000) +
+ (new->tv_usec - old->tv_usec));
+}
+
+static int noncached_write(int fd, void *buffer, int len)
+{
+ static int write_count = 0;
+ int rc = (write_exact(fd, buffer, len) == 0) ? len : -1;
+
+ write_count += len;
+ if ( write_count >= (MAX_PAGECACHE_USAGE * PAGE_SIZE) )
+ {
+ /* Time to discard cache - dont care if this fails */
+ discard_file_cache(fd, 0 /* no flush */);
+ write_count = 0;
+ }
+
+ return rc;
+}
+
+#ifdef ADAPTIVE_SAVE
+
+/*
+** We control the rate at which we transmit (or save) to minimize impact
+** on running domains (including the target if we're doing live migrate).
+*/
+
+#define MAX_MBIT_RATE 500 /* maximum transmit rate for migrate */
+#define START_MBIT_RATE 100 /* initial transmit rate for migrate */
+
+/* Scaling factor to convert between a rate (in Mb/s) and time (in usecs) */
+#define RATE_TO_BTU 781250
+
+/* Amount in bytes we allow ourselves to send in a burst */
+#define BURST_BUDGET (100*1024)
+
+/* We keep track of the current and previous transmission rate */
+static int mbit_rate, ombit_rate = 0;
+
+/* Have we reached the maximum transmission rate? */
+#define RATE_IS_MAX() (mbit_rate == MAX_MBIT_RATE)
+
+static inline void initialize_mbit_rate()
+{
+ mbit_rate = START_MBIT_RATE;
+}
+
+static int ratewrite(int io_fd, void *buf, int n)
+{
+ static int budget = 0;
+ static int burst_time_us = -1;
+ static struct timeval last_put = { 0 };
+ struct timeval now;
+ struct timespec delay;
+ long long delta;
+
+ if ( START_MBIT_RATE == 0 )
+ return noncached_write(io_fd, buf, n);
+
+ budget -= n;
+ if ( budget < 0 )
+ {
+ if ( mbit_rate != ombit_rate )
+ {
+ burst_time_us = RATE_TO_BTU / mbit_rate;
+ ombit_rate = mbit_rate;
+ DPRINTF("rate limit: %d mbit/s burst budget %d slot time %d\n",
+ mbit_rate, BURST_BUDGET, burst_time_us);
+ }
+ if ( last_put.tv_sec == 0 )
+ {
+ budget += BURST_BUDGET;
+ gettimeofday(&last_put, NULL);
+ }
+ else
+ {
+ while ( budget < 0 )
+ {
+ gettimeofday(&now, NULL);
+ delta = tv_delta(&now, &last_put);
+ while ( delta > burst_time_us )
+ {
+ budget += BURST_BUDGET;
+ last_put.tv_usec += burst_time_us;
+ if ( last_put.tv_usec > 1000000 )
+ {
+ last_put.tv_usec -= 1000000;
+ last_put.tv_sec++;
+ }
+ delta -= burst_time_us;
+ }
+ if ( budget > 0 )
+ break;
+ delay.tv_sec = 0;
+ delay.tv_nsec = 1000 * (burst_time_us - delta);
+ while ( delay.tv_nsec > 0 )
+ if ( nanosleep(&delay, &delay) == 0 )
+ break;
+ }
+ }
+ }
+ return noncached_write(io_fd, buf, n);
+}
+
+#else /* ! ADAPTIVE SAVE */
+
+#define RATE_IS_MAX() (0)
+#define ratewrite(_io_fd, _buf, _n) noncached_write((_io_fd), (_buf), (_n))
+#define initialize_mbit_rate()
+
+#endif
+
+static int print_stats(int xc_handle, uint32_t domid, int pages_sent,
+ xc_shadow_op_stats_t *stats, int print)
+{
+ static struct timeval wall_last;
+ static long long d0_cpu_last;
+ static long long d1_cpu_last;
+
+ struct timeval wall_now;
+ long long wall_delta;
+ long long d0_cpu_now, d0_cpu_delta;
+ long long d1_cpu_now, d1_cpu_delta;
+
+ gettimeofday(&wall_now, NULL);
+
+ d0_cpu_now = xc_domain_get_cpu_usage(xc_handle, 0, /* FIXME */ 0)/1000;
+ d1_cpu_now = xc_domain_get_cpu_usage(xc_handle, domid, /* FIXME */ 0)/1000;
+
+ if ( (d0_cpu_now == -1) || (d1_cpu_now == -1) )
+ DPRINTF("ARRHHH!!\n");
+
+ wall_delta = tv_delta(&wall_now,&wall_last)/1000;
+ if ( wall_delta == 0 )
+ wall_delta = 1;
+
+ d0_cpu_delta = (d0_cpu_now - d0_cpu_last)/1000;
+ d1_cpu_delta = (d1_cpu_now - d1_cpu_last)/1000;
+
+ if ( print )
+ DPRINTF("delta %lldms, dom0 %d%%, target %d%%, sent %dMb/s, "
+ "dirtied %dMb/s %" PRId32 " pages\n",
+ wall_delta,
+ (int)((d0_cpu_delta*100)/wall_delta),
+ (int)((d1_cpu_delta*100)/wall_delta),
+ (int)((pages_sent*PAGE_SIZE)/(wall_delta*(1000/8))),
+ (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))),
+ stats->dirty_count);
+
+#ifdef ADAPTIVE_SAVE
+ if ( ((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8))) > mbit_rate )
+ {
+ mbit_rate = (int)((stats->dirty_count*PAGE_SIZE)/(wall_delta*(1000/8)))
+ + 50;
+ if ( mbit_rate > MAX_MBIT_RATE )
+ mbit_rate = MAX_MBIT_RATE;
+ }
+#endif
+
+ d0_cpu_last = d0_cpu_now;
+ d1_cpu_last = d1_cpu_now;
+ wall_last = wall_now;
+
+ return 0;
+}
+
+static int send_qemu_image(int xc_handle, int io_fd, uint32_t dom)
+{
+ char path[128];
+ struct stat st;
+ struct {
+ int minusfour;
+ uint32_t image_size;
+ } chunk = { -1, 0 };
+ int qemu_fd;
+ int rc = -1;
+
+ snprintf(path, sizeof(path), "/dev/shm/qemu-save.%d", dom);
+ if ((qemu_fd = open(path, O_RDONLY)) == -1)
+ {
+ PERROR("Error when opening qemu image %s", path);
+ goto out;
+ }
+
+ if (fstat(qemu_fd, &st) == -1)
+ {
+ PERROR("Error fstat qemu file %s", path);
+ goto out;
+ }
+ chunk.image_size = st.st_size;
+
+ if ( write_exact(io_fd, &chunk, sizeof(chunk)) )
+ {
+ PERROR("Error when writing header for qemu image");
+ goto out;
+ }
+
+ if ( sendfile(io_fd, qemu_fd, NULL, chunk.image_size) !=
+ chunk.image_size)
+ {
+ PERROR("Error when writing qemu image");
+ goto out;
+ }
+ close(qemu_fd);
+
+ rc = 0;
+out:
+ return rc;
+}
+
+static int send_hvm_params(int xc_handle, int io_fd, uint32_t dom)
+{
+ struct {
+ int id;
+ uint32_t pad;
+ uint64_t data;
+ } chunk = { 0, 0 };
+
+ chunk.id = -3;
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
+ &chunk.data);
+
+ if ( (chunk.data != 0) &&
+ write_exact(io_fd, &chunk, sizeof(chunk)) )
+ {
+ PERROR("Error when writing the ident_pt for EPT guest");
+ return -1;
+ }
+
+ chunk.id = -4;
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS,
+ &chunk.data);
+
+ if ( (chunk.data != 0) &&
+ write_exact(io_fd, &chunk, sizeof(chunk)) )
+ {
+ PERROR("Error when writing the vm86 TSS for guest");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int send_hvm_context(int xc_handle, int io_fd,
+ struct kemari_ring *ring, uint32_t dom)
+{
+ uint32_t buf_size = ring->hvm_ctxt.buf_size;
+ uint32_t rec_size = ring->hvm_ctxt.rec_size;
+ uint8_t *hvm_buf = (uint8_t *)ring + ring->hvm_ctxt.buf_offset;
+ int rc = -1;
+
+ /* Get HVM context from Xen and save it too */
+ if ( (rec_size = xc_domain_hvm_getcontext(xc_handle, dom, hvm_buf,
+ buf_size)) == -1 )
+ {
+ ERROR("HVM:Could not get hvm buffer");
+ goto out;
+ }
+
+ if ( write_exact(io_fd, &rec_size, sizeof(uint32_t)) )
+ {
+ PERROR("error write hvm buffer size");
+ goto out;
+ }
+
+ if ( write_exact(io_fd, hvm_buf, rec_size) )
+ {
+ PERROR("write HVM info failed!\n");
+ goto out;
+ }
+ rc = 0;
+
+out:
+ return rc;
+}
+
+int xc_kemari_save(int xc_handle, int io_fd, uint32_t dom,
+ void *kemari_ring, uint32_t flags,
+ int hvm, void *(*init_qemu_maps)(int, unsigned))
+{
+ int rc = 1, i, j, iter = 0;
+ int debug = (flags & XCFLAGS_DEBUG);
+ int sent_last_iter, skip_this_iter;
+ xc_dominfo_t info;
+ struct kemari_ring *ring = (struct kemari_ring *)kemari_ring;
+
+ /* base of the region in which domain memory is mapped */
+ unsigned char *region_base = NULL;
+
+ /* bitmap of pages:
+ - that should be sent this iteration (unless later marked as skip);
+ - to skip this iteration because already dirty;
+ - to fixup by sending at the end if not already resent; */
+ unsigned long *to_send = NULL, *to_fix = NULL;
+
+ xc_shadow_op_stats_t stats;
+
+ unsigned long needed_to_fix = 0;
+ unsigned long total_sent = 0;
+
+ /* HVM: magic frames for ioreqs and xenstore comms. */
+ uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
+
+ /* callback irq */
+ uint64_t callback_irq = 0;
+
+ if ( !hvm )
+ {
+ ERROR("HVM domain is required for the kemari migration.");
+ return 1;
+ }
+
+ initialize_mbit_rate();
+
+ if ( xc_domain_getinfo(xc_handle, dom, 1, &info) != 1 )
+ {
+ ERROR("Could not get domain info");
+ return 1;
+ }
+
+ shared_info_frame = info.shared_info_frame;
+ DPRINTF("xc_kemari_save: shared_info_frame: %lu\n", shared_info_frame);
+
+ /* Get the size of the P2M table */
+ p2m_size = xc_memory_op(xc_handle, XENMEM_maximum_gpfn, &dom) + 1;
+ DPRINTF("xc_kemari_save: p2m_size: %lu\n", p2m_size);
+
+ /* Domain is still running at this point */
+ {
+ /* Get qemu-dm logging dirty pages too */
+ void *seg = init_qemu_maps(dom, BITMAP_SIZE);
+ qemu_bitmaps[0] = seg;
+ qemu_bitmaps[1] = seg + BITMAP_SIZE;
+ qemu_active = 0;
+ qemu_non_active = 1;
+ }
+
+ /* pretend we sent all the pages last iteration */
+ sent_last_iter = p2m_size;
+
+ /* Setup to_send / to_fix bitmaps */
+ to_send = xg_memalign(PAGE_SIZE, ROUNDUP(BITMAP_SIZE, PAGE_SHIFT));
+ to_fix = calloc(1, BITMAP_SIZE);
+
+ if ( !to_send || !to_fix )
+ {
+ ERROR("Couldn't allocate to_send array");
+ goto out;
+ }
+
+ memset(to_send, 0xff, BITMAP_SIZE);
+
+ if ( lock_pages(to_send, BITMAP_SIZE) )
+ {
+ ERROR("Unable to lock to_send");
+ return 1;
+ }
+
+ pfn_type = xg_memalign(PAGE_SIZE, ROUNDUP(
+ MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
+ if ( pfn_type == NULL )
+ {
+ ERROR("failed to alloc memory for pfn_type arrays");
+ errno = ENOMEM;
+ goto out;
+ }
+ memset(pfn_type, 0,
+ ROUNDUP(MAX_BATCH_SIZE * sizeof(*pfn_type), PAGE_SHIFT));
+
+ if ( lock_pages(pfn_type, MAX_BATCH_SIZE * sizeof(*pfn_type)) )
+ {
+ ERROR("Unable to lock pfn_type array");
+ goto out;
+ }
+
+ /* Start writing out the saved-domain record. */
+ if ( write_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
+ {
+ PERROR("write: p2m_size");
+ goto out;
+ }
+
+ /* send shared_info_frame */
+ if ( write_exact(io_fd, &shared_info_frame, sizeof(unsigned long)) )
+ {
+ PERROR("write: shared_info_frame");
+ goto out;
+ }
+
+ /* Save magic-page locations. */
+ memset(magic_pfns, 0, sizeof(magic_pfns));
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
+ &magic_pfns[0]);
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
+ &magic_pfns[1]);
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
+ &magic_pfns[2]);
+ DPRINTF("kemari_restore: magic_pfns 0: %lld, 1: %lld, 2: %lld\n",
+ magic_pfns[0], magic_pfns[1], magic_pfns[2]);
+ if ( write_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
+ {
+ PERROR("Error when writing to state file (7)");
+ goto out;
+ }
+
+ xc_get_hvm_param(xc_handle, dom, HVM_PARAM_CALLBACK_IRQ,
+ &callback_irq);
+ DPRINTF("kemari_restore: callback irq %llx", callback_irq);
+ if ( write_exact(io_fd, &callback_irq, sizeof(callback_irq)) )
+ {
+ PERROR("Error when writing to state file (8)");
+ goto out;
+ }
+
+ print_stats(xc_handle, dom, 0, &stats, 0);
+
+ /* Now write out each data page, canonicalising page tables as we go... */
+ {
+ unsigned int prev_pc, sent_this_iter, N, batch, run;
+
+ iter++;
+ sent_this_iter = 0;
+ skip_this_iter = 0;
+ prev_pc = 0;
+ N = 0;
+
+ DPRINTF("Saving memory pages: iter %d 0%%", iter);
+
+ while ( N < p2m_size )
+ {
+ unsigned int this_pc = (N * 100) / p2m_size;
+
+ if ( (this_pc - prev_pc) >= 5 )
+ {
+ DPRINTF("\b\b\b\b%3d%%", this_pc);
+ prev_pc = this_pc;
+ }
+
+ /* load pfn_type[] with the mfn of all the pages we're doing in
+ this batch. */
+ for ( batch = 0;
+ (batch < MAX_BATCH_SIZE) && (N < p2m_size);
+ N++ )
+ {
+ int n = N;
+
+ if ( debug )
+ {
+ DPRINTF("%d pfn= %08lx mfn= %08lx %d",
+ iter, (unsigned long)n,
+ (long unsigned int)0,
+ test_bit(n, to_send));
+ DPRINTF("\n");
+ }
+
+ if ( !( (test_bit(n, to_send)) || (test_bit(n, to_fix))) )
+ continue;
+
+ /*
+ ** we get here if:
+ ** 1. page is marked to_send & hasn't already been re-dirtied
+ ** 2. add in pages that still need fixup (net bufs)
+ */
+
+ /* Hypercall interfaces operate in PFNs for HVM guests
+ * and MFNs for PV guests */
+ pfn_type[batch] = n;
+
+ if ( !is_mapped(pfn_type[batch]) )
+ {
+ /*
+ ** not currently in psuedo-physical map -- set bit
+ ** in to_fix since we must send this page in last_iter
+ ** unless its sent sooner anyhow, or it never enters
+ ** pseudo-physical map (e.g. for ballooned down doms)
+ */
+ set_bit(n, to_fix);
+ continue;
+ }
+
+ if ( test_bit(n, to_fix) &&
+ !test_bit(n, to_send) )
+ {
+ needed_to_fix++;
+ DPRINTF("Fix! iter %d, pfn %x. mfn %lx\n",
+ iter, n, pfn_type[batch]);
+ }
+
+ clear_bit(n, to_fix);
+
+ batch++;
+ }
+
+ if ( batch == 0 )
+ goto skip; /* vanishingly unlikely... */
+
+ region_base = xc_map_foreign_batch(
+ xc_handle, dom, PROT_READ, pfn_type, batch);
+ if ( region_base == NULL )
+ {
+ ERROR("map batch failed");
+ goto out;
+ }
+
+ {
+ /* Look for and skip completely empty batches. */
+ for ( j = 0; j < batch; j++ )
+ if ( (pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK) !=
+ XEN_DOMCTL_PFINFO_XTAB )
+ break;
+ if ( j == batch )
+ {
+ munmap(region_base, batch*PAGE_SIZE);
+ continue; /* bail on this batch: no valid pages */
+ }
+ }
+
+ if ( write_exact(io_fd, &batch, sizeof(unsigned int)) )
+ {
+ PERROR("Error when writing to state file (2)");
+ goto out;
+ }
+
+ if ( write_exact(io_fd, pfn_type, sizeof(unsigned long)*batch) )
+ {
+ PERROR("Error when writing to state file (3)");
+ goto out;
+ }
+
+ /* entering this loop, pfn_type is now in pfns (Not mfns) */
+ run = 0;
+ for ( j = 0; j < batch; j++ )
+ {
+ unsigned long pfn, pagetype;
+
+ pfn = pfn_type[j] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ pagetype = pfn_type[j] & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ if ( pagetype != 0 )
+ {
+ /* If the page is not a normal data page, write out any
+ run of pages we may have previously acumulated */
+ if ( run )
+ {
+ if ( ratewrite(io_fd,
+ (char*)region_base+(PAGE_SIZE*(j-run)),
+ PAGE_SIZE*run) != PAGE_SIZE*run )
+ {
+ ERROR("Error when writing to state file (4a)"
+ " (errno %d)", errno);
+ goto out;
+ }
+ run = 0;
+ }
+ }
+
+ /* skip pages that aren't present */
+ if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+ continue;
+
+ pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+ if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
+ (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+ {
+ DPRINTF("canonicalize_pagetable pagetype = %lx pfn = %lu\n", pagetype, pfn);
+ }
+ else
+ {
+ /* We have a normal page: accumulate it for writing. */
+ run++;
+ }
+ } /* end of the write out for this batch */
+
+ if ( run )
+ {
+ /* write out the last accumulated run of pages */
+ if ( ratewrite(io_fd,
+ (char*)region_base+(PAGE_SIZE*(j-run)),
+ PAGE_SIZE*run) != PAGE_SIZE*run )
+ {
+ ERROR("Error when writing to state file (4c)"
+ " (errno %d)", errno);
+ goto out;
+ }
+ }
+
+ sent_this_iter += batch;
+
+ munmap(region_base, batch*PAGE_SIZE);
+
+ } /* end of this while loop for this iteration */
+
+ skip:
+
+ total_sent += sent_this_iter;
+
+ DPRINTF("\r %d: sent %d, skipped %d, ",
+ iter, sent_this_iter, skip_this_iter );
+
+ {
+ print_stats( xc_handle, dom, sent_this_iter, &stats, 1);
+
+ DPRINTF("Total pages sent= %ld (%.2fx)\n",
+ total_sent, ((float)total_sent)/p2m_size );
+ DPRINTF("(of which %ld were fixups)\n", needed_to_fix );
+ }
+ } /* end of infinite for loop */
+
+ DPRINTF("All memory is saved\n");
+
+ if (send_hvm_params(xc_handle, io_fd, dom) < 0)
+ goto out;
+
+ /* Zero terminate */
+ i = 0;
+ if ( write_exact(io_fd, &i, sizeof(int)) )
+ {
+ PERROR("Error when writing to state file (6')");
+ goto out;
+ }
+
+ if (send_hvm_context(xc_handle, io_fd, ring, dom) < 0)
+ goto out;
+
+ if (!debug)
+ {
+ int rcv_status;
+ if ( read_exact(io_fd, &rcv_status, sizeof(int))) {
+ ERROR("Error when reading receiver status");
+ goto out;
+ }
+ DPRINTF("status received: %d\n", rcv_status);
+ }
+
+ if (init_guest_mem(xc_handle, dom) < 0)
+ goto out;
+
+ /* HVM guests are done now */
+ rc = 0;
+
+ out:
+
+ /* Flush last write and discard cache for file. */
+ discard_file_cache(io_fd, 1 /* flush */);
+
+ free(to_send);
+ free(to_fix);
+
+ DPRINTF("Save exit rc=%d\n",rc);
+
+ return !!rc;
+}
+
+
+int xc_kemari_update(int xc_handle, int io_fd, uint32_t dom,
+ void *kemari_ring, uint32_t flags,
+ void (*qemu_save_image)(int),
+ void (*qemu_end_flip)(void),
+ void (*qemu_end_save)(void),
+ void (*qemu_image_sent)(void))
+{
+ int rc = 1, k;
+ int debug = (flags & XCFLAGS_DEBUG);
+ uint32_t i, j, index = 0;
+ unsigned int batch = 0;
+ struct kemari_ring *ring = (struct kemari_ring *)kemari_ring;
+ struct kemari_ent *buf;
+ struct iovec iov[MAX_BATCH_SIZE + 2]; /* 2 for batch and pfn_type */
+ int iovcnt = 2;
+
+#define ADD_IOV(base, len) do { \
+ iov[iovcnt].iov_base = base; \
+ iov[iovcnt].iov_len = len; \
+ iovcnt++; \
+} while (0)
+
+
+
+ /* flip active qemu */
+ qemu_active = qemu_non_active;
+ qemu_non_active = qemu_active ? 0 : 1;
+ qemu_save_image(qemu_active);
+
+ /*
+ * main iteration starts from here
+ */
+ while (ring->cons < ring->prod) {
+
+ kemari_ring_read(ring, &buf);
+
+ for (i = buf->u.index.start, j = buf->u.index.end; i < j; i++) {
+
+ int next, offset = 0;
+
+ index = i * BITS_PER_LONG;
+
+ kemari_ring_read(ring, &buf);
+
+ while (buf->u.dirty_bitmap && offset < BITS_PER_LONG) {
+ int n;
+ next = ffs(buf->u.dirty_bitmap);
+ buf->u.dirty_bitmap >>= next;
+ offset += next;
+ n = offset + index - 1;
+ ADD_IOV(search_guest_mem(xc_handle, dom, n), PAGE_SIZE);
+ pfn_type[batch] = n;
+ batch++;
+ }
+
+ if ((batch + BITS_PER_LONG - 1 < MAX_BATCH_SIZE) &&
+ !(ring->cons == ring->prod))
+ continue;
+
+ /* Pull in the dirty bits from qemu-dm too */
+ qemu_end_flip();
+ for ( k = 0; k < BITMAP_SIZE / BITS_PER_LONG; k++) {
+ if (qemu_bitmaps[qemu_non_active][k] != 0) {
+ unsigned int bmp = qemu_bitmaps[qemu_non_active][k];
+
+ index = k * BITS_PER_LONG;
+ while (bmp && offset < BITS_PER_LONG) {
+ int n, next, offset = 0;
+ next = ffs(bmp);
+ bmp >>= next;
+ offset += next;
+ n = offset + index - 1;
+
+ ADD_IOV(search_guest_mem(xc_handle, dom, n), PAGE_SIZE);
+ pfn_type[batch] = n;
+ batch++;
+ }
+ qemu_bitmaps[qemu_non_active][k] = 0;
+ }
+ if (batch >= MAX_BATCH_SIZE) {
+ ERROR("Sorry, reached MAX_BATCH_SIZE. "
+ "We will fix this lator.");
+ goto out;
+ }
+ }
+
+ PPRINTF("batch %d\n", batch);
+
+ /* send pages */
+ iov[0].iov_base = &batch;
+ iov[0].iov_len = sizeof(batch);
+
+ iov[1].iov_base = pfn_type;
+ iov[1].iov_len = sizeof(pfn_type[0]) * batch;
+
+ for (k = 0; k < iovcnt / IOV_MAX + 1; k++) {
+ int count = (iovcnt<IOV_MAX*(k+1))?(iovcnt-IOV_MAX*k):IOV_MAX;
+ if (writev_exact(io_fd, &iov[IOV_MAX * k], count)) {
+ ERROR("Error when writing pages state file (2--4)"
+ " (errno %d)", errno);
+ goto out;
+ }
+ }
+
+ batch = 0;
+ }
+ }
+
+ if (send_hvm_params(xc_handle, io_fd, dom) < 0)
+ goto out;
+ qemu_end_save();
+ if (!debug && send_qemu_image(xc_handle, io_fd, dom) < 0)
+ goto out;
+ qemu_image_sent();
+
+ /* Zero terminate */
+ i = 0;
+ if ( write_exact(io_fd, &i, sizeof(int)) )
+ {
+ PERROR("Error when writing to state file (6')");
+ goto out;
+ }
+
+ if (send_hvm_context(xc_handle, io_fd, ring, dom) < 0)
+ goto out;
+
+ if (!debug)
+ {
+ int rcv_status;
+ if ( read_exact(io_fd, &rcv_status, sizeof(int))) {
+ ERROR("Error when reading receiver status");
+ goto out;
+ }
+ }
+
+ rc = 0;
+out:
+
+ return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r b249f3e979a5 -r 06b950859c92 tools/xcutils/xc_kemari_save.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xcutils/xc_kemari_save.c Tue Mar 24 15:11:38 2009 +0900
@@ -0,0 +1,525 @@
+/*
+ * xc_kemari_save.c
+ *
+ * Save the state of a running Linux session.
+ *
+ * Copyright (c) 2008-2009 Nippon Telegraph and Telephone Corporation.
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License. See the file "COPYING" in the main directory of
+ * this archive for more details.
+ *
+ * This source code is based on xc_save.c.
+ * Copied qemu_destroy_buffer and init_qemu_maps from xc_save.c.
+ *
+ * Copyright (C) 2005 by Christian Limpach
+ *
+ */
+
+
+#include <err.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <xs.h>
+#include <xenctrl.h>
+#include <xenguest.h>
+#include <xc_private.h>
+#include <xen/kemari.h>
+
+static volatile sig_atomic_t run = 1;
+static int xc_handle, xce_handle, io_fd;
+static struct kemari_ring *ring = NULL;
+static uint32_t kemari_ring_size = 0;
+static int qemu_port;
+static int is_finalized = 0;
+static int domid;
+
+/* For HVM guests, there are two sources of dirty pages: the Xen shadow
+ * log-dirty bitmap, which we get with a hypercall, and qemu's version.
+ * The protocol for getting page-dirtying data from qemu uses a
+ * double-buffered shared memory interface directly between xc_save and
+ * qemu-dm.
+ *
+ * xc_save calculates the size of the bitmaps and notifies qemu-dm
+ * through the store that it wants to share the bitmaps. qemu-dm then
+ * starts filling in the 'active' buffer.
+ *
+ * To change the buffers over, xc_save writes the other buffer number to
+ * the store and waits for qemu to acknowledge that it is now writing to
+ * the new active buffer. xc_save can then process and clear the old
+ * active buffer. */
+
+static char *qemu_active_path;
+static char *qemu_next_active_path;
+static int qemu_shmid = -1;
+static struct xs_handle *xs;
+
+
+/* Mark the shared-memory segment for destruction */
+static void qemu_destroy_buffer(void)
+{
+ if (qemu_shmid != -1)
+ shmctl(qemu_shmid, IPC_RMID, NULL);
+ qemu_shmid = -1;
+}
+
+static char *kemari_qemu_info = NULL;
+static void qemu_save_image(int next_active)
+{
+ kemari_qemu_info[0] = next_active;
+ kemari_qemu_info[1] = 0;
+ xen_wmb();
+ xc_evtchn_notify(xce_handle, qemu_port);
+}
+
+static void qemu_end_flip(void)
+{
+ while (kemari_qemu_info[1] == 0)
+ xen_rmb();
+}
+
+static void qemu_end_save(void)
+{
+ while (kemari_qemu_info[2] == 0)
+ xen_rmb();
+}
+
+static void qemu_image_sent(void)
+{
+ /* after QEMU image sent */
+ kemari_qemu_info[2] = 0;
+ xen_wmb();
+}
+
+static void *init_qemu_maps(int domid, unsigned int bitmap_size)
+{
+ key_t key;
+ char key_ascii[17] = {0,};
+ void *seg;
+ char *path, *p;
+
+ /* Make a shared-memory segment */
+ do {
+ key = rand(); /* No security, just a sequence of numbers */
+ qemu_shmid = shmget(key, 2 * bitmap_size + PAGE_SIZE,
+ IPC_CREAT|IPC_EXCL|S_IRUSR|S_IWUSR);
+ if (qemu_shmid == -1 && errno != EEXIST)
+ errx(1, "can't get shmem to talk to qemu-dm");
+ } while (qemu_shmid == -1);
+
+ /* Remember to tidy up after ourselves */
+ atexit(qemu_destroy_buffer);
+
+ /* Map it into our address space */
+ seg = shmat(qemu_shmid, NULL, 0);
+ if (seg == (void *) -1)
+ errx(1, "can't map shmem to talk to qemu-dm");
+ memset(seg, 0, 2 * bitmap_size + PAGE_SIZE);
+
+ /* Write the size of it into the first 32 bits */
+ *(uint32_t *)seg = bitmap_size;
+
+ /* Tell qemu about it */
+ if ((xs = xs_daemon_open()) == NULL)
+ errx(1, "Couldn't contact xenstore");
+ if (!(path = strdup("/local/domain/0/device-model/")))
+ errx(1, "can't get domain path in store");
+ if (!(path = realloc(path, strlen(path)
+ + 10
+ + strlen("/logdirty/next-active") + 1)))
+ errx(1, "no memory for constructing xenstore path");
+ snprintf(path + strlen(path), 11, "%i", domid);
+ strcat(path, "/logdirty/");
+ p = path + strlen(path);
+
+ strcpy(p, "key");
+ snprintf(key_ascii, 17, "%16.16llx", (unsigned long long) key);
+ if (!xs_write(xs, XBT_NULL, path, key_ascii, 16))
+ errx(1, "can't write key (%s) to store path (%s)\n", key_ascii, path);
+
+ /* Watch for qemu's indication of the active buffer, and request it
+ * to start writing to buffer 0 */
+ strcpy(p, "active");
+ if (!xs_watch(xs, path, "qemu-active-buffer"))
+ errx(1, "can't set watch in store (%s)\n", path);
+ if (!(qemu_active_path = strdup(path)))
+ errx(1, "no memory for copying xenstore path");
+
+ strcpy(p, "next-active");
+ if (!(qemu_next_active_path = strdup(path)))
+ errx(1, "no memory for copying xenstore path");
+
+ kemari_qemu_info = seg + 2 * bitmap_size;
+ xen_wmb();
+ qemu_save_image(0);
+
+ free(path);
+ return seg;
+}
+
+static void close_handler(int sig_type)
+{
+ run = 0;
+}
+
+static int handle_event(int domid, unsigned int flags)
+{
+ int ret = 1, rcv_port;
+
+ if ((rcv_port = xc_evtchn_pending(xce_handle)) < 0) {
+ ERROR("Failed to read from event fd");
+ goto out;
+ }
+
+ if (xc_kemari_update(xc_handle, io_fd, domid, ring, flags,
+ qemu_save_image, qemu_end_flip, qemu_end_save, qemu_image_sent) != 0) {
+ xc_domain_pause(xc_handle, domid);
+ ERROR("xc_kemari_update failed");
+ goto out;
+ }
+
+ if (xc_evtchn_unmask(xce_handle, rcv_port) < 0) {
+ ERROR("Failed to write to event fd");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static void set_signal_handler(void (*handler)(int))
+{
+ struct sigaction act;
+
+ act.sa_handler = handler;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = 0;
+ sigaction(SIGQUIT, &act, 0);
+ sigaction(SIGINT, &act, 0);
+ sigaction(SIGHUP, &act, 0);
+ sigaction(SIGTERM, &act, 0);
+}
+
+static int attach_ports(int domid)
+{
+ struct xs_handle *xs_handle;
+ char **list, *data;
+ unsigned int list_size, data_size;
+ char path[128];
+ uint32_t port;
+ int i, ret = 1;
+
+ if ((xs_handle = xs_daemon_open()) == NULL)
+ errx(1, "Couldn't contact xenstore");
+
+ /*
+ * attach block port.
+ */
+ snprintf(path, sizeof(path), "/local/domain/%d/device/vbd", domid);
+ list = xs_directory(xs_handle, XBT_NULL, path, &list_size);
+ if (list == NULL)
+ errx(1, "xs_directory (%s) failed", path);
+
+ for (i = 0; i < list_size; i++) {
+ snprintf(path, sizeof(path),
+ "/local/domain/%d/device/vbd/%s/event-channel", domid, list[i]);
+ data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+ if (data == NULL)
+ continue;
+ port = strtoul(data, NULL, 10);
+ if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_attach,
+ &port, NULL,
+ NULL, KEMARI_TAP_OUT)) != 0) {
+ ERROR("Error when attaching blk_port (%d) on kemari", port);
+ goto out;
+ }
+ free(data);
+ DPRINTF("blk_port %d attached\n", port);
+ }
+ free(list);
+
+ /*
+ * attach net port.
+ */
+ snprintf(path, sizeof(path), "/local/domain/%d/device/vif", domid);
+ list = xs_directory(xs_handle, XBT_NULL, path, &list_size);
+ if (list == NULL)
+ errx(1, "xs_directory (%s) failed", path);
+
+ for (i = 0; i < list_size; i++) {
+ snprintf(path, sizeof(path),
+ "/local/domain/%d/device/vif/%s/event-channel", domid, list[i]);
+ data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+ if (data == NULL)
+ continue;
+ port = strtoul(data, NULL, 10);
+ if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_attach,
+ &port, NULL,
+ NULL, KEMARI_TAP_OUT)) != 0) {
+ ERROR("Error when attaching net_port (%d) on kemari", port);
+ goto out;
+ }
+ free(data);
+ DPRINTF("net_port %d attached\n", port);
+ }
+ free(list);
+
+ /* attach success */
+ ret = 0;
+
+out:
+ xs_daemon_close(xs_handle);
+
+ return ret;
+}
+
+static int get_qemu_port(unsigned int domid)
+{
+ struct xs_handle *xs_handle;
+ char path[128];
+ char *data;
+ unsigned int data_size;
+ int port, inter_port = -1;
+
+ if ((xs_handle = xs_daemon_open()) == NULL)
+ errx(1, "Couldn't contact xenstore");
+
+ snprintf(path, sizeof(path),
+ "/local/domain/%u/kemari/event-channel", domid);
+
+ data = xs_read(xs_handle, XBT_NULL, path, &data_size);
+ if (data == NULL) {
+ ERROR("Could not find QEMU port for domid %d", domid);
+ goto out;
+ }
+ port = strtoul(data, NULL, 10);
+ free(data);
+
+ inter_port = xc_evtchn_bind_interdomain(xce_handle, DOMID_SELF, port);
+ if (inter_port < 0)
+ errx(1, "Port assigned by Xen is strange: %d", inter_port);
+
+ DPRINTF("qemu_port: %d %d\n", port, inter_port);
+
+out:
+ xs_daemon_close(xs_handle);
+
+ return inter_port;
+}
+
+static void finalize(void)
+{
+ int ret;
+
+ if (is_finalized)
+ return;
+
+ set_signal_handler(SIG_IGN);
+ if (ring != NULL)
+ munmap(ring, kemari_ring_size * PAGE_SIZE);
+
+ if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_off,
+ NULL, NULL, NULL, 0)) != 0) {
+ ERROR("Error when turning off kemari");
+ } else {
+ DPRINTF("successufully execute KEMARI_OP_off\n");
+ }
+
+ if ( xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_OFF,
+ NULL, 0, NULL, 0, NULL) < 0 ) {
+ ERROR("Warning - couldn't disable shadow mode");
+ }
+
+ if (!run)
+ xc_domain_destroy(xc_handle, domid);
+
+ xc_interface_close(xc_handle);
+
+ is_finalized = 1;
+}
+
+int
+main(int argc, char **argv)
+{
+ unsigned int maxit, max_f, flags;
+ int ret;
+ int evtchn_fd;
+ uint32_t port, kemari_port;
+ uint64_t kemari_mfn;
+ fd_set inset;
+
+ if (argc != 6)
+ errx(1, "usage: %s iofd domid maxit maxf flags", argv[0]);
+
+ xc_handle = xc_interface_open();
+ if (xc_handle < 0)
+ errx(1, "failed to open control interface");
+
+ io_fd = atoi(argv[1]);
+ domid = atoi(argv[2]);
+ maxit = atoi(argv[3]);
+ max_f = atoi(argv[4]);
+ flags = atoi(argv[5]);
+
+ set_signal_handler(close_handler);
+ atexit(finalize);
+
+ if (io_fd == -1) /* means test mode */
+ {
+ io_fd = open("/dev/null", O_RDWR);
+ flags |= XCFLAGS_DEBUG;
+ }
+ else
+ {
+ int one = 1;
+ if (setsockopt(io_fd, IPPROTO_TCP, TCP_NODELAY,
+ &one, sizeof(one)) < 0) {
+ ERROR("failed to set TCP_NODELAY");
+ }
+ }
+
+ if ((xce_handle = xc_evtchn_open()) < 0) {
+ errx(1, "failed to open control interface");
+ }
+
+ evtchn_fd = xc_evtchn_fd(xce_handle);
+
+ if ((qemu_port = get_qemu_port(domid)) < 0)
+ errx(1, "failed to get qemu port");
+
+ if ( xc_shadow_control(xc_handle, domid,
+ XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+ NULL, 0, NULL, 0, NULL) < 0 )
+ {
+ int frc;
+ /* log-dirty already enabled? There's no test op,
+ so attempt to disable then reenable it */
+ frc = xc_shadow_control(xc_handle, domid, XEN_DOMCTL_SHADOW_OP_OFF,
+ NULL, 0, NULL, 0, NULL);
+ if ( frc >= 0 )
+ {
+ frc = xc_shadow_control(xc_handle, domid,
+ XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+ NULL, 0, NULL, 0, NULL);
+ }
+
+ if ( frc < 0 )
+ {
+ err(errno, "Couldn't enable shadow mode (rc %d)", frc);
+ }
+ }
+
+ if ((ret = xc_kemari_control(xc_handle, domid, XEN_KEMARI_OP_enable,
+ &kemari_port, &kemari_ring_size,
+ &kemari_mfn, 0) != 0)) {
+ errx(1, "Error when turning on kemari");
+ }
+
+ DPRINTF("kemari_port=%u, kemari_mfn=%llu, kemari_ring_size=%u\n",
+ kemari_port, kemari_mfn, kemari_ring_size);
+
+ if (attach_ports(domid) != 0) {
+ ERROR("attaching port failed ");
+ goto out;
+ }
+
+ if ((port = xc_evtchn_bind_interdomain(xce_handle, domid,
+ kemari_port)) < 0) {
+ ERROR("xc_evtchn_bind_interdomain failed ");
+ goto out;
+ }
+
+ if ((ring = xc_map_foreign_range(xc_handle, DOMID_XEN,
+ kemari_ring_size * PAGE_SIZE, PROT_READ | PROT_WRITE,
+ kemari_mfn)) == 0) {
+ ERROR("xc_map_foreign_range failed");
+ goto out;
+ }
+
+ if (xc_domain_pause(xc_handle, domid) < 0) {
+ ERROR("Domain appears not to have paused");
+ goto out;
+ }
+
+ ret = xc_kemari_save(xc_handle, io_fd, domid, ring, flags,
+ !!(flags & XCFLAGS_HVM),
+ &init_qemu_maps);
+ if (ret != 0) {
+ ERROR("xc_kemari_save failed");
+ goto out;
+ }
+
+ FD_ZERO(&inset);
+ FD_SET(evtchn_fd, &inset);
+
+ if (xc_domain_unpause(xc_handle, domid) < 0) {
+ ERROR("Domain appears not to have unpaused");
+ goto out;
+ }
+
+ DPRINTF("running start");
+
+ while (run) {
+
+ if (select(evtchn_fd + 1, &inset, NULL, NULL, NULL) < 0) {
+ if (errno == EINTR)
+ continue;
+ ERROR("Error when waiting events by select()");
+ break;
+ }
+
+ if (evtchn_fd != -1 && FD_ISSET(evtchn_fd, &inset)) {
+
+ if ((ret = handle_event(domid, flags)) != 0) {
+ ERROR("Error when handling events");
+ break;
+ }
+
+ /* usleep(10000); */
+
+ if (xc_evtchn_notify(xce_handle, port) < 0) {
+ ERROR("xc_evtchn_notify failed");
+ /* goto out; */
+ break;
+ }
+
+ if(xc_domain_unpause(xc_handle, domid) < 0) {
+ ERROR("xc_domain_unpause");
+ /* goto out; */
+ break;
+ }
+
+ }
+ }
+
+ out:
+ close(io_fd);
+ finalize();
+
+ return ret;
+}
+
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
^ permalink raw reply [flat|nested] 14+ messages in thread
* [RFC][PATCH 06/13] Kemari: Kemari receiver
2009-03-12 1:14 [RFC][PATCH 00/13] Kemari: updated to the 3.4 unstable tree Yoshiaki Tamura
` (4 preceding siblings ...)
2009-03-12 1:17 ` [RFC][PATCH 05/13] Kemari: Kemari sender Yoshiaki Tamura
@ 2009-03-12 1:18 ` Yoshiaki Tamura
2009-03-12 1:19 ` [RFC][PATCH 07/13] Kemari: add Kemari support to python Yoshiaki Tamura
` (3 subsequent siblings)
9 siblings, 0 replies; 14+ messages in thread
From: Yoshiaki Tamura @ 2009-03-12 1:18 UTC (permalink / raw)
To: xen-devel
Cc: "柳澤佳里(yanagisawa yoshisato)",
Ian Pratt, ian.jackson, Keir Fraser, Stefano Stabellini
This is an updated version of the following patch. Followed the
changes in live migration code.
http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00375.html
Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp>
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp>
---
tools/libxc/xc_dom_kemari_restore.c | 727 ++++++++++++++++++++++++++++++++++++
tools/xcutils/xc_kemari_restore.c | 88 ++++
2 files changed, 815 insertions(+)
diff -r b249f3e979a5 -r cf6a910e3663 tools/xcutils/xc_kemari_restore.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/xcutils/xc_kemari_restore.c Wed Mar 11 18:03:47 2009 +0900
@@ -0,0 +1,88 @@
+/*
+ * xc_kemari_restore.c
+ *
+ * Restore the state of a running Linux session.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License. See the file "COPYING" in the main directory of
+ * this archive for more details.
+ *
+ * This source code is based on xc_restore.c.
+ *
+ * Copyright (C) 2005 by Christian Limpach
+ *
+ */
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+
+#include <err.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <xenctrl.h>
+#include <xenguest.h>
+#include <xc_private.h>
+
+static int io_fd;
+
+static void close_handler(int sig_type)
+{
+ /* let xc_kemari_restore move build process */
+ close(io_fd);
+}
+
+int
+main(int argc, char **argv)
+{
+ unsigned int domid, store_evtchn, console_evtchn;
+ unsigned int hvm, pae, apic;
+ int xc_fd, ret, one = 1;
+ unsigned long store_mfn, console_mfn;
+ struct sigaction act;
+
+ if ( argc != 8 )
+ errx(1, "usage: %s iofd domid store_evtchn "
+ "console_evtchn hvm pae apic", argv[0]);
+
+ xc_fd = xc_interface_open();
+ if ( xc_fd < 0 )
+ errx(1, "failed to open control interface");
+
+ io_fd = atoi(argv[1]);
+ domid = atoi(argv[2]);
+ store_evtchn = atoi(argv[3]);
+ console_evtchn = atoi(argv[4]);
+ hvm = atoi(argv[5]);
+ pae = atoi(argv[6]);
+ apic = atoi(argv[7]);
+
+ act.sa_handler = close_handler;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = 0;
+ sigaction(SIGHUP, &act, 0);
+ sigaction(SIGINT, &act, 0);
+
+ if ( setsockopt(io_fd, IPPROTO_TCP, TCP_NODELAY, &one, sizeof(one)) < 0 )
+ DPRINTF("failed to set TCP_NODELAY");
+
+ ret = xc_kemari_restore(xc_fd, io_fd, domid, store_evtchn, &store_mfn,
+ console_evtchn, &console_mfn, hvm, pae);
+
+ if ( ret == 0 )
+ {
+ printf("store-mfn %li\n", store_mfn);
+ if ( !hvm )
+ printf("console-mfn %li\n", console_mfn);
+ fflush(stdout);
+ }
+
+ xc_interface_close(xc_fd);
+
+ return ret;
+}
diff -r b249f3e979a5 -r cf6a910e3663 tools/libxc/xc_dom_kemari_restore.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_dom_kemari_restore.c Wed Mar 11 18:03:47 2009 +0900
@@ -0,0 +1,727 @@
+/******************************************************************************
+ * xc_dom_kemari_restore.c
+ *
+ * Restore the state of a guest session for kemari.
+ *
+ * Copyright (c) 2008 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * This source code is based on xc_domain_restore.c.
+ *
+ * Copyright (c) 2003, K A Fraser.
+ * Copyright (c) 2006, Intel Corporation
+ * Copyright (c) 2007, XenSource Inc.
+ */
+
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "xg_private.h"
+#include "xg_save_restore.h"
+#include "xc_dom.h"
+
+#include <xen/hvm/ioreq.h>
+#include <xen/hvm/params.h>
+
+/* number of pfns this guest has (i.e. number of entries in the P2M) */
+static unsigned long p2m_size;
+
+/* number of 'in use' pfns in the guest (i.e. #P2M entries with a valid mfn) */
+static unsigned long nr_pfns;
+
+/* A table mapping each PFN to its new MFN. */
+static xen_pfn_t *p2m = NULL;
+
+/* A table of P2M mappings in the current region */
+static xen_pfn_t *p2m_batch = NULL;
+
+int xc_kemari_restore(int xc_handle, int io_fd, uint32_t dom,
+ unsigned int store_evtchn, unsigned long *store_mfn,
+ unsigned int console_evtchn, unsigned long *console_mfn,
+ unsigned int hvm, unsigned int pae)
+{
+ int rc = 1, frc, i, n, m;
+ unsigned long mfn, pfn;
+ unsigned int prev_pc, this_pc;
+
+ /* The new domain's shared-info frame number. */
+ unsigned long shared_info_frame;
+
+ /* A table containing the type of each PFN (/not/ MFN!). */
+ unsigned long *pfn_type = NULL;
+
+ /* A table of MFNs to map in the current region */
+ xen_pfn_t *region_mfn = NULL;
+
+ /* Types of the pfns in the current region */
+ unsigned long region_pfn_type[MAX_BATCH_SIZE];
+
+ /* Our mapping of the current region (batch) */
+ char *region_base;
+
+ /* Magic frames in HVM guests: ioreqs and xenstore comms. */
+ uint64_t magic_pfns[3]; /* ioreq_pfn, bufioreq_pfn, store_pfn */
+
+ /* Temporary buffered memory space until all pages are read. */
+ char *tmp_region = NULL;
+
+ /* if true, go into transaction mode */
+ int kemari_transaction_mode = 0;
+
+ /* index for grant table */
+ int grant_idx = 0;
+
+ /* Callback IRQ */
+ uint64_t callback_irq = 0;
+
+ /* active and non-active id of flip buffer */
+ int info_active = 0, info_non_active = 1;
+
+ /* Buffer for holding HVM context */
+ uint8_t *hvm_buf[2] = {NULL,NULL};
+ uint32_t hvm_buf_size = 0;
+
+ /* Buffer for qemu image */
+ uint8_t *qemu_image[2] = {NULL,NULL};
+ uint32_t qemu_image_size[2] = {0,0};
+ uint32_t qemu_buff_size = 0;
+
+ /* Buffer for the EPT identity PT location. */
+ uint64_t ident_pt[2] = {0,0};
+ /* Buffer for the VM86 TSS. */
+ uint64_t vm86_tss[2] = {0,0};
+
+ if ( !hvm ) {
+ ERROR("Kemari only works on HVM domain.");
+ goto out;
+ }
+
+ /* For info only */
+ nr_pfns = 0;
+
+ if ( read_exact(io_fd, &p2m_size, sizeof(unsigned long)) )
+ {
+ ERROR("read: p2m_size");
+ goto out;
+ }
+ DPRINTF("xc_kemari_restore start: p2m_size = %lx\n", p2m_size);
+
+ /* We want zeroed memory so use calloc rather than malloc. */
+ p2m = calloc(p2m_size, sizeof(xen_pfn_t));
+ pfn_type = calloc(p2m_size, sizeof(unsigned long));
+
+ region_mfn = xg_memalign(PAGE_SIZE, ROUNDUP(
+ MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
+ p2m_batch = xg_memalign(PAGE_SIZE, ROUNDUP(
+ MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
+
+ /* use aligned page for speed up memmove(3) */
+ tmp_region = xg_memalign(PAGE_SIZE, PAGE_SIZE * MAX_BATCH_SIZE);
+
+ if ( (p2m == NULL) || (pfn_type == NULL) ||
+ (region_mfn == NULL) || (p2m_batch == NULL) ||
+ (tmp_region == NULL) )
+ {
+ ERROR("memory alloc failed");
+ errno = ENOMEM;
+ goto out;
+ }
+
+ memset(region_mfn, 0,
+ ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
+ memset(p2m_batch, 0,
+ ROUNDUP(MAX_BATCH_SIZE * sizeof(xen_pfn_t), PAGE_SHIFT));
+ memset(tmp_region, 0, PAGE_SIZE * MAX_BATCH_SIZE);
+
+ if ( lock_pages(region_mfn, sizeof(xen_pfn_t) * MAX_BATCH_SIZE) )
+ {
+ ERROR("Could not lock region_mfn");
+ goto out;
+ }
+
+ if ( lock_pages(p2m_batch, sizeof(xen_pfn_t) * MAX_BATCH_SIZE) )
+ {
+ ERROR("Could not lock p2m_batch");
+ goto out;
+ }
+
+ if ( lock_pages(tmp_region, sizeof(xen_pfn_t) * MAX_BATCH_SIZE) )
+ {
+ ERROR("Could not lock region_mfn");
+ goto out;
+ }
+
+ /* Get the domain's shared-info frame. */
+ if ( read_exact(io_fd, &shared_info_frame, sizeof(unsigned long)))
+ {
+ ERROR("Error when reading shared_info_frame");
+ goto out;
+ }
+ DPRINTF("xc_kemari_restore: shared_info_frame: %lx\n", shared_info_frame);
+
+ /* read HVM-specific parameters */
+ if ( read_exact(io_fd, magic_pfns, sizeof(magic_pfns)) )
+ {
+ ERROR("error reading magic page addresses");
+ goto out;
+ }
+
+ if (read_exact(io_fd, &callback_irq, sizeof(callback_irq)))
+ {
+ ERROR("error reading magic page addresses");
+ goto out;
+ }
+
+ /* Mark all PFNs as invalid; we allocate on demand */
+ for ( pfn = 0; pfn < p2m_size; pfn++ )
+ p2m[pfn] = INVALID_P2M_ENTRY;
+
+ /*
+ * Now simply read each saved frame into its new machine frame.
+ * We uncanonicalise page tables as we go.
+ */
+ prev_pc = 0;
+
+ n = m = 0;
+ for ( ; ; )
+ {
+ int num_pages;
+ int nr_mfns;
+
+ num_pages = 0;
+ for ( ; ; ) {
+ int j;
+
+ this_pc = (n * 100) / p2m_size;
+ if ( (this_pc - prev_pc) >= 5 )
+ {
+ PPRINTF("\b\b\b\b%3d%%", this_pc);
+ prev_pc = this_pc;
+ }
+
+ if ( read_exact(io_fd, &j, sizeof(int)) )
+ {
+ ERROR("Error when reading batch size");
+ goto build;
+ }
+
+ PPRINTF("batch %d\n",j);
+
+ if (j == -1)
+ {
+ uint32_t rec_size;
+ if ( read_exact(io_fd, &rec_size, sizeof(uint32_t)) )
+ {
+ ERROR("error read the qemu file size");
+ goto build;
+ }
+
+ if (qemu_buff_size < rec_size)
+ {
+ qemu_buff_size = rec_size;
+ qemu_image[0] = realloc(qemu_image[0], qemu_buff_size);
+ qemu_image[1] = realloc(qemu_image[1], qemu_buff_size);
+ if ((qemu_image[0] == NULL) || (qemu_image[1] == NULL))
+ {
+ ERROR("error allocate memory");
+ goto out;
+ }
+ }
+
+ qemu_image_size[info_non_active] = rec_size;
+ if ( read_exact(io_fd, qemu_image[info_non_active],
+ qemu_image_size[info_non_active]) )
+ {
+ ERROR("error read the qemu image file");
+ goto build;
+ }
+
+ continue;
+ }
+
+ if ( j == -3 )
+ {
+ /* Skip padding 4 bytes then read the EPT identity PT location. */
+ if ( read_exact(io_fd, &ident_pt[info_non_active],
+ sizeof(uint32_t)) ||
+ read_exact(io_fd, &ident_pt[info_non_active],
+ sizeof(uint64_t)) )
+ {
+ ERROR("error read the address of the EPT identity map");
+ goto build;
+ }
+
+ continue;
+ }
+
+ if ( j == -4 )
+ {
+ /* Skip padding 4 bytes then read the vm86 TSS location. */
+ if ( read_exact(io_fd, &vm86_tss[info_non_active],
+ sizeof(uint32_t)) ||
+ read_exact(io_fd, &vm86_tss[info_non_active],
+ sizeof(uint64_t)) )
+ {
+ ERROR("error read the address of the vm86 TSS");
+ goto out;
+ }
+
+ continue;
+ }
+
+ if ( j == 0 )
+ break; /* our work here is done */
+
+ /* j > 0: Read pages here */
+ if ( (j > MAX_BATCH_SIZE) || (j < 0) )
+ {
+ ERROR("Max batch size exceeded. Giving up. %d", j);
+ goto out;
+ }
+
+ if ( read_exact(io_fd, region_pfn_type, j*sizeof(unsigned long)) )
+ {
+ ERROR("Error when reading region pfn types");
+ goto build;
+ }
+
+ if (kemari_transaction_mode) {
+ if (num_pages != 0)
+ {
+ ERROR("Sorry! You cannot execute page-send-phase "
+ "twice. We will fix this bug in the future.");
+ DPRINTF("Sorry\n");
+ goto out;
+ }
+ num_pages = j;
+
+ /* Since there are not invalid pages, we don't need to skip */
+ if ( read_exact(io_fd, tmp_region, PAGE_SIZE * num_pages) )
+ {
+ ERROR("Error when reading page at kemari transaction mode");
+ goto build;
+ }
+
+ continue;
+ }
+
+ /* Normal mode */
+ /* First pass for this batch: work out how much memory to alloc */
+ nr_mfns = 0;
+ for ( i = 0; i < j; i++ )
+ {
+ unsigned long pfn, pagetype;
+ pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) &&
+ (p2m[pfn] == INVALID_P2M_ENTRY) )
+ {
+ /* Have a live PFN which hasn't had an MFN allocated */
+ p2m_batch[nr_mfns++] = pfn;
+ p2m[pfn]--;
+ }
+ }
+
+ /* Now allocate a bunch of mfns for this batch */
+ if ( nr_mfns &&
+ (xc_domain_memory_populate_physmap(xc_handle, dom, nr_mfns, 0,
+ 0, p2m_batch) != 0) )
+ {
+ ERROR("Failed to allocate memory for batch.! %d\n", nr_mfns);
+ for (i = 0; i < nr_mfns; i++)
+ DPRINTF("p2m_batch[%d] = %lx\n", i, p2m_batch[i]);
+ errno = ENOMEM;
+ goto out;
+ }
+
+ /* set special pages */
+ {
+ struct xen_add_to_physmap xatp;
+ for (i = 0; i < nr_mfns; i++)
+ if (p2m_batch[i] == shared_info_frame) {
+ xatp.domid = dom;
+ xatp.space = XENMAPSPACE_shared_info;
+ xatp.idx = 0;
+ xatp.gpfn = shared_info_frame;
+ DPRINTF("setting up shared_info_frame: %lu\n",
+ shared_info_frame);
+ if (xc_memory_op(xc_handle, XENMEM_add_to_physmap, &xatp)
+ != 0)
+ {
+ ERROR("Error setting shared_info_frame");
+ goto out;
+ }
+ } else if ((p2m_batch[i] > shared_info_frame)
+ && (p2m_batch[i] <= shared_info_frame + 32)) {
+ xatp.domid = dom;
+ xatp.space = XENMAPSPACE_grant_table;
+ xatp.idx = grant_idx;
+ xatp.gpfn = p2m_batch[i];
+ DPRINTF("grant[%d]: %lu\n", grant_idx, xatp.gpfn);
+ if (xc_memory_op(xc_handle, XENMEM_add_to_physmap,
+ &xatp) != 0)
+ {
+ PERROR("Cannot map grant table pfn: %lu", xatp.gpfn);
+ goto out;
+ }
+ grant_idx++;
+ }
+ }
+
+ /* Second pass for this batch: update p2m[] and region_mfn[] */
+ nr_mfns = 0;
+ for ( i = 0; i < j; i++ )
+ {
+ unsigned long pfn, pagetype;
+ pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+ region_mfn[i] = ~0UL; /* map will fail but we don't care */
+ else
+ {
+ if ( p2m[pfn] == (INVALID_P2M_ENTRY-1) )
+ {
+ /* We just allocated a new mfn above; update p2m */
+ p2m[pfn] = p2m_batch[nr_mfns++];
+ nr_pfns++;
+ }
+
+ /* setup region_mfn[] for batch map.
+ * For HVM guests, this interface takes PFNs, not MFNs */
+ region_mfn[i] = pfn;
+ }
+ }
+
+ /* Map relevant mfns */
+ region_base = xc_map_foreign_batch(
+ xc_handle, dom, PROT_WRITE, region_mfn, j);
+
+ if ( region_base == NULL )
+ {
+ ERROR("map batch failed");
+ goto out;
+ }
+
+ for ( i = 0; i < j; i++ )
+ {
+ void *page;
+ unsigned long pagetype;
+ pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ if ( pagetype == XEN_DOMCTL_PFINFO_XTAB )
+ /* a bogus/unmapped page: skip it */
+ continue;
+
+ if ( pfn > p2m_size )
+ {
+ ERROR("pfn out of range");
+ goto out;
+ }
+
+ pfn_type[pfn] = pagetype;
+
+ mfn = p2m[pfn];
+
+ page = region_base + i*PAGE_SIZE;
+
+ if ( read_exact(io_fd, page, PAGE_SIZE) )
+ {
+ ERROR("Error when reading page (type was %lx)", pagetype);
+ goto out;
+ }
+
+ pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
+
+ if ( (pagetype >= XEN_DOMCTL_PFINFO_L1TAB) &&
+ (pagetype <= XEN_DOMCTL_PFINFO_L4TAB) )
+ {
+ DPRINTF("uncanonicalize_pagetable pagetype = %lx pfn = %lu\n", pagetype, pfn);
+ }
+ else if ( pagetype != XEN_DOMCTL_PFINFO_NOTAB )
+ {
+ ERROR("Bogus page type %lx page table is out of range: "
+ "i=%d p2m_size=%lu", pagetype, i, p2m_size);
+ goto out;
+
+ }
+ } /* end of 'batch' for loop */
+
+ munmap(region_base, j*PAGE_SIZE);
+ n+= j; /* crude stats */
+ }
+
+ /* HVM specific */
+ {
+ uint32_t rec_len;
+
+ /* Read HVM context */
+ if ( read_exact(io_fd, &rec_len, sizeof(uint32_t)) )
+ {
+ ERROR("error read hvm context size!\n");
+ goto build;
+ }
+
+ if (rec_len != hvm_buf_size)
+ {
+ if (hvm_buf[info_non_active] == NULL)
+ { /* hvm_buf will be reused. */
+ hvm_buf_size = rec_len;
+ hvm_buf[0] = malloc(hvm_buf_size);
+ hvm_buf[1] = malloc(hvm_buf_size);
+ if ( hvm_buf[0] == NULL || hvm_buf[1] == NULL)
+ {
+ ERROR("memory alloc for hvm context buffer failed");
+ errno = ENOMEM;
+ goto out;
+ }
+ } else {
+ ERROR("Sorry, we did not thought about HVM image size "
+ "change.");
+ goto out;
+ }
+ }
+
+ if ( read_exact(io_fd, hvm_buf[info_non_active], hvm_buf_size) )
+ {
+ ERROR("error loading the HVM context");
+ goto build;
+ }
+ }
+
+ /*
+ * Commit!
+ */
+ {
+ int zero = 0;
+
+ if ( write_exact(io_fd, &zero, sizeof(int))) {
+ ERROR("Error when replying to sender (errno %d)", errno);
+ goto out;
+ }
+ }
+
+ /* commit pages */
+ if (kemari_transaction_mode && num_pages > 0)
+ {
+ int nr_mfns;
+ /* First pass for this batch: work out how much memory to alloc */
+ nr_mfns = 0;
+ for ( i = 0; i < num_pages; i++ )
+ {
+ unsigned long pfn, pagetype;
+ pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ if ( (pagetype != XEN_DOMCTL_PFINFO_XTAB) &&
+ (p2m[pfn] == INVALID_P2M_ENTRY) )
+ {
+ /* Have a live PFN which hasn't had an MFN allocated */
+ p2m_batch[nr_mfns++] = pfn;
+ p2m[pfn]--;
+ DPRINTF("Cannot be occur!!! no map for pfn: %lu\n", pfn);
+ }
+ }
+
+ /* Now allocate a bunch of mfns for this batch */
+ if ( nr_mfns &&
+ (xc_domain_memory_populate_physmap(xc_handle, dom, nr_mfns, 0,
+ 0, p2m_batch) != 0) )
+ {
+ ERROR("Failed to allocate memory for batch.!\n");
+ errno = ENOMEM;
+ goto out;
+ }
+
+ /* Second pass for this batch: update p2m[] and region_mfn[] */
+ nr_mfns = 0;
+ for ( i = 0; i < num_pages; i++ )
+ {
+ unsigned long pfn, pagetype;
+ pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ if ( pagetype == XEN_DOMCTL_PFINFO_XTAB ) {
+ DPRINTF("pfn %lu = XEN_DOMCTL_PFINFO_XTAB\n", pfn);
+ region_mfn[i] = ~0UL; /* map will fail but we don't care */
+ }
+ else
+ {
+ if ( p2m[pfn] == (INVALID_P2M_ENTRY-1) )
+ {
+ /* We just allocated a new mfn above; update p2m */
+ p2m[pfn] = p2m_batch[nr_mfns++];
+ nr_pfns++;
+ }
+
+ /* setup region_mfn[] for batch map.
+ * For HVM guests, this interface takes PFNs, not MFNs */
+ region_mfn[i] = pfn;
+ }
+ }
+
+ /* Map relevant mfns */
+ region_base = xc_map_foreign_batch(
+ xc_handle, dom, PROT_WRITE, region_mfn, num_pages);
+
+ if ( region_base == NULL )
+ {
+ ERROR("map batch failed");
+ goto out;
+ }
+
+ for ( i = 0; i < num_pages; i++ )
+ {
+ void *page, *spage;
+ unsigned long pagetype;
+
+ pfn = region_pfn_type[i] & ~XEN_DOMCTL_PFINFO_LTAB_MASK;
+ pagetype = region_pfn_type[i] & XEN_DOMCTL_PFINFO_LTAB_MASK;
+
+ if ( pfn > p2m_size )
+ {
+ ERROR("pfn out of range");
+ goto out;
+ }
+
+ pfn_type[pfn] = pagetype;
+
+ mfn = p2m[pfn];
+
+ page = region_base + i*PAGE_SIZE;
+ spage = tmp_region + i*PAGE_SIZE;
+
+ if ( !memmove(page, spage, PAGE_SIZE) )
+ {
+ ERROR("Error when reading page (type was %lx)", pagetype);
+ goto out;
+ }
+
+ } /* end of 'batch' for loop */
+
+ munmap(region_base, num_pages*PAGE_SIZE);
+ num_pages = 0; /* clear num_pages for refill */
+ }
+
+ /* commit HVM specific status */
+ info_active = info_non_active;
+ info_non_active = info_active ? 0 : 1;
+
+ /* HVM success! */
+ rc = 0;
+ kemari_transaction_mode = 1;
+ }
+
+ build: /* building HVM context */
+ DPRINTF("building status %d\n", rc);
+ if (rc == 0)
+ {
+ FILE *qemu_fp;
+ char path[128];
+
+ /* set the EPT identity PT location */
+ xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
+ ident_pt[info_active]);
+ xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS,
+ vm86_tss[info_active]);
+
+ if ( (frc = xc_set_hvm_param(xc_handle, dom,
+ HVM_PARAM_IOREQ_PFN, magic_pfns[0]))
+ || (frc = xc_set_hvm_param(xc_handle, dom,
+ HVM_PARAM_BUFIOREQ_PFN, magic_pfns[1]))
+ || (frc = xc_set_hvm_param(xc_handle, dom,
+ HVM_PARAM_STORE_PFN, magic_pfns[2]))
+ || (frc = xc_set_hvm_param(xc_handle, dom,
+ HVM_PARAM_PAE_ENABLED, pae))
+ || (frc = xc_set_hvm_param(xc_handle, dom,
+ HVM_PARAM_STORE_EVTCHN,
+ store_evtchn))
+ || (frc = xc_set_hvm_param(xc_handle, dom,
+ HVM_PARAM_CALLBACK_IRQ,
+ callback_irq)) )
+ {
+ ERROR("error setting HVM params: %i", frc);
+ rc = 3;
+ goto out;
+ }
+ *store_mfn = magic_pfns[2];
+ DPRINTF("kemari_restore: magic_pfns 0: %lld, 1: %lld, 2: %lld\n",
+ magic_pfns[0], magic_pfns[1], magic_pfns[2]);
+
+ frc = xc_domain_hvm_setcontext(xc_handle, dom, hvm_buf[info_active],
+ hvm_buf_size);
+ if ( frc )
+ {
+ ERROR("error setting the HVM context");
+ rc = 4;
+ goto out;
+ }
+
+ if (qemu_image_size[info_active] == 0)
+ {
+ ERROR("Did not received QEMU image");
+ rc = 5;
+ goto out;
+ }
+ snprintf(path, sizeof(path), "/var/lib/xen/qemu-save.%d", dom);
+ if ((qemu_fp = fopen(path, "w")) == NULL)
+ {
+ ERROR("error opening QEMU image");
+ rc = 5;
+ goto out;
+ }
+ if (fwrite(qemu_image[info_active], qemu_image_size[info_active],
+ 1, qemu_fp) != 1)
+ {
+ ERROR("error writing QEMU image");
+ rc = 5;
+ goto out;
+ }
+ fclose(qemu_fp);
+ }
+
+ out:
+ if ( (rc != 0) && (dom != 0) )
+ xc_domain_destroy(xc_handle, dom);
+ free(p2m);
+ free(pfn_type);
+ free(region_mfn);
+ free(p2m_batch);
+ free(tmp_region);
+ free(hvm_buf[0]);
+ free(hvm_buf[1]);
+ free(qemu_image[0]);
+ free(qemu_image[1]);
+
+ /* discard cache for save file */
+ discard_file_cache(io_fd, 1 /*flush*/);
+
+ DPRINTF("Restore exit with rc=%d\n", rc);
+
+ return rc;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
+
^ permalink raw reply [flat|nested] 14+ messages in thread* [RFC][PATCH 07/13] Kemari: add Kemari support to python
2009-03-12 1:14 [RFC][PATCH 00/13] Kemari: updated to the 3.4 unstable tree Yoshiaki Tamura
` (5 preceding siblings ...)
2009-03-12 1:18 ` [RFC][PATCH 06/13] Kemari: Kemari receiver Yoshiaki Tamura
@ 2009-03-12 1:19 ` Yoshiaki Tamura
2009-03-12 1:19 ` [RFC][PATCH 08/13] Kemari: add dev state "Attached" " Yoshiaki Tamura
` (2 subsequent siblings)
9 siblings, 0 replies; 14+ messages in thread
From: Yoshiaki Tamura @ 2009-03-12 1:19 UTC (permalink / raw)
To: xen-devel
Cc: "柳澤佳里(yanagisawa yoshisato)",
Ian Pratt, ian.jackson, Keir Fraser, Stefano Stabellini
This is an updated version of the following patch. Followed the
changes in live migration code.
http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00376.html
Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp>
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp>
---
tools/python/xen/xend/XendAPI.py | 3 -
tools/python/xen/xend/XendCheckpoint.py | 86 +++++++++++++++++++++++++++-----
tools/python/xen/xend/XendDomain.py | 6 +-
tools/python/xen/xm/migrate.py | 10 ++-
4 files changed, 88 insertions(+), 17 deletions(-)
diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/xm/migrate.py
--- a/tools/python/xen/xm/migrate.py Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/python/xen/xm/migrate.py Wed Mar 11 18:03:47 2009 +0900
@@ -51,6 +51,10 @@
fn=set_true, default=None,
use="Use ssl connection for migration.")
+gopts.opt('kemari', short='k',
+ fn=set_true, default=None,
+ use="Use the Kemari fault tolerant migration.")
+
def help():
return str(gopts)
@@ -70,7 +74,8 @@
other_config = {
"port": opts.vals.port,
"node": opts.vals.node,
- "ssl": opts.vals.ssl
+ "ssl": opts.vals.ssl,
+ "kemari": opts.vals.kemari
}
server.xenapi.VM.migrate(vm_ref, dst, bool(opts.vals.live),
other_config)
@@ -78,4 +83,5 @@
server.xend.domain.migrate(dom, dst, opts.vals.live,
opts.vals.port,
opts.vals.node,
- opts.vals.ssl)
+ opts.vals.ssl,
+ opts.vals.kemari)
diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/xend/XendAPI.py
--- a/tools/python/xen/xend/XendAPI.py Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/python/xen/xend/XendAPI.py Wed Mar 11 18:03:47 2009 +0900
@@ -1797,9 +1797,10 @@
port = other_config.get("port", 0)
node = other_config.get("node", -1)
ssl = other_config.get("ssl", None)
+ kemari = other_config.get("kemari", None)
xendom.domain_migrate(xeninfo.getDomid(), destination_url,
- bool(live), port, node, ssl)
+ bool(live), port, node, ssl, kemari)
return xen_api_success_void()
def VM_save(self, _, vm_ref, dest, checkpoint):
diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/xend/XendCheckpoint.py
--- a/tools/python/xen/xend/XendCheckpoint.py Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/python/xen/xend/XendCheckpoint.py Wed Mar 11 18:03:47 2009 +0900
@@ -29,6 +29,8 @@
dm_batch = 512
XC_SAVE = "xc_save"
XC_RESTORE = "xc_restore"
+XC_KEMARI_SAVE = "xc_kemari_save"
+XC_KEMARI_RESTORE = "xc_kemari_restore"
sizeof_int = calcsize("i")
@@ -64,8 +66,15 @@
list.insert (i+1, value)
return
+def get_dev_info(info, n):
+ i = 0
+ while i < len(info):
+ if (info[i][0] == n):
+ return [n, info[i][1]]
+ i = i + 1
+ return [n, '']
-def save(fd, dominfo, network, live, dst, checkpoint=False, node=-1):
+def save(fd, dominfo, network, live, dst, checkpoint=False, node=-1, kemari=False):
try:
if not os.path.isdir("/var/lib/xen"):
os.makedirs("/var/lib/xen")
@@ -76,6 +85,30 @@
write_exact(fd, SIGNATURE, "could not write guest state file: signature")
sxprep = dominfo.sxpr()
+
+ # Add kemari option if enabled.
+ if kemari:
+ sxprep.append(['kemari', kemari])
+ pv_devlist = []
+ pv_devs = dominfo.getDeviceSxprs('vbd')
+ for x in pv_devs:
+ devinfo = []
+ for n in ['event-channel', 'ring-ref']:
+ devinfo.append(get_dev_info(x[1], n))
+ pv_devlist.append([x[0], devinfo])
+ pv_devs = dominfo.getDeviceSxprs('vif')
+ for x in pv_devs:
+ devinfo = []
+ for n in ['event-channel', 'tx-ring-ref', 'rx-ring-ref',
+ 'request-rx-copy', 'feature-rx-notify', 'feature-sg',
+ 'feature-gso-tcpv4']:
+ devinfo.append(get_dev_info(x[1], n))
+ pv_devlist.append([x[0], devinfo])
+ sxprep.append(['kemari-device-info', pv_devlist])
+
+ # Add kemari option if enabled.
+ if kemari:
+ sxprep.append(['kemari', kemari])
if node > -1:
insert_after(sxprep,'vcpus',['node', str(node)])
@@ -104,9 +137,17 @@
# enabled. Passing "0" simply uses the defaults compiled into
# libxenguest; see the comments and/or code in xc_linux_save() for
# more information.
- cmd = [xen.util.auxbin.pathTo(XC_SAVE), str(fd),
- str(dominfo.getDomid()), "0", "0",
- str(int(live) | (int(hvm) << 2)) ]
+ if kemari:
+ if not hvm:
+ raise XendError("You can only use kemari on HVM domain.")
+
+ cmd = [xen.util.auxbin.pathTo(XC_KEMARI_SAVE), str(fd),
+ str(dominfo.getDomid()), "0", "0",
+ str(int(live) | (int(hvm) << 2)) ]
+ else:
+ cmd = [xen.util.auxbin.pathTo(XC_SAVE), str(fd),
+ str(dominfo.getDomid()), "0", "0",
+ str(int(live) | (int(hvm) << 2)) ]
log.debug("[xc_save]: %s", string.join(cmd))
def saveInputHandler(line, tochild):
@@ -132,7 +173,7 @@
forkHelper(cmd, fd, saveInputHandler, False)
# put qemu device model state
- if os.path.exists("/var/lib/xen/qemu-save.%d" % dominfo.getDomid()):
+ if not kemari and os.path.exists("/var/lib/xen/qemu-save.%d" % dominfo.getDomid()):
write_exact(fd, QEMU_SIGNATURE, "could not write qemu signature")
qemu_fd = os.open("/var/lib/xen/qemu-save.%d" % dominfo.getDomid(),
os.O_RDONLY)
@@ -198,6 +239,16 @@
raise XendError("not a valid guest state file: config parse")
vmconfig = p.get_val()
+
+ # Checks if kemari is enabled or not.
+ # Since Xen do not know kemari option, this option will not be migrated.
+ is_kemari = False
+ kemari_device_info = []
+ for v in vmconfig:
+ if v[0] == 'kemari' and v[1]:
+ is_kemari = True
+ if v[0] == 'kemari-device-info' and v[1]:
+ kemari_device_info = v[1]
if not relocating:
domconfig = XendConfig(sxp_obj = vmconfig)
@@ -272,14 +323,21 @@
shadow_cur = xc.shadow_mem_control(dominfo.getDomid(), shadow / 1024)
dominfo.info['shadow_memory'] = shadow_cur
- cmd = map(str, [xen.util.auxbin.pathTo(XC_RESTORE),
- fd, dominfo.getDomid(),
- store_port, console_port, int(is_hvm), pae, apic])
+ if is_kemari:
+ cmd = map(str, [xen.util.auxbin.pathTo(XC_KEMARI_RESTORE),
+ fd, dominfo.getDomid(),
+ store_port, console_port, int(is_hvm), pae, apic])
+ else:
+ cmd = map(str, [xen.util.auxbin.pathTo(XC_RESTORE),
+ fd, dominfo.getDomid(),
+ store_port, console_port, int(is_hvm), pae, apic])
log.debug("[xc_restore]: %s", string.join(cmd))
handler = RestoreInputHandler()
forkHelper(cmd, fd, handler.handler, True)
+ if is_kemari:
+ os.close(fd)
# We don't want to pass this fd to any other children -- we
# might need to recover the disk space that backs it.
@@ -299,7 +357,7 @@
# get qemu state and create a tmp file for dm restore
# Even PV guests may have QEMU stat, but its not currently
# used so only bother with HVM currently.
- if is_hvm:
+ if is_hvm and not is_kemari:
qemu_signature = read_exact(fd, len(QEMU_SIGNATURE),
"invalid device model signature read")
if qemu_signature != QEMU_SIGNATURE:
@@ -318,7 +376,10 @@
restore_image.setCpuid()
- os.read(fd, 1) # Wait for source to close connection
+ if is_kemari:
+ restore_image.setCpuid()
+ else:
+ os.read(fd, 1) # Wait for source to close connection
dominfo.completeRestore(handler.store_mfn, handler.console_mfn)
@@ -336,7 +397,10 @@
lock = False;
try:
- dominfo.waitForDevices() # Wait for backends to set up
+ if is_kemari:
+ dominfo.waitForAttachedDevices(kemari_device_info)
+ else:
+ dominfo.waitForDevices() # Wait for backends to set up
except Exception, exn:
log.exception(exn)
diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/xend/XendDomain.py
--- a/tools/python/xen/xend/XendDomain.py Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/python/xen/xend/XendDomain.py Wed Mar 11 18:03:47 2009 +0900
@@ -1273,7 +1273,7 @@
return val
- def domain_migrate(self, domid, dst, live=False, port=0, node=-1, ssl=None):
+ def domain_migrate(self, domid, dst, live=False, port=0, node=-1, ssl=None, kemari=None):
"""Start domain migration.
@param domid: Domain ID or Name
@@ -1338,7 +1338,7 @@
try:
XendCheckpoint.save(p2cwrite, dominfo, True, live, dst,
- node=node)
+ node=node, kemari=kemari)
finally:
sock.shutdown()
sock.close()
@@ -1364,7 +1364,7 @@
try:
XendCheckpoint.save(sock.fileno(), dominfo, True, live,
- dst, node=node)
+ dst, node=node, kemari=kemari)
finally:
sock.close()
^ permalink raw reply [flat|nested] 14+ messages in thread* [RFC][PATCH 08/13] Kemari: add dev state "Attached" to python
2009-03-12 1:14 [RFC][PATCH 00/13] Kemari: updated to the 3.4 unstable tree Yoshiaki Tamura
` (6 preceding siblings ...)
2009-03-12 1:19 ` [RFC][PATCH 07/13] Kemari: add Kemari support to python Yoshiaki Tamura
@ 2009-03-12 1:19 ` Yoshiaki Tamura
2009-03-12 1:20 ` [RFC][PATCH 09/13] Kemari: add XenbusStateAttached to xenbus Yoshiaki Tamura
2009-03-12 1:23 ` [RFC][PATCH 13/13] Kemari: use shared region with to flip logdirty_bitmap Yoshiaki Tamura
9 siblings, 0 replies; 14+ messages in thread
From: Yoshiaki Tamura @ 2009-03-12 1:19 UTC (permalink / raw)
To: xen-devel
Cc: "柳澤佳里(yanagisawa yoshisato)",
Ian Pratt, ian.jackson, Keir Fraser, Stefano Stabellini
This is an updated version of the following patch. No major changes.
http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00377.html
Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp>
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp>
---
tools/python/xen/xend/XendDomainInfo.py | 8 +++
tools/python/xen/xend/server/DevConstants.py | 1
tools/python/xen/xend/server/DevController.py | 60 ++++++++++++++++++++++++++
tools/python/xen/xend/server/vfbif.py | 4 +
4 files changed, 73 insertions(+)
diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/xend/server/DevConstants.py
--- a/tools/python/xen/xend/server/DevConstants.py Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/python/xen/xend/server/DevConstants.py Wed Mar 11 18:03:47 2009 +0900
@@ -40,6 +40,7 @@
'Closed' : 6,
'Reconfiguring' : 7,
'Reconfigured' : 8,
+ 'Attached' : 9,
}
xenbusState.update(dict(zip(xenbusState.values(), xenbusState.keys())))
diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/xend/server/DevController.py
--- a/tools/python/xen/xend/server/DevController.py Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/python/xen/xend/server/DevController.py Wed Mar 11 18:03:47 2009 +0900
@@ -176,6 +176,59 @@
(devid, self.deviceClass, err))
+ def waitForAttachedDevices(self, devinfo):
+ log.debug("Waiting for attached devices %s.", self.deviceClass)
+ seq = self.deviceIDs()
+ return [self.waitForAttachedDevice(item, devinfo) for item in seq]
+
+
+ def waitForAttachedDevice(self, devid, devinfo):
+ log.debug("Waiting for attached %s.", devid)
+
+ if not self.hotplug:
+ return
+
+ (status, err) = self.waitForBackend(devid)
+
+ if status == Timeout:
+ self.destroyDevice(devid, False)
+ raise VmError("Device %s (%s) could not be connected. "
+ "Hotplug scripts not working." %
+ (devid, self.deviceClass))
+
+ elif status == Error:
+ self.destroyDevice(devid, False)
+ raise VmError("Device %s (%s) could not be connected. "
+ "Backend device not found." %
+ (devid, self.deviceClass))
+
+ elif status == Missing:
+ # Don't try to destroy the device; it's already gone away.
+ raise VmError("Device %s (%s) could not be connected. "
+ "Device not found." % (devid, self.deviceClass))
+
+ elif status == Busy:
+ err = None
+ frontpath = self.frontendPath(devid)
+ backpath = xstransact.Read(frontpath, "backend")
+ if backpath:
+ err = xstransact.Read(backpath, HOTPLUG_ERROR_NODE)
+ if not err:
+ err = "Busy."
+
+ self.destroyDevice(devid, False)
+ raise VmError("Device %s (%s) could not be connected.\n%s" %
+ (devid, self.deviceClass, err))
+
+ for x in devinfo:
+ if x[0] == str(devid): # x[0] was changed to string for transfer.
+ for y in x[1]:
+ if y[0] and y[1]:
+ self.writeFrontend(devid, y[0], str(y[1]))
+ log.debug("%s %s set for %s.", y[0], y[1], devid)
+ self.writeFrontend(devid, 'state', str(xenbusState['Attached']))
+
+
def waitForDevice_destroy(self, devid, backpath):
log.debug("Waiting for %s - destroyDevice.", devid)
@@ -473,6 +526,13 @@
else:
raise VmError("Device %s not connected" % devid)
+ def writeFrontend(self, devid, *args):
+ frontpath = self.frontendPath(devid)
+
+ if frontpath:
+ xstransact.Write(frontpath, *args)
+ else:
+ raise VmError("Device %s not connected" % devid)
## private:
diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/xend/server/vfbif.py
--- a/tools/python/xen/xend/server/vfbif.py Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/python/xen/xend/server/vfbif.py Wed Mar 11 18:03:47 2009 +0900
@@ -39,6 +39,10 @@
if devinfo[i] is not None])
def waitForDevice(self, devid):
+ # is a qemu-dm managed device, don't wait for hotplug for these.
+ return
+
+ def waitForAttachedDevice(self, devid, devinfo):
# is a qemu-dm managed device, don't wait for hotplug for these.
return
diff -r b249f3e979a5 -r cf6a910e3663 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py Mon Mar 09 10:32:24 2009 +0000
+++ b/tools/python/xen/xend/XendDomainInfo.py Wed Mar 11 18:03:47 2009 +0900
@@ -1018,6 +1018,14 @@
"""
for devclass in XendDevices.valid_devices():
self.getDeviceController(devclass).waitForDevices()
+
+ def waitForAttachedDevices(self, devinfo):
+ """Wait for this domain's configured devices to connect.
+
+ @raise VmError: if any device fails to initialise.
+ """
+ for devclass in XendDevices.valid_devices():
+ self.getDeviceController(devclass).waitForAttachedDevices(devinfo)
def hvm_destroyPCIDevice(self, vslot):
log.debug("hvm_destroyPCIDevice called %s", vslot)
^ permalink raw reply [flat|nested] 14+ messages in thread* [RFC][PATCH 09/13] Kemari: add XenbusStateAttached to xenbus
2009-03-12 1:14 [RFC][PATCH 00/13] Kemari: updated to the 3.4 unstable tree Yoshiaki Tamura
` (7 preceding siblings ...)
2009-03-12 1:19 ` [RFC][PATCH 08/13] Kemari: add dev state "Attached" " Yoshiaki Tamura
@ 2009-03-12 1:20 ` Yoshiaki Tamura
2009-03-12 1:23 ` [RFC][PATCH 13/13] Kemari: use shared region with to flip logdirty_bitmap Yoshiaki Tamura
9 siblings, 0 replies; 14+ messages in thread
From: Yoshiaki Tamura @ 2009-03-12 1:20 UTC (permalink / raw)
To: xen-devel
Cc: "柳澤佳里(yanagisawa yoshisato)",
Ian Pratt, ian.jackson, Keir Fraser, Stefano Stabellini
This is an updated version of the following patch. No major changes.
http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00378.html
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp>
Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp>
---
drivers/xen/xenbus/xenbus_client.c | 3 ++-
include/xen/interface/io/xenbus.h | 4 +++-
2 files changed, 5 insertions(+), 2 deletions(-)
diff -r 0430b1dbfb3a -r e183d2114ea1 include/xen/interface/io/xenbus.h
--- a/include/xen/interface/io/xenbus.h Fri Mar 06 12:51:33 2009 +0000
+++ b/include/xen/interface/io/xenbus.h Tue Mar 10 15:40:44 2009 +0900
@@ -63,6 +63,8 @@ enum xenbus_state {
*/
XenbusStateReconfiguring = 7,
- XenbusStateReconfigured = 8
+ XenbusStateReconfigured = 8,
+
+ XenbusStateAttached = 9
};
typedef enum xenbus_state XenbusState;
diff -r 0430b1dbfb3a -r e183d2114ea1 drivers/xen/xenbus/xenbus_client.c
--- a/drivers/xen/xenbus/xenbus_client.c Fri Mar 06 12:51:33 2009 +0000
+++ b/drivers/xen/xenbus/xenbus_client.c Tue Mar 10 15:40:44 2009 +0900
@@ -52,8 +52,9 @@ const char *xenbus_strstate(enum xenbus_
[ XenbusStateInitialised ] = "Initialised",
[ XenbusStateConnected ] = "Connected",
[ XenbusStateClosing ] = "Closing",
- [ XenbusStateClosed ] = "Closed",
+ [ XenbusStateClosed ] = "Closed",
+ [ XenbusStateAttached ] = "Attached",
};
return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
}
^ permalink raw reply [flat|nested] 14+ messages in thread* [RFC][PATCH 13/13] Kemari: use shared region with to flip logdirty_bitmap
2009-03-12 1:14 [RFC][PATCH 00/13] Kemari: updated to the 3.4 unstable tree Yoshiaki Tamura
` (8 preceding siblings ...)
2009-03-12 1:20 ` [RFC][PATCH 09/13] Kemari: add XenbusStateAttached to xenbus Yoshiaki Tamura
@ 2009-03-12 1:23 ` Yoshiaki Tamura
2009-03-24 6:59 ` Yoshiaki Tamura
9 siblings, 1 reply; 14+ messages in thread
From: Yoshiaki Tamura @ 2009-03-12 1:23 UTC (permalink / raw)
To: xen-devel
Cc: "柳澤佳里(yanagisawa yoshisato)",
Ian Pratt, ian.jackson, Keir Fraser, Stefano Stabellini
This is an updated version of the following patch. No major changes.
http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00382.html
Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp>
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp>
---
xenstore.c | 30 +++++++++++++++++++++++++++++-
1 file changed, 29 insertions(+), 1 deletion(-)
diff --git a/xenstore.c b/xenstore.c
index 928e950..4333c79 100644
--- a/xenstore.c
+++ b/xenstore.c
@@ -639,6 +639,8 @@ void xenstore_process_logdirty_event(void)
static char *active_path = NULL;
static char *next_active_path = NULL;
static char *seg = NULL;
+ static char *kemari_qemu_info = NULL;
+ static char *qemu_file = NULL;
unsigned int len;
int i;
@@ -705,6 +707,8 @@ void xenstore_process_logdirty_event(void)
seg = NULL;
return;
}
+ kemari_qemu_info = seg + logdirty_bitmap_size * 2;
+ asprintf(&qemu_file, "/dev/shm/qemu-save.%d", domid); /* use tmpfs */
#endif
/* Remember the paths for the next-active and active entries */
@@ -722,8 +726,32 @@ void xenstore_process_logdirty_event(void)
}
}
+#ifndef CONFIG_STUBDOM
+ if (kemari_enabled) {
+ while (kemari_qemu_info[1])
+ xen_rmb();
+
+ /* Switch buffers */
+ i = kemari_qemu_info[0];
+ if (i != 0 && i != 1) {
+ fprintf(logfile, "Log-dirty: bad next-active entry: %s\n", act);
+ exit(1);
+ }
+ logdirty_bitmap = (unsigned long *)(seg + i * logdirty_bitmap_size);
+ kemari_qemu_info[1] = 1;
+ xen_wmb();
+
+ /* Save QEMU status */
+ while (kemari_qemu_info[2])
+ xen_rmb();
+ do_savevm(qemu_file);
+ kemari_qemu_info[2] = 1;
+ xen_wmb();
+ return;
+ }
+#endif /* !CONFIG_STUBDOM */
fprintf(logfile, "Triggered log-dirty buffer switch\n");
-
+
/* Read the required active buffer from the store */
act = xs_read(xsh, XBT_NULL, next_active_path, &len);
if (!act) {
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [RFC][PATCH 13/13] Kemari: use shared region with to flip logdirty_bitmap
2009-03-12 1:23 ` [RFC][PATCH 13/13] Kemari: use shared region with to flip logdirty_bitmap Yoshiaki Tamura
@ 2009-03-24 6:59 ` Yoshiaki Tamura
0 siblings, 0 replies; 14+ messages in thread
From: Yoshiaki Tamura @ 2009-03-24 6:59 UTC (permalink / raw)
To: xen-devel
Cc: Ian Pratt, ian.jackson,
"柳澤佳里(yanagisawa yoshisato)",
Stefano Stabellini, Keir Fraser
This is an updated version of the following patch. It will receive a
notification of flip change through an event channel.
http://lists.xensource.com/archives/html/xen-devel/2009-03/msg00757.html
Signed-off-by: Yoshisato Yanagisawa <yanagisawa.yoshisato@lab.ntt.co.jp>
Signed-off-by: Yoshi Tamura <tamura.yoshiaki@lab.ntt.co.jp>
---
xenstore.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 92 insertions(+)
diff --git a/xenstore.c b/xenstore.c
index 928e950..caef5ef 100644
--- a/xenstore.c
+++ b/xenstore.c
@@ -33,6 +33,7 @@
struct xs_handle *xsh = NULL;
static char *media_filename[MAX_DRIVES+1];
static QEMUTimer *insert_timer = NULL;
+static int xce = 0;
#define UWAIT_MAX (30*1000000) /* thirty seconds */
#define UWAIT (100000) /* 1/10th second */
@@ -301,6 +302,42 @@ const char *xenstore_get_guest_uuid(void)
return already_computed;
}
+/* prototype of xenstore_process_logdirty_event */
+void xenstore_process_logdirty_event(void);
+
+static int get_kemari_port(void)
+{
+ static int kemari_port = -1;
+ if (kemari_port > 0)
+ return kemari_port;
+
+ kemari_port = xc_evtchn_bind_unbound_port(xce, DOMID_SELF);
+ return kemari_port;
+}
+
+static void kemari_handler(void *dummy)
+{
+ int port;
+
+ port = xc_evtchn_pending(xce);
+ if (port < 0) {
+ fprintf(logfile, "xc_evtchn_pending failed");
+ return;
+ }
+
+ if (port == get_kemari_port()) {
+ kemari_enabled = 1; /* QEMU will run in kemari mode */
+ xenstore_process_logdirty_event();
+ } else {
+ fprintf(logfile, "unexpected port %d fired", port);
+ }
+
+ if (xc_evtchn_unmask(xce, port) < 0) {
+ fprintf(logfile, "xc_evtchn_unmask failed");
+ return;
+ }
+}
+
#define DIRECT_PCI_STR_LEN 512
#define PT_PCI_MSITRANSLATE_DEFAULT 1
char direct_pci_str[DIRECT_PCI_STR_LEN];
@@ -326,6 +363,12 @@ void xenstore_parse_domain_config(int hvm_domid)
xenstore_get_guest_uuid();
+ xce = xc_evtchn_open();
+ if (xce < 0) {
+ fprintf(logfile, "Could not open event channel\n");
+ return;
+ }
+
xsh = xs_daemon_open();
if (xsh == NULL) {
fprintf(logfile, "Could not contact xenstore for domain config\n");
@@ -363,6 +406,27 @@ void xenstore_parse_domain_config(int hvm_domid)
break;
}
}
+
+ /* kemari */
+ {
+ int port;
+ char port_string[128];
+ port = get_kemari_port();
+ if (port < 0) {
+ fprintf(stderr, "failed to get kemari port\n");
+ goto out;
+ }
+ snprintf(port_string, sizeof(port_string), "%d", port);
+
+ if (pasprintf(&buf, "/local/domain/%u/kemari/event-channel",
+ hvm_domid) == -1)
+ goto out;
+
+ xs_write(xsh, XBT_NULL, buf, port_string, strlen(port_string));
+ qemu_set_fd_handler2(xc_evtchn_fd(xce),
+ NULL, kemari_handler, NULL, NULL);
+ fprintf(stderr, "Kemari port is enabled: %d\n", port);
+ }
for (i = 0; i < num; i++) {
format = NULL; /* don't know what the format is yet */
@@ -639,6 +703,8 @@ void xenstore_process_logdirty_event(void)
static char *active_path = NULL;
static char *next_active_path = NULL;
static char *seg = NULL;
+ static char *kemari_qemu_info = NULL;
+ static char *qemu_file = NULL;
unsigned int len;
int i;
@@ -705,6 +771,8 @@ void xenstore_process_logdirty_event(void)
seg = NULL;
return;
}
+ kemari_qemu_info = seg + logdirty_bitmap_size * 2;
+ asprintf(&qemu_file, "/dev/shm/qemu-save.%d", domid); /* use tmpfs */
#endif
/* Remember the paths for the next-active and active entries */
@@ -722,6 +790,30 @@ void xenstore_process_logdirty_event(void)
}
}
+#ifndef CONFIG_STUBDOM
+ if (kemari_enabled) {
+ while (kemari_qemu_info[1])
+ xen_rmb();
+
+ /* Switch buffers */
+ i = kemari_qemu_info[0];
+ if (i != 0 && i != 1) {
+ fprintf(logfile, "Log-dirty: bad next-active entry: %d\n", i);
+ exit(1);
+ }
+ logdirty_bitmap = (unsigned long *)(seg + i * logdirty_bitmap_size);
+ kemari_qemu_info[1] = 1;
+ xen_wmb();
+
+ /* Save QEMU status */
+ while (kemari_qemu_info[2])
+ xen_rmb();
+ do_savevm(qemu_file);
+ kemari_qemu_info[2] = 1;
+ xen_wmb();
+ return;
+ }
+#endif /* !CONFIG_STUBDOM */
fprintf(logfile, "Triggered log-dirty buffer switch\n");
/* Read the required active buffer from the store */
^ permalink raw reply related [flat|nested] 14+ messages in thread