* PATCH: Enable QEMU booting of blktap disks
@ 2007-07-19 17:09 Daniel P. Berrange
2007-07-19 17:34 ` Andrew Warfield
0 siblings, 1 reply; 9+ messages in thread
From: Daniel P. Berrange @ 2007-07-19 17:09 UTC (permalink / raw)
To: xen-devel
[-- Attachment #1: Type: text/plain, Size: 1156 bytes --]
This is a re-send of previous patches:
http://lists.xensource.com/archives/html/xen-devel/2007-06/msg01021.html
The only change is that it explicitly looks for the driver type in xenstore
rather than assuming 'xvd' == 'tap' - this is because tap could be configured
with 'hd' or 'sd' nodenames too, and we still need to strip the leading
':aio' or ':vmdk', etc prefix from the path.
There are two patches:
- xen-revert-phantom-2.patch removes the phantom device code since it
doesn't work & is redundant if QEMU can process tap devices straight
from xenstore
- xen-qemu-blktap-2.patch makes QEMU able to handle disks with xvd prefix
treating them as IDE. Also makes QEMU strip the driver type prefix from
tap disks since it can auto-guess driver
Signed-off-by: Daniel P. Berrange <berrange@redhat.com>
Dan.
--
|=- Red Hat, Engineering, Emerging Technologies, Boston. +1 978 392 2496 -=|
|=- Perl modules: http://search.cpan.org/~danberr/ -=|
|=- Projects: http://freshmeat.net/~danielpb/ -=|
|=- GnuPG: 7D3B9505 F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 -=|
[-- Attachment #2: xen-qemu-blktap-2.patch --]
[-- Type: text/plain, Size: 2846 bytes --]
diff -r 7871916794c1 tools/ioemu/xenstore.c
--- a/tools/ioemu/xenstore.c Thu Jul 19 13:00:18 2007 -0400
+++ b/tools/ioemu/xenstore.c Thu Jul 19 13:00:37 2007 -0400
@@ -64,7 +64,7 @@ void xenstore_parse_domain_config(int do
{
char **e = NULL;
char *buf = NULL, *path;
- char *bpath = NULL, *dev = NULL, *params = NULL, *type = NULL;
+ char *bpath = NULL, *dev = NULL, *params = NULL, *type = NULL, *drv = NULL;
int i, is_scsi;
unsigned int len, num, hd_index;
@@ -98,6 +98,13 @@ void xenstore_parse_domain_config(int do
bpath = xs_read(xsh, XBT_NULL, buf, &len);
if (bpath == NULL)
continue;
+ /* read the driver type of the device */
+ if (pasprintf(&buf, "%s/type", bpath) == -1)
+ continue;
+ free(drv);
+ drv = xs_read(xsh, XBT_NULL, buf, &len);
+ if (drv == NULL)
+ continue;
/* read the name of the device */
if (pasprintf(&buf, "%s/dev", bpath) == -1)
continue;
@@ -105,6 +112,13 @@ void xenstore_parse_domain_config(int do
dev = xs_read(xsh, XBT_NULL, buf, &len);
if (dev == NULL)
continue;
+ /* Force xvdN to look like hdN */
+ if (!strncmp(dev, "xvd", 3)) {
+ fprintf(logfile, "Converting device type '%s'\n", dev);
+ memmove(dev, dev+1, strlen(dev));
+ dev[0] = 'h';
+ dev[1] = 'd';
+ }
is_scsi = !strncmp(dev, "sd", 2);
if ((strncmp(dev, "hd", 2) && !is_scsi) || strlen(dev) != 3 )
continue;
@@ -122,7 +136,15 @@ void xenstore_parse_domain_config(int do
params = xs_read(xsh, XBT_NULL, buf, &len);
if (params == NULL)
continue;
-
+ /* Strip off blktap sub-type prefix aio: - QEMU can autodetect this */
+ if (!strcmp(drv, "tap") && params[0]) {
+ char *offset = strchr(params, ':');
+ if (!offset)
+ continue;
+ fprintf(logfile, "Stripping blktap sub-type prefix from %s\n", params);
+ memmove(params, offset+1, strlen(offset+1)+1);
+ }
+ fprintf(logfile, "Creating disk '%s' with driver '%s'\n", dev, drv);
bs_table[hd_index + (is_scsi ? MAX_DISKS : 0)] = bdrv_new(dev);
/* check if it is a cdrom */
if (type && !strcmp(type, "cdrom")) {
@@ -131,6 +153,7 @@ void xenstore_parse_domain_config(int do
}
/* open device now if media present */
if (params[0]) {
+ fprintf(logfile, "Initializing disk '%s' with media '%s'\n", dev, params);
if (bdrv_open(bs_table[hd_index + (is_scsi ? MAX_DISKS : 0)],
params, 0 /* snapshot */) < 0)
fprintf(stderr, "qemu: could not open hard disk image '%s'\n",
@@ -146,6 +169,7 @@ void xenstore_parse_domain_config(int do
out:
+ free(drv);
free(type);
free(params);
free(dev);
[-- Attachment #3: xen-revert-phantom-2.patch --]
[-- Type: text/plain, Size: 12452 bytes --]
diff -r 1f348e70a5af tools/ioemu/xenstore.c
--- a/tools/ioemu/xenstore.c Tue Jul 10 11:10:38 2007 +0100
+++ b/tools/ioemu/xenstore.c Thu Jul 19 12:58:53 2007 -0400
@@ -10,7 +10,6 @@
#include "vl.h"
#include "block_int.h"
-#include <unistd.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/types.h>
@@ -61,28 +60,11 @@ void xenstore_check_new_media_present(in
qemu_mod_timer(insert_timer, qemu_get_clock(rt_clock) + timeout);
}
-static void waitForDevice(char *fn)
-{
- struct stat sbuf;
- int status;
- int uwait = UWAIT_MAX;
-
- do {
- status = stat(fn, &sbuf);
- if (!status) break;
- usleep(UWAIT);
- uwait -= UWAIT;
- } while (uwait > 0);
-
- return;
-}
-
void xenstore_parse_domain_config(int domid)
{
char **e = NULL;
char *buf = NULL, *path;
- char *fpath = NULL, *bpath = NULL,
- *dev = NULL, *params = NULL, *type = NULL;
+ char *bpath = NULL, *dev = NULL, *params = NULL, *type = NULL;
int i, is_scsi;
unsigned int len, num, hd_index;
@@ -140,36 +122,12 @@ void xenstore_parse_domain_config(int do
params = xs_read(xsh, XBT_NULL, buf, &len);
if (params == NULL)
continue;
- /*
- * check if device has a phantom vbd; the phantom is hooked
- * to the frontend device (for ease of cleanup), so lookup
- * the frontend device, and see if there is a phantom_vbd
- * if there is, we will use resolution as the filename
- */
- if (pasprintf(&buf, "%s/device/vbd/%s/phantom_vbd", path, e[i]) == -1)
- continue;
- free(fpath);
- fpath = xs_read(xsh, XBT_NULL, buf, &len);
- if (fpath) {
- if (pasprintf(&buf, "%s/dev", fpath) == -1)
- continue;
- free(params);
- params = xs_read(xsh, XBT_NULL, buf , &len);
- if (params) {
- /*
- * wait for device, on timeout silently fail because we will
- * fail to open below
- */
- waitForDevice(params);
- }
- }
bs_table[hd_index + (is_scsi ? MAX_DISKS : 0)] = bdrv_new(dev);
/* check if it is a cdrom */
if (type && !strcmp(type, "cdrom")) {
bdrv_set_type_hint(bs_table[hd_index], BDRV_TYPE_CDROM);
- if (pasprintf(&buf, "%s/params", bpath) != -1)
- xs_watch(xsh, buf, dev);
+ xs_watch(xsh, buf, dev);
}
/* open device now if media present */
if (params[0]) {
diff -r 1f348e70a5af tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py Tue Jul 10 11:10:38 2007 +0100
+++ b/tools/python/xen/xend/XendConfig.py Thu Jul 19 12:58:53 2007 -0400
@@ -1235,47 +1235,6 @@ class XendConfig(dict):
# no valid device to add
return ''
- def phantom_device_add(self, dev_type, cfg_xenapi = None,
- target = None):
- """Add a phantom tap device configuration in XenAPI struct format.
- """
-
- if target == None:
- target = self
-
- if dev_type not in XendDevices.valid_devices() and \
- dev_type not in XendDevices.pseudo_devices():
- raise XendConfigError("XendConfig: %s not a valid device type" %
- dev_type)
-
- if cfg_xenapi == None:
- raise XendConfigError("XendConfig: device_add requires some "
- "config.")
-
- if cfg_xenapi:
- log.debug("XendConfig.phantom_device_add: %s" % str(cfg_xenapi))
-
- if cfg_xenapi:
- dev_info = {}
- if dev_type in ('vbd', 'tap'):
- if dev_type == 'vbd':
- dev_info['uname'] = cfg_xenapi.get('image', '')
- dev_info['dev'] = '%s:disk' % cfg_xenapi.get('device')
- elif dev_type == 'tap':
- if cfg_xenapi.get('image').find('tap:') == -1:
- dev_info['uname'] = 'tap:qcow:%s' % cfg_xenapi.get('image')
- dev_info['dev'] = '/dev/%s' % cfg_xenapi.get('device')
- dev_info['uname'] = cfg_xenapi.get('image')
- dev_info['mode'] = cfg_xenapi.get('mode')
- dev_info['backend'] = '0'
- dev_uuid = cfg_xenapi.get('uuid', uuid.createString())
- dev_info['uuid'] = dev_uuid
- self['devices'][dev_uuid] = (dev_type, dev_info)
- self['vbd_refs'].append(dev_uuid)
- return dev_uuid
-
- return ''
-
def console_add(self, protocol, location, other_config = {}):
dev_uuid = uuid.createString()
if protocol == 'vt100':
diff -r 1f348e70a5af tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py Tue Jul 10 11:10:38 2007 +0100
+++ b/tools/python/xen/xend/XendDomainInfo.py Thu Jul 19 12:59:28 2007 -0400
@@ -1649,51 +1649,16 @@ class XendDomainInfo:
# VM Destroy
#
- def _prepare_phantom_paths(self):
- # get associated devices to destroy
- # build list of phantom devices to be removed after normal devices
- plist = []
- if self.domid is not None:
- from xen.xend.xenstore.xstransact import xstransact
- t = xstransact("%s/device/vbd" % GetDomainPath(self.domid))
- for dev in t.list():
- backend_phantom_vbd = xstransact.Read("%s/device/vbd/%s/phantom_vbd" \
- % (self.dompath, dev))
- if backend_phantom_vbd is not None:
- frontend_phantom_vbd = xstransact.Read("%s/frontend" \
- % backend_phantom_vbd)
- plist.append(backend_phantom_vbd)
- plist.append(frontend_phantom_vbd)
- return plist
-
- def _cleanup_phantom_devs(self, plist):
- # remove phantom devices
- if not plist == []:
- time.sleep(2)
- for paths in plist:
- if paths.find('backend') != -1:
- from xen.xend.server import DevController
- # Modify online status /before/ updating state (latter is watched by
- # drivers, so this ordering avoids a race).
- xstransact.Write(paths, 'online', "0")
- xstransact.Write(paths, 'state', str(DevController.xenbusState['Closing']))
- # force
- xstransact.Remove(paths)
-
def destroy(self):
"""Cleanup VM and destroy domain. Nothrow guarantee."""
log.debug("XendDomainInfo.destroy: domid=%s", str(self.domid))
-
- paths = self._prepare_phantom_paths()
self._cleanupVm()
if self.dompath is not None:
xc.domain_destroy_hook(self.domid)
self.destroyDomain()
- self._cleanup_phantom_devs(paths)
-
if "transient" in self.info["other_config"] \
and bool(self.info["other_config"]["transient"]):
from xen.xend import XendDomain
@@ -1702,8 +1667,6 @@ class XendDomainInfo:
def destroyDomain(self):
log.debug("XendDomainInfo.destroyDomain(%s)", str(self.domid))
-
- paths = self._prepare_phantom_paths()
try:
if self.domid is not None:
@@ -1718,7 +1681,7 @@ class XendDomainInfo:
XendDomain.instance().remove_domain(self)
self.cleanupDomain()
- self._cleanup_phantom_devs(paths)
+
def resumeDomain(self):
@@ -2528,25 +2491,6 @@ class XendDomainInfo:
return dev_uuid
- def create_phantom_vbd_with_vdi(self, xenapi_vbd, vdi_image_path):
- """Create a VBD using a VDI from XendStorageRepository.
-
- @param xenapi_vbd: vbd struct from the Xen API
- @param vdi_image_path: VDI UUID
- @rtype: string
- @return: uuid of the device
- """
- xenapi_vbd['image'] = vdi_image_path
- dev_uuid = self.info.phantom_device_add('tap', cfg_xenapi = xenapi_vbd)
- if not dev_uuid:
- raise XendError('Failed to create device')
-
- if self._stateGet() == XEN_API_VM_POWER_STATE_RUNNING:
- _, config = self.info['devices'][dev_uuid]
- config['devid'] = self.getDeviceController('tap').createDevice(config)
-
- return config['devid']
-
def create_vif(self, xenapi_vif):
"""Create VIF device from the passed struct in Xen API format.
diff -r 1f348e70a5af tools/python/xen/xend/server/BlktapController.py
--- a/tools/python/xen/xend/server/BlktapController.py Tue Jul 10 11:10:38 2007 +0100
+++ b/tools/python/xen/xend/server/BlktapController.py Thu Jul 19 12:58:53 2007 -0400
@@ -2,10 +2,7 @@
from xen.xend.server.blkif import BlkifController
-from xen.xend.XendLogging import log
-phantomDev = 0;
-phantomId = 0;
class BlktapController(BlkifController):
def __init__(self, vm):
@@ -15,62 +12,3 @@ class BlktapController(BlkifController):
"""@see DevController#frontendRoot"""
return "%s/device/vbd" % self.vm.getDomainPath()
-
- def getDeviceDetails(self, config):
- (devid, back, front) = BlkifController.getDeviceDetails(self, config)
-
- phantomDevid = 0
- wrapped = False
-
- try:
- imagetype = self.vm.info['image']['type']
- except:
- imagetype = ""
-
- if imagetype == 'hvm':
- tdevname = back['dev']
- index = ['c', 'd', 'e', 'f', 'g', 'h', 'i', \
- 'j', 'l', 'm', 'n', 'o', 'p']
- while True:
- global phantomDev
- global phantomId
- import os, stat
-
- phantomId = phantomId + 1
- if phantomId == 16:
- if index[phantomDev] == index[-1]:
- if wrapped:
- raise VmError(" No loopback block \
- devices are available. ")
- wrapped = True
- phantomDev = 0
- else:
- phantomDev = phantomDev + 1
- phantomId = 1
- devname = 'xvd%s%d' % (index[phantomDev], phantomId)
- try:
- info = os.stat('/dev/%s' % devname)
- except:
- break
-
- vbd = { 'mode': 'w', 'device': devname }
- fn = 'tap:%s' % back['params']
-
- # recurse ... by creating the vbd, then fallthrough
- # and finish creating the original device
-
- from xen.xend import XendDomain
- dom0 = XendDomain.instance().privilegedDomain()
- phantomDevid = dom0.create_phantom_vbd_with_vdi(vbd, fn)
- # we need to wait for this device at a higher level
- # the vbd that gets created will have a link to us
- # and will let them do it there
-
- # add a hook to point to the phantom device,
- # root path is always the same (dom0 tap)
- if phantomDevid != 0:
- front['phantom_vbd'] = '/local/domain/0/backend/tap/0/%s' \
- % str(phantomDevid)
-
- return (devid, back, front)
-
diff -r 1f348e70a5af tools/python/xen/xend/server/DevController.py
--- a/tools/python/xen/xend/server/DevController.py Tue Jul 10 11:10:38 2007 +0100
+++ b/tools/python/xen/xend/server/DevController.py Thu Jul 19 12:58:53 2007 -0400
@@ -476,19 +476,6 @@ class DevController:
def waitForBackend(self, devid):
frontpath = self.frontendPath(devid)
- # lookup a phantom
- phantomPath = xstransact.Read(frontpath, 'phantom_vbd')
- if phantomPath is not None:
- log.debug("Waiting for %s's phantom %s.", devid, phantomPath)
- statusPath = phantomPath + '/' + HOTPLUG_STATUS_NODE
- ev = Event()
- result = { 'status': Timeout }
- xswatch(statusPath, hotplugStatusCallback, ev, result)
- ev.wait(DEVICE_CREATE_TIMEOUT)
- err = xstransact.Read(statusPath, HOTPLUG_ERROR_NODE)
- if result['status'] != 'Connected':
- return (result['status'], err)
-
backpath = xstransact.Read(frontpath, "backend")
[-- Attachment #4: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: PATCH: Enable QEMU booting of blktap disks
2007-07-19 17:09 PATCH: Enable QEMU booting of blktap disks Daniel P. Berrange
@ 2007-07-19 17:34 ` Andrew Warfield
2007-07-19 18:08 ` Daniel P. Berrange
2007-07-20 10:35 ` Gerd Hoffmann
0 siblings, 2 replies; 9+ messages in thread
From: Andrew Warfield @ 2007-07-19 17:34 UTC (permalink / raw)
To: Daniel P. Berrange; +Cc: xen-devel
So two comments on this:
In the other thread that's currently going on this topic, it sounds
like others are quite successfully using the phantom code. Why is it
broken for you?
As I've said before, I dislike the idea of having separate
implementations of disks -- one in qemu and one in tapdisk. We'd
quite like to encourage people to be able to extend virtual block
devices in the future, and it seems like your approach is going to
force them to do two independent implementations of things. It also
leads to complications if you want to add things like caching, shared
ramdisks, etc. If phantom is broken, why don't we just fix that?
a.
On 7/19/07, Daniel P. Berrange <berrange@redhat.com> wrote:
> This is a re-send of previous patches:
>
> http://lists.xensource.com/archives/html/xen-devel/2007-06/msg01021.html
>
> The only change is that it explicitly looks for the driver type in xenstore
> rather than assuming 'xvd' == 'tap' - this is because tap could be configured
> with 'hd' or 'sd' nodenames too, and we still need to strip the leading
> ':aio' or ':vmdk', etc prefix from the path.
>
> There are two patches:
>
> - xen-revert-phantom-2.patch removes the phantom device code since it
> doesn't work & is redundant if QEMU can process tap devices straight
> from xenstore
>
> - xen-qemu-blktap-2.patch makes QEMU able to handle disks with xvd prefix
> treating them as IDE. Also makes QEMU strip the driver type prefix from
> tap disks since it can auto-guess driver
>
>
> Signed-off-by: Daniel P. Berrange <berrange@redhat.com>
>
> Dan.
> --
> |=- Red Hat, Engineering, Emerging Technologies, Boston. +1 978 392 2496 -=|
> |=- Perl modules: http://search.cpan.org/~danberr/ -=|
> |=- Projects: http://freshmeat.net/~danielpb/ -=|
> |=- GnuPG: 7D3B9505 F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 -=|
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>
>
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: PATCH: Enable QEMU booting of blktap disks
2007-07-19 17:34 ` Andrew Warfield
@ 2007-07-19 18:08 ` Daniel P. Berrange
2007-07-19 22:45 ` Andrew Warfield
2007-07-19 22:46 ` Andrew Warfield
2007-07-20 10:35 ` Gerd Hoffmann
1 sibling, 2 replies; 9+ messages in thread
From: Daniel P. Berrange @ 2007-07-19 18:08 UTC (permalink / raw)
To: Andrew Warfield; +Cc: xen-devel
On Thu, Jul 19, 2007 at 10:34:12AM -0700, Andrew Warfield wrote:
> So two comments on this:
>
> In the other thread that's currently going on this topic, it sounds
> like others are quite successfully using the phantom code. Why is it
> broken for you?
I really can't see how it works for anybody in 3.1.0 since the code which
sets up phantom devices simply doesn't work
try:
imagetype = self.vm.info['image']['type']
except:
imagetype = ""
if imagetype == 'hvm':
The body of that try: statement is trying to read hash keys which don't
exist, since 'vm.info' isn't a hash. So imagetype is always "" and so
none of the phantom setup code ever gets run. Even once fixing that I
never get any devices appearing and the Vm just immediately shuts down.
It seems to be looking for the /dev/xvd* device nodes in Dom0 rather
than DomU which seems rather wrong.
> As I've said before, I dislike the idea of having separate
> implementations of disks -- one in qemu and one in tapdisk. We'd
> quite like to encourage people to be able to extend virtual block
> devices in the future, and it seems like your approach is going to
> force them to do two independent implementations of things. It also
> leads to complications if you want to add things like caching, shared
> ramdisks, etc. If phantom is broken, why don't we just fix that?
AFAICT with or without my change you need to have two separate impls
of every disk format, since the phantom device stuff is only ever used
by blktap - non blktap disks still get processed directly by QEMU. Now
if we intend to remove all support for file: entirely, and make blktap
compulsory for file backed VMs then I can see the benefit in having
everything go via one codepath. Though now having 2 userspace daemons
in Dom0 per HVM guest seems like its going in wrong direction to me.
IMHO the entire design & impl of blktap userspace was broken from the
start because it is duplicating functionality already in the QEMU
codebase. With the benefit of hindsight, I would suggest that it would
be better to have QEMU able to speak the native blktap protocol straight
to the blktap kernel driver. Keep HVM using QEMU for all file backed
disks, since it already handles all the formats just fine, and have a
new machine type in QEMU for paravirt VMs which provided the tap daemon
replacement and also a PVFB daemon replacement. The you could kill the
entire blktap userspace codebase & most of the PVFB userspace codebase
and the libvncserver requirement.
So there'd only be 1 single daemon in Dom0 per VM, it would be the same
daemon for PV and HVM, and all the open source virt platforms (Xen, KVM,
QEMU, VirtualBox) would all be reaping the benefit of each other's code
improvements to QEMU driver model, in particular for disk format code &
VNC server code, rather than forking & reimplementing private copies.
Of course this isn't a quick job, but if the motiviation is reducing
code duplication & alternative I/O paths, the focusing on QEMU for
everything seems like a much more viable idea than more Xen specific
code.
Dan.
--
|=- Red Hat, Engineering, Emerging Technologies, Boston. +1 978 392 2496 -=|
|=- Perl modules: http://search.cpan.org/~danberr/ -=|
|=- Projects: http://freshmeat.net/~danielpb/ -=|
|=- GnuPG: 7D3B9505 F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 -=|
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: PATCH: Enable QEMU booting of blktap disks
2007-07-19 18:08 ` Daniel P. Berrange
@ 2007-07-19 22:45 ` Andrew Warfield
2007-07-20 14:31 ` Daniel P. Berrange
2007-07-19 22:46 ` Andrew Warfield
1 sibling, 1 reply; 9+ messages in thread
From: Andrew Warfield @ 2007-07-19 22:45 UTC (permalink / raw)
To: Daniel P. Berrange; +Cc: xen-devel
> > In the other thread that's currently going on this topic, it sounds
> > like others are quite successfully using the phantom code. Why is it
> > broken for you?
>
> I really can't see how it works for anybody in 3.1.0 since the code which
> sets up phantom devices simply doesn't work
Well let's fix it then. ;)
> > As I've said before, I dislike the idea of having separate
> > implementations of disks -- one in qemu and one in tapdisk. We'd
> > quite like to encourage people to be able to extend virtual block
> > devices in the future, and it seems like your approach is going to
> > force them to do two independent implementations of things. It also
> > leads to complications if you want to add things like caching, shared
> > ramdisks, etc. If phantom is broken, why don't we just fix that?
>
> AFAICT with or without my change you need to have two separate impls
> of every disk format, since the phantom device stuff is only ever used
> by blktap - non blktap disks still get processed directly by QEMU.
My concern is that it's possible to run the VM with it only having to
depend on a single implementation of a virtual disk. If you don't use
PV drivers, the qemu block drivers do this nicely. If you do, the
phantom code lets you do this by ensuring that emulated block requests
are redirected to tapdisk (in an admittedly ineffecient, but it
doesn't really matter for the length of time that it happens, way)
until the pv drivers come up.
> IMHO the entire design & impl of blktap userspace was broken from the
> start because it is duplicating functionality already in the QEMU
> codebase.
Blktap was written before there were device emulated guests and before
qemu was capable of processing more than a single outstanding block
request at a time. So the only functionality that it duplicated was
to use e.g. the vmdk and qcow code as a basis for some of the image
file implementations. Vmdk is largely unchanged and I don't know of
anyone who actively uses it, qcow evolved considerably in order to do
asynchronous access and batched request processing.
> With the benefit of hindsight, I would suggest that it would
> be better to have QEMU able to speak the native blktap protocol straight
> to the blktap kernel driver. Keep HVM using QEMU for all file backed
> disks, since it already handles all the formats just fine, and have a
> new machine type in QEMU for paravirt VMs which provided the tap daemon
> replacement and also a PVFB daemon replacement. The you could kill the
> entire blktap userspace codebase & most of the PVFB userspace codebase
> and the libvncserver requirement.
I think a patch that pulled a lot of the tapdisk processing into qemu
would be a very interesting thing to compare overheads for against the
current model.
> So there'd only be 1 single daemon in Dom0 per VM, it would be the same
> daemon for PV and HVM, and all the open source virt platforms (Xen, KVM,
> QEMU, VirtualBox) would all be reaping the benefit of each other's code
> improvements to QEMU driver model, in particular for disk format code &
> VNC server code, rather than forking & reimplementing private copies.
>
> Of course this isn't a quick job, but if the motiviation is reducing
> code duplication & alternative I/O paths, the focusing on QEMU for
> everything seems like a much more viable idea than more Xen specific
> code.
Absolutely. Dan, I completely agree that it would be very good to
have a unified way to implement virtual block devices -- image
formats, interposition, and otherwise. I think that the qemu and
blktap disk interfaces both shared this as an initial design goal. I
agree it's a lot of work and I agree that it would be a very nice
thing -- in the same spirit as Rusty's virtio efforts -- to be able to
share these implementations across hypervisors/emulators/etc. I also
know of some grad students who would be very happy to see virtual
block devices that they are building for blktap apply against
everything else.
The thing is is that doing everything in qemu doesn't currently
achieve this -- because PV drivers can't talk directly to qemu and
going through the emulated path results in suckful performance. So
rather than taking a patch that means PV-based HVM domains have to
depend on multiple implementations of disks, I'd much prefer to see us
go in the direction of what you propose.
a.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: PATCH: Enable QEMU booting of blktap disks
2007-07-19 18:08 ` Daniel P. Berrange
2007-07-19 22:45 ` Andrew Warfield
@ 2007-07-19 22:46 ` Andrew Warfield
1 sibling, 0 replies; 9+ messages in thread
From: Andrew Warfield @ 2007-07-19 22:46 UTC (permalink / raw)
To: Daniel P. Berrange; +Cc: xen-devel
> > In the other thread that's currently going on this topic, it sounds
> > like others are quite successfully using the phantom code. Why is it
> > broken for you?
>
> I really can't see how it works for anybody in 3.1.0 since the code which
> sets up phantom devices simply doesn't work
Well let's fix it then. ;)
> > As I've said before, I dislike the idea of having separate
> > implementations of disks -- one in qemu and one in tapdisk. We'd
> > quite like to encourage people to be able to extend virtual block
> > devices in the future, and it seems like your approach is going to
> > force them to do two independent implementations of things. It also
> > leads to complications if you want to add things like caching, shared
> > ramdisks, etc. If phantom is broken, why don't we just fix that?
>
> AFAICT with or without my change you need to have two separate impls
> of every disk format, since the phantom device stuff is only ever used
> by blktap - non blktap disks still get processed directly by QEMU.
My concern is that it's possible to run the VM with it only having to
depend on a single implementation of a virtual disk. If you don't use
PV drivers, the qemu block drivers do this nicely. If you do, the
phantom code lets you do this by ensuring that emulated block requests
are redirected to tapdisk (in an admittedly ineffecient, but it
doesn't really matter for the length of time that it happens, way)
until the pv drivers come up.
> IMHO the entire design & impl of blktap userspace was broken from the
> start because it is duplicating functionality already in the QEMU
> codebase.
Blktap was written before there were device emulated guests and before
qemu was capable of processing more than a single outstanding block
request at a time. So the only functionality that it duplicated was
to use e.g. the vmdk and qcow code as a basis for some of the image
file implementations. Vmdk is largely unchanged and I don't know of
anyone who actively uses it, qcow evolved considerably in order to do
asynchronous access and batched request processing.
> With the benefit of hindsight, I would suggest that it would
> be better to have QEMU able to speak the native blktap protocol straight
> to the blktap kernel driver. Keep HVM using QEMU for all file backed
> disks, since it already handles all the formats just fine, and have a
> new machine type in QEMU for paravirt VMs which provided the tap daemon
> replacement and also a PVFB daemon replacement. The you could kill the
> entire blktap userspace codebase & most of the PVFB userspace codebase
> and the libvncserver requirement.
I think a patch that pulled a lot of the tapdisk processing into qemu
would be a very interesting thing to compare overheads for against the
current model.
> So there'd only be 1 single daemon in Dom0 per VM, it would be the same
> daemon for PV and HVM, and all the open source virt platforms (Xen, KVM,
> QEMU, VirtualBox) would all be reaping the benefit of each other's code
> improvements to QEMU driver model, in particular for disk format code &
> VNC server code, rather than forking & reimplementing private copies.
>
> Of course this isn't a quick job, but if the motiviation is reducing
> code duplication & alternative I/O paths, the focusing on QEMU for
> everything seems like a much more viable idea than more Xen specific
> code.
Absolutely. Dan, I completely agree that it would be very good to
have a unified way to implement virtual block devices -- image
formats, interposition, and otherwise. I think that the qemu and
blktap disk interfaces both shared this as an initial design goal. I
agree it's a lot of work and I agree that it would be a very nice
thing -- in the same spirit as Rusty's virtio efforts -- to be able to
share these implementations across hypervisors/emulators/etc. I also
know of some grad students who would be very happy to see virtual
block devices that they are building for blktap apply against
everything else.
The thing is is that doing everything in qemu doesn't currently
achieve this -- because PV drivers can't talk directly to qemu and
going through the emulated path results in suckful performance. So
rather than taking a patch that means PV-based HVM domains have to
depend on multiple implementations of disks, I'd much prefer to see us
go in the direction of what you propose.
a.
On 7/19/07, Daniel P. Berrange <berrange@redhat.com> wrote:
> On Thu, Jul 19, 2007 at 10:34:12AM -0700, Andrew Warfield wrote:
> > So two comments on this:
> >
> > In the other thread that's currently going on this topic, it sounds
> > like others are quite successfully using the phantom code. Why is it
> > broken for you?
>
> I really can't see how it works for anybody in 3.1.0 since the code which
> sets up phantom devices simply doesn't work
>
> try:
> imagetype = self.vm.info['image']['type']
> except:
> imagetype = ""
>
> if imagetype == 'hvm':
>
> The body of that try: statement is trying to read hash keys which don't
> exist, since 'vm.info' isn't a hash. So imagetype is always "" and so
> none of the phantom setup code ever gets run. Even once fixing that I
> never get any devices appearing and the Vm just immediately shuts down.
> It seems to be looking for the /dev/xvd* device nodes in Dom0 rather
> than DomU which seems rather wrong.
>
> > As I've said before, I dislike the idea of having separate
> > implementations of disks -- one in qemu and one in tapdisk. We'd
> > quite like to encourage people to be able to extend virtual block
> > devices in the future, and it seems like your approach is going to
> > force them to do two independent implementations of things. It also
> > leads to complications if you want to add things like caching, shared
> > ramdisks, etc. If phantom is broken, why don't we just fix that?
>
> AFAICT with or without my change you need to have two separate impls
> of every disk format, since the phantom device stuff is only ever used
> by blktap - non blktap disks still get processed directly by QEMU. Now
> if we intend to remove all support for file: entirely, and make blktap
> compulsory for file backed VMs then I can see the benefit in having
> everything go via one codepath. Though now having 2 userspace daemons
> in Dom0 per HVM guest seems like its going in wrong direction to me.
>
> IMHO the entire design & impl of blktap userspace was broken from the
> start because it is duplicating functionality already in the QEMU
> codebase. With the benefit of hindsight, I would suggest that it would
> be better to have QEMU able to speak the native blktap protocol straight
> to the blktap kernel driver. Keep HVM using QEMU for all file backed
> disks, since it already handles all the formats just fine, and have a
> new machine type in QEMU for paravirt VMs which provided the tap daemon
> replacement and also a PVFB daemon replacement. The you could kill the
> entire blktap userspace codebase & most of the PVFB userspace codebase
> and the libvncserver requirement.
>
> So there'd only be 1 single daemon in Dom0 per VM, it would be the same
> daemon for PV and HVM, and all the open source virt platforms (Xen, KVM,
> QEMU, VirtualBox) would all be reaping the benefit of each other's code
> improvements to QEMU driver model, in particular for disk format code &
> VNC server code, rather than forking & reimplementing private copies.
>
> Of course this isn't a quick job, but if the motiviation is reducing
> code duplication & alternative I/O paths, the focusing on QEMU for
> everything seems like a much more viable idea than more Xen specific
> code.
>
> Dan.
> --
> |=- Red Hat, Engineering, Emerging Technologies, Boston. +1 978 392 2496 -=|
> |=- Perl modules: http://search.cpan.org/~danberr/ -=|
> |=- Projects: http://freshmeat.net/~danielpb/ -=|
> |=- GnuPG: 7D3B9505 F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 -=|
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: PATCH: Enable QEMU booting of blktap disks
2007-07-19 17:34 ` Andrew Warfield
2007-07-19 18:08 ` Daniel P. Berrange
@ 2007-07-20 10:35 ` Gerd Hoffmann
2007-07-20 13:04 ` Andrew Warfield
1 sibling, 1 reply; 9+ messages in thread
From: Gerd Hoffmann @ 2007-07-20 10:35 UTC (permalink / raw)
To: Andrew Warfield; +Cc: xen-devel, Daniel P. Berrange
Andrew Warfield wrote:
> As I've said before, I dislike the idea of having separate
> implementations of disks -- one in qemu and one in tapdisk.
The qemu one isn't going to go away due to qemu being *the* device model
for any kind of virtualization in Linux. So if you want to have tapdisk
share the code to avoid duplication I see two possible ways to get there:
(a) replace blktapd with qemu
(b) put the bits into a shared library, which then can be used by
qemu & blktapd and other tools (qemu-img, virtual machine
management tools, ...).
cheers,
Gerd
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: PATCH: Enable QEMU booting of blktap disks
2007-07-20 10:35 ` Gerd Hoffmann
@ 2007-07-20 13:04 ` Andrew Warfield
2007-07-20 13:33 ` Gerd Hoffmann
0 siblings, 1 reply; 9+ messages in thread
From: Andrew Warfield @ 2007-07-20 13:04 UTC (permalink / raw)
To: Gerd Hoffmann; +Cc: xen-devel, Daniel P. Berrange
Gerd, don't misunderstand what I'm saying: I'd be delighted to see
blktap and qemu share block device implementations. However, the
blktap patch that I am commenting on achieves exactly the opposite of
that: it *requires* two implementations of any virtual disk type that
you want to use PV drivers on in an HVM guest.
a.
On 7/20/07, Gerd Hoffmann <kraxel@redhat.com> wrote:
> Andrew Warfield wrote:
> > As I've said before, I dislike the idea of having separate
> > implementations of disks -- one in qemu and one in tapdisk.
>
> The qemu one isn't going to go away due to qemu being *the* device model
> for any kind of virtualization in Linux. So if you want to have tapdisk
> share the code to avoid duplication I see two possible ways to get there:
>
> (a) replace blktapd with qemu
> (b) put the bits into a shared library, which then can be used by
> qemu & blktapd and other tools (qemu-img, virtual machine
> management tools, ...).
>
> cheers,
> Gerd
>
>
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: PATCH: Enable QEMU booting of blktap disks
2007-07-20 13:04 ` Andrew Warfield
@ 2007-07-20 13:33 ` Gerd Hoffmann
0 siblings, 0 replies; 9+ messages in thread
From: Gerd Hoffmann @ 2007-07-20 13:33 UTC (permalink / raw)
To: Andrew Warfield; +Cc: xen-devel, Daniel P. Berrange
Andrew Warfield wrote:
> Gerd, don't misunderstand what I'm saying: I'd be delighted to see
> blktap and qemu share block device implementations. However, the
> blktap patch that I am commenting on achieves exactly the opposite of
> that: it *requires* two implementations of any virtual disk type that
> you want to use PV drivers on in an HVM guest.
Well, the code will be in qemu anyway because other people use it, no
matter whenever the xenified qemu device model actually uses it or not
(with the phantom device redirection trick). So if you want to get rid
of code duplication either shared libs or redirecting things the other
way around (i.e. use qemu code for both hvm and pv) will work.
cheers,
Gerd
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: PATCH: Enable QEMU booting of blktap disks
2007-07-19 22:45 ` Andrew Warfield
@ 2007-07-20 14:31 ` Daniel P. Berrange
0 siblings, 0 replies; 9+ messages in thread
From: Daniel P. Berrange @ 2007-07-20 14:31 UTC (permalink / raw)
To: Andrew Warfield; +Cc: xen-devel
On Thu, Jul 19, 2007 at 03:45:26PM -0700, Andrew Warfield wrote:
> >> In the other thread that's currently going on this topic, it sounds
> >> like others are quite successfully using the phantom code. Why is it
> >> broken for you?
> >
> >I really can't see how it works for anybody in 3.1.0 since the code which
> >sets up phantom devices simply doesn't work
>
> Well let's fix it then. ;)
Ok, I'll try and figure out what's broken. We still need a patch to make
QEMU watch out for disks named 'xvd*' though, since upstream paravirt
drivers only support xvd* naming.
> >With the benefit of hindsight, I would suggest that it would
> >be better to have QEMU able to speak the native blktap protocol straight
> >to the blktap kernel driver. Keep HVM using QEMU for all file backed
> >disks, since it already handles all the formats just fine, and have a
> >new machine type in QEMU for paravirt VMs which provided the tap daemon
> >replacement and also a PVFB daemon replacement. The you could kill the
> >entire blktap userspace codebase & most of the PVFB userspace codebase
> >and the libvncserver requirement.
>
> I think a patch that pulled a lot of the tapdisk processing into qemu
> would be a very interesting thing to compare overheads for against the
> current model.
>
> >So there'd only be 1 single daemon in Dom0 per VM, it would be the same
> >daemon for PV and HVM, and all the open source virt platforms (Xen, KVM,
> >QEMU, VirtualBox) would all be reaping the benefit of each other's code
> >improvements to QEMU driver model, in particular for disk format code &
> >VNC server code, rather than forking & reimplementing private copies.
> >
> >Of course this isn't a quick job, but if the motiviation is reducing
> >code duplication & alternative I/O paths, the focusing on QEMU for
> >everything seems like a much more viable idea than more Xen specific
> >code.
>
> Absolutely. Dan, I completely agree that it would be very good to
> have a unified way to implement virtual block devices -- image
> formats, interposition, and otherwise. I think that the qemu and
> blktap disk interfaces both shared this as an initial design goal. I
> agree it's a lot of work and I agree that it would be a very nice
> thing -- in the same spirit as Rusty's virtio efforts -- to be able to
> share these implementations across hypervisors/emulators/etc. I also
> know of some grad students who would be very happy to see virtual
> block devices that they are building for blktap apply against
> everything else.
Thinking about it a bit more broadly - considering differences between
paravirt & fullyvirt. With paravirt ops we're nearly able to have a
single kernel image. VirtIO work will hopefully make a single set of
paravirt drivers for disk/network/etc. With this HVM ought to be getting
pretty near parity with paravirt in terms of performance. The primary
compelling thing left in favour of paravirt is no reliance of hardware
support. At the same time paravirt has some really bad downsides, in
particular the terrible bootloader process with hacks of pygrub & pypxeboot.
The PV framebuffer is severely limited compared to Cirrus, and requires a
almost completely duplicated VNC server impl. Blktap is a little more
complicated, but there's general codeduplication wrt to QEMU we've discussed
above. Then there are things like lack of emulated USB bus, an emulated
CDROM device, and other misc hardware devices that QEMU provides.
I think it would be an interesting project to see if one could make a QEMU
machine which allows Xen paravirt guests to be booted using QEMU (and thus
the regular grub, viewable through the regulard graphical VNC server), and
provide the QEMU emulated device model to the guest to enable USB, etc,
though still primarily using paravirt drivers where available for speed of
course. Basically I'd like to have complete parity in device models & boot
process between paravirt & HVM guests, so I stop having to tell users "well
you can do X in HVM, but not with paravirt" & vica-verca.
Regards,
Dan.
--
|=- Red Hat, Engineering, Emerging Technologies, Boston. +1 978 392 2496 -=|
|=- Perl modules: http://search.cpan.org/~danberr/ -=|
|=- Projects: http://freshmeat.net/~danielpb/ -=|
|=- GnuPG: 7D3B9505 F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 -=|
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2007-07-20 14:31 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-07-19 17:09 PATCH: Enable QEMU booting of blktap disks Daniel P. Berrange
2007-07-19 17:34 ` Andrew Warfield
2007-07-19 18:08 ` Daniel P. Berrange
2007-07-19 22:45 ` Andrew Warfield
2007-07-20 14:31 ` Daniel P. Berrange
2007-07-19 22:46 ` Andrew Warfield
2007-07-20 10:35 ` Gerd Hoffmann
2007-07-20 13:04 ` Andrew Warfield
2007-07-20 13:33 ` Gerd Hoffmann
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.