All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH] Use ioemu block drivers through blktap
@ 2008-03-10 17:03 Kevin Wolf
  2008-03-13 12:25 ` Kevin Wolf
  0 siblings, 1 reply; 13+ messages in thread
From: Kevin Wolf @ 2008-03-10 17:03 UTC (permalink / raw)
  To: xen-devel

[-- Attachment #1: Type: text/plain, Size: 1269 bytes --]

When I submitted the qcow2 patch for blktap, suggestions came up that
the qemu block drivers should be used also for blktap to eliminate the
current code duplication in ioemu and blktap.

The attached patch adds support for a tap:ioemu pseudo driver. Devices
using this driver won't use tapdisk (containing the code duplication)
any more, but will connect to the qemu-dm of the domain. In this way no
working configuration should be broken right now as you can still choose
to use the tapdisk drivers.

If tap:ioemu eventually proves to work well enough as drop-in
replacement for the tapdisk drivers (currently at least qcow is
incompatible between tapdisk and qemu), tapdisk and with it the code
duplication could be dropped in the long term.

Obviously, there are still problems with this first patch. IMO the most
important one is that I needed to change block.c to replace
qemu_aio_wait() by qemu_aio_poll(), otherwise it would hang in sigwait
(called by qemu_aio_wait) sooner or later. I suspect it has something to
do with ioemu being multi-threaded in contrast to single-threaded qemu,
but I wouldn't be too unhappy if someone who knows the code better knew
the exact problem.

Is this the right approach at all or did I make the cut in the wrong place?

Kevin

[-- Attachment #2: blktap-ioemu.patch --]
[-- Type: text/x-patch, Size: 25874 bytes --]

diff -r 7530c4dba8a5 tools/blktap/drivers/blktapctrl.c
--- a/tools/blktap/drivers/blktapctrl.c	Mon Mar  3 15:19:39 2008
+++ b/tools/blktap/drivers/blktapctrl.c	Mon Mar 10 16:58:40 2008
@@ -524,30 +524,59 @@
 		blkif->cookie = next_cookie++;
 
 		if (!exist) {
-			DPRINTF("Process does not exist:\n");
-			if (asprintf(&rdctldev,
-				     "%s/tapctrlread%d", BLKTAP_CTRL_DIR, minor) == -1)
-				goto fail;
-			if (asprintf(&wrctldev,
-				     "%s/tapctrlwrite%d", BLKTAP_CTRL_DIR, minor) == -1) {
+			
+			if (type == DISK_TYPE_IOEMU) {				
+				// Connect to qemu-dm
+				if (asprintf(&rdctldev, BLKTAP_CTRL_DIR "/qemu-read-%d", blkif->domid) < 0)
+					goto fail;				
+
+				if (asprintf(&wrctldev, BLKTAP_CTRL_DIR "/qemu-write-%d", 
+						blkif->domid) < 0) {
+					free(rdctldev);
+					goto fail;
+				}
+
+				DPRINTF("Using qemu blktap pipe: %s\n", rdctldev);
+				
+				blkif->fds[READ] = open_ctrl_socket(wrctldev);
+				blkif->fds[WRITE] = open_ctrl_socket(rdctldev);
+				
+				if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1) 
+					goto fail;
+
+				DPRINTF("Attached to qemu blktap pipes\n");
 				free(rdctldev);
-				goto fail;
+				free(wrctldev);
+			} else {
+				// Lauch tapdisk instance
+				DPRINTF("tapdisk process does not exist:\n");
+				if (asprintf(&rdctldev,
+					     "%s/tapctrlread%d", BLKTAP_CTRL_DIR, minor) == -1)
+					goto fail;
+				if (asprintf(&wrctldev,
+					     "%s/tapctrlwrite%d", BLKTAP_CTRL_DIR, minor) == -1) {
+					free(rdctldev);
+					goto fail;
+				}
+				blkif->fds[READ] = open_ctrl_socket(rdctldev);
+				blkif->fds[WRITE] = open_ctrl_socket(wrctldev);
+				
+				if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1) {
+					DPRINTF("ERROR: fds[READ] = %d, fds[WRTIE] = %d\n",
+						blkif->fds[READ], blkif->fds[WRITE]);
+					goto fail;
+				}
+
+				/*launch the new process*/
+				DPRINTF("Launching process, CMDLINE [tapdisk %s %s]\n",wrctldev, rdctldev);
+				if (launch_tapdisk(wrctldev, rdctldev) == -1) {
+					DPRINTF("Unable to fork, cmdline: [tapdisk %s %s]\n",wrctldev, rdctldev);
+					goto fail;
+				}
+
+				free(rdctldev);
+				free(wrctldev);
 			}
-			blkif->fds[READ] = open_ctrl_socket(rdctldev);
-			blkif->fds[WRITE] = open_ctrl_socket(wrctldev);
-			
-			if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1) 
-				goto fail;
-
-			/*launch the new process*/
- 			DPRINTF("Launching process, CMDLINE [tapdisk %s %s]\n",wrctldev, rdctldev);
- 			if (launch_tapdisk(wrctldev, rdctldev) == -1) {
- 				DPRINTF("Unable to fork, cmdline: [tapdisk %s %s]\n",wrctldev, rdctldev);
-				goto fail;
-			}
-
-			free(rdctldev);
-			free(wrctldev);
 		} else {
 			DPRINTF("Process exists!\n");
 			blkif->fds[READ] = exist->fds[READ];
diff -r 7530c4dba8a5 tools/blktap/drivers/tapdisk.h
--- a/tools/blktap/drivers/tapdisk.h	Mon Mar  3 15:19:39 2008
+++ b/tools/blktap/drivers/tapdisk.h	Mon Mar 10 16:58:40 2008
@@ -167,6 +167,7 @@
 #define DISK_TYPE_RAM      3
 #define DISK_TYPE_QCOW     4
 #define DISK_TYPE_QCOW2    5
+#define DISK_TYPE_IOEMU    6
 
 
 /*Define Individual Disk Parameters here */
@@ -227,6 +228,16 @@
 	0,
 #ifdef TAPDISK
 	&tapdisk_qcow2,
+#endif
+};
+
+static disk_info_t ioemu_disk = {
+	DISK_TYPE_IOEMU,
+	"ioemu disk",
+	"ioemu",
+	0,
+#ifdef TAPDISK
+	NULL
 #endif
 };
 
@@ -238,6 +249,7 @@
 	&ram_disk,
 	&qcow_disk,
 	&qcow2_disk,
+	&ioemu_disk,
 };
 
 typedef struct driver_list_entry {
diff -r 7530c4dba8a5 tools/blktap/lib/blktaplib.h
--- a/tools/blktap/lib/blktaplib.h	Mon Mar  3 15:19:39 2008
+++ b/tools/blktap/lib/blktaplib.h	Mon Mar 10 16:58:40 2008
@@ -221,15 +221,5 @@
      ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * getpagesize()) +    \
      ((_seg) * getpagesize()))
 
-/* Defines that are only used by library clients */
-
-#ifndef __COMPILING_BLKTAP_LIB
-
-static char *blkif_op_name[] = {
-	[BLKIF_OP_READ]       = "READ",
-	[BLKIF_OP_WRITE]      = "WRITE",
-};
-
-#endif /* __COMPILING_BLKTAP_LIB */
 
 #endif /* __BLKTAPLIB_H__ */
diff -r 7530c4dba8a5 tools/ioemu/Makefile.target
--- a/tools/ioemu/Makefile.target	Mon Mar  3 15:19:39 2008
+++ b/tools/ioemu/Makefile.target	Mon Mar 10 16:58:40 2008
@@ -17,6 +17,7 @@
 VPATH=$(SRC_PATH):$(TARGET_PATH):$(SRC_PATH)/hw:$(SRC_PATH)/audio
 CPPFLAGS+=-I. -I.. -I$(TARGET_PATH) -I$(SRC_PATH)
 CPPFLAGS+= -I$(XEN_ROOT)/tools/libxc
+CPPFLAGS+= -I$(XEN_ROOT)/tools/blktap/lib
 CPPFLAGS+= -I$(XEN_ROOT)/tools/xenstore
 CPPFLAGS+= -I$(XEN_ROOT)/tools/include
 ifdef CONFIG_DARWIN_USER
@@ -429,6 +430,7 @@
 VL_OBJS+= usb-uhci.o smbus_eeprom.o
 VL_OBJS+= piix4acpi.o
 VL_OBJS+= xenstore.o
+VL_OBJS+= xen_blktap.o
 VL_OBJS+= xen_platform.o
 VL_OBJS+= xen_machine_fv.o
 VL_OBJS+= xen_machine_pv.o
diff -r 7530c4dba8a5 tools/ioemu/block.c
--- a/tools/ioemu/block.c	Mon Mar  3 15:19:39 2008
+++ b/tools/ioemu/block.c	Mon Mar 10 16:58:40 2008
@@ -1239,7 +1239,8 @@
         return -1;
     }
     while (async_ret == NOT_DONE) {
-        qemu_aio_wait();
+        //qemu_aio_wait();
+        qemu_aio_poll();
     }
     qemu_aio_wait_end();
     return async_ret;
@@ -1260,7 +1261,8 @@
         return -1;
     }
     while (async_ret == NOT_DONE) {
-        qemu_aio_wait();
+        //qemu_aio_wait();
+        qemu_aio_poll();
     }
     qemu_aio_wait_end();
     return async_ret;
diff -r 7530c4dba8a5 tools/ioemu/hw/xen_machine_pv.c
--- a/tools/ioemu/hw/xen_machine_pv.c	Mon Mar  3 15:19:39 2008
+++ b/tools/ioemu/hw/xen_machine_pv.c	Mon Mar 10 16:58:40 2008
@@ -26,6 +26,9 @@
 #include "xen_console.h"
 #include "xenfb.h"
 
+extern void init_blktap(void);
+
+
 /* The Xen PV machine currently provides
  *   - a virtual framebuffer
  *   - ....
@@ -40,6 +43,10 @@
 {
     struct xenfb *xenfb;
     extern int domid;
+
+
+    /* Initialize tapdisk client */
+    init_blktap();
 
     /* Connect to text console */
     if (serial_hds[0]) {
diff -r 7530c4dba8a5 tools/python/xen/xend/server/BlktapController.py
--- a/tools/python/xen/xend/server/BlktapController.py	Mon Mar  3 15:19:39 2008
+++ b/tools/python/xen/xend/server/BlktapController.py	Mon Mar 10 16:58:40 2008
@@ -13,7 +13,9 @@
     'vmdk',
     'ram',
     'qcow',
-    'qcow2'
+    'qcow2',
+
+    'ioemu'
     ]
 
 class BlktapController(BlkifController):
diff -r 7530c4dba8a5 tools/ioemu/hw/xen_blktap.c
--- /dev/null	Mon Mar  3 15:19:39 2008
+++ b/tools/ioemu/hw/xen_blktap.c	Mon Mar 10 16:58:40 2008
@@ -0,0 +1,769 @@
+/* xen_blktap.c
+ *
+ * Interface to blktapctrl to allow use of qemu block drivers with blktap.
+ * This file is based on tools/blktap/drivers/tapdisk.c
+ * 
+ * Copyright (c) 2005 Julian Chesterfield and Andrew Warfield.
+ * Copyright (c) 2008 Kevin Wolf
+ */
+
+/*
+ * There are several communication channels which are used by this interface:
+ *
+ *   - A pair of pipes for receiving and sending general control messages
+ *     (qemu-read-N and qemu-writeN in /var/run/tap, where N is the domain ID).
+ *     These control messages are handled by handle_blktap_ctrlmsg().
+ *
+ *   - One file descriptor per attached disk (/dev/xen/blktapN) for disk
+ *     specific control messages. A callback is triggered on this fd if there
+ *     is a new IO request. The callback function is handle_blktap_iomsg().
+ *
+ *   - A shared ring for each attached disk containing the actual IO requests 
+ *     and responses. Whenever handle_blktap_iomsg() is triggered it processes
+ *     the requests on this ring.
+ */
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#include "vl.h"
+#include "blktaplib.h"
+#include "xen_blktap.h"
+#include "block_int.h"
+
+#define MSG_SIZE 4096
+
+#define BLKTAP_CTRL_DIR "/var/run/tap"
+#define USE_AIO
+
+/* If enabled, print debug messages to stderr */
+#if 1
+#define DPRINTF(_f, _a...) fprintf(stderr, __FILE__ ":%d: " _f, __LINE__, ##_a)
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+#if 1                                                                        
+#define ASSERT(_p) \
+    if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
+        __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif 
+
+
+extern int domid;
+
+int read_fd;
+int write_fd;
+
+static int run = 1;
+
+static pid_t process;
+int connected_disks = 0;
+fd_list_entry_t *fd_start = NULL;
+
+static void handle_blktap_iomsg(void* private);
+
+struct aiocb_info {
+	struct td_state	*s;
+	uint64_t sector;
+	int nr_secs;
+	int idx;
+	long i;
+};
+
+static void unmap_disk(struct td_state *s)
+{
+	tapdev_info_t *info = s->ring_info;
+	fd_list_entry_t *entry;
+	
+	bdrv_close(s->bs);
+
+	if (info != NULL && info->mem > 0)
+	        munmap(info->mem, getpagesize() * BLKTAP_MMAP_REGION_SIZE);
+
+	entry = s->fd_entry;
+	*entry->pprev = entry->next;
+	if (entry->next)
+		entry->next->pprev = entry->pprev;
+
+	close(info->fd);
+
+	free(s->fd_entry);
+	free(s->blkif);
+	free(s->ring_info);
+	free(s);
+
+	return;
+}
+
+void sig_handler(int sig)
+{
+	/*Received signal to close. If no disks are active, we close app.*/
+
+	if (connected_disks < 1) run = 0;	
+}
+
+static inline fd_list_entry_t *add_fd_entry(int tap_fd, struct td_state *s)
+{
+	fd_list_entry_t **pprev, *entry;
+
+	DPRINTF("Adding fd_list_entry\n");
+
+	/*Add to linked list*/
+	s->fd_entry   = entry = malloc(sizeof(fd_list_entry_t));
+	entry->tap_fd = tap_fd;
+	entry->s      = s;
+	entry->next   = NULL;
+
+	pprev = &fd_start;
+	while (*pprev != NULL)
+		pprev = &(*pprev)->next;
+
+	*pprev = entry;
+	entry->pprev = pprev;
+
+	return entry;
+}
+
+static inline struct td_state *get_state(int cookie)
+{
+	fd_list_entry_t *ptr;
+
+	ptr = fd_start;
+	while (ptr != NULL) {
+		if (ptr->cookie == cookie) return ptr->s;
+		ptr = ptr->next;
+	}
+	return NULL;
+}
+
+static struct td_state *state_init(void)
+{
+	int i;
+	struct td_state *s;
+	blkif_t *blkif;
+
+	s = malloc(sizeof(struct td_state));
+	blkif = s->blkif = malloc(sizeof(blkif_t));
+	s->ring_info = calloc(1, sizeof(tapdev_info_t));
+
+	for (i = 0; i < MAX_REQUESTS; i++) {
+		blkif->pending_list[i].secs_pending = 0;
+		blkif->pending_list[i].submitting = 0;
+	}
+
+	return s;
+}
+
+static int map_new_dev(struct td_state *s, int minor)
+{
+	int tap_fd;
+	tapdev_info_t *info = s->ring_info;
+	char *devname;
+	fd_list_entry_t *ptr;
+	int page_size;
+
+	if (asprintf(&devname,"%s/%s%d", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, minor) == -1)
+		return -1;
+	tap_fd = open(devname, O_RDWR);
+	if (tap_fd == -1) 
+	{
+		DPRINTF("open failed on dev %s!",devname);
+		goto fail;
+	} 
+	info->fd = tap_fd;
+
+	/*Map the shared memory*/
+	page_size = getpagesize();
+	info->mem = mmap(0, page_size * BLKTAP_MMAP_REGION_SIZE, 
+			  PROT_READ | PROT_WRITE, MAP_SHARED, info->fd, 0);
+	if ((long int)info->mem == -1) 
+	{
+		DPRINTF("mmap failed on dev %s!\n",devname);
+		goto fail;
+	}
+
+	/* assign the rings to the mapped memory */ 
+	info->sring = (blkif_sring_t *)((unsigned long)info->mem);
+	BACK_RING_INIT(&info->fe_ring, info->sring, page_size);
+	
+	info->vstart = 
+	        (unsigned long)info->mem + (BLKTAP_RING_PAGES * page_size);
+
+	ioctl(info->fd, BLKTAP_IOCTL_SENDPID, process );
+	ioctl(info->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE );
+	free(devname);
+
+	/*Update the fd entry*/
+	ptr = fd_start;
+	while (ptr != NULL) {
+		if (s == ptr->s) {
+			ptr->tap_fd = tap_fd;
+
+			// Setup fd_handler for qemu main loop
+			DPRINTF("set tap_fd = %d\n", tap_fd);
+			qemu_set_fd_handler2(tap_fd, NULL, &handle_blktap_iomsg, NULL, s);
+
+			break;
+		}
+		ptr = ptr->next;
+	}	
+
+
+	DPRINTF("map_new_dev = %d\n", minor);
+	return minor;
+
+ fail:
+	free(devname);
+	return -1;
+}
+
+static int open_disk(struct td_state *s, char *path, td_flag_t flags)
+{
+	char *dup;
+	struct disk_id id;
+	BlockDriverState* bs;
+
+	DPRINTF("Opening %s\n", path);
+	bs = calloc(sizeof(*bs), 1);
+
+	dup = strdup(path);
+	if (!dup)
+		return -ENOMEM;
+
+	memset(&id, 0, sizeof(struct disk_id));
+
+	if (bdrv_open(bs, path, 0) != 0) {
+		fprintf(stderr, "Could not open image file %s\n", path);
+		return -ENOMEM;
+	}
+
+	s->bs = bs;
+	s->flags = 0;
+	s->size = bs->total_sectors;
+	s->sector_size = 512;
+
+	s->info = ((flags & TD_RDONLY) ? VDISK_READONLY : 0);
+
+	return 0;
+}
+
+static inline int write_rsp_to_ring(struct td_state *s, blkif_response_t *rsp)
+{
+	tapdev_info_t *info = s->ring_info;
+	blkif_response_t *rsp_d;
+	
+	rsp_d = RING_GET_RESPONSE(&info->fe_ring, info->fe_ring.rsp_prod_pvt);
+	memcpy(rsp_d, rsp, sizeof(blkif_response_t));
+	info->fe_ring.rsp_prod_pvt++;
+	
+	return 0;
+}
+
+static inline void kick_responses(struct td_state *s)
+{
+	tapdev_info_t *info = s->ring_info;
+
+	if (info->fe_ring.rsp_prod_pvt != info->fe_ring.sring->rsp_prod) 
+	{
+		RING_PUSH_RESPONSES(&info->fe_ring);
+		ioctl(info->fd, BLKTAP_IOCTL_KICK_FE);
+	}
+}
+
+static inline uint64_t
+segment_start(blkif_request_t *req, int sidx)
+{
+	int i;
+	uint64_t start = req->sector_number;
+
+	for (i = 0; i < sidx; i++) 
+		start += (req->seg[i].last_sect - req->seg[i].first_sect + 1);
+
+	return start;
+}
+
+static int send_responses(struct td_state *s, int res, 
+		   uint64_t sector, int nr_secs, int idx, void *private)
+{
+	pending_req_t   *preq;
+	blkif_request_t *req;
+	int responses_queued = 0;
+	blkif_t *blkif = s->blkif;
+	int secs_done = nr_secs;
+
+	if ( (idx > MAX_REQUESTS-1) )
+	{
+		DPRINTF("invalid index returned(%u)!\n", idx);
+		return 0;
+	}
+	preq = &blkif->pending_list[idx];
+	req  = &preq->req;
+
+	preq->secs_pending -= secs_done;
+
+	if (res == -EBUSY && preq->submitting) 
+		return -EBUSY;  /* propagate -EBUSY back to higher layers */
+	if (res) 
+		preq->status = BLKIF_RSP_ERROR;
+	
+	if (!preq->submitting && preq->secs_pending == 0) 
+	{
+		blkif_request_t tmp;
+		blkif_response_t *rsp;
+
+		tmp = preq->req;
+		rsp = (blkif_response_t *)req;
+		
+		rsp->id = tmp.id;
+		rsp->operation = tmp.operation;
+		rsp->status = preq->status;
+		
+		write_rsp_to_ring(s, rsp);
+		responses_queued++;
+
+		kick_responses(s);
+	}
+	
+	return responses_queued;
+}
+
+#ifdef USE_AIO
+static void qemu_send_responses(void* opaque, int ret)
+{
+	struct aiocb_info* info = opaque;
+
+	if (ret != 0) {
+		DPRINTF("ERROR: ret = %d (%s)\n", ret, strerror(-ret));
+	}
+
+	send_responses(info->s, ret, info->sector, info->nr_secs, 
+		info->idx, (void*) info->i);
+	free(info);
+}
+#endif
+
+/**
+ * Callback function for the IO message pipe. Reads requests from the ring
+ * and processes them (call qemu read/write functions).
+ *
+ * The private parameter points to the struct td_state representing the
+ * disk the request is targeted at.
+ */
+static void handle_blktap_iomsg(void* private)
+{
+	struct td_state* s = private;
+
+	RING_IDX          rp, j, i;
+	blkif_request_t  *req;
+	int idx, nsects, ret;
+	uint64_t sector_nr;
+	uint8_t *page;
+	blkif_t *blkif = s->blkif;
+	tapdev_info_t *info = s->ring_info;
+	int page_size = getpagesize();
+
+#ifdef USE_AIO	
+	struct aiocb_info *aiocb_info;
+#endif	
+
+	if (!run) {
+		DPRINTF("  !run");
+		return; /*We have received signal to close*/
+	}
+
+	if (info->fe_ring.sring == NULL) {
+		DPRINTF("  sring == NULL, ignoring IO request\n");
+		return;
+	}
+
+	rp = info->fe_ring.sring->req_prod; 
+	xen_rmb();
+
+	for (j = info->fe_ring.req_cons; j != rp; j++)
+	{
+		int start_seg = 0; 
+
+		req = NULL;
+		req = RING_GET_REQUEST(&info->fe_ring, j);
+		++info->fe_ring.req_cons;
+		
+		if (req == NULL) {
+			continue;
+		}
+
+		idx = req->id;
+
+		if (info->busy.req) {
+			/* continue where we left off last time */
+			ASSERT(info->busy.req == req);
+			start_seg = info->busy.seg_idx;
+			sector_nr = segment_start(req, start_seg);
+			info->busy.seg_idx = 0;
+			info->busy.req     = NULL;
+		} else {
+			ASSERT(blkif->pending_list[idx].secs_pending == 0);
+			memcpy(&blkif->pending_list[idx].req, 
+			       req, sizeof(*req));
+			blkif->pending_list[idx].status = BLKIF_RSP_OKAY;
+			blkif->pending_list[idx].submitting = 1;
+			sector_nr = req->sector_number;
+		}
+
+		if ((s->flags & TD_RDONLY) && 
+		    (req->operation == BLKIF_OP_WRITE)) {
+			blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
+			DPRINTF("  BLKIF_RSP_ERROR\n");
+			goto send_response;
+		}
+
+		for (i = start_seg; i < req->nr_segments; i++) {
+			nsects = req->seg[i].last_sect - 
+				 req->seg[i].first_sect + 1;
+	
+			if ((req->seg[i].last_sect >= page_size >> 9) ||
+			    (nsects <= 0))
+			{
+				DPRINTF("  Invalid request\n");
+				continue;
+			}
+
+			page  = (uint8_t*) MMAP_VADDR(info->vstart, 
+						   (unsigned long)req->id, i);
+			page += (req->seg[i].first_sect << SECTOR_SHIFT);
+
+			if (sector_nr >= s->size) {
+				DPRINTF("Sector request failed:\n");
+				DPRINTF("%s request, idx [%d,%d] size [%llu], "
+					"sector [%llu,%llu]\n",
+					(req->operation == BLKIF_OP_WRITE ? 
+					 "WRITE" : "READ"),
+					idx,i,
+					(long long unsigned) 
+						nsects<<SECTOR_SHIFT,
+					(long long unsigned) 
+						sector_nr<<SECTOR_SHIFT,
+					(long long unsigned) sector_nr);
+				continue;
+			}
+
+			blkif->pending_list[idx].secs_pending += nsects;
+
+			switch (req->operation) 
+			{
+			case BLKIF_OP_WRITE:
+				//DPRINTF("  Write request %#" PRIx64 "\n", sector_nr);
+#ifdef USE_AIO
+				aiocb_info = malloc(sizeof(*aiocb_info));
+
+				aiocb_info->s = s;
+				aiocb_info->sector = sector_nr;
+				aiocb_info->nr_secs = nsects;
+				aiocb_info->idx = idx;
+				aiocb_info->i = i;
+
+				ret = (NULL != bdrv_aio_write(s->bs, sector_nr,
+							  page, nsects,
+							  qemu_send_responses,
+							  aiocb_info));
+#else				
+				ret = bdrv_write(s->bs, sector_nr, page, nsects);
+#endif
+				if (ret >= 0) {
+#ifndef USE_AIO
+					send_responses(s, ret, sector_nr, nsects, idx, (void *)(long)i);
+#endif
+				} else if (ret == -EBUSY) {
+					/* put req back on queue */
+					--info->fe_ring.req_cons;
+					info->busy.req     = req;
+					info->busy.seg_idx = i;
+					goto out;
+				} else {
+					blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
+					DPRINTF("ERROR: bdrv_write() == %d\n", ret);
+					goto send_response;
+				}
+				break;
+			case BLKIF_OP_READ:
+				//DPRINTF("  Read request %#" PRIx64 "\n", sector_nr);
+#ifdef USE_AIO
+				aiocb_info = malloc(sizeof(*aiocb_info));
+
+				aiocb_info->s = s;
+				aiocb_info->sector = sector_nr;
+				aiocb_info->nr_secs = nsects;
+				aiocb_info->idx = idx;
+				aiocb_info->i = i;
+
+				ret = (NULL != bdrv_aio_read(s->bs, sector_nr,
+							 page, nsects,
+							 qemu_send_responses,
+							 aiocb_info));
+#else
+				ret = bdrv_read(s->bs, sector_nr, page, nsects);
+#endif							 
+
+				if (ret >= 0) {
+#ifndef USE_AIO					
+					send_responses(s, ret, sector_nr, nsects, idx, (void *)(long)i);
+#endif					
+				} else if (ret == -EBUSY) {
+					/* put req back on queue */
+					--info->fe_ring.req_cons;
+					info->busy.req     = req;
+					info->busy.seg_idx = i;
+					goto out;
+				} else {
+					blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
+					DPRINTF("ERROR: bdrv_read() == %d\n", ret);
+					goto send_response;
+				}
+				break;
+			default:
+				DPRINTF("Unknown block operation\n");
+				break;
+			}
+			sector_nr += nsects;
+		}
+	send_response:
+		blkif->pending_list[idx].submitting = 0;
+
+		/* force write_rsp_to_ring for synchronous case */
+		if (blkif->pending_list[idx].secs_pending == 0)
+			send_responses(s, 0, 0, 0, idx, (void *)(long)0);
+	}
+
+ out:
+	/*Batch done*/
+	bdrv_flush(s->bs);
+	kick_responses(s);
+
+	return;
+}
+
+/**
+ * Callback function for the qemu-read pipe. Reads and processes control 
+ * message from the pipe.
+ *
+ * The parameter private is unused.
+ */
+static void handle_blktap_ctrlmsg(void* private)
+{
+	int length, len, msglen;
+	char *ptr, *path;
+	image_t *img;
+	msg_hdr_t *msg;
+	msg_newdev_t *msg_dev;
+	msg_pid_t *msg_pid;
+	int ret = -1;
+	struct td_state *s = NULL;
+	fd_list_entry_t *entry;
+
+	char buf[MSG_SIZE];
+	memset(buf, 0, MSG_SIZE);
+
+	length = read(read_fd, buf, MSG_SIZE);
+
+	if (length > 0 && length >= sizeof(msg_hdr_t)) 
+	{
+		msg = (msg_hdr_t *)buf;
+		DPRINTF("ioemu: Received msg, len %d, type %d, UID %d\n",
+			length,msg->type,msg->cookie);
+
+		switch (msg->type) {
+		case CTLMSG_PARAMS: 			
+			ptr = buf + sizeof(msg_hdr_t);
+			len = (length - sizeof(msg_hdr_t));
+			path = calloc(1, len);
+			
+			memcpy(path, ptr, len); 
+			DPRINTF("Received CTLMSG_PARAMS: [%s]\n", path);
+
+			/* Allocate the disk structs */
+			s = state_init();
+			if (s == NULL)
+				goto params_done;
+
+			/*Open file*/
+			ret = open_disk(s, path, 
+					((msg->readonly) ? TD_RDONLY : 0));
+			if (ret)
+				goto params_done;
+
+			entry = add_fd_entry(0, s);
+			entry->cookie = msg->cookie;
+			DPRINTF("Entered cookie %d\n", entry->cookie);
+			
+			memset(buf, 0x00, MSG_SIZE); 
+			
+		params_done:
+			if (ret == 0) {
+				msglen = sizeof(msg_hdr_t) + sizeof(image_t);
+				msg->type = CTLMSG_IMG;
+				img = (image_t *)(buf + sizeof(msg_hdr_t));
+				img->size = s->size;
+				img->secsize = s->sector_size;
+				img->info = s->info;
+				DPRINTF("Writing (size, secsize, info) = "
+					"(%#" PRIx64 ", %#" PRIx64 ", %d)\n",
+					s->size, s->sector_size, s->info);
+			} else {
+				msglen = sizeof(msg_hdr_t);
+				msg->type = CTLMSG_IMG_FAIL;
+				msg->len = msglen;
+			}
+			len = write(write_fd, buf, msglen);
+			free(path);
+			break;
+			
+		case CTLMSG_NEWDEV:
+			msg_dev = (msg_newdev_t *)(buf + sizeof(msg_hdr_t));
+
+			s = get_state(msg->cookie);
+			DPRINTF("Retrieving state, cookie %d.....[%s]\n",
+				msg->cookie, (s == NULL ? "FAIL":"OK"));
+			if (s != NULL) {
+				ret = ((map_new_dev(s, msg_dev->devnum) 
+					== msg_dev->devnum ? 0: -1));
+				connected_disks++;
+			}	
+
+			memset(buf, 0x00, MSG_SIZE); 
+			msglen = sizeof(msg_hdr_t);
+			msg->type = (ret == 0 ? CTLMSG_NEWDEV_RSP 
+				              : CTLMSG_NEWDEV_FAIL);
+			msg->len = msglen;
+
+			len = write(write_fd, buf, msglen);
+			break;
+
+		case CTLMSG_CLOSE:
+			s = get_state(msg->cookie);
+			if (s) unmap_disk(s);
+			
+			connected_disks--;
+			sig_handler(SIGINT);
+
+			break;			
+
+		case CTLMSG_PID:
+			memset(buf, 0x00, MSG_SIZE);
+			msglen = sizeof(msg_hdr_t) + sizeof(msg_pid_t);
+			msg->type = CTLMSG_PID_RSP;
+			msg->len = msglen;
+
+			msg_pid = (msg_pid_t *)(buf + sizeof(msg_hdr_t));
+			process = getpid();
+			msg_pid->pid = process;
+
+			len = write(write_fd, buf, msglen);
+			break;
+
+		default:
+			break;
+		}
+	}
+}
+
+/**
+ * Opens a control socket, i.e. a pipe to communicate with blktapctrl.
+ *
+ * Returns the file descriptor number for the pipe; -1 in error case
+ */
+static int open_ctrl_socket(char *devname)
+{
+	int ret;
+	int ipc_fd;
+
+	if (mkdir(BLKTAP_CTRL_DIR, 0755) == 0)
+		DPRINTF("Created %s directory\n", BLKTAP_CTRL_DIR);
+
+	ret = mkfifo(devname,S_IRWXU|S_IRWXG|S_IRWXO);
+	if ( (ret != 0) && (errno != EEXIST) ) {
+		DPRINTF("ERROR: pipe failed (%d)\n", errno);
+		exit(0);
+	}
+
+	ipc_fd = open(devname,O_RDWR|O_NONBLOCK);
+
+	if (ipc_fd < 0) {
+		DPRINTF("FD open failed\n");
+		return -1;
+	}
+
+	return ipc_fd;
+}
+
+/**
+ * Unmaps all disks and closes their pipes
+ */
+void shutdown_blktap(void)
+{
+	fd_list_entry_t *ptr;
+	struct td_state *s;
+
+	DPRINTF("Shutdown blktap\n");
+	ptr = fd_start;
+	while (ptr != NULL) {
+		s = ptr->s;
+		unmap_disk(s);
+		close(ptr->tap_fd);
+		ptr = ptr->next;
+	}
+}
+
+/**
+ * Initialize the blktap interface, i.e. open a pair of pipes in /var/run/tap
+ * and register a fd handler.
+ *
+ * Returns 0 on success.
+ */
+int init_blktap(void)
+{
+	char* devname;	
+
+	DPRINTF("Init blktap pipes\n");
+
+	/* Open the read pipe */
+	if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-read-%d", domid) >= 0) {	
+		read_fd = open_ctrl_socket(devname);		
+		free(devname);
+		
+		if (read_fd == -1) {
+			fprintf(stderr, "Could not open %s/qemu-read-%d\n",
+				BLKTAP_CTRL_DIR, domid);
+			return -1;
+		}
+	}
+	
+	/* Open the write pipe */
+	if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-write-%d", domid) >= 0) {
+		write_fd = open_ctrl_socket(devname);
+		free(devname);
+		
+		if (write_fd == -1) {
+			fprintf(stderr, "Could not open %s/qemu-write-%d\n",
+				BLKTAP_CTRL_DIR, domid);
+			close(read_fd);
+			return -1;
+		}
+	}
+
+	/* Attach a handler to the read pipe (called from qemu main loop */
+	qemu_set_fd_handler2(read_fd, NULL, &handle_blktap_ctrlmsg, NULL, NULL);
+
+	/* TODO Doesn't work - qemu-dm receives SIGKILL */
+	atexit(&shutdown_blktap);
+
+	return 0;
+}
diff -r 7530c4dba8a5 tools/ioemu/hw/xen_blktap.h
--- /dev/null	Mon Mar  3 15:19:39 2008
+++ b/tools/ioemu/hw/xen_blktap.h	Mon Mar 10 16:58:40 2008
@@ -0,0 +1,57 @@
+/* xen_blktap.h
+ *
+ * Generic disk interface for blktap-based image adapters.
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ */
+
+#ifndef XEN_BLKTAP_H_ 
+#define XEN_BLKTAP_H_
+
+#include <stdint.h>
+#include <syslog.h>
+#include <stdio.h>
+
+#include "block_int.h"
+
+/* Things disks need to know about, these should probably be in a higher-level
+ * header. */
+#define MAX_SEGMENTS_PER_REQ    11
+#define SECTOR_SHIFT             9
+#define DEFAULT_SECTOR_SIZE    512
+
+#define MAX_IOFD                 2
+
+#define BLK_NOT_ALLOCATED       99
+#define TD_NO_PARENT             1
+
+typedef uint32_t td_flag_t;
+
+#define TD_RDONLY                1
+
+struct disk_id {
+	char *name;
+	int drivertype;
+};
+
+/* This structure represents the state of an active virtual disk.           */
+struct td_state {
+	BlockDriverState* bs;
+	td_flag_t flags;
+	void *blkif;
+	void *image;
+	void *ring_info;
+	void *fd_entry;
+	uint64_t sector_size;
+	uint64_t size;
+	unsigned int       info;
+};
+
+typedef struct fd_list_entry {
+	int cookie;
+	int  tap_fd;
+	struct td_state *s;
+	struct fd_list_entry **pprev, *next;
+} fd_list_entry_t;
+
+#endif /*XEN_BLKTAP_H_*/

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][PATCH] Use ioemu block drivers through blktap
  2008-03-10 17:03 [RFC][PATCH] Use ioemu block drivers through blktap Kevin Wolf
@ 2008-03-13 12:25 ` Kevin Wolf
  2008-03-13 14:21   ` Konrad Rzeszutek
  2008-03-28 15:22   ` Ian Jackson
  0 siblings, 2 replies; 13+ messages in thread
From: Kevin Wolf @ 2008-03-13 12:25 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: xen-devel

[-- Attachment #1: Type: text/plain, Size: 243 bytes --]

I've reworked and cleaned up the thing, so the attached patch should be
much better. If you agree that my approach is the right one, I think it
could be committed in this (or a similar) shape.

Kevin

Signed-off-by: Kevin Wolf <kwolf@suse.de>

[-- Attachment #2: blktap-ioemu.patch --]
[-- Type: text/x-patch, Size: 25263 bytes --]

diff -r f33328217eee tools/blktap/drivers/blktapctrl.c
--- a/tools/blktap/drivers/blktapctrl.c	Mon Mar 10 22:51:57 2008 +0000
+++ b/tools/blktap/drivers/blktapctrl.c	Thu Mar 13 13:00:18 2008 +0100
@@ -501,6 +501,82 @@ int launch_tapdisk(char *wrctldev, char 
 	return 0;
 }
 
+/* Connect to qemu-dm */
+static int connect_qemu(blkif_t *blkif)
+{
+	char *rdctldev, *wrctldev;
+	
+	if (asprintf(&rdctldev, BLKTAP_CTRL_DIR "/qemu-read-%d", 
+			blkif->domid) < 0)
+		return -1;
+
+	if (asprintf(&wrctldev, BLKTAP_CTRL_DIR "/qemu-write-%d", 
+			blkif->domid) < 0) {
+		free(rdctldev);
+		return -1;
+	}
+
+	DPRINTF("Using qemu blktap pipe: %s\n", rdctldev);
+	
+	blkif->fds[READ] = open_ctrl_socket(wrctldev);
+	blkif->fds[WRITE] = open_ctrl_socket(rdctldev);
+	
+	if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1) {
+		free(rdctldev);
+		free(wrctldev);
+		return -1;
+	}
+
+	DPRINTF("Attached to qemu blktap pipes\n");
+	free(rdctldev);
+	free(wrctldev);
+	return 0;
+}
+
+/* Launch tapdisk instance */
+static int connect_tapdisk(blkif_t *blkif, int minor)
+{
+	char *rdctldev = NULL, *wrctldev = NULL;
+	int ret = -1;
+
+	DPRINTF("tapdisk process does not exist:\n");
+
+	if (asprintf(&rdctldev,
+		     "%s/tapctrlread%d", BLKTAP_CTRL_DIR, minor) == -1)
+		goto fail;
+
+	if (asprintf(&wrctldev,
+		     "%s/tapctrlwrite%d", BLKTAP_CTRL_DIR, minor) == -1)
+		goto fail;
+	
+	blkif->fds[READ] = open_ctrl_socket(rdctldev);
+	blkif->fds[WRITE] = open_ctrl_socket(wrctldev);
+	
+	if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1)
+		goto fail;
+
+	/*launch the new process*/
+	DPRINTF("Launching process, CMDLINE [tapdisk %s %s]\n",
+			wrctldev, rdctldev);
+
+	if (launch_tapdisk(wrctldev, rdctldev) == -1) {
+		DPRINTF("Unable to fork, cmdline: [tapdisk %s %s]\n",
+				wrctldev, rdctldev);
+		goto fail;
+	}
+
+	ret = 0;
+	
+fail:
+	if (rdctldev)
+		free(rdctldev);
+
+	if (wrctldev)
+		free(wrctldev);
+
+	return ret;
+}
+
 int blktapctrl_new_blkif(blkif_t *blkif)
 {
 	blkif_info_t *blk;
@@ -524,30 +600,14 @@ int blktapctrl_new_blkif(blkif_t *blkif)
 		blkif->cookie = next_cookie++;
 
 		if (!exist) {
-			DPRINTF("Process does not exist:\n");
-			if (asprintf(&rdctldev,
-				     "%s/tapctrlread%d", BLKTAP_CTRL_DIR, minor) == -1)
-				goto fail;
-			if (asprintf(&wrctldev,
-				     "%s/tapctrlwrite%d", BLKTAP_CTRL_DIR, minor) == -1) {
-				free(rdctldev);
-				goto fail;
+			if (type == DISK_TYPE_IOEMU) {
+				if (connect_qemu(blkif))
+					goto fail;
+			} else {
+				if (connect_tapdisk(blkif, minor))
+					goto fail;
 			}
-			blkif->fds[READ] = open_ctrl_socket(rdctldev);
-			blkif->fds[WRITE] = open_ctrl_socket(wrctldev);
-			
-			if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1) 
-				goto fail;
-
-			/*launch the new process*/
- 			DPRINTF("Launching process, CMDLINE [tapdisk %s %s]\n",wrctldev, rdctldev);
- 			if (launch_tapdisk(wrctldev, rdctldev) == -1) {
- 				DPRINTF("Unable to fork, cmdline: [tapdisk %s %s]\n",wrctldev, rdctldev);
-				goto fail;
-			}
-
-			free(rdctldev);
-			free(wrctldev);
+
 		} else {
 			DPRINTF("Process exists!\n");
 			blkif->fds[READ] = exist->fds[READ];
diff -r f33328217eee tools/blktap/drivers/tapdisk.h
--- a/tools/blktap/drivers/tapdisk.h	Mon Mar 10 22:51:57 2008 +0000
+++ b/tools/blktap/drivers/tapdisk.h	Thu Mar 13 13:00:18 2008 +0100
@@ -167,6 +167,7 @@ extern struct tap_disk tapdisk_qcow2;
 #define DISK_TYPE_RAM      3
 #define DISK_TYPE_QCOW     4
 #define DISK_TYPE_QCOW2    5
+#define DISK_TYPE_IOEMU    6
 
 
 /*Define Individual Disk Parameters here */
@@ -227,6 +228,16 @@ static disk_info_t qcow2_disk = {
 	0,
 #ifdef TAPDISK
 	&tapdisk_qcow2,
+#endif
+};
+
+static disk_info_t ioemu_disk = {
+	DISK_TYPE_IOEMU,
+	"ioemu disk",
+	"ioemu",
+	0,
+#ifdef TAPDISK
+	NULL
 #endif
 };
 
@@ -238,6 +249,7 @@ static disk_info_t *dtypes[] = {
 	&ram_disk,
 	&qcow_disk,
 	&qcow2_disk,
+	&ioemu_disk,
 };
 
 typedef struct driver_list_entry {
diff -r f33328217eee tools/blktap/lib/blktaplib.h
--- a/tools/blktap/lib/blktaplib.h	Mon Mar 10 22:51:57 2008 +0000
+++ b/tools/blktap/lib/blktaplib.h	Thu Mar 13 13:00:18 2008 +0100
@@ -221,15 +221,5 @@ int xs_fire_next_watch(struct xs_handle 
      ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * getpagesize()) +    \
      ((_seg) * getpagesize()))
 
-/* Defines that are only used by library clients */
-
-#ifndef __COMPILING_BLKTAP_LIB
-
-static char *blkif_op_name[] = {
-	[BLKIF_OP_READ]       = "READ",
-	[BLKIF_OP_WRITE]      = "WRITE",
-};
-
-#endif /* __COMPILING_BLKTAP_LIB */
 
 #endif /* __BLKTAPLIB_H__ */
diff -r f33328217eee tools/ioemu/Makefile.target
--- a/tools/ioemu/Makefile.target	Mon Mar 10 22:51:57 2008 +0000
+++ b/tools/ioemu/Makefile.target	Thu Mar 13 13:00:18 2008 +0100
@@ -17,6 +17,7 @@ VPATH=$(SRC_PATH):$(TARGET_PATH):$(SRC_P
 VPATH=$(SRC_PATH):$(TARGET_PATH):$(SRC_PATH)/hw:$(SRC_PATH)/audio
 CPPFLAGS+=-I. -I.. -I$(TARGET_PATH) -I$(SRC_PATH)
 CPPFLAGS+= -I$(XEN_ROOT)/tools/libxc
+CPPFLAGS+= -I$(XEN_ROOT)/tools/blktap/lib
 CPPFLAGS+= -I$(XEN_ROOT)/tools/xenstore
 CPPFLAGS+= -I$(XEN_ROOT)/tools/include
 ifdef CONFIG_DARWIN_USER
@@ -429,6 +430,7 @@ VL_OBJS+= usb-uhci.o smbus_eeprom.o
 VL_OBJS+= usb-uhci.o smbus_eeprom.o
 VL_OBJS+= piix4acpi.o
 VL_OBJS+= xenstore.o
+VL_OBJS+= xen_blktap.o
 VL_OBJS+= xen_platform.o
 VL_OBJS+= xen_machine_fv.o
 VL_OBJS+= xen_machine_pv.o
diff -r f33328217eee tools/ioemu/hw/xen_machine_pv.c
--- a/tools/ioemu/hw/xen_machine_pv.c	Mon Mar 10 22:51:57 2008 +0000
+++ b/tools/ioemu/hw/xen_machine_pv.c	Thu Mar 13 13:00:18 2008 +0100
@@ -26,6 +26,9 @@
 #include "xen_console.h"
 #include "xenfb.h"
 
+extern void init_blktap(void);
+
+
 /* The Xen PV machine currently provides
  *   - a virtual framebuffer
  *   - ....
@@ -40,6 +43,10 @@ static void xen_init_pv(uint64_t ram_siz
 {
     struct xenfb *xenfb;
     extern int domid;
+
+
+    /* Initialize tapdisk client */
+    init_blktap();
 
     /* Connect to text console */
     if (serial_hds[0]) {
diff -r f33328217eee tools/ioemu/vl.c
--- a/tools/ioemu/vl.c	Mon Mar 10 22:51:57 2008 +0000
+++ b/tools/ioemu/vl.c	Thu Mar 13 13:00:18 2008 +0100
@@ -6270,6 +6270,12 @@ void qemu_system_powerdown_request(void)
     powerdown_requested = 1;
     if (cpu_single_env)
         cpu_interrupt(cpu_single_env, CPU_INTERRUPT_EXIT);
+}
+
+static void qemu_sighup_handler(int signal)
+{
+    fprintf(stderr, "Received SIGHUP, terminating.\n");
+    exit(0);
 }
 
 void main_loop_wait(int timeout)
@@ -7980,7 +7986,7 @@ int main(int argc, char **argv)
 
 #ifndef CONFIG_STUBDOM
     /* Unblock SIGTERM and SIGHUP, which may have been blocked by the caller */
-    signal(SIGHUP, SIG_DFL);
+    signal(SIGHUP, qemu_sighup_handler);
     sigemptyset(&set);
     sigaddset(&set, SIGTERM);
     sigaddset(&set, SIGHUP);
diff -r f33328217eee tools/python/xen/xend/server/BlktapController.py
--- a/tools/python/xen/xend/server/BlktapController.py	Mon Mar 10 22:51:57 2008 +0000
+++ b/tools/python/xen/xend/server/BlktapController.py	Thu Mar 13 13:00:18 2008 +0100
@@ -13,7 +13,9 @@ blktap_disk_types = [
     'vmdk',
     'ram',
     'qcow',
-    'qcow2'
+    'qcow2',
+
+    'ioemu'
     ]
 
 class BlktapController(BlkifController):
diff -r f33328217eee tools/ioemu/hw/xen_blktap.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ioemu/hw/xen_blktap.c	Thu Mar 13 13:00:18 2008 +0100
@@ -0,0 +1,688 @@
+/* xen_blktap.c
+ *
+ * Interface to blktapctrl to allow use of qemu block drivers with blktap.
+ * This file is based on tools/blktap/drivers/tapdisk.c
+ * 
+ * Copyright (c) 2005 Julian Chesterfield and Andrew Warfield.
+ * Copyright (c) 2008 Kevin Wolf
+ */
+
+/*
+ * There are several communication channels which are used by this interface:
+ *
+ *   - A pair of pipes for receiving and sending general control messages
+ *     (qemu-read-N and qemu-writeN in /var/run/tap, where N is the domain ID).
+ *     These control messages are handled by handle_blktap_ctrlmsg().
+ *
+ *   - One file descriptor per attached disk (/dev/xen/blktapN) for disk
+ *     specific control messages. A callback is triggered on this fd if there
+ *     is a new IO request. The callback function is handle_blktap_iomsg().
+ *
+ *   - A shared ring for each attached disk containing the actual IO requests 
+ *     and responses. Whenever handle_blktap_iomsg() is triggered it processes
+ *     the requests on this ring.
+ */
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+
+#include "vl.h"
+#include "blktaplib.h"
+#include "xen_blktap.h"
+#include "block_int.h"
+
+#define MSG_SIZE 4096
+
+#define BLKTAP_CTRL_DIR "/var/run/tap"
+
+/* If enabled, print debug messages to stderr */
+#if 1
+#define DPRINTF(_f, _a...) fprintf(stderr, __FILE__ ":%d: " _f, __LINE__, ##_a)
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+#if 1                                                                        
+#define ASSERT(_p) \
+    if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
+        __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif 
+
+
+extern int domid;
+
+int read_fd;
+int write_fd;
+
+static pid_t process;
+fd_list_entry_t *fd_start = NULL;
+
+static void handle_blktap_iomsg(void* private);
+
+struct aiocb_info {
+	struct td_state	*s;
+	uint64_t sector;
+	int nr_secs;
+	int idx;
+	long i;
+};
+
+static void unmap_disk(struct td_state *s)
+{
+	tapdev_info_t *info = s->ring_info;
+	fd_list_entry_t *entry;
+	
+	bdrv_close(s->bs);
+
+	if (info != NULL && info->mem > 0)
+	        munmap(info->mem, getpagesize() * BLKTAP_MMAP_REGION_SIZE);
+
+	entry = s->fd_entry;
+	*entry->pprev = entry->next;
+	if (entry->next)
+		entry->next->pprev = entry->pprev;
+
+	qemu_set_fd_handler2(info->fd, NULL, NULL, NULL, NULL);
+	close(info->fd);
+
+	free(s->fd_entry);
+	free(s->blkif);
+	free(s->ring_info);
+	free(s);
+
+	return;
+}
+
+static inline fd_list_entry_t *add_fd_entry(int tap_fd, struct td_state *s)
+{
+	fd_list_entry_t **pprev, *entry;
+
+	DPRINTF("Adding fd_list_entry\n");
+
+	/*Add to linked list*/
+	s->fd_entry   = entry = malloc(sizeof(fd_list_entry_t));
+	entry->tap_fd = tap_fd;
+	entry->s      = s;
+	entry->next   = NULL;
+
+	pprev = &fd_start;
+	while (*pprev != NULL)
+		pprev = &(*pprev)->next;
+
+	*pprev = entry;
+	entry->pprev = pprev;
+
+	return entry;
+}
+
+static inline struct td_state *get_state(int cookie)
+{
+	fd_list_entry_t *ptr;
+
+	ptr = fd_start;
+	while (ptr != NULL) {
+		if (ptr->cookie == cookie) return ptr->s;
+		ptr = ptr->next;
+	}
+	return NULL;
+}
+
+static struct td_state *state_init(void)
+{
+	int i;
+	struct td_state *s;
+	blkif_t *blkif;
+
+	s = malloc(sizeof(struct td_state));
+	blkif = s->blkif = malloc(sizeof(blkif_t));
+	s->ring_info = calloc(1, sizeof(tapdev_info_t));
+
+	for (i = 0; i < MAX_REQUESTS; i++) {
+		blkif->pending_list[i].secs_pending = 0;
+		blkif->pending_list[i].submitting = 0;
+	}
+
+	return s;
+}
+
+static int map_new_dev(struct td_state *s, int minor)
+{
+	int tap_fd;
+	tapdev_info_t *info = s->ring_info;
+	char *devname;
+	fd_list_entry_t *ptr;
+	int page_size;
+
+	if (asprintf(&devname,"%s/%s%d", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, minor) == -1)
+		return -1;
+	tap_fd = open(devname, O_RDWR);
+	if (tap_fd == -1) 
+	{
+		DPRINTF("open failed on dev %s!",devname);
+		goto fail;
+	} 
+	info->fd = tap_fd;
+
+	/*Map the shared memory*/
+	page_size = getpagesize();
+	info->mem = mmap(0, page_size * BLKTAP_MMAP_REGION_SIZE, 
+			  PROT_READ | PROT_WRITE, MAP_SHARED, info->fd, 0);
+	if ((long int)info->mem == -1) 
+	{
+		DPRINTF("mmap failed on dev %s!\n",devname);
+		goto fail;
+	}
+
+	/* assign the rings to the mapped memory */ 
+	info->sring = (blkif_sring_t *)((unsigned long)info->mem);
+	BACK_RING_INIT(&info->fe_ring, info->sring, page_size);
+	
+	info->vstart = 
+	        (unsigned long)info->mem + (BLKTAP_RING_PAGES * page_size);
+
+	ioctl(info->fd, BLKTAP_IOCTL_SENDPID, process );
+	ioctl(info->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE );
+	free(devname);
+
+	/*Update the fd entry*/
+	ptr = fd_start;
+	while (ptr != NULL) {
+		if (s == ptr->s) {
+			ptr->tap_fd = tap_fd;
+
+			/* Setup fd_handler for qemu main loop */
+			DPRINTF("set tap_fd = %d\n", tap_fd);
+			qemu_set_fd_handler2(tap_fd, NULL, &handle_blktap_iomsg, NULL, s);
+
+			break;
+		}
+		ptr = ptr->next;
+	}	
+
+
+	DPRINTF("map_new_dev = %d\n", minor);
+	return minor;
+
+ fail:
+	free(devname);
+	return -1;
+}
+
+static int open_disk(struct td_state *s, char *path, int readonly)
+{
+	struct disk_id id;
+	BlockDriverState* bs;
+
+	DPRINTF("Opening %s\n", path);
+	bs = calloc(sizeof(*bs), 1);
+
+	memset(&id, 0, sizeof(struct disk_id));
+
+	if (bdrv_open(bs, path, 0) != 0) {
+		fprintf(stderr, "Could not open image file %s\n", path);
+		return -ENOMEM;
+	}
+
+	s->bs = bs;
+	s->flags = readonly ? TD_RDONLY : 0;
+	s->size = bs->total_sectors;
+	s->sector_size = 512;
+
+	s->info = ((s->flags & TD_RDONLY) ? VDISK_READONLY : 0);
+
+	return 0;
+}
+
+static inline int write_rsp_to_ring(struct td_state *s, blkif_response_t *rsp)
+{
+	tapdev_info_t *info = s->ring_info;
+	blkif_response_t *rsp_d;
+	
+	rsp_d = RING_GET_RESPONSE(&info->fe_ring, info->fe_ring.rsp_prod_pvt);
+	memcpy(rsp_d, rsp, sizeof(blkif_response_t));
+	info->fe_ring.rsp_prod_pvt++;
+	
+	return 0;
+}
+
+static inline void kick_responses(struct td_state *s)
+{
+	tapdev_info_t *info = s->ring_info;
+
+	if (info->fe_ring.rsp_prod_pvt != info->fe_ring.sring->rsp_prod) 
+	{
+		RING_PUSH_RESPONSES(&info->fe_ring);
+		ioctl(info->fd, BLKTAP_IOCTL_KICK_FE);
+	}
+}
+
+static int send_responses(struct td_state *s, int res, 
+		   uint64_t sector, int nr_secs, int idx, void *private)
+{
+	pending_req_t   *preq;
+	blkif_request_t *req;
+	int responses_queued = 0;
+	blkif_t *blkif = s->blkif;
+	int secs_done = nr_secs;
+
+	if ( (idx > MAX_REQUESTS-1) )
+	{
+		DPRINTF("invalid index returned(%u)!\n", idx);
+		return 0;
+	}
+	preq = &blkif->pending_list[idx];
+	req  = &preq->req;
+
+	preq->secs_pending -= secs_done;
+
+	if (res == -EBUSY && preq->submitting) 
+		return -EBUSY;  /* propagate -EBUSY back to higher layers */
+	if (res) 
+		preq->status = BLKIF_RSP_ERROR;
+	
+	if (!preq->submitting && preq->secs_pending == 0) 
+	{
+		blkif_request_t tmp;
+		blkif_response_t *rsp;
+
+		tmp = preq->req;
+		rsp = (blkif_response_t *)req;
+		
+		rsp->id = tmp.id;
+		rsp->operation = tmp.operation;
+		rsp->status = preq->status;
+		
+		write_rsp_to_ring(s, rsp);
+		responses_queued++;
+
+		kick_responses(s);
+	}
+	
+	return responses_queued;
+}
+
+static void qemu_send_responses(void* opaque, int ret)
+{
+	struct aiocb_info* info = opaque;
+
+	if (ret != 0) {
+		DPRINTF("ERROR: ret = %d (%s)\n", ret, strerror(-ret));
+	}
+
+	send_responses(info->s, ret, info->sector, info->nr_secs, 
+		info->idx, (void*) info->i);
+	free(info);
+}
+
+/**
+ * Callback function for the IO message pipe. Reads requests from the ring
+ * and processes them (call qemu read/write functions).
+ *
+ * The private parameter points to the struct td_state representing the
+ * disk the request is targeted at.
+ */
+static void handle_blktap_iomsg(void* private)
+{
+	struct td_state* s = private;
+
+	RING_IDX          rp, j, i;
+	blkif_request_t  *req;
+	int idx, nsects, ret;
+	uint64_t sector_nr;
+	uint8_t *page;
+	blkif_t *blkif = s->blkif;
+	tapdev_info_t *info = s->ring_info;
+	int page_size = getpagesize();
+
+	struct aiocb_info *aiocb_info;
+
+	if (info->fe_ring.sring == NULL) {
+		DPRINTF("  sring == NULL, ignoring IO request\n");
+		return;
+	}
+
+	rp = info->fe_ring.sring->req_prod; 
+	xen_rmb();
+
+	for (j = info->fe_ring.req_cons; j != rp; j++)
+	{
+		int start_seg = 0; 
+
+		req = NULL;
+		req = RING_GET_REQUEST(&info->fe_ring, j);
+		++info->fe_ring.req_cons;
+		
+		if (req == NULL)
+			continue;
+
+		idx = req->id;
+
+		ASSERT(blkif->pending_list[idx].secs_pending == 0);
+		memcpy(&blkif->pending_list[idx].req, req, sizeof(*req));
+		blkif->pending_list[idx].status = BLKIF_RSP_OKAY;
+		blkif->pending_list[idx].submitting = 1;
+		sector_nr = req->sector_number;
+
+		/* Don't allow writes on readonly devices */
+		if ((s->flags & TD_RDONLY) && 
+		    (req->operation == BLKIF_OP_WRITE)) {
+			blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
+			goto send_response;
+		}
+
+		for (i = start_seg; i < req->nr_segments; i++) {
+			nsects = req->seg[i].last_sect - 
+				 req->seg[i].first_sect + 1;
+	
+			if ((req->seg[i].last_sect >= page_size >> 9) ||
+					(nsects <= 0))
+				continue;
+
+			page  = (uint8_t*) MMAP_VADDR(info->vstart, 
+						   (unsigned long)req->id, i);
+			page += (req->seg[i].first_sect << SECTOR_SHIFT);
+
+			if (sector_nr >= s->size) {
+				DPRINTF("Sector request failed:\n");
+				DPRINTF("%s request, idx [%d,%d] size [%llu], "
+					"sector [%llu,%llu]\n",
+					(req->operation == BLKIF_OP_WRITE ? 
+					 "WRITE" : "READ"),
+					idx,i,
+					(long long unsigned) 
+						nsects<<SECTOR_SHIFT,
+					(long long unsigned) 
+						sector_nr<<SECTOR_SHIFT,
+					(long long unsigned) sector_nr);
+				continue;
+			}
+
+			blkif->pending_list[idx].secs_pending += nsects;
+
+			switch (req->operation) 
+			{
+			case BLKIF_OP_WRITE:
+				aiocb_info = malloc(sizeof(*aiocb_info));
+
+				aiocb_info->s = s;
+				aiocb_info->sector = sector_nr;
+				aiocb_info->nr_secs = nsects;
+				aiocb_info->idx = idx;
+				aiocb_info->i = i;
+
+				ret = (NULL == bdrv_aio_write(s->bs, sector_nr,
+							  page, nsects,
+							  qemu_send_responses,
+							  aiocb_info));
+
+				if (ret) {
+					blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
+					DPRINTF("ERROR: bdrv_write() == NULL\n");
+					goto send_response;
+				}
+				break;
+
+			case BLKIF_OP_READ:
+				aiocb_info = malloc(sizeof(*aiocb_info));
+
+				aiocb_info->s = s;
+				aiocb_info->sector = sector_nr;
+				aiocb_info->nr_secs = nsects;
+				aiocb_info->idx = idx;
+				aiocb_info->i = i;
+
+				ret = (NULL == bdrv_aio_read(s->bs, sector_nr,
+							 page, nsects,
+							 qemu_send_responses,
+							 aiocb_info));
+
+				if (ret) {
+					blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
+					DPRINTF("ERROR: bdrv_read() == NULL\n");
+					goto send_response;
+				}
+				break;
+
+			default:
+				DPRINTF("Unknown block operation\n");
+				break;
+			}
+			sector_nr += nsects;
+		}
+	send_response:
+		blkif->pending_list[idx].submitting = 0;
+
+		/* force write_rsp_to_ring for synchronous case */
+		if (blkif->pending_list[idx].secs_pending == 0)
+			send_responses(s, 0, 0, 0, idx, (void *)(long)0);
+	}
+}
+
+/**
+ * Callback function for the qemu-read pipe. Reads and processes control 
+ * message from the pipe.
+ *
+ * The parameter private is unused.
+ */
+static void handle_blktap_ctrlmsg(void* private)
+{
+	int length, len, msglen;
+	char *ptr, *path;
+	image_t *img;
+	msg_hdr_t *msg;
+	msg_newdev_t *msg_dev;
+	msg_pid_t *msg_pid;
+	int ret = -1;
+	struct td_state *s = NULL;
+	fd_list_entry_t *entry;
+
+	char buf[MSG_SIZE];
+
+	length = read(read_fd, buf, MSG_SIZE);
+
+	if (length > 0 && length >= sizeof(msg_hdr_t)) 
+	{
+		msg = (msg_hdr_t *)buf;
+		DPRINTF("blktap: Received msg, len %d, type %d, UID %d\n",
+			length,msg->type,msg->cookie);
+
+		switch (msg->type) {
+		case CTLMSG_PARAMS: 			
+			ptr = buf + sizeof(msg_hdr_t);
+			len = (length - sizeof(msg_hdr_t));
+			path = calloc(1, len + 1);
+			
+			memcpy(path, ptr, len); 
+			DPRINTF("Received CTLMSG_PARAMS: [%s]\n", path);
+
+			/* Allocate the disk structs */
+			s = state_init();
+
+			/*Open file*/
+			if (s == NULL || open_disk(s, path, msg->readonly)) {
+				msglen = sizeof(msg_hdr_t);
+				msg->type = CTLMSG_IMG_FAIL;
+				msg->len = msglen;
+			} else {
+				entry = add_fd_entry(0, s);
+				entry->cookie = msg->cookie;
+				DPRINTF("Entered cookie %d\n", entry->cookie);
+				
+				memset(buf, 0x00, MSG_SIZE); 
+			
+				msglen = sizeof(msg_hdr_t) + sizeof(image_t);
+				msg->type = CTLMSG_IMG;
+				img = (image_t *)(buf + sizeof(msg_hdr_t));
+				img->size = s->size;
+				img->secsize = s->sector_size;
+				img->info = s->info;
+				DPRINTF("Writing (size, secsize, info) = "
+					"(%#" PRIx64 ", %#" PRIx64 ", %d)\n",
+					s->size, s->sector_size, s->info);
+			}
+			len = write(write_fd, buf, msglen);
+			free(path);
+			break;
+			
+		case CTLMSG_NEWDEV:
+			msg_dev = (msg_newdev_t *)(buf + sizeof(msg_hdr_t));
+
+			s = get_state(msg->cookie);
+			DPRINTF("Retrieving state, cookie %d.....[%s]\n",
+				msg->cookie, (s == NULL ? "FAIL":"OK"));
+			if (s != NULL) {
+				ret = ((map_new_dev(s, msg_dev->devnum) 
+					== msg_dev->devnum ? 0: -1));
+			}	
+
+			memset(buf, 0x00, MSG_SIZE); 
+			msglen = sizeof(msg_hdr_t);
+			msg->type = (ret == 0 ? CTLMSG_NEWDEV_RSP 
+				              : CTLMSG_NEWDEV_FAIL);
+			msg->len = msglen;
+
+			len = write(write_fd, buf, msglen);
+			break;
+
+		case CTLMSG_CLOSE:
+			s = get_state(msg->cookie);
+			if (s) unmap_disk(s);
+			break;			
+
+		case CTLMSG_PID:
+			memset(buf, 0x00, MSG_SIZE);
+			msglen = sizeof(msg_hdr_t) + sizeof(msg_pid_t);
+			msg->type = CTLMSG_PID_RSP;
+			msg->len = msglen;
+
+			msg_pid = (msg_pid_t *)(buf + sizeof(msg_hdr_t));
+			process = getpid();
+			msg_pid->pid = process;
+
+			len = write(write_fd, buf, msglen);
+			break;
+
+		default:
+			break;
+		}
+	}
+}
+
+/**
+ * Opens a control socket, i.e. a pipe to communicate with blktapctrl.
+ *
+ * Returns the file descriptor number for the pipe; -1 in error case
+ */
+static int open_ctrl_socket(char *devname)
+{
+	int ret;
+	int ipc_fd;
+
+	if (mkdir(BLKTAP_CTRL_DIR, 0755) == 0)
+		DPRINTF("Created %s directory\n", BLKTAP_CTRL_DIR);
+
+	ret = mkfifo(devname,S_IRWXU|S_IRWXG|S_IRWXO);
+	if ( (ret != 0) && (errno != EEXIST) ) {
+		DPRINTF("ERROR: pipe failed (%d)\n", errno);
+		return -1;
+	}
+
+	ipc_fd = open(devname,O_RDWR|O_NONBLOCK);
+
+	if (ipc_fd < 0) {
+		DPRINTF("FD open failed\n");
+		return -1;
+	}
+
+	return ipc_fd;
+}
+
+/**
+ * Unmaps all disks and closes their pipes
+ */
+void shutdown_blktap(void)
+{
+	fd_list_entry_t *ptr;
+	struct td_state *s;
+	char *devname;
+
+	DPRINTF("Shutdown blktap\n");
+
+	/* Unmap all disks */
+	ptr = fd_start;
+	while (ptr != NULL) {
+		s = ptr->s;
+		unmap_disk(s);
+		close(ptr->tap_fd);
+		ptr = ptr->next;
+	}
+
+	/* Delete control pipes */
+	if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-read-%d", domid) >= 0) {
+		DPRINTF("Delete %s\n", devname);
+		if (unlink(devname))
+			DPRINTF("Could not delete: %s\n", strerror(errno));
+		free(devname);
+	}
+	
+	if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-write-%d", domid) >= 0) {	
+		DPRINTF("Delete %s\n", devname);
+		if (unlink(devname))
+			DPRINTF("Could not delete: %s\n", strerror(errno));
+		free(devname);
+	}
+}
+
+/**
+ * Initialize the blktap interface, i.e. open a pair of pipes in /var/run/tap
+ * and register a fd handler.
+ *
+ * Returns 0 on success.
+ */
+int init_blktap(void)
+{
+	char* devname;	
+
+	DPRINTF("Init blktap pipes\n");
+
+	/* Open the read pipe */
+	if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-read-%d", domid) >= 0) {	
+		read_fd = open_ctrl_socket(devname);		
+		free(devname);
+		
+		if (read_fd == -1) {
+			fprintf(stderr, "Could not open %s/qemu-read-%d\n",
+				BLKTAP_CTRL_DIR, domid);
+			return -1;
+		}
+	}
+	
+	/* Open the write pipe */
+	if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-write-%d", domid) >= 0) {
+		write_fd = open_ctrl_socket(devname);
+		free(devname);
+		
+		if (write_fd == -1) {
+			fprintf(stderr, "Could not open %s/qemu-write-%d\n",
+				BLKTAP_CTRL_DIR, domid);
+			close(read_fd);
+			return -1;
+		}
+	}
+
+	/* Attach a handler to the read pipe (called from qemu main loop) */
+	qemu_set_fd_handler2(read_fd, NULL, &handle_blktap_ctrlmsg, NULL, NULL);
+
+	/* Register handler to clean up when the domain is destroyed */
+	atexit(&shutdown_blktap);
+
+	return 0;
+}
diff -r f33328217eee tools/ioemu/hw/xen_blktap.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ioemu/hw/xen_blktap.h	Thu Mar 13 13:00:18 2008 +0100
@@ -0,0 +1,57 @@
+/* xen_blktap.h
+ *
+ * Generic disk interface for blktap-based image adapters.
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ */
+
+#ifndef XEN_BLKTAP_H_ 
+#define XEN_BLKTAP_H_
+
+#include <stdint.h>
+#include <syslog.h>
+#include <stdio.h>
+
+#include "block_int.h"
+
+/* Things disks need to know about, these should probably be in a higher-level
+ * header. */
+#define MAX_SEGMENTS_PER_REQ    11
+#define SECTOR_SHIFT             9
+#define DEFAULT_SECTOR_SIZE    512
+
+#define MAX_IOFD                 2
+
+#define BLK_NOT_ALLOCATED       99
+#define TD_NO_PARENT             1
+
+typedef uint32_t td_flag_t;
+
+#define TD_RDONLY                1
+
+struct disk_id {
+	char *name;
+	int drivertype;
+};
+
+/* This structure represents the state of an active virtual disk.           */
+struct td_state {
+	BlockDriverState* bs;
+	td_flag_t flags;
+	void *blkif;
+	void *image;
+	void *ring_info;
+	void *fd_entry;
+	uint64_t sector_size;
+	uint64_t size;
+	unsigned int       info;
+};
+
+typedef struct fd_list_entry {
+	int cookie;
+	int  tap_fd;
+	struct td_state *s;
+	struct fd_list_entry **pprev, *next;
+} fd_list_entry_t;
+
+#endif /*XEN_BLKTAP_H_*/

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][PATCH] Use ioemu block drivers through blktap
  2008-03-13 12:25 ` Kevin Wolf
@ 2008-03-13 14:21   ` Konrad Rzeszutek
  2008-03-14  9:37     ` Kevin Wolf
  2008-03-28 15:22   ` Ian Jackson
  1 sibling, 1 reply; 13+ messages in thread
From: Konrad Rzeszutek @ 2008-03-13 14:21 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: xen-devel

> +static int connect_qemu(blkif_t *blkif)
> +{
> +	char *rdctldev, *wrctldev;
> +	
> +	if (asprintf(&rdctldev, BLKTAP_CTRL_DIR "/qemu-read-%d", 
> +			blkif->domid) < 0)
> +		return -1;
> +
> +	if (asprintf(&wrctldev, BLKTAP_CTRL_DIR "/qemu-write-%d", 
> +			blkif->domid) < 0) {
> +		free(rdctldev);
> +		return -1;
> +	}
> +
> +	DPRINTF("Using qemu blktap pipe: %s\n", rdctldev);
> +	
> +	blkif->fds[READ] = open_ctrl_socket(wrctldev);
> +	blkif->fds[WRITE] = open_ctrl_socket(rdctldev);

How about freeing the data here once?

> +	
> +	if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1) {
> +		free(rdctldev);
> +		free(wrctldev);

And then this is not needed.

> +		return -1;
> +	}
> +
> +	DPRINTF("Attached to qemu blktap pipes\n");
> +	free(rdctldev);
> +	free(wrctldev);

Nor these two lines above.

Thought looking at the 'connect_tapdisk' you are using 'goto'
there, why not emulate the same behavior in this function?

> +	return 0;
> +}
> +
> +/* Launch tapdisk instance */
> +static int connect_tapdisk(blkif_t *blkif, int minor)
> +{
> +	char *rdctldev = NULL, *wrctldev = NULL;
> +	int ret = -1;
> +
> +	DPRINTF("tapdisk process does not exist:\n");
> +
> +	if (asprintf(&rdctldev,
> +		     "%s/tapctrlread%d", BLKTAP_CTRL_DIR, minor) == -1)
> +		goto fail;
> +
> +	if (asprintf(&wrctldev,
> +		     "%s/tapctrlwrite%d", BLKTAP_CTRL_DIR, minor) == -1)
> +		goto fail;
> +	
> +	blkif->fds[READ] = open_ctrl_socket(rdctldev);
> +	blkif->fds[WRITE] = open_ctrl_socket(wrctldev);
> +	
> +	if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1)
> +		goto fail;
> +
> +	/*launch the new process*/
> +	DPRINTF("Launching process, CMDLINE [tapdisk %s %s]\n",
> +			wrctldev, rdctldev);
> +
> +	if (launch_tapdisk(wrctldev, rdctldev) == -1) {
> +		DPRINTF("Unable to fork, cmdline: [tapdisk %s %s]\n",
> +				wrctldev, rdctldev);
> +		goto fail;
> +	}
> +
> +	ret = 0;
> +	
> +fail:
> +	if (rdctldev)
> +		free(rdctldev);
> +
> +	if (wrctldev)
> +		free(wrctldev);
> +
> +	return ret;
> +}
> +
>  int blktapctrl_new_blkif(blkif_t *blkif)
>  {
>  	blkif_info_t *blk;
> @@ -524,30 +600,14 @@ int blktapctrl_new_blkif(blkif_t *blkif)
>  		blkif->cookie = next_cookie++;
>  
>  		if (!exist) {
> -			DPRINTF("Process does not exist:\n");
> -			if (asprintf(&rdctldev,
> -				     "%s/tapctrlread%d", BLKTAP_CTRL_DIR, minor) == -1)
> -				goto fail;
> -			if (asprintf(&wrctldev,
> -				     "%s/tapctrlwrite%d", BLKTAP_CTRL_DIR, minor) == -1) {
> -				free(rdctldev);
> -				goto fail;
> +			if (type == DISK_TYPE_IOEMU) {
> +				if (connect_qemu(blkif))
> +					goto fail;
> +			} else {
> +				if (connect_tapdisk(blkif, minor))
> +					goto fail;
>  			}
> -			blkif->fds[READ] = open_ctrl_socket(rdctldev);
> -			blkif->fds[WRITE] = open_ctrl_socket(wrctldev);
> -			
> -			if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1) 
> -				goto fail;
> -
> -			/*launch the new process*/
> - 			DPRINTF("Launching process, CMDLINE [tapdisk %s %s]\n",wrctldev, rdctldev);
> - 			if (launch_tapdisk(wrctldev, rdctldev) == -1) {
> - 				DPRINTF("Unable to fork, cmdline: [tapdisk %s %s]\n",wrctldev, rdctldev);
> -				goto fail;
> -			}
> -
> -			free(rdctldev);
> -			free(wrctldev);
> +
>  		} else {
>  			DPRINTF("Process exists!\n");
>  			blkif->fds[READ] = exist->fds[READ];
> diff -r f33328217eee tools/blktap/drivers/tapdisk.h
> --- a/tools/blktap/drivers/tapdisk.h	Mon Mar 10 22:51:57 2008 +0000
> +++ b/tools/blktap/drivers/tapdisk.h	Thu Mar 13 13:00:18 2008 +0100
> @@ -167,6 +167,7 @@ extern struct tap_disk tapdisk_qcow2;
>  #define DISK_TYPE_RAM      3
>  #define DISK_TYPE_QCOW     4
>  #define DISK_TYPE_QCOW2    5
> +#define DISK_TYPE_IOEMU    6
>  
>  
>  /*Define Individual Disk Parameters here */
> @@ -227,6 +228,16 @@ static disk_info_t qcow2_disk = {
>  	0,
>  #ifdef TAPDISK
>  	&tapdisk_qcow2,
> +#endif
> +};
> +
> +static disk_info_t ioemu_disk = {
> +	DISK_TYPE_IOEMU,
> +	"ioemu disk",
> +	"ioemu",
> +	0,
> +#ifdef TAPDISK
> +	NULL
>  #endif
>  };
>  
> @@ -238,6 +249,7 @@ static disk_info_t *dtypes[] = {
>  	&ram_disk,
>  	&qcow_disk,
>  	&qcow2_disk,
> +	&ioemu_disk,
>  };
>  
>  typedef struct driver_list_entry {
> diff -r f33328217eee tools/blktap/lib/blktaplib.h
> --- a/tools/blktap/lib/blktaplib.h	Mon Mar 10 22:51:57 2008 +0000
> +++ b/tools/blktap/lib/blktaplib.h	Thu Mar 13 13:00:18 2008 +0100
> @@ -221,15 +221,5 @@ int xs_fire_next_watch(struct xs_handle 
>       ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * getpagesize()) +    \
>       ((_seg) * getpagesize()))
>  
> -/* Defines that are only used by library clients */
> -
> -#ifndef __COMPILING_BLKTAP_LIB
> -
> -static char *blkif_op_name[] = {
> -	[BLKIF_OP_READ]       = "READ",
> -	[BLKIF_OP_WRITE]      = "WRITE",
> -};
> -
> -#endif /* __COMPILING_BLKTAP_LIB */
>  
>  #endif /* __BLKTAPLIB_H__ */
> diff -r f33328217eee tools/ioemu/Makefile.target
> --- a/tools/ioemu/Makefile.target	Mon Mar 10 22:51:57 2008 +0000
> +++ b/tools/ioemu/Makefile.target	Thu Mar 13 13:00:18 2008 +0100
> @@ -17,6 +17,7 @@ VPATH=$(SRC_PATH):$(TARGET_PATH):$(SRC_P
>  VPATH=$(SRC_PATH):$(TARGET_PATH):$(SRC_PATH)/hw:$(SRC_PATH)/audio
>  CPPFLAGS+=-I. -I.. -I$(TARGET_PATH) -I$(SRC_PATH)
>  CPPFLAGS+= -I$(XEN_ROOT)/tools/libxc
> +CPPFLAGS+= -I$(XEN_ROOT)/tools/blktap/lib
>  CPPFLAGS+= -I$(XEN_ROOT)/tools/xenstore
>  CPPFLAGS+= -I$(XEN_ROOT)/tools/include
>  ifdef CONFIG_DARWIN_USER
> @@ -429,6 +430,7 @@ VL_OBJS+= usb-uhci.o smbus_eeprom.o
>  VL_OBJS+= usb-uhci.o smbus_eeprom.o
>  VL_OBJS+= piix4acpi.o
>  VL_OBJS+= xenstore.o
> +VL_OBJS+= xen_blktap.o
>  VL_OBJS+= xen_platform.o
>  VL_OBJS+= xen_machine_fv.o
>  VL_OBJS+= xen_machine_pv.o
> diff -r f33328217eee tools/ioemu/hw/xen_machine_pv.c
> --- a/tools/ioemu/hw/xen_machine_pv.c	Mon Mar 10 22:51:57 2008 +0000
> +++ b/tools/ioemu/hw/xen_machine_pv.c	Thu Mar 13 13:00:18 2008 +0100
> @@ -26,6 +26,9 @@
>  #include "xen_console.h"
>  #include "xenfb.h"
>  
> +extern void init_blktap(void);
> +
> +
>  /* The Xen PV machine currently provides
>   *   - a virtual framebuffer
>   *   - ....
> @@ -40,6 +43,10 @@ static void xen_init_pv(uint64_t ram_siz
>  {
>      struct xenfb *xenfb;
>      extern int domid;
> +
> +
> +    /* Initialize tapdisk client */
> +    init_blktap();
>  
>      /* Connect to text console */
>      if (serial_hds[0]) {
> diff -r f33328217eee tools/ioemu/vl.c
> --- a/tools/ioemu/vl.c	Mon Mar 10 22:51:57 2008 +0000
> +++ b/tools/ioemu/vl.c	Thu Mar 13 13:00:18 2008 +0100
> @@ -6270,6 +6270,12 @@ void qemu_system_powerdown_request(void)
>      powerdown_requested = 1;
>      if (cpu_single_env)
>          cpu_interrupt(cpu_single_env, CPU_INTERRUPT_EXIT);
> +}
> +
> +static void qemu_sighup_handler(int signal)
> +{
> +    fprintf(stderr, "Received SIGHUP, terminating.\n");
> +    exit(0);
>  }
>  
>  void main_loop_wait(int timeout)
> @@ -7980,7 +7986,7 @@ int main(int argc, char **argv)
>  
>  #ifndef CONFIG_STUBDOM
>      /* Unblock SIGTERM and SIGHUP, which may have been blocked by the caller */
> -    signal(SIGHUP, SIG_DFL);
> +    signal(SIGHUP, qemu_sighup_handler);
>      sigemptyset(&set);
>      sigaddset(&set, SIGTERM);
>      sigaddset(&set, SIGHUP);
> diff -r f33328217eee tools/python/xen/xend/server/BlktapController.py
> --- a/tools/python/xen/xend/server/BlktapController.py	Mon Mar 10 22:51:57 2008 +0000
> +++ b/tools/python/xen/xend/server/BlktapController.py	Thu Mar 13 13:00:18 2008 +0100
> @@ -13,7 +13,9 @@ blktap_disk_types = [
>      'vmdk',
>      'ram',
>      'qcow',
> -    'qcow2'
> +    'qcow2',
> +
> +    'ioemu'

Why add the extra \n ?

>      ]
>  
>  class BlktapController(BlkifController):
> diff -r f33328217eee tools/ioemu/hw/xen_blktap.c
> --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
> +++ b/tools/ioemu/hw/xen_blktap.c	Thu Mar 13 13:00:18 2008 +0100
> @@ -0,0 +1,688 @@
> +/* xen_blktap.c
> + *
> + * Interface to blktapctrl to allow use of qemu block drivers with blktap.
> + * This file is based on tools/blktap/drivers/tapdisk.c
> + * 
> + * Copyright (c) 2005 Julian Chesterfield and Andrew Warfield.
> + * Copyright (c) 2008 Kevin Wolf
> + */
> +
> +/*
> + * There are several communication channels which are used by this interface:
> + *
> + *   - A pair of pipes for receiving and sending general control messages
> + *     (qemu-read-N and qemu-writeN in /var/run/tap, where N is the domain ID).
> + *     These control messages are handled by handle_blktap_ctrlmsg().
> + *
> + *   - One file descriptor per attached disk (/dev/xen/blktapN) for disk
> + *     specific control messages. A callback is triggered on this fd if there
> + *     is a new IO request. The callback function is handle_blktap_iomsg().
> + *
> + *   - A shared ring for each attached disk containing the actual IO requests 
> + *     and responses. Whenever handle_blktap_iomsg() is triggered it processes
> + *     the requests on this ring.
> + */
> +
> +#include <sys/stat.h>
> +#include <sys/types.h>
> +#include <sys/mman.h>
> +#include <sys/ioctl.h>
> +#include <fcntl.h>
> +#include <stdio.h>
> +#include <errno.h>
> +#include <stdlib.h>
> +
> +#include "vl.h"
> +#include "blktaplib.h"
> +#include "xen_blktap.h"
> +#include "block_int.h"
> +
> +#define MSG_SIZE 4096
> +
> +#define BLKTAP_CTRL_DIR "/var/run/tap"
> +
> +/* If enabled, print debug messages to stderr */
> +#if 1
> +#define DPRINTF(_f, _a...) fprintf(stderr, __FILE__ ":%d: " _f, __LINE__, ##_a)
> +#else
> +#define DPRINTF(_f, _a...) ((void)0)
> +#endif
> +
> +#if 1                                                                        
> +#define ASSERT(_p) \
> +    if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \

You probably want \n at the end.

> +        __LINE__, __FILE__); *(int*)0=0; }
> +#else
> +#define ASSERT(_p) ((void)0)
> +#endif 
> +
> +
> +extern int domid;
> +
> +int read_fd;
> +int write_fd;
> +
> +static pid_t process;
> +fd_list_entry_t *fd_start = NULL;
> +
> +static void handle_blktap_iomsg(void* private);
> +
> +struct aiocb_info {
> +	struct td_state	*s;
> +	uint64_t sector;
> +	int nr_secs;
> +	int idx;
> +	long i;
> +};
> +
> +static void unmap_disk(struct td_state *s)
> +{
> +	tapdev_info_t *info = s->ring_info;
> +	fd_list_entry_t *entry;
> +	
> +	bdrv_close(s->bs);
> +
> +	if (info != NULL && info->mem > 0)
> +	        munmap(info->mem, getpagesize() * BLKTAP_MMAP_REGION_SIZE);
> +
> +	entry = s->fd_entry;
> +	*entry->pprev = entry->next;
> +	if (entry->next)
> +		entry->next->pprev = entry->pprev;
> +
> +	qemu_set_fd_handler2(info->fd, NULL, NULL, NULL, NULL);
> +	close(info->fd);
> +
> +	free(s->fd_entry);
> +	free(s->blkif);
> +	free(s->ring_info);
> +	free(s);
> +
> +	return;
> +}
> +
> +static inline fd_list_entry_t *add_fd_entry(int tap_fd, struct td_state *s)
> +{
> +	fd_list_entry_t **pprev, *entry;
> +
> +	DPRINTF("Adding fd_list_entry\n");
> +
> +	/*Add to linked list*/
> +	s->fd_entry   = entry = malloc(sizeof(fd_list_entry_t));
> +	entry->tap_fd = tap_fd;
> +	entry->s      = s;
> +	entry->next   = NULL;
> +
> +	pprev = &fd_start;
> +	while (*pprev != NULL)
> +		pprev = &(*pprev)->next;
> +
> +	*pprev = entry;
> +	entry->pprev = pprev;
> +
> +	return entry;
> +}
> +
> +static inline struct td_state *get_state(int cookie)
> +{
> +	fd_list_entry_t *ptr;
> +
> +	ptr = fd_start;
> +	while (ptr != NULL) {
> +		if (ptr->cookie == cookie) return ptr->s;
> +		ptr = ptr->next;
> +	}
> +	return NULL;
> +}
> +
> +static struct td_state *state_init(void)
> +{
> +	int i;
> +	struct td_state *s;
> +	blkif_t *blkif;
> +
> +	s = malloc(sizeof(struct td_state));

Would it make sense to zero out the allocated memory?

> +	blkif = s->blkif = malloc(sizeof(blkif_t));
> +	s->ring_info = calloc(1, sizeof(tapdev_info_t));
> +
> +	for (i = 0; i < MAX_REQUESTS; i++) {
> +		blkif->pending_list[i].secs_pending = 0;
> +		blkif->pending_list[i].submitting = 0;
> +	}
> +
> +	return s;
> +}
> +
> +static int map_new_dev(struct td_state *s, int minor)
> +{
> +	int tap_fd;
> +	tapdev_info_t *info = s->ring_info;
> +	char *devname;
> +	fd_list_entry_t *ptr;
> +	int page_size;
> +
> +	if (asprintf(&devname,"%s/%s%d", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, minor) == -1)
> +		return -1;
> +	tap_fd = open(devname, O_RDWR);
> +	if (tap_fd == -1) 
> +	{
> +		DPRINTF("open failed on dev %s!",devname);

You forgot to include \n above.

> +		goto fail;
> +	} 
> +	info->fd = tap_fd;
> +
> +	/*Map the shared memory*/
> +	page_size = getpagesize();
> +	info->mem = mmap(0, page_size * BLKTAP_MMAP_REGION_SIZE, 
> +			  PROT_READ | PROT_WRITE, MAP_SHARED, info->fd, 0);
> +	if ((long int)info->mem == -1) 
> +	{
> +		DPRINTF("mmap failed on dev %s!\n",devname);
> +		goto fail;
> +	}
> +
> +	/* assign the rings to the mapped memory */ 
> +	info->sring = (blkif_sring_t *)((unsigned long)info->mem);
> +	BACK_RING_INIT(&info->fe_ring, info->sring, page_size);
> +	
> +	info->vstart = 
> +	        (unsigned long)info->mem + (BLKTAP_RING_PAGES * page_size);
> +
> +	ioctl(info->fd, BLKTAP_IOCTL_SENDPID, process );
> +	ioctl(info->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE );
> +	free(devname);
> +
> +	/*Update the fd entry*/
> +	ptr = fd_start;
> +	while (ptr != NULL) {
> +		if (s == ptr->s) {
> +			ptr->tap_fd = tap_fd;
> +
> +			/* Setup fd_handler for qemu main loop */
> +			DPRINTF("set tap_fd = %d\n", tap_fd);
> +			qemu_set_fd_handler2(tap_fd, NULL, &handle_blktap_iomsg, NULL, s);
> +
> +			break;
> +		}
> +		ptr = ptr->next;
> +	}	
> +
> +
> +	DPRINTF("map_new_dev = %d\n", minor);
> +	return minor;
> +
> + fail:
> +	free(devname);
> +	return -1;
> +}
> +
> +static int open_disk(struct td_state *s, char *path, int readonly)
> +{
> +	struct disk_id id;
> +	BlockDriverState* bs;
> +
> +	DPRINTF("Opening %s\n", path);
> +	bs = calloc(sizeof(*bs), 1);

Are the arguments swapped?

> +
> +	memset(&id, 0, sizeof(struct disk_id));
> +
> +	if (bdrv_open(bs, path, 0) != 0) {
> +		fprintf(stderr, "Could not open image file %s\n", path);
> +		return -ENOMEM;
> +	}
> +
> +	s->bs = bs;
> +	s->flags = readonly ? TD_RDONLY : 0;
> +	s->size = bs->total_sectors;
> +	s->sector_size = 512;
> +
> +	s->info = ((s->flags & TD_RDONLY) ? VDISK_READONLY : 0);
> +
> +	return 0;
> +}
> +
> +static inline int write_rsp_to_ring(struct td_state *s, blkif_response_t *rsp)

Why bother with a return of int when you always return 0?

> +{
> +	tapdev_info_t *info = s->ring_info;
> +	blkif_response_t *rsp_d;
> +	
> +	rsp_d = RING_GET_RESPONSE(&info->fe_ring, info->fe_ring.rsp_prod_pvt);
> +	memcpy(rsp_d, rsp, sizeof(blkif_response_t));
> +	info->fe_ring.rsp_prod_pvt++;
> +	
> +	return 0;
> +}
> +
> +static inline void kick_responses(struct td_state *s)
> +{
> +	tapdev_info_t *info = s->ring_info;
> +
> +	if (info->fe_ring.rsp_prod_pvt != info->fe_ring.sring->rsp_prod) 
> +	{
> +		RING_PUSH_RESPONSES(&info->fe_ring);
> +		ioctl(info->fd, BLKTAP_IOCTL_KICK_FE);
> +	}
> +}
> +
> +static int send_responses(struct td_state *s, int res, 
> +		   uint64_t sector, int nr_secs, int idx, void *private)
> +{
> +	pending_req_t   *preq;
> +	blkif_request_t *req;
> +	int responses_queued = 0;
> +	blkif_t *blkif = s->blkif;
> +	int secs_done = nr_secs;
> +
> +	if ( (idx > MAX_REQUESTS-1) )
> +	{
> +		DPRINTF("invalid index returned(%u)!\n", idx);
> +		return 0;
> +	}
> +	preq = &blkif->pending_list[idx];
> +	req  = &preq->req;
> +
> +	preq->secs_pending -= secs_done;
> +
> +	if (res == -EBUSY && preq->submitting) 
> +		return -EBUSY;  /* propagate -EBUSY back to higher layers */
> +	if (res) 
> +		preq->status = BLKIF_RSP_ERROR;
> +	
> +	if (!preq->submitting && preq->secs_pending == 0) 
> +	{
> +		blkif_request_t tmp;
> +		blkif_response_t *rsp;
> +
> +		tmp = preq->req;
> +		rsp = (blkif_response_t *)req;
> +		
> +		rsp->id = tmp.id;
> +		rsp->operation = tmp.operation;
> +		rsp->status = preq->status;
> +		
> +		write_rsp_to_ring(s, rsp);
> +		responses_queued++;
> +
> +		kick_responses(s);
> +	}
> +	
> +	return responses_queued;
> +}
> +
> +static void qemu_send_responses(void* opaque, int ret)
> +{
> +	struct aiocb_info* info = opaque;
> +
> +	if (ret != 0) {
> +		DPRINTF("ERROR: ret = %d (%s)\n", ret, strerror(-ret));
> +	}
> +
> +	send_responses(info->s, ret, info->sector, info->nr_secs, 
> +		info->idx, (void*) info->i);
> +	free(info);
> +}
> +
> +/**
> + * Callback function for the IO message pipe. Reads requests from the ring
> + * and processes them (call qemu read/write functions).
> + *
> + * The private parameter points to the struct td_state representing the
> + * disk the request is targeted at.
> + */
> +static void handle_blktap_iomsg(void* private)
> +{
> +	struct td_state* s = private;
> +
> +	RING_IDX          rp, j, i;
> +	blkif_request_t  *req;
> +	int idx, nsects, ret;
> +	uint64_t sector_nr;
> +	uint8_t *page;
> +	blkif_t *blkif = s->blkif;
> +	tapdev_info_t *info = s->ring_info;
> +	int page_size = getpagesize();
> +
> +	struct aiocb_info *aiocb_info;
> +
> +	if (info->fe_ring.sring == NULL) {
> +		DPRINTF("  sring == NULL, ignoring IO request\n");
> +		return;
> +	}
> +
> +	rp = info->fe_ring.sring->req_prod; 
> +	xen_rmb();
> +
> +	for (j = info->fe_ring.req_cons; j != rp; j++)
> +	{
> +		int start_seg = 0; 
> +
> +		req = NULL;
> +		req = RING_GET_REQUEST(&info->fe_ring, j);
> +		++info->fe_ring.req_cons;
> +		
> +		if (req == NULL)
> +			continue;
> +
> +		idx = req->id;
> +
> +		ASSERT(blkif->pending_list[idx].secs_pending == 0);
> +		memcpy(&blkif->pending_list[idx].req, req, sizeof(*req));
> +		blkif->pending_list[idx].status = BLKIF_RSP_OKAY;
> +		blkif->pending_list[idx].submitting = 1;
> +		sector_nr = req->sector_number;
> +
> +		/* Don't allow writes on readonly devices */
> +		if ((s->flags & TD_RDONLY) && 
> +		    (req->operation == BLKIF_OP_WRITE)) {
> +			blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
> +			goto send_response;
> +		}
> +
> +		for (i = start_seg; i < req->nr_segments; i++) {
> +			nsects = req->seg[i].last_sect - 
> +				 req->seg[i].first_sect + 1;
> +	
> +			if ((req->seg[i].last_sect >= page_size >> 9) ||
> +					(nsects <= 0))
> +				continue;
> +
> +			page  = (uint8_t*) MMAP_VADDR(info->vstart, 
> +						   (unsigned long)req->id, i);
> +			page += (req->seg[i].first_sect << SECTOR_SHIFT);
> +
> +			if (sector_nr >= s->size) {
> +				DPRINTF("Sector request failed:\n");
> +				DPRINTF("%s request, idx [%d,%d] size [%llu], "
> +					"sector [%llu,%llu]\n",
> +					(req->operation == BLKIF_OP_WRITE ? 
> +					 "WRITE" : "READ"),
> +					idx,i,
> +					(long long unsigned) 
> +						nsects<<SECTOR_SHIFT,
> +					(long long unsigned) 
> +						sector_nr<<SECTOR_SHIFT,
> +					(long long unsigned) sector_nr);
> +				continue;
> +			}
> +
> +			blkif->pending_list[idx].secs_pending += nsects;
> +
> +			switch (req->operation) 
> +			{
> +			case BLKIF_OP_WRITE:
> +				aiocb_info = malloc(sizeof(*aiocb_info));
> +
> +				aiocb_info->s = s;
> +				aiocb_info->sector = sector_nr;
> +				aiocb_info->nr_secs = nsects;
> +				aiocb_info->idx = idx;
> +				aiocb_info->i = i;
> +
> +				ret = (NULL == bdrv_aio_write(s->bs, sector_nr,
> +							  page, nsects,
> +							  qemu_send_responses,
> +							  aiocb_info));

Who de-allocates aiocb_info?

> +
> +				if (ret) {
> +					blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
> +					DPRINTF("ERROR: bdrv_write() == NULL\n");
> +					goto send_response;
> +				}
> +				break;
> +
> +			case BLKIF_OP_READ:
> +				aiocb_info = malloc(sizeof(*aiocb_info));
> +
> +				aiocb_info->s = s;
> +				aiocb_info->sector = sector_nr;
> +				aiocb_info->nr_secs = nsects;
> +				aiocb_info->idx = idx;
> +				aiocb_info->i = i;
> +
> +				ret = (NULL == bdrv_aio_read(s->bs, sector_nr,
> +							 page, nsects,
> +							 qemu_send_responses,
> +							 aiocb_info));

Ditto.

> +
> +				if (ret) {
> +					blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
> +					DPRINTF("ERROR: bdrv_read() == NULL\n");
> +					goto send_response;
> +				}
> +				break;
> +
> +			default:
> +				DPRINTF("Unknown block operation\n");
> +				break;
> +			}
> +			sector_nr += nsects;
> +		}
> +	send_response:
> +		blkif->pending_list[idx].submitting = 0;
> +
> +		/* force write_rsp_to_ring for synchronous case */
> +		if (blkif->pending_list[idx].secs_pending == 0)
> +			send_responses(s, 0, 0, 0, idx, (void *)(long)0);
> +	}
> +}
> +
> +/**
> + * Callback function for the qemu-read pipe. Reads and processes control 
> + * message from the pipe.
> + *
> + * The parameter private is unused.
> + */
> +static void handle_blktap_ctrlmsg(void* private)
> +{
> +	int length, len, msglen;
> +	char *ptr, *path;
> +	image_t *img;
> +	msg_hdr_t *msg;
> +	msg_newdev_t *msg_dev;
> +	msg_pid_t *msg_pid;
> +	int ret = -1;
> +	struct td_state *s = NULL;
> +	fd_list_entry_t *entry;
> +
> +	char buf[MSG_SIZE];
> +
> +	length = read(read_fd, buf, MSG_SIZE);
> +
> +	if (length > 0 && length >= sizeof(msg_hdr_t)) 
> +	{
> +		msg = (msg_hdr_t *)buf;
> +		DPRINTF("blktap: Received msg, len %d, type %d, UID %d\n",
> +			length,msg->type,msg->cookie);
> +
> +		switch (msg->type) {
> +		case CTLMSG_PARAMS: 			
> +			ptr = buf + sizeof(msg_hdr_t);
> +			len = (length - sizeof(msg_hdr_t));
> +			path = calloc(1, len + 1);
> +			
> +			memcpy(path, ptr, len); 
> +			DPRINTF("Received CTLMSG_PARAMS: [%s]\n", path);
> +
> +			/* Allocate the disk structs */
> +			s = state_init();
> +
> +			/*Open file*/
> +			if (s == NULL || open_disk(s, path, msg->readonly)) {
> +				msglen = sizeof(msg_hdr_t);
> +				msg->type = CTLMSG_IMG_FAIL;
> +				msg->len = msglen;
> +			} else {
> +				entry = add_fd_entry(0, s);
> +				entry->cookie = msg->cookie;
> +				DPRINTF("Entered cookie %d\n", entry->cookie);
> +				
> +				memset(buf, 0x00, MSG_SIZE); 
> +			
> +				msglen = sizeof(msg_hdr_t) + sizeof(image_t);
> +				msg->type = CTLMSG_IMG;
> +				img = (image_t *)(buf + sizeof(msg_hdr_t));
> +				img->size = s->size;
> +				img->secsize = s->sector_size;
> +				img->info = s->info;
> +				DPRINTF("Writing (size, secsize, info) = "
> +					"(%#" PRIx64 ", %#" PRIx64 ", %d)\n",
> +					s->size, s->sector_size, s->info);
> +			}
> +			len = write(write_fd, buf, msglen);
> +			free(path);
> +			break;
> +			
> +		case CTLMSG_NEWDEV:
> +			msg_dev = (msg_newdev_t *)(buf + sizeof(msg_hdr_t));
> +
> +			s = get_state(msg->cookie);
> +			DPRINTF("Retrieving state, cookie %d.....[%s]\n",
> +				msg->cookie, (s == NULL ? "FAIL":"OK"));
> +			if (s != NULL) {
> +				ret = ((map_new_dev(s, msg_dev->devnum) 
> +					== msg_dev->devnum ? 0: -1));
> +			}	
> +
> +			memset(buf, 0x00, MSG_SIZE); 
> +			msglen = sizeof(msg_hdr_t);
> +			msg->type = (ret == 0 ? CTLMSG_NEWDEV_RSP 
> +				              : CTLMSG_NEWDEV_FAIL);
> +			msg->len = msglen;
> +
> +			len = write(write_fd, buf, msglen);
> +			break;
> +
> +		case CTLMSG_CLOSE:
> +			s = get_state(msg->cookie);
> +			if (s) unmap_disk(s);
> +			break;			
> +
> +		case CTLMSG_PID:
> +			memset(buf, 0x00, MSG_SIZE);
> +			msglen = sizeof(msg_hdr_t) + sizeof(msg_pid_t);
> +			msg->type = CTLMSG_PID_RSP;
> +			msg->len = msglen;
> +
> +			msg_pid = (msg_pid_t *)(buf + sizeof(msg_hdr_t));
> +			process = getpid();
> +			msg_pid->pid = process;
> +
> +			len = write(write_fd, buf, msglen);
> +			break;
> +
> +		default:
> +			break;
> +		}
> +	}
> +}
> +
> +/**
> + * Opens a control socket, i.e. a pipe to communicate with blktapctrl.
> + *
> + * Returns the file descriptor number for the pipe; -1 in error case
> + */
> +static int open_ctrl_socket(char *devname)
> +{
> +	int ret;
> +	int ipc_fd;
> +
> +	if (mkdir(BLKTAP_CTRL_DIR, 0755) == 0)
> +		DPRINTF("Created %s directory\n", BLKTAP_CTRL_DIR);
> +
> +	ret = mkfifo(devname,S_IRWXU|S_IRWXG|S_IRWXO);
> +	if ( (ret != 0) && (errno != EEXIST) ) {
> +		DPRINTF("ERROR: pipe failed (%d)\n", errno);
> +		return -1;
> +	}
> +
> +	ipc_fd = open(devname,O_RDWR|O_NONBLOCK);
> +
> +	if (ipc_fd < 0) {
> +		DPRINTF("FD open failed\n");
> +		return -1;
> +	}
> +
> +	return ipc_fd;
> +}
> +
> +/**
> + * Unmaps all disks and closes their pipes
> + */
> +void shutdown_blktap(void)
> +{
> +	fd_list_entry_t *ptr;
> +	struct td_state *s;
> +	char *devname;
> +
> +	DPRINTF("Shutdown blktap\n");
> +
> +	/* Unmap all disks */
> +	ptr = fd_start;
> +	while (ptr != NULL) {
> +		s = ptr->s;
> +		unmap_disk(s);
> +		close(ptr->tap_fd);
> +		ptr = ptr->next;
> +	}
> +
> +	/* Delete control pipes */
> +	if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-read-%d", domid) >= 0) {
> +		DPRINTF("Delete %s\n", devname);
> +		if (unlink(devname))
> +			DPRINTF("Could not delete: %s\n", strerror(errno));
> +		free(devname);
> +	}
> +	
> +	if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-write-%d", domid) >= 0) {	
> +		DPRINTF("Delete %s\n", devname);
> +		if (unlink(devname))
> +			DPRINTF("Could not delete: %s\n", strerror(errno));
> +		free(devname);
> +	}
> +}
> +
> +/**
> + * Initialize the blktap interface, i.e. open a pair of pipes in /var/run/tap
> + * and register a fd handler.
> + *
> + * Returns 0 on success.
> + */
> +int init_blktap(void)
> +{
> +	char* devname;	
> +
> +	DPRINTF("Init blktap pipes\n");
> +
> +	/* Open the read pipe */
> +	if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-read-%d", domid) >= 0) {	
> +		read_fd = open_ctrl_socket(devname);		
> +		free(devname);
> +		
> +		if (read_fd == -1) {
> +			fprintf(stderr, "Could not open %s/qemu-read-%d\n",
> +				BLKTAP_CTRL_DIR, domid);
> +			return -1;
> +		}
> +	}
> +	
> +	/* Open the write pipe */
> +	if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-write-%d", domid) >= 0) {
> +		write_fd = open_ctrl_socket(devname);
> +		free(devname);
> +		
> +		if (write_fd == -1) {
> +			fprintf(stderr, "Could not open %s/qemu-write-%d\n",
> +				BLKTAP_CTRL_DIR, domid);
> +			close(read_fd);
> +			return -1;
> +		}
> +	}
> +
> +	/* Attach a handler to the read pipe (called from qemu main loop) */
> +	qemu_set_fd_handler2(read_fd, NULL, &handle_blktap_ctrlmsg, NULL, NULL);
> +
> +	/* Register handler to clean up when the domain is destroyed */
> +	atexit(&shutdown_blktap);
> +
> +	return 0;
> +}
> diff -r f33328217eee tools/ioemu/hw/xen_blktap.h
> --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
> +++ b/tools/ioemu/hw/xen_blktap.h	Thu Mar 13 13:00:18 2008 +0100
> @@ -0,0 +1,57 @@
> +/* xen_blktap.h
> + *
> + * Generic disk interface for blktap-based image adapters.
> + *
> + * (c) 2006 Andrew Warfield and Julian Chesterfield
> + */
> +
> +#ifndef XEN_BLKTAP_H_ 
> +#define XEN_BLKTAP_H_
> +
> +#include <stdint.h>
> +#include <syslog.h>
> +#include <stdio.h>
> +
> +#include "block_int.h"
> +
> +/* Things disks need to know about, these should probably be in a higher-level
> + * header. */
> +#define MAX_SEGMENTS_PER_REQ    11
> +#define SECTOR_SHIFT             9
> +#define DEFAULT_SECTOR_SIZE    512
> +
> +#define MAX_IOFD                 2
> +
> +#define BLK_NOT_ALLOCATED       99
> +#define TD_NO_PARENT             1
> +
> +typedef uint32_t td_flag_t;
> +
> +#define TD_RDONLY                1
> +
> +struct disk_id {
> +	char *name;
> +	int drivertype;
> +};
> +
> +/* This structure represents the state of an active virtual disk.           */
> +struct td_state {
> +	BlockDriverState* bs;
> +	td_flag_t flags;
> +	void *blkif;
> +	void *image;
> +	void *ring_info;
> +	void *fd_entry;
> +	uint64_t sector_size;
> +	uint64_t size;
> +	unsigned int       info;
> +};
> +
> +typedef struct fd_list_entry {
> +	int cookie;
> +	int  tap_fd;
> +	struct td_state *s;
> +	struct fd_list_entry **pprev, *next;
> +} fd_list_entry_t;
> +
> +#endif /*XEN_BLKTAP_H_*/

> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][PATCH] Use ioemu block drivers through blktap
  2008-03-13 14:21   ` Konrad Rzeszutek
@ 2008-03-14  9:37     ` Kevin Wolf
  2008-03-17 14:12       ` Konrad Rzeszutek
  0 siblings, 1 reply; 13+ messages in thread
From: Kevin Wolf @ 2008-03-14  9:37 UTC (permalink / raw)
  To: Konrad Rzeszutek; +Cc: xen-devel

[-- Attachment #1: Type: text/plain, Size: 2230 bytes --]

Hi Konrad,

first of all, thank you for your review. You noticed quite a few points
I never really looked at because I inherited them from the current
tapdisk code. But probably I should fix these issues as well. ;-)

Konrad Rzeszutek schrieb:
>> +	blkif->fds[READ] = open_ctrl_socket(wrctldev);
>> +	blkif->fds[WRITE] = open_ctrl_socket(rdctldev);
> 
> How about freeing the data here once?
> 
>> +	
>> +	if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1) {
>> +		free(rdctldev);
>> +		free(wrctldev);
> 
> And then this is not needed.
> 
>> +		return -1;
>> +	}
>> +
>> +	DPRINTF("Attached to qemu blktap pipes\n");
>> +	free(rdctldev);
>> +	free(wrctldev);
> 
> Nor these two lines above.

Hmm, good point. This code looks a bit silly... Will move the free to
the place you suggested.

>> --- a/tools/python/xen/xend/server/BlktapController.py	Mon Mar 10 22:51:57 2008 +0000
>> +++ b/tools/python/xen/xend/server/BlktapController.py	Thu Mar 13 13:00:18 2008 +0100
>> @@ -13,7 +13,9 @@ blktap_disk_types = [
>>      'vmdk',
>>      'ram',
>>      'qcow',
>> -    'qcow2'
>> +    'qcow2',
>> +
>> +    'ioemu'
> 
> Why add the extra \n ?

I wanted to separate the ioemu pseudo driver (which is the only one that
doesn't go through tapdisk) from the "real" tapdisk drivers.

>> +static struct td_state *state_init(void)
>> +{
>> +	int i;
>> +	struct td_state *s;
>> +	blkif_t *blkif;
>> +
>> +	s = malloc(sizeof(struct td_state));
> 
> Would it make sense to zero out the allocated memory?

This code comes directly from tapdisk and it worked there. On the other
hand, it certainly wouldn't hurt.

>> +			switch (req->operation) 
>> +			{
>> +			case BLKIF_OP_WRITE:
>> +				aiocb_info = malloc(sizeof(*aiocb_info));
>> +
>> +				aiocb_info->s = s;
>> +				aiocb_info->sector = sector_nr;
>> +				aiocb_info->nr_secs = nsects;
>> +				aiocb_info->idx = idx;
>> +				aiocb_info->i = i;
>> +
>> +				ret = (NULL == bdrv_aio_write(s->bs, sector_nr,
>> +							  page, nsects,
>> +							  qemu_send_responses,
>> +							  aiocb_info));
> 
> Who de-allocates aiocb_info?

qemu_send_responses is a callback function which gets aiocb_info as
parameter and frees it when it's done.

I've attached a new version of the patch.

Kevin

[-- Attachment #2: blktap-ioemu.patch --]
[-- Type: text/x-patch, Size: 24663 bytes --]

diff -r 7530c4dba8a5 tools/blktap/drivers/blktapctrl.c
--- a/tools/blktap/drivers/blktapctrl.c	Mon Mar  3 15:19:39 2008
+++ b/tools/blktap/drivers/blktapctrl.c	Fri Mar 14 11:14:10 2008
@@ -501,6 +501,80 @@
 	return 0;
 }
 
+/* Connect to qemu-dm */
+static int connect_qemu(blkif_t *blkif)
+{
+	char *rdctldev, *wrctldev;
+	
+	if (asprintf(&rdctldev, BLKTAP_CTRL_DIR "/qemu-read-%d", 
+			blkif->domid) < 0)
+		return -1;
+
+	if (asprintf(&wrctldev, BLKTAP_CTRL_DIR "/qemu-write-%d", 
+			blkif->domid) < 0) {
+		free(rdctldev);
+		return -1;
+	}
+
+	DPRINTF("Using qemu blktap pipe: %s\n", rdctldev);
+	
+	blkif->fds[READ] = open_ctrl_socket(wrctldev);
+	blkif->fds[WRITE] = open_ctrl_socket(rdctldev);
+	
+	free(rdctldev);
+	free(wrctldev);
+	
+	if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1)
+		return -1;
+
+	DPRINTF("Attached to qemu blktap pipes\n");
+	return 0;
+}
+
+/* Launch tapdisk instance */
+static int connect_tapdisk(blkif_t *blkif, int minor)
+{
+	char *rdctldev = NULL, *wrctldev = NULL;
+	int ret = -1;
+
+	DPRINTF("tapdisk process does not exist:\n");
+
+	if (asprintf(&rdctldev,
+		     "%s/tapctrlread%d", BLKTAP_CTRL_DIR, minor) == -1)
+		goto fail;
+
+	if (asprintf(&wrctldev,
+		     "%s/tapctrlwrite%d", BLKTAP_CTRL_DIR, minor) == -1)
+		goto fail;
+	
+	blkif->fds[READ] = open_ctrl_socket(rdctldev);
+	blkif->fds[WRITE] = open_ctrl_socket(wrctldev);
+	
+	if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1)
+		goto fail;
+
+	/*launch the new process*/
+	DPRINTF("Launching process, CMDLINE [tapdisk %s %s]\n",
+			wrctldev, rdctldev);
+
+	if (launch_tapdisk(wrctldev, rdctldev) == -1) {
+		DPRINTF("Unable to fork, cmdline: [tapdisk %s %s]\n",
+				wrctldev, rdctldev);
+		goto fail;
+	}
+
+	ret = 0;
+	
+fail:
+	if (rdctldev)
+		free(rdctldev);
+
+	if (wrctldev)
+		free(wrctldev);
+
+	return ret;
+}
+
 int blktapctrl_new_blkif(blkif_t *blkif)
 {
 	blkif_info_t *blk;
@@ -524,30 +598,14 @@
 		blkif->cookie = next_cookie++;
 
 		if (!exist) {
-			DPRINTF("Process does not exist:\n");
-			if (asprintf(&rdctldev,
-				     "%s/tapctrlread%d", BLKTAP_CTRL_DIR, minor) == -1)
-				goto fail;
-			if (asprintf(&wrctldev,
-				     "%s/tapctrlwrite%d", BLKTAP_CTRL_DIR, minor) == -1) {
-				free(rdctldev);
-				goto fail;
+			if (type == DISK_TYPE_IOEMU) {
+				if (connect_qemu(blkif))
+					goto fail;
+			} else {
+				if (connect_tapdisk(blkif, minor))
+					goto fail;
 			}
-			blkif->fds[READ] = open_ctrl_socket(rdctldev);
-			blkif->fds[WRITE] = open_ctrl_socket(wrctldev);
-			
-			if (blkif->fds[READ] == -1 || blkif->fds[WRITE] == -1) 
-				goto fail;
-
-			/*launch the new process*/
- 			DPRINTF("Launching process, CMDLINE [tapdisk %s %s]\n",wrctldev, rdctldev);
- 			if (launch_tapdisk(wrctldev, rdctldev) == -1) {
- 				DPRINTF("Unable to fork, cmdline: [tapdisk %s %s]\n",wrctldev, rdctldev);
-				goto fail;
-			}
-
-			free(rdctldev);
-			free(wrctldev);
+
 		} else {
 			DPRINTF("Process exists!\n");
 			blkif->fds[READ] = exist->fds[READ];
diff -r 7530c4dba8a5 tools/blktap/drivers/tapdisk.h
--- a/tools/blktap/drivers/tapdisk.h	Mon Mar  3 15:19:39 2008
+++ b/tools/blktap/drivers/tapdisk.h	Fri Mar 14 11:14:10 2008
@@ -167,6 +167,7 @@
 #define DISK_TYPE_RAM      3
 #define DISK_TYPE_QCOW     4
 #define DISK_TYPE_QCOW2    5
+#define DISK_TYPE_IOEMU    6
 
 
 /*Define Individual Disk Parameters here */
@@ -227,6 +228,16 @@
 	0,
 #ifdef TAPDISK
 	&tapdisk_qcow2,
+#endif
+};
+
+static disk_info_t ioemu_disk = {
+	DISK_TYPE_IOEMU,
+	"ioemu disk",
+	"ioemu",
+	0,
+#ifdef TAPDISK
+	NULL
 #endif
 };
 
@@ -238,6 +249,7 @@
 	&ram_disk,
 	&qcow_disk,
 	&qcow2_disk,
+	&ioemu_disk,
 };
 
 typedef struct driver_list_entry {
diff -r 7530c4dba8a5 tools/blktap/lib/blktaplib.h
--- a/tools/blktap/lib/blktaplib.h	Mon Mar  3 15:19:39 2008
+++ b/tools/blktap/lib/blktaplib.h	Fri Mar 14 11:14:10 2008
@@ -221,15 +221,5 @@
      ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * getpagesize()) +    \
      ((_seg) * getpagesize()))
 
-/* Defines that are only used by library clients */
-
-#ifndef __COMPILING_BLKTAP_LIB
-
-static char *blkif_op_name[] = {
-	[BLKIF_OP_READ]       = "READ",
-	[BLKIF_OP_WRITE]      = "WRITE",
-};
-
-#endif /* __COMPILING_BLKTAP_LIB */
 
 #endif /* __BLKTAPLIB_H__ */
diff -r 7530c4dba8a5 tools/ioemu/Makefile.target
--- a/tools/ioemu/Makefile.target	Mon Mar  3 15:19:39 2008
+++ b/tools/ioemu/Makefile.target	Fri Mar 14 11:14:10 2008
@@ -17,6 +17,7 @@
 VPATH=$(SRC_PATH):$(TARGET_PATH):$(SRC_PATH)/hw:$(SRC_PATH)/audio
 CPPFLAGS+=-I. -I.. -I$(TARGET_PATH) -I$(SRC_PATH)
 CPPFLAGS+= -I$(XEN_ROOT)/tools/libxc
+CPPFLAGS+= -I$(XEN_ROOT)/tools/blktap/lib
 CPPFLAGS+= -I$(XEN_ROOT)/tools/xenstore
 CPPFLAGS+= -I$(XEN_ROOT)/tools/include
 ifdef CONFIG_DARWIN_USER
@@ -429,6 +430,7 @@
 VL_OBJS+= usb-uhci.o smbus_eeprom.o
 VL_OBJS+= piix4acpi.o
 VL_OBJS+= xenstore.o
+VL_OBJS+= xen_blktap.o
 VL_OBJS+= xen_platform.o
 VL_OBJS+= xen_machine_fv.o
 VL_OBJS+= xen_machine_pv.o
diff -r 7530c4dba8a5 tools/ioemu/hw/xen_machine_pv.c
--- a/tools/ioemu/hw/xen_machine_pv.c	Mon Mar  3 15:19:39 2008
+++ b/tools/ioemu/hw/xen_machine_pv.c	Fri Mar 14 11:14:10 2008
@@ -26,6 +26,9 @@
 #include "xen_console.h"
 #include "xenfb.h"
 
+extern void init_blktap(void);
+
+
 /* The Xen PV machine currently provides
  *   - a virtual framebuffer
  *   - ....
@@ -40,6 +43,10 @@
 {
     struct xenfb *xenfb;
     extern int domid;
+
+
+    /* Initialize tapdisk client */
+    init_blktap();
 
     /* Connect to text console */
     if (serial_hds[0]) {
diff -r 7530c4dba8a5 tools/ioemu/vl.c
--- a/tools/ioemu/vl.c	Mon Mar  3 15:19:39 2008
+++ b/tools/ioemu/vl.c	Fri Mar 14 11:14:10 2008
@@ -6266,6 +6266,12 @@
     powerdown_requested = 1;
     if (cpu_single_env)
         cpu_interrupt(cpu_single_env, CPU_INTERRUPT_EXIT);
+}
+
+static void qemu_sighup_handler(int signal)
+{
+    fprintf(stderr, "Received SIGHUP, terminating.\n");
+    exit(0);
 }
 
 void main_loop_wait(int timeout)
@@ -7976,7 +7982,7 @@
 
 #ifndef CONFIG_STUBDOM
     /* Unblock SIGTERM and SIGHUP, which may have been blocked by the caller */
-    signal(SIGHUP, SIG_DFL);
+    signal(SIGHUP, qemu_sighup_handler);
     sigemptyset(&set);
     sigaddset(&set, SIGTERM);
     sigaddset(&set, SIGHUP);
diff -r 7530c4dba8a5 tools/python/xen/xend/server/BlktapController.py
--- a/tools/python/xen/xend/server/BlktapController.py	Mon Mar  3 15:19:39 2008
+++ b/tools/python/xen/xend/server/BlktapController.py	Fri Mar 14 11:14:10 2008
@@ -13,7 +13,9 @@
     'vmdk',
     'ram',
     'qcow',
-    'qcow2'
+    'qcow2',
+
+    'ioemu'
     ]
 
 class BlktapController(BlkifController):
diff -r 7530c4dba8a5 tools/ioemu/hw/xen_blktap.c
--- /dev/null	Mon Mar  3 15:19:39 2008
+++ b/tools/ioemu/hw/xen_blktap.c	Fri Mar 14 11:14:10 2008
@@ -0,0 +1,686 @@
+/* xen_blktap.c
+ *
+ * Interface to blktapctrl to allow use of qemu block drivers with blktap.
+ * This file is based on tools/blktap/drivers/tapdisk.c
+ * 
+ * Copyright (c) 2005 Julian Chesterfield and Andrew Warfield.
+ * Copyright (c) 2008 Kevin Wolf
+ */
+
+/*
+ * There are several communication channels which are used by this interface:
+ *
+ *   - A pair of pipes for receiving and sending general control messages
+ *     (qemu-read-N and qemu-writeN in /var/run/tap, where N is the domain ID).
+ *     These control messages are handled by handle_blktap_ctrlmsg().
+ *
+ *   - One file descriptor per attached disk (/dev/xen/blktapN) for disk
+ *     specific control messages. A callback is triggered on this fd if there
+ *     is a new IO request. The callback function is handle_blktap_iomsg().
+ *
+ *   - A shared ring for each attached disk containing the actual IO requests 
+ *     and responses. Whenever handle_blktap_iomsg() is triggered it processes
+ *     the requests on this ring.
+ */
+
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+
+#include "vl.h"
+#include "blktaplib.h"
+#include "xen_blktap.h"
+#include "block_int.h"
+
+#define MSG_SIZE 4096
+
+#define BLKTAP_CTRL_DIR "/var/run/tap"
+
+/* If enabled, print debug messages to stderr */
+#if 1
+#define DPRINTF(_f, _a...) fprintf(stderr, __FILE__ ":%d: " _f, __LINE__, ##_a)
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+#if 1                                                                        
+#define ASSERT(_p) \
+    if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s\n", #_p , \
+        __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif 
+
+
+extern int domid;
+
+int read_fd;
+int write_fd;
+
+static pid_t process;
+fd_list_entry_t *fd_start = NULL;
+
+static void handle_blktap_iomsg(void* private);
+
+struct aiocb_info {
+	struct td_state	*s;
+	uint64_t sector;
+	int nr_secs;
+	int idx;
+	long i;
+};
+
+static void unmap_disk(struct td_state *s)
+{
+	tapdev_info_t *info = s->ring_info;
+	fd_list_entry_t *entry;
+	
+	bdrv_close(s->bs);
+
+	if (info != NULL && info->mem > 0)
+	        munmap(info->mem, getpagesize() * BLKTAP_MMAP_REGION_SIZE);
+
+	entry = s->fd_entry;
+	*entry->pprev = entry->next;
+	if (entry->next)
+		entry->next->pprev = entry->pprev;
+
+	qemu_set_fd_handler2(info->fd, NULL, NULL, NULL, NULL);
+	close(info->fd);
+
+	free(s->fd_entry);
+	free(s->blkif);
+	free(s->ring_info);
+	free(s);
+
+	return;
+}
+
+static inline fd_list_entry_t *add_fd_entry(int tap_fd, struct td_state *s)
+{
+	fd_list_entry_t **pprev, *entry;
+
+	DPRINTF("Adding fd_list_entry\n");
+
+	/*Add to linked list*/
+	s->fd_entry   = entry = malloc(sizeof(fd_list_entry_t));
+	entry->tap_fd = tap_fd;
+	entry->s      = s;
+	entry->next   = NULL;
+
+	pprev = &fd_start;
+	while (*pprev != NULL)
+		pprev = &(*pprev)->next;
+
+	*pprev = entry;
+	entry->pprev = pprev;
+
+	return entry;
+}
+
+static inline struct td_state *get_state(int cookie)
+{
+	fd_list_entry_t *ptr;
+
+	ptr = fd_start;
+	while (ptr != NULL) {
+		if (ptr->cookie == cookie) return ptr->s;
+		ptr = ptr->next;
+	}
+	return NULL;
+}
+
+static struct td_state *state_init(void)
+{
+	int i;
+	struct td_state *s;
+	blkif_t *blkif;
+
+	s = malloc(sizeof(struct td_state));
+	blkif = s->blkif = malloc(sizeof(blkif_t));
+	s->ring_info = calloc(1, sizeof(tapdev_info_t));
+
+	for (i = 0; i < MAX_REQUESTS; i++) {
+		blkif->pending_list[i].secs_pending = 0;
+		blkif->pending_list[i].submitting = 0;
+	}
+
+	return s;
+}
+
+static int map_new_dev(struct td_state *s, int minor)
+{
+	int tap_fd;
+	tapdev_info_t *info = s->ring_info;
+	char *devname;
+	fd_list_entry_t *ptr;
+	int page_size;
+
+	if (asprintf(&devname,"%s/%s%d", BLKTAP_DEV_DIR, BLKTAP_DEV_NAME, minor) == -1)
+		return -1;
+	tap_fd = open(devname, O_RDWR);
+	if (tap_fd == -1) 
+	{
+		DPRINTF("open failed on dev %s!\n",devname);
+		goto fail;
+	} 
+	info->fd = tap_fd;
+
+	/*Map the shared memory*/
+	page_size = getpagesize();
+	info->mem = mmap(0, page_size * BLKTAP_MMAP_REGION_SIZE, 
+			  PROT_READ | PROT_WRITE, MAP_SHARED, info->fd, 0);
+	if ((long int)info->mem == -1) 
+	{
+		DPRINTF("mmap failed on dev %s!\n",devname);
+		goto fail;
+	}
+
+	/* assign the rings to the mapped memory */ 
+	info->sring = (blkif_sring_t *)((unsigned long)info->mem);
+	BACK_RING_INIT(&info->fe_ring, info->sring, page_size);
+	
+	info->vstart = 
+	        (unsigned long)info->mem + (BLKTAP_RING_PAGES * page_size);
+
+	ioctl(info->fd, BLKTAP_IOCTL_SENDPID, process );
+	ioctl(info->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE );
+	free(devname);
+
+	/*Update the fd entry*/
+	ptr = fd_start;
+	while (ptr != NULL) {
+		if (s == ptr->s) {
+			ptr->tap_fd = tap_fd;
+
+			/* Setup fd_handler for qemu main loop */
+			DPRINTF("set tap_fd = %d\n", tap_fd);
+			qemu_set_fd_handler2(tap_fd, NULL, &handle_blktap_iomsg, NULL, s);
+
+			break;
+		}
+		ptr = ptr->next;
+	}	
+
+
+	DPRINTF("map_new_dev = %d\n", minor);
+	return minor;
+
+ fail:
+	free(devname);
+	return -1;
+}
+
+static int open_disk(struct td_state *s, char *path, int readonly)
+{
+	struct disk_id id;
+	BlockDriverState* bs;
+
+	DPRINTF("Opening %s\n", path);
+	bs = calloc(1, sizeof(*bs));
+
+	memset(&id, 0, sizeof(struct disk_id));
+
+	if (bdrv_open(bs, path, 0) != 0) {
+		fprintf(stderr, "Could not open image file %s\n", path);
+		return -ENOMEM;
+	}
+
+	s->bs = bs;
+	s->flags = readonly ? TD_RDONLY : 0;
+	s->size = bs->total_sectors;
+	s->sector_size = 512;
+
+	s->info = ((s->flags & TD_RDONLY) ? VDISK_READONLY : 0);
+
+	return 0;
+}
+
+static inline void write_rsp_to_ring(struct td_state *s, blkif_response_t *rsp)
+{
+	tapdev_info_t *info = s->ring_info;
+	blkif_response_t *rsp_d;
+	
+	rsp_d = RING_GET_RESPONSE(&info->fe_ring, info->fe_ring.rsp_prod_pvt);
+	memcpy(rsp_d, rsp, sizeof(blkif_response_t));
+	info->fe_ring.rsp_prod_pvt++;
+}
+
+static inline void kick_responses(struct td_state *s)
+{
+	tapdev_info_t *info = s->ring_info;
+
+	if (info->fe_ring.rsp_prod_pvt != info->fe_ring.sring->rsp_prod) 
+	{
+		RING_PUSH_RESPONSES(&info->fe_ring);
+		ioctl(info->fd, BLKTAP_IOCTL_KICK_FE);
+	}
+}
+
+static int send_responses(struct td_state *s, int res, 
+		   uint64_t sector, int nr_secs, int idx, void *private)
+{
+	pending_req_t   *preq;
+	blkif_request_t *req;
+	int responses_queued = 0;
+	blkif_t *blkif = s->blkif;
+	int secs_done = nr_secs;
+
+	if ( (idx > MAX_REQUESTS-1) )
+	{
+		DPRINTF("invalid index returned(%u)!\n", idx);
+		return 0;
+	}
+	preq = &blkif->pending_list[idx];
+	req  = &preq->req;
+
+	preq->secs_pending -= secs_done;
+
+	if (res == -EBUSY && preq->submitting) 
+		return -EBUSY;  /* propagate -EBUSY back to higher layers */
+	if (res) 
+		preq->status = BLKIF_RSP_ERROR;
+	
+	if (!preq->submitting && preq->secs_pending == 0) 
+	{
+		blkif_request_t tmp;
+		blkif_response_t *rsp;
+
+		tmp = preq->req;
+		rsp = (blkif_response_t *)req;
+		
+		rsp->id = tmp.id;
+		rsp->operation = tmp.operation;
+		rsp->status = preq->status;
+		
+		write_rsp_to_ring(s, rsp);
+		responses_queued++;
+
+		kick_responses(s);
+	}
+	
+	return responses_queued;
+}
+
+static void qemu_send_responses(void* opaque, int ret)
+{
+	struct aiocb_info* info = opaque;
+
+	if (ret != 0) {
+		DPRINTF("ERROR: ret = %d (%s)\n", ret, strerror(-ret));
+	}
+
+	send_responses(info->s, ret, info->sector, info->nr_secs, 
+		info->idx, (void*) info->i);
+	free(info);
+}
+
+/**
+ * Callback function for the IO message pipe. Reads requests from the ring
+ * and processes them (call qemu read/write functions).
+ *
+ * The private parameter points to the struct td_state representing the
+ * disk the request is targeted at.
+ */
+static void handle_blktap_iomsg(void* private)
+{
+	struct td_state* s = private;
+
+	RING_IDX          rp, j, i;
+	blkif_request_t  *req;
+	int idx, nsects, ret;
+	uint64_t sector_nr;
+	uint8_t *page;
+	blkif_t *blkif = s->blkif;
+	tapdev_info_t *info = s->ring_info;
+	int page_size = getpagesize();
+
+	struct aiocb_info *aiocb_info;
+
+	if (info->fe_ring.sring == NULL) {
+		DPRINTF("  sring == NULL, ignoring IO request\n");
+		return;
+	}
+
+	rp = info->fe_ring.sring->req_prod; 
+	xen_rmb();
+
+	for (j = info->fe_ring.req_cons; j != rp; j++)
+	{
+		int start_seg = 0; 
+
+		req = NULL;
+		req = RING_GET_REQUEST(&info->fe_ring, j);
+		++info->fe_ring.req_cons;
+		
+		if (req == NULL)
+			continue;
+
+		idx = req->id;
+
+		ASSERT(blkif->pending_list[idx].secs_pending == 0);
+		memcpy(&blkif->pending_list[idx].req, req, sizeof(*req));
+		blkif->pending_list[idx].status = BLKIF_RSP_OKAY;
+		blkif->pending_list[idx].submitting = 1;
+		sector_nr = req->sector_number;
+
+		/* Don't allow writes on readonly devices */
+		if ((s->flags & TD_RDONLY) && 
+		    (req->operation == BLKIF_OP_WRITE)) {
+			blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
+			goto send_response;
+		}
+
+		for (i = start_seg; i < req->nr_segments; i++) {
+			nsects = req->seg[i].last_sect - 
+				 req->seg[i].first_sect + 1;
+	
+			if ((req->seg[i].last_sect >= page_size >> 9) ||
+					(nsects <= 0))
+				continue;
+
+			page  = (uint8_t*) MMAP_VADDR(info->vstart, 
+						   (unsigned long)req->id, i);
+			page += (req->seg[i].first_sect << SECTOR_SHIFT);
+
+			if (sector_nr >= s->size) {
+				DPRINTF("Sector request failed:\n");
+				DPRINTF("%s request, idx [%d,%d] size [%llu], "
+					"sector [%llu,%llu]\n",
+					(req->operation == BLKIF_OP_WRITE ? 
+					 "WRITE" : "READ"),
+					idx,i,
+					(long long unsigned) 
+						nsects<<SECTOR_SHIFT,
+					(long long unsigned) 
+						sector_nr<<SECTOR_SHIFT,
+					(long long unsigned) sector_nr);
+				continue;
+			}
+
+			blkif->pending_list[idx].secs_pending += nsects;
+
+			switch (req->operation) 
+			{
+			case BLKIF_OP_WRITE:
+				aiocb_info = malloc(sizeof(*aiocb_info));
+
+				aiocb_info->s = s;
+				aiocb_info->sector = sector_nr;
+				aiocb_info->nr_secs = nsects;
+				aiocb_info->idx = idx;
+				aiocb_info->i = i;
+
+				ret = (NULL == bdrv_aio_write(s->bs, sector_nr,
+							  page, nsects,
+							  qemu_send_responses,
+							  aiocb_info));
+
+				if (ret) {
+					blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
+					DPRINTF("ERROR: bdrv_write() == NULL\n");
+					goto send_response;
+				}
+				break;
+
+			case BLKIF_OP_READ:
+				aiocb_info = malloc(sizeof(*aiocb_info));
+
+				aiocb_info->s = s;
+				aiocb_info->sector = sector_nr;
+				aiocb_info->nr_secs = nsects;
+				aiocb_info->idx = idx;
+				aiocb_info->i = i;
+
+				ret = (NULL == bdrv_aio_read(s->bs, sector_nr,
+							 page, nsects,
+							 qemu_send_responses,
+							 aiocb_info));
+
+				if (ret) {
+					blkif->pending_list[idx].status = BLKIF_RSP_ERROR;
+					DPRINTF("ERROR: bdrv_read() == NULL\n");
+					goto send_response;
+				}
+				break;
+
+			default:
+				DPRINTF("Unknown block operation\n");
+				break;
+			}
+			sector_nr += nsects;
+		}
+	send_response:
+		blkif->pending_list[idx].submitting = 0;
+
+		/* force write_rsp_to_ring for synchronous case */
+		if (blkif->pending_list[idx].secs_pending == 0)
+			send_responses(s, 0, 0, 0, idx, (void *)(long)0);
+	}
+}
+
+/**
+ * Callback function for the qemu-read pipe. Reads and processes control 
+ * message from the pipe.
+ *
+ * The parameter private is unused.
+ */
+static void handle_blktap_ctrlmsg(void* private)
+{
+	int length, len, msglen;
+	char *ptr, *path;
+	image_t *img;
+	msg_hdr_t *msg;
+	msg_newdev_t *msg_dev;
+	msg_pid_t *msg_pid;
+	int ret = -1;
+	struct td_state *s = NULL;
+	fd_list_entry_t *entry;
+
+	char buf[MSG_SIZE];
+
+	length = read(read_fd, buf, MSG_SIZE);
+
+	if (length > 0 && length >= sizeof(msg_hdr_t)) 
+	{
+		msg = (msg_hdr_t *)buf;
+		DPRINTF("blktap: Received msg, len %d, type %d, UID %d\n",
+			length,msg->type,msg->cookie);
+
+		switch (msg->type) {
+		case CTLMSG_PARAMS: 			
+			ptr = buf + sizeof(msg_hdr_t);
+			len = (length - sizeof(msg_hdr_t));
+			path = calloc(1, len + 1);
+			
+			memcpy(path, ptr, len); 
+			DPRINTF("Received CTLMSG_PARAMS: [%s]\n", path);
+
+			/* Allocate the disk structs */
+			s = state_init();
+
+			/*Open file*/
+			if (s == NULL || open_disk(s, path, msg->readonly)) {
+				msglen = sizeof(msg_hdr_t);
+				msg->type = CTLMSG_IMG_FAIL;
+				msg->len = msglen;
+			} else {
+				entry = add_fd_entry(0, s);
+				entry->cookie = msg->cookie;
+				DPRINTF("Entered cookie %d\n", entry->cookie);
+				
+				memset(buf, 0x00, MSG_SIZE); 
+			
+				msglen = sizeof(msg_hdr_t) + sizeof(image_t);
+				msg->type = CTLMSG_IMG;
+				img = (image_t *)(buf + sizeof(msg_hdr_t));
+				img->size = s->size;
+				img->secsize = s->sector_size;
+				img->info = s->info;
+				DPRINTF("Writing (size, secsize, info) = "
+					"(%#" PRIx64 ", %#" PRIx64 ", %d)\n",
+					s->size, s->sector_size, s->info);
+			}
+			len = write(write_fd, buf, msglen);
+			free(path);
+			break;
+			
+		case CTLMSG_NEWDEV:
+			msg_dev = (msg_newdev_t *)(buf + sizeof(msg_hdr_t));
+
+			s = get_state(msg->cookie);
+			DPRINTF("Retrieving state, cookie %d.....[%s]\n",
+				msg->cookie, (s == NULL ? "FAIL":"OK"));
+			if (s != NULL) {
+				ret = ((map_new_dev(s, msg_dev->devnum) 
+					== msg_dev->devnum ? 0: -1));
+			}	
+
+			memset(buf, 0x00, MSG_SIZE); 
+			msglen = sizeof(msg_hdr_t);
+			msg->type = (ret == 0 ? CTLMSG_NEWDEV_RSP 
+				              : CTLMSG_NEWDEV_FAIL);
+			msg->len = msglen;
+
+			len = write(write_fd, buf, msglen);
+			break;
+
+		case CTLMSG_CLOSE:
+			s = get_state(msg->cookie);
+			if (s) unmap_disk(s);
+			break;			
+
+		case CTLMSG_PID:
+			memset(buf, 0x00, MSG_SIZE);
+			msglen = sizeof(msg_hdr_t) + sizeof(msg_pid_t);
+			msg->type = CTLMSG_PID_RSP;
+			msg->len = msglen;
+
+			msg_pid = (msg_pid_t *)(buf + sizeof(msg_hdr_t));
+			process = getpid();
+			msg_pid->pid = process;
+
+			len = write(write_fd, buf, msglen);
+			break;
+
+		default:
+			break;
+		}
+	}
+}
+
+/**
+ * Opens a control socket, i.e. a pipe to communicate with blktapctrl.
+ *
+ * Returns the file descriptor number for the pipe; -1 in error case
+ */
+static int open_ctrl_socket(char *devname)
+{
+	int ret;
+	int ipc_fd;
+
+	if (mkdir(BLKTAP_CTRL_DIR, 0755) == 0)
+		DPRINTF("Created %s directory\n", BLKTAP_CTRL_DIR);
+
+	ret = mkfifo(devname,S_IRWXU|S_IRWXG|S_IRWXO);
+	if ( (ret != 0) && (errno != EEXIST) ) {
+		DPRINTF("ERROR: pipe failed (%d)\n", errno);
+		return -1;
+	}
+
+	ipc_fd = open(devname,O_RDWR|O_NONBLOCK);
+
+	if (ipc_fd < 0) {
+		DPRINTF("FD open failed\n");
+		return -1;
+	}
+
+	return ipc_fd;
+}
+
+/**
+ * Unmaps all disks and closes their pipes
+ */
+void shutdown_blktap(void)
+{
+	fd_list_entry_t *ptr;
+	struct td_state *s;
+	char *devname;
+
+	DPRINTF("Shutdown blktap\n");
+
+	/* Unmap all disks */
+	ptr = fd_start;
+	while (ptr != NULL) {
+		s = ptr->s;
+		unmap_disk(s);
+		close(ptr->tap_fd);
+		ptr = ptr->next;
+	}
+
+	/* Delete control pipes */
+	if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-read-%d", domid) >= 0) {
+		DPRINTF("Delete %s\n", devname);
+		if (unlink(devname))
+			DPRINTF("Could not delete: %s\n", strerror(errno));
+		free(devname);
+	}
+	
+	if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-write-%d", domid) >= 0) {	
+		DPRINTF("Delete %s\n", devname);
+		if (unlink(devname))
+			DPRINTF("Could not delete: %s\n", strerror(errno));
+		free(devname);
+	}
+}
+
+/**
+ * Initialize the blktap interface, i.e. open a pair of pipes in /var/run/tap
+ * and register a fd handler.
+ *
+ * Returns 0 on success.
+ */
+int init_blktap(void)
+{
+	char* devname;	
+
+	DPRINTF("Init blktap pipes\n");
+
+	/* Open the read pipe */
+	if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-read-%d", domid) >= 0) {	
+		read_fd = open_ctrl_socket(devname);		
+		free(devname);
+		
+		if (read_fd == -1) {
+			fprintf(stderr, "Could not open %s/qemu-read-%d\n",
+				BLKTAP_CTRL_DIR, domid);
+			return -1;
+		}
+	}
+	
+	/* Open the write pipe */
+	if (asprintf(&devname, BLKTAP_CTRL_DIR "/qemu-write-%d", domid) >= 0) {
+		write_fd = open_ctrl_socket(devname);
+		free(devname);
+		
+		if (write_fd == -1) {
+			fprintf(stderr, "Could not open %s/qemu-write-%d\n",
+				BLKTAP_CTRL_DIR, domid);
+			close(read_fd);
+			return -1;
+		}
+	}
+
+	/* Attach a handler to the read pipe (called from qemu main loop) */
+	qemu_set_fd_handler2(read_fd, NULL, &handle_blktap_ctrlmsg, NULL, NULL);
+
+	/* Register handler to clean up when the domain is destroyed */
+	atexit(&shutdown_blktap);
+
+	return 0;
+}
diff -r 7530c4dba8a5 tools/ioemu/hw/xen_blktap.h
--- /dev/null	Mon Mar  3 15:19:39 2008
+++ b/tools/ioemu/hw/xen_blktap.h	Fri Mar 14 11:14:10 2008
@@ -0,0 +1,57 @@
+/* xen_blktap.h
+ *
+ * Generic disk interface for blktap-based image adapters.
+ *
+ * (c) 2006 Andrew Warfield and Julian Chesterfield
+ */
+
+#ifndef XEN_BLKTAP_H_ 
+#define XEN_BLKTAP_H_
+
+#include <stdint.h>
+#include <syslog.h>
+#include <stdio.h>
+
+#include "block_int.h"
+
+/* Things disks need to know about, these should probably be in a higher-level
+ * header. */
+#define MAX_SEGMENTS_PER_REQ    11
+#define SECTOR_SHIFT             9
+#define DEFAULT_SECTOR_SIZE    512
+
+#define MAX_IOFD                 2
+
+#define BLK_NOT_ALLOCATED       99
+#define TD_NO_PARENT             1
+
+typedef uint32_t td_flag_t;
+
+#define TD_RDONLY                1
+
+struct disk_id {
+	char *name;
+	int drivertype;
+};
+
+/* This structure represents the state of an active virtual disk.           */
+struct td_state {
+	BlockDriverState* bs;
+	td_flag_t flags;
+	void *blkif;
+	void *image;
+	void *ring_info;
+	void *fd_entry;
+	uint64_t sector_size;
+	uint64_t size;
+	unsigned int       info;
+};
+
+typedef struct fd_list_entry {
+	int cookie;
+	int  tap_fd;
+	struct td_state *s;
+	struct fd_list_entry **pprev, *next;
+} fd_list_entry_t;
+
+#endif /*XEN_BLKTAP_H_*/

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][PATCH] Use ioemu block drivers through blktap
  2008-03-14  9:37     ` Kevin Wolf
@ 2008-03-17 14:12       ` Konrad Rzeszutek
  0 siblings, 0 replies; 13+ messages in thread
From: Konrad Rzeszutek @ 2008-03-17 14:12 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: xen-devel

On Fri, Mar 14, 2008 at 10:37:48AM +0100, Kevin Wolf wrote:
> Hi Konrad,
> 
> first of all, thank you for your review. You noticed quite a few points

You are welcome.

> I never really looked at because I inherited them from the current
> tapdisk code. But probably I should fix these issues as well. ;-)

Yes :-)


.. snip ..
> >> +
> >> +    'ioemu'
> > 
> > Why add the extra \n ?
> 
> I wanted to separate the ioemu pseudo driver (which is the only one that
> doesn't go through tapdisk) from the "real" tapdisk drivers.

OK. How about you add a comment saying exactly what you just said
in that line?

> 
> >> +static struct td_state *state_init(void)
> >> +{
> >> +	int i;
> >> +	struct td_state *s;
> >> +	blkif_t *blkif;
> >> +
> >> +	s = malloc(sizeof(struct td_state));
> > 
> > Would it make sense to zero out the allocated memory?
> 
> This code comes directly from tapdisk and it worked there. On the other
> hand, it certainly wouldn't hurt.

I am thinking that in the future the struct td_state might be expanded and
not initializing all of the variables to something could lead to corrupt
pointers.

> 
> >> +			switch (req->operation) 
> >> +			{
> >> +			case BLKIF_OP_WRITE:
> >> +				aiocb_info = malloc(sizeof(*aiocb_info));
> >> +
> >> +				aiocb_info->s = s;
> >> +				aiocb_info->sector = sector_nr;
> >> +				aiocb_info->nr_secs = nsects;
> >> +				aiocb_info->idx = idx;
> >> +				aiocb_info->i = i;
> >> +
> >> +				ret = (NULL == bdrv_aio_write(s->bs, sector_nr,
> >> +							  page, nsects,
> >> +							  qemu_send_responses,
> >> +							  aiocb_info));
> > 
> > Who de-allocates aiocb_info?
> 
> qemu_send_responses is a callback function which gets aiocb_info as
> parameter and frees it when it's done.

Aha! I thought so but wasn't sure. Would it make sense to include
your explanation as comment?

> 
> I've attached a new version of the patch.

Thanks. Didn't spot anything wrong.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][PATCH] Use ioemu block drivers through blktap
  2008-03-13 12:25 ` Kevin Wolf
  2008-03-13 14:21   ` Konrad Rzeszutek
@ 2008-03-28 15:22   ` Ian Jackson
  2008-03-31  9:10     ` Kevin Wolf
  1 sibling, 1 reply; 13+ messages in thread
From: Ian Jackson @ 2008-03-28 15:22 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: xen-devel

Kevin Wolf writes ("Re: [Xen-devel] [RFC][PATCH] Use ioemu block drivers through blktap"):
> I've reworked and cleaned up the thing, so the attached patch should be
> much better. If you agree that my approach is the right one, I think it
> could be committed in this (or a similar) shape.

Sorry for picking this up now, but I'm trying to reconcile ioemu with
qemu and I wondered what the purpose of these changes was:

> +static void qemu_sighup_handler(int signal)
> +{
> +    fprintf(stderr, "Received SIGHUP, terminating.\n");
> +    exit(0);
> +}
> +

> -    signal(SIGHUP, SIG_DFL);
> +    signal(SIGHUP, qemu_sighup_handler);

Thanks,
Ian.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][PATCH] Use ioemu block drivers through blktap
  2008-03-28 15:22   ` Ian Jackson
@ 2008-03-31  9:10     ` Kevin Wolf
  2008-03-31  9:36       ` Ian Jackson
  0 siblings, 1 reply; 13+ messages in thread
From: Kevin Wolf @ 2008-03-31  9:10 UTC (permalink / raw)
  To: Ian Jackson; +Cc: xen-devel

Ian Jackson schrieb:
> Sorry for picking this up now, but I'm trying to reconcile ioemu with
> qemu and I wondered what the purpose of these changes was:
> 
>> +static void qemu_sighup_handler(int signal)
>> +{
>> +    fprintf(stderr, "Received SIGHUP, terminating.\n");
>> +    exit(0);
>> +}
>> +
> 
>> -    signal(SIGHUP, SIG_DFL);
>> +    signal(SIGHUP, qemu_sighup_handler);

I needed a shutdown handler to properly close the blktap connection and
delete the named pipes associated with the qemu-dm instance. When a
domain is destroyed, qemu-dm receives a SIGHUP. However, with the
default SIGHUP handler no atexit functions are called, so I had to add
this handler which performs a clean exit.

Kevin

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][PATCH] Use ioemu block drivers through blktap
  2008-03-31  9:10     ` Kevin Wolf
@ 2008-03-31  9:36       ` Ian Jackson
  2008-03-31 11:11         ` Kevin Wolf
  0 siblings, 1 reply; 13+ messages in thread
From: Ian Jackson @ 2008-03-31  9:36 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: xen-devel

Kevin Wolf writes ("Re: [Xen-devel] [RFC][PATCH] Use ioemu block drivers through blktap"):
> I needed a shutdown handler to properly close the blktap connection and
> delete the named pipes associated with the qemu-dm instance. When a
> domain is destroyed, qemu-dm receives a SIGHUP. However, with the
> default SIGHUP handler no atexit functions are called, so I had to add
> this handler which performs a clean exit.

Sadly in that case using a signal hanlder is no good, because qemu-dm
might die for some other reason than SIGHUP, skipping the cleanup.
(For example, it might have a bug and segfault, or the kernel's OOM
killer might send it a SIGKILL, or ...)

If this cleanup is really necessary, then that will cause problems for
future runs.  If not then the cleanup isn't really necessary :-).
Leftover named pipes, for example, aren't really a big problem and can
just be garbage collected at some point (provided their names are
sufficiently unique).

I haven't eyeballed all of the new qemu blktap arrangements in detail.
What processes are there and why can't the process at the other end of
the pipe do the cleanup ?

Ian.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][PATCH] Use ioemu block drivers through blktap
  2008-03-31  9:36       ` Ian Jackson
@ 2008-03-31 11:11         ` Kevin Wolf
  2008-03-31 13:23           ` Ian Jackson
  0 siblings, 1 reply; 13+ messages in thread
From: Kevin Wolf @ 2008-03-31 11:11 UTC (permalink / raw)
  To: Ian Jackson; +Cc: xen-devel

Ian Jackson schrieb:
> If this cleanup is really necessary, then that will cause problems for
> future runs.  If not then the cleanup isn't really necessary :-).

I think in your classification they are "not really necessary" cleanups.
But that's no reason not to do them, right?

> Leftover named pipes, for example, aren't really a big problem and can
> just be garbage collected at some point (provided their names are
> sufficiently unique).
> 
> I haven't eyeballed all of the new qemu blktap arrangements in detail.
> What processes are there and why can't the process at the other end of
> the pipe do the cleanup ?

As for uniqueness, they have the domain ID as part of their filename. If
a pipe already exists it is reused, so no real problem here, it's just
not nice to leave too many of them lying around IMHO.

Think of qemu-dm as a server providing access to disk images. If you
want the process at the other end to do the cleanup, you have the client
cleaning up server pipes. Even though this sounds a bit odd, it might
work for us in principle as the client is a single blktapctrl process
(or even xend?). One problem would be that qemu-dm could provide a pipe
which is not used by any client (if you don't use tap:ioemu block
devices) and thus won't be cleaned up.

Kevin

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][PATCH] Use ioemu block drivers through blktap
  2008-03-31 11:11         ` Kevin Wolf
@ 2008-03-31 13:23           ` Ian Jackson
  2008-03-31 13:31             ` Keir Fraser
  2008-03-31 13:37             ` Kevin Wolf
  0 siblings, 2 replies; 13+ messages in thread
From: Ian Jackson @ 2008-03-31 13:23 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: xen-devel

Kevin Wolf writes ("Re: [Xen-devel] [RFC][PATCH] Use ioemu block drivers through blktap"):
> Ian Jackson schrieb:
> > If this cleanup is really necessary, then that will cause problems for
> > future runs.  If not then the cleanup isn't really necessary :-).
> 
> I think in your classification they are "not really necessary" cleanups.
> But that's no reason not to do them, right?

Yes, but to avoid gradually accumulating an ever increasing amount of
cruft, that cleanup mechanism ought to be capable of cleaning up even
`lost' garbage of one kind or another.  And once you've got such a
garbage collecting cleanup arrangement, you don't need any separate
atexit-like arrangement which cleans up specifically after the current
run.

It is best just to run the garbage collector at a suitable point.  Do
you have a garbage collector which deletes pipes from stale domains ?

> As for uniqueness, they have the domain ID as part of their filename. If
> a pipe already exists it is reused, so no real problem here, it's just
> not nice to leave too many of them lying around IMHO.

The cost of a handful of leftover pipes for a short period of time is
tiny: a few inodes and a miniscule increase in directory search time.

OTOH if you write special-case code like this then you have the costs
of writing extra code.  Code costs effort to maintain and typically
has a nonzero bug rate, causing actual problems when it goes wrong.

Ian.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][PATCH] Use ioemu block drivers through blktap
  2008-03-31 13:23           ` Ian Jackson
@ 2008-03-31 13:31             ` Keir Fraser
  2008-03-31 13:37               ` Ian Jackson
  2008-03-31 13:37             ` Kevin Wolf
  1 sibling, 1 reply; 13+ messages in thread
From: Keir Fraser @ 2008-03-31 13:31 UTC (permalink / raw)
  To: Ian Jackson, Kevin Wolf; +Cc: xen-devel

On 31/3/08 14:23, "Ian Jackson" <Ian.Jackson@eu.citrix.com> wrote:

>> I think in your classification they are "not really necessary" cleanups.
>> But that's no reason not to do them, right?
> 
> Yes, but to avoid gradually accumulating an ever increasing amount of
> cruft, that cleanup mechanism ought to be capable of cleaning up even
> `lost' garbage of one kind or another.  And once you've got such a
> garbage collecting cleanup arrangement, you don't need any separate
> atexit-like arrangement which cleans up specifically after the current
> run.

The SIGHUP was only introduced in the first place to allow cleanup when
using stub domains. In that case the SIGHUP goes to a qemu-dm 'proxy script'
which then kills the actual stub domain. That's another case where the
mechanism for tearing down such pervasive state really belongs in xend, in
my opinion. I would be very much up for seeing us going back to SIGKILL of
qemu-dm processes.

 -- Keir

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][PATCH] Use ioemu block drivers through blktap
  2008-03-31 13:23           ` Ian Jackson
  2008-03-31 13:31             ` Keir Fraser
@ 2008-03-31 13:37             ` Kevin Wolf
  1 sibling, 0 replies; 13+ messages in thread
From: Kevin Wolf @ 2008-03-31 13:37 UTC (permalink / raw)
  To: Ian Jackson; +Cc: xen-devel

Ian Jackson schrieb:
> Yes, but to avoid gradually accumulating an ever increasing amount of
> cruft, that cleanup mechanism ought to be capable of cleaning up even
> `lost' garbage of one kind or another.  And once you've got such a
> garbage collecting cleanup arrangement, you don't need any separate
> atexit-like arrangement which cleans up specifically after the current
> run.

Well then, feel free to implement it. ;-)

> It is best just to run the garbage collector at a suitable point.  Do
> you have a garbage collector which deletes pipes from stale domains ?

Not directly, no. When a domain gets the same domain ID, the pipe is
reused and then freed when the domain is destroyed. But you don't want
to reboot your Dom0 too often, so you won't get the same domain ID too
early...

> OTOH if you write special-case code like this then you have the costs
> of writing extra code.  Code costs effort to maintain and typically
> has a nonzero bug rate, causing actual problems when it goes wrong.

If you think we were better off just dropping this code, for my part we
can do so. I just thought I'd port also the shutdown handlers from
tapdisk while I'm at it.

Kevin

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [RFC][PATCH] Use ioemu block drivers through blktap
  2008-03-31 13:31             ` Keir Fraser
@ 2008-03-31 13:37               ` Ian Jackson
  0 siblings, 0 replies; 13+ messages in thread
From: Ian Jackson @ 2008-03-31 13:37 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel, Kevin Wolf

Keir Fraser writes ("Re: [Xen-devel] [RFC][PATCH] Use ioemu block drivers through blktap"):
> The SIGHUP was only introduced in the first place to allow cleanup when
> using stub domains. In that case the SIGHUP goes to a qemu-dm 'proxy script'
> which then kills the actual stub domain. That's another case where the
> mechanism for tearing down such pervasive state really belongs in xend, in
> my opinion. I would be very much up for seeing us going back to SIGKILL of
> qemu-dm processes.

Using SIGKILL is indeed much better because it guarantees that any
necessary cleanup happens even if qemu-dm (or some associated process)
is terminated abruptly - because it makes that be the usual case.

Ian.

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2008-03-31 13:37 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-03-10 17:03 [RFC][PATCH] Use ioemu block drivers through blktap Kevin Wolf
2008-03-13 12:25 ` Kevin Wolf
2008-03-13 14:21   ` Konrad Rzeszutek
2008-03-14  9:37     ` Kevin Wolf
2008-03-17 14:12       ` Konrad Rzeszutek
2008-03-28 15:22   ` Ian Jackson
2008-03-31  9:10     ` Kevin Wolf
2008-03-31  9:36       ` Ian Jackson
2008-03-31 11:11         ` Kevin Wolf
2008-03-31 13:23           ` Ian Jackson
2008-03-31 13:31             ` Keir Fraser
2008-03-31 13:37               ` Ian Jackson
2008-03-31 13:37             ` Kevin Wolf

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.