[patch] CFQ for xen domains

All of lore.kernel.org
 help / color / mirror / Atom feed

* [patch] CFQ for xen domains
       [not found]               ` <efbd0c82d427f99326de12f394c2c90b@cl.cam.ac.uk>
@ 2005-11-09 13:29                 ` Gerd Knorr
  2005-11-15 17:51                   ` Gerd Knorr
  0 siblings, 1 reply; 5+ messages in thread
From: Gerd Knorr @ 2005-11-09 13:29 UTC (permalink / raw)
  To: Keir Fraser; +Cc: Ian Pratt, xen-devel, Kurt Garloff, Chris Mason, Jens Axboe

[-- Attachment #1: Type: text/plain, Size: 1127 bytes --]

> Does 'xm save / xm restore' work with this patch (at least as well as it 
> currently does ;-)?

xm save/restore still doesn't work for me, neither with the sparse tree 
nor with the linux-2.6 repository, so I can't try.  I can't see any 
reason why it should become worse with that patch though.

I've resynced the blkback threading patch with the latest sparse tree, 
here we are.  Changes:

   * One thread per blkif.  The I/O scheduler can do a better job that
     way, also you can use ionice on the blkback threads to adjust the
     block I/O priorities for the domain.
   * Various stuff has been moved from global variables into blkif_t.
   * The scary allocation ring for pending_req's is gone and has been
     replaced by a free list.
   * made dispatch_rw_block_io() reentrant.
   * general linux coding style cleanup, at least for the code I've
     touched anyway.
   * number of outstanding requests is runtime-configurable now.
   * made the ia64 #ifdefs smaller and dropped one.  It should still
     work on ia64 in theory, but would be great if the ia64 folks
     can have a look ...

cheers,

   Gerd


[-- Attachment #2: blkback-7.diff --]
[-- Type: text/x-patch, Size: 22872 bytes --]

diff -r abbe3df33774 linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c	Tue Nov  8 17:39:58 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c	Wed Nov  9 13:45:37 2005
@@ -12,6 +12,8 @@
  */
 
 #include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
 #include <asm-xen/balloon.h>
 #include <asm/hypervisor.h>
 #include "common.h"
@@ -21,26 +23,21 @@
  * pulled from a communication ring are quite likely to end up being part of
  * the same scatter/gather request at the disc.
  * 
- * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
+ * 
  * This will increase the chances of being able to write whole tracks.
  * 64 should be enough to keep us competitive with Linux.
  */
-#define MAX_PENDING_REQS 64
-#define BATCH_PER_DOMAIN 16
-
-static unsigned long mmap_vstart;
-#define MMAP_PAGES						\
-	(MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
-#ifdef __ia64__
-static void *pending_vaddrs[MMAP_PAGES];
-#define MMAP_VADDR(_idx, _i) \
-	(unsigned long)(pending_vaddrs[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
-#else
-#define MMAP_VADDR(_req,_seg)						\
-	(mmap_vstart +							\
-	 ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +	\
-	 ((_seg) * PAGE_SIZE))
-#endif
+static int blkif_reqs = 64;
+static int mmap_pages;
+
+static int __init set_blkif_reqs(char *str)
+{
+	get_option(&str, &blkif_reqs);
+	return 1;
+}
+__setup("blkif_reqs=", set_blkif_reqs);
+
 
 /*
  * Each outstanding request that we've passed to the lower device layers has a 
@@ -55,43 +52,38 @@
 	atomic_t       pendcnt;
 	unsigned short operation;
 	int            status;
+	struct list_head free_list;
 } pending_req_t;
 
-/*
- * We can't allocate pending_req's in order, since they may complete out of 
- * order. We therefore maintain an allocation ring. This ring also indicates 
- * when enough work has been passed down -- at that point the allocation ring 
- * will be empty.
- */
-static pending_req_t pending_reqs[MAX_PENDING_REQS];
-static unsigned char pending_ring[MAX_PENDING_REQS];
-static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
-/* NB. We use a different index type to differentiate from shared blk rings. */
-typedef unsigned int PEND_RING_IDX;
-#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
-static PEND_RING_IDX pending_prod, pending_cons;
-#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
-
-static request_queue_t *plugged_queue;
-static inline void flush_plugged_queue(void)
-{
-	request_queue_t *q = plugged_queue;
-	if (q != NULL) {
-		if ( q->unplug_fn != NULL )
-			q->unplug_fn(q);
-		blk_put_queue(q);
-		plugged_queue = NULL;
-	}
-}
+static pending_req_t *pending_reqs;
+static struct list_head pending_free;
+static spinlock_t pending_free_lock = SPIN_LOCK_UNLOCKED;
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
+#define BLKBACK_INVALID_HANDLE (0xFFFF)
+
+static unsigned long mmap_vstart;
+static unsigned long *pending_vaddrs;
+static u16 *pending_grant_handles;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+	return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+	return pending_vaddrs[vaddr_pagenr(req, seg)];
+}
+
+#define pending_handle(_req, _seg) \
+	(pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
 
 /* When using grant tables to map a frame for device access then the
  * handle returned must be used to unmap the frame. This is needed to
  * drop the ref count on the frame.
  */
-static u16 pending_grant_handles[MMAP_PAGES];
-#define pending_handle(_idx, _i) \
-    (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
-#define BLKBACK_INVALID_HANDLE (0xFFFF)
 
 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
 /*
@@ -105,26 +97,79 @@
 static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
 #endif
 
-static int do_block_io_op(blkif_t *blkif, int max_to_do);
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 blkif_request_t *req,
+				 pending_req_t *pending_req);
 static void make_response(blkif_t *blkif, unsigned long id, 
                           unsigned short op, int st);
 
-static void fast_flush_area(int idx, int nr_pages)
+/******************************************************************
+ * misc small helpers
+ */
+static pending_req_t* alloc_req(void)
+{
+	pending_req_t *req = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	if (!list_empty(&pending_free)) {
+		req = list_entry(pending_free.next, pending_req_t, free_list);
+		list_del(&req->free_list);
+	}
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+	unsigned long flags;
+	int was_empty;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	was_empty = list_empty(&pending_free);
+	list_add(&req->free_list, &pending_free);
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	if (was_empty)
+		wake_up(&pending_free_wq);
+}
+
+static void unplug_queue(blkif_t *blkif)
+{
+	if (NULL == blkif->plug)
+		return;
+	if (blkif->plug->unplug_fn)
+		blkif->plug->unplug_fn(blkif->plug);
+	blk_put_queue(blkif->plug);
+	blkif->plug = NULL;
+}
+
+static void plug_queue(blkif_t *blkif, struct bio *bio)
+{
+	request_queue_t *q = bdev_get_queue(bio->bi_bdev);
+
+	if (q == blkif->plug)
+		return;
+	unplug_queue(blkif);
+	blk_get_queue(q);
+	blkif->plug = q;
+}
+
+static void fast_flush_area(pending_req_t *req)
 {
 	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int i, invcount = 0;
 	u16 handle;
 	int ret;
 
-	for (i = 0; i < nr_pages; i++) {
-		handle = pending_handle(idx, i);
+	for (i = 0; i < req->nr_pages; i++) {
+		handle = pending_handle(req, i);
 		if (handle == BLKBACK_INVALID_HANDLE)
 			continue;
-		unmap[invcount].host_addr    = MMAP_VADDR(idx, i);
+		unmap[invcount].host_addr    = vaddr(req, i);
 		unmap[invcount].dev_bus_addr = 0;
 		unmap[invcount].handle       = handle;
-		pending_handle(idx, i) = BLKBACK_INVALID_HANDLE;
+		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
 		invcount++;
 	}
 
@@ -133,109 +178,56 @@
 	BUG_ON(ret);
 }
 
-
-/******************************************************************
- * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
- */
-
-static struct list_head blkio_schedule_list;
-static spinlock_t blkio_schedule_list_lock;
-
-static int __on_blkdev_list(blkif_t *blkif)
-{
-	return blkif->blkdev_list.next != NULL;
-}
-
-static void remove_from_blkdev_list(blkif_t *blkif)
-{
-	unsigned long flags;
-
-	if (!__on_blkdev_list(blkif))
-		return;
-
-	spin_lock_irqsave(&blkio_schedule_list_lock, flags);
-	if (__on_blkdev_list(blkif)) {
-		list_del(&blkif->blkdev_list);
-		blkif->blkdev_list.next = NULL;
-		blkif_put(blkif);
-	}
-	spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-static void add_to_blkdev_list_tail(blkif_t *blkif)
-{
-	unsigned long flags;
-
-	if (__on_blkdev_list(blkif))
-		return;
-
-	spin_lock_irqsave(&blkio_schedule_list_lock, flags);
-	if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) {
-		list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
-		blkif_get(blkif);
-	}
-	spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-
 /******************************************************************
  * SCHEDULER FUNCTIONS
  */
 
-static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
-
-static int blkio_schedule(void *arg)
-{
-	DECLARE_WAITQUEUE(wq, current);
-
-	blkif_t          *blkif;
-	struct list_head *ent;
-
-	daemonize("xenblkd");
-
+int blkif_schedule(void *arg)
+{
+	blkif_t          *blkif = arg;
+
+	blkif_get(blkif);
+	printk(KERN_DEBUG "%s: started\n", current->comm);
 	for (;;) {
-		/* Wait for work to do. */
-		add_wait_queue(&blkio_schedule_wait, &wq);
-		set_current_state(TASK_INTERRUPTIBLE);
-		if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || 
-		     list_empty(&blkio_schedule_list) )
-			schedule();
-		__set_current_state(TASK_RUNNING);
-		remove_wait_queue(&blkio_schedule_wait, &wq);
-
-		/* Queue up a batch of requests. */
-		while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
-		       !list_empty(&blkio_schedule_list)) {
-			ent = blkio_schedule_list.next;
-			blkif = list_entry(ent, blkif_t, blkdev_list);
-			blkif_get(blkif);
-			remove_from_blkdev_list(blkif);
-			if (do_block_io_op(blkif, BATCH_PER_DOMAIN))
-				add_to_blkdev_list_tail(blkif);
-			blkif_put(blkif);
-		}
-
-		/* Push the batch through to disc. */
-		flush_plugged_queue();
-	}
-}
-
-static void maybe_trigger_blkio_schedule(void)
-{
-	/*
-	 * Needed so that two processes, which together make the following
-	 * predicate true, don't both read stale values and evaluate the
-	 * predicate incorrectly. Incredibly unlikely to stall the scheduler
-	 * on x86, but...
-	 */
-	smp_mb();
-
-	if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
-	    !list_empty(&blkio_schedule_list))
-		wake_up(&blkio_schedule_wait);
-}
-
-
+		if (!atomic_read(&blkif->io_pending)) {
+			/* Wait for work to do or requests to exit. */
+			if (kthread_should_stop())
+				break;
+			wait_event_interruptible(blkif->wq,
+						 atomic_read(&blkif->io_pending) ||
+						 kthread_should_stop());
+		} else if (list_empty(&pending_free)) {
+			/* Wait for pending_req becoming available. */
+			wait_event_interruptible(pending_free_wq,
+						 !list_empty(&pending_free));
+		}
+
+		/* Schedule I/O */
+		atomic_set(&blkif->io_pending, 0);
+		if (do_block_io_op(blkif))
+			atomic_inc(&blkif->io_pending);
+		unplug_queue(blkif);
+
+#if 0
+		/* Print stats for performance debugging. */
+		if (time_after(jiffies, blkif->st_print)) {
+			printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
+			       current->comm, blkif->st_oo_req,
+			       blkif->st_rd_req, blkif->st_wr_req);
+			blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+			blkif->st_rd_req = 0;
+			blkif->st_wr_req = 0;
+			blkif->st_oo_req = 0;
+		}
+#endif
+	}
+
+	/* bye folks, and thanks for all the fish ;) */
+	printk(KERN_DEBUG "%s: exiting\n", current->comm);
+	blkif->xenblkd = NULL;
+	blkif_put(blkif);
+	return 0;
+}
 
 /******************************************************************
  * COMPLETION CALLBACK -- Called as bh->b_end_io()
@@ -243,8 +235,6 @@
 
 static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
 {
-	unsigned long flags;
-
 	/* An error fails the entire request. */
 	if (!uptodate) {
 		DPRINTK("Buffer not up-to-date at end of operation\n");
@@ -252,15 +242,11 @@
 	}
 
 	if (atomic_dec_and_test(&pending_req->pendcnt)) {
-		int pending_idx = pending_req - pending_reqs;
-		fast_flush_area(pending_idx, pending_req->nr_pages);
+		fast_flush_area(pending_req);
 		make_response(pending_req->blkif, pending_req->id,
 			      pending_req->operation, pending_req->status);
 		blkif_put(pending_req->blkif);
-		spin_lock_irqsave(&pend_prod_lock, flags);
-		pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
-		spin_unlock_irqrestore(&pend_prod_lock, flags);
-		maybe_trigger_blkio_schedule();
+		free_req(pending_req);
 	}
 }
 
@@ -281,8 +267,10 @@
 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
 {
 	blkif_t *blkif = dev_id;
-	add_to_blkdev_list_tail(blkif);
-	maybe_trigger_blkio_schedule();
+
+	atomic_inc(&blkif->io_pending);
+	if (blkif->status == CONNECTED)
+		wake_up(&blkif->wq);
 	return IRQ_HANDLED;
 }
 
@@ -292,10 +280,11 @@
  * DOWNWARD CALLS -- These interface with the block-device layer proper.
  */
 
-static int do_block_io_op(blkif_t *blkif, int max_to_do)
+static int do_block_io_op(blkif_t *blkif)
 {
 	blkif_back_ring_t *blk_ring = &blkif->blk_ring;
 	blkif_request_t *req;
+	pending_req_t *pending_req;
 	RING_IDX i, rp;
 	int more_to_do = 0;
 
@@ -305,24 +294,30 @@
 	for (i = blk_ring->req_cons; 
 	     (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i);
 	     i++) {
-		if ((max_to_do-- == 0) ||
-		    (NR_PENDING_REQS == MAX_PENDING_REQS)) {
+
+		pending_req = alloc_req();
+		if (NULL == pending_req) {
+			blkif->st_oo_req++;
 			more_to_do = 1;
 			break;
 		}
-        
+
 		req = RING_GET_REQUEST(blk_ring, i);
 		switch (req->operation) {
 		case BLKIF_OP_READ:
+			blkif->st_rd_req++;
+			dispatch_rw_block_io(blkif, req, pending_req);
+			break;
 		case BLKIF_OP_WRITE:
-			dispatch_rw_block_io(blkif, req);
+			blkif->st_wr_req++;
+			dispatch_rw_block_io(blkif, req, pending_req);
 			break;
-
 		default:
 			DPRINTK("error: unknown block io operation [%d]\n",
 				req->operation);
 			make_response(blkif, req->id, req->operation,
 				      BLKIF_RSP_ERROR);
+			free_req(pending_req);
 			break;
 		}
 	}
@@ -331,13 +326,13 @@
 	return more_to_do;
 }
 
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 blkif_request_t *req,
+				 pending_req_t *pending_req)
 {
 	extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
 	int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
 	unsigned long fas = 0;
-	int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
-	pending_req_t *pending_req;
 	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	struct phys_req preq;
 	struct { 
@@ -345,31 +340,35 @@
 	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int nseg;
 	struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-	int nbio = 0;
-	request_queue_t *q;
-	int ret, errors = 0;
+	int ret, i, nbio = 0;
 
 	/* Check that number of segments is sane. */
 	nseg = req->nr_segments;
 	if (unlikely(nseg == 0) || 
 	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 		DPRINTK("Bad number of segments in request (%d)\n", nseg);
-		goto bad_descriptor;
+		goto fail_response;
 	}
 
 	preq.dev           = req->handle;
 	preq.sector_number = req->sector_number;
 	preq.nr_sects      = 0;
 
+	pending_req->blkif     = blkif;
+	pending_req->id        = req->id;
+	pending_req->operation = operation;
+	pending_req->status    = BLKIF_RSP_OKAY;
+	pending_req->nr_pages  = nseg;
+
 	for (i = 0; i < nseg; i++) {
 		fas         = req->frame_and_sects[i];
 		seg[i].nsec = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
 
 		if (seg[i].nsec <= 0)
-			goto bad_descriptor;
+			goto fail_response;
 		preq.nr_sects += seg[i].nsec;
 
-		map[i].host_addr = MMAP_VADDR(pending_idx, i);
+		map[i].host_addr = vaddr(pending_req, i);
 		map[i].dom = blkif->domid;
 		map[i].ref = blkif_gref_from_fas(fas);
 		map[i].flags = GNTMAP_host_map;
@@ -381,27 +380,23 @@
 	BUG_ON(ret);
 
 	for (i = 0; i < nseg; i++) {
-		if (likely(map[i].handle >= 0)) {
-			pending_handle(pending_idx, i) = map[i].handle;
+		if (unlikely(map[i].handle < 0)) {
+			DPRINTK("invalid buffer -- could not remap it\n");
+			goto fail_flush;
+		}
+
+		pending_handle(pending_req, i) = map[i].handle;
 #ifdef __ia64__
-			MMAP_VADDR(pending_idx,i) = gnttab_map_vaddr(map[i]);
+		pending_vaddrs[vaddr_pagenr(req, seg)] =
+			= gnttab_map_vaddr(map[i]);
 #else
-			phys_to_machine_mapping[__pa(MMAP_VADDR(
-				pending_idx, i)) >> PAGE_SHIFT] =
-				FOREIGN_FRAME(map[i].dev_bus_addr>>PAGE_SHIFT);
+		phys_to_machine_mapping[__pa(vaddr(
+			pending_req, i)) >> PAGE_SHIFT] =
+			FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT);
 #endif
-			fas        = req->frame_and_sects[i];
-			seg[i].buf = map[i].dev_bus_addr | 
-				(blkif_first_sect(fas) << 9);
-		} else {
-			errors++;
-		}
-	}
-
-	if (errors) {
-		DPRINTK("invalid buffer -- could not remap it\n");
-		fast_flush_area(pending_idx, nseg);
-		goto bad_descriptor;
+		fas         = req->frame_and_sects[i];
+		seg[i].buf  = map[i].dev_bus_addr | 
+			(blkif_first_sect(fas) << 9);
 	}
 
 	if (vbd_translate(&preq, blkif, operation) != 0) {
@@ -409,37 +404,25 @@
 			operation == READ ? "read" : "write",
 			preq.sector_number,
 			preq.sector_number + preq.nr_sects, preq.dev); 
-		goto bad_descriptor;
-	}
-
-	pending_req = &pending_reqs[pending_idx];
-	pending_req->blkif     = blkif;
-	pending_req->id        = req->id;
-	pending_req->operation = operation;
-	pending_req->status    = BLKIF_RSP_OKAY;
-	pending_req->nr_pages  = nseg;
+		goto fail_flush;
+	}
 
 	for (i = 0; i < nseg; i++) {
 		if (((int)preq.sector_number|(int)seg[i].nsec) &
 		    ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
 			DPRINTK("Misaligned I/O request from domain %d",
 				blkif->domid);
-			goto cleanup_and_fail;
+			goto fail_put_bio;
 		}
 
 		while ((bio == NULL) ||
 		       (bio_add_page(bio,
-				     virt_to_page(MMAP_VADDR(pending_idx, i)),
+				     virt_to_page(vaddr(pending_req, i)),
 				     seg[i].nsec << 9,
 				     seg[i].buf & ~PAGE_MASK) == 0)) {
 			bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
-			if (unlikely(bio == NULL)) {
-			cleanup_and_fail:
-				for (i = 0; i < (nbio-1); i++)
-					bio_put(biolist[i]);
-				fast_flush_area(pending_idx, nseg);
-				goto bad_descriptor;
-			}
+			if (unlikely(bio == NULL))
+				goto fail_put_bio;
                 
 			bio->bi_bdev    = preq.bdev;
 			bio->bi_private = pending_req;
@@ -450,14 +433,8 @@
 		preq.sector_number += seg[i].nsec;
 	}
 
-	if ((q = bdev_get_queue(bio->bi_bdev)) != plugged_queue) {
-		flush_plugged_queue();
-		blk_get_queue(q);
-		plugged_queue = q;
-	}
-
+	plug_queue(blkif, bio);
 	atomic_set(&pending_req->pendcnt, nbio);
-	pending_cons++;
 	blkif_get(blkif);
 
 	for (i = 0; i < nbio; i++)
@@ -465,8 +442,14 @@
 
 	return;
 
- bad_descriptor:
+ fail_put_bio:
+	for (i = 0; i < (nbio-1); i++)
+		bio_put(biolist[i]);
+ fail_flush:
+	fast_flush_area(pending_req);
+ fail_response:
 	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+	free_req(pending_req);
 } 
 
 
@@ -498,56 +481,47 @@
 	notify_remote_via_irq(blkif->irq);
 }
 
-void blkif_deschedule(blkif_t *blkif)
-{
-	remove_from_blkdev_list(blkif);
-}
-
 static int __init blkif_init(void)
 {
+	struct page *page;
 	int i;
-	struct page *page;
-	int ret;
-
-	for (i = 0; i < MMAP_PAGES; i++)
-		pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
-
-	if (xen_init() < 0)
-		return -ENODEV;
+
+	mmap_pages            = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
+					blkif_reqs, GFP_KERNEL);
+	pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+					mmap_pages, GFP_KERNEL);
+	pending_vaddrs        = kmalloc(sizeof(pending_vaddrs[0]) *
+					mmap_pages, GFP_KERNEL);
+	if (!pending_reqs || !pending_grant_handles || !pending_vaddrs) {
+		printk("%s: out of memory\n", __FUNCTION__);
+		return -1;
+	}
 
 	blkif_interface_init();
-
+	
 #ifdef __ia64__
-    {
 	extern unsigned long alloc_empty_foreign_map_page_range(unsigned long pages);
-	int i;
-
-	mmap_vstart =  alloc_empty_foreign_map_page_range(MMAP_PAGES);
-	printk("Allocated mmap_vstart: 0x%lx\n", mmap_vstart);
-	for(i = 0; i < MMAP_PAGES; i++)
-	    pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
-	BUG_ON(mmap_vstart == NULL);
-    }
-#else
-	page = balloon_alloc_empty_page_range(MMAP_PAGES);
+	mmap_vstart = (unsigned long)alloc_empty_foreign_map_page_range(mmap_pages);
+#else /* ! ia64 */
+	page = balloon_alloc_empty_page_range(mmap_pages);
 	BUG_ON(page == NULL);
 	mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
 #endif
-
-	pending_cons = 0;
-	pending_prod = MAX_PENDING_REQS;
+	printk("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n",
+	       __FUNCTION__, blkif_reqs, mmap_pages, mmap_vstart);
+	BUG_ON(mmap_vstart == 0);
+	for (i = 0; i < mmap_pages; i++)
+		pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
+
+	memset(pending_grant_handles,  BLKBACK_INVALID_HANDLE, mmap_pages);
 	memset(pending_reqs, 0, sizeof(pending_reqs));
-	for (i = 0; i < MAX_PENDING_REQS; i++)
-		pending_ring[i] = i;
+	INIT_LIST_HEAD(&pending_free);
+
+	for (i = 0; i < blkif_reqs; i++)
+		list_add_tail(&pending_reqs[i].free_list, &pending_free);
     
-	spin_lock_init(&blkio_schedule_list_lock);
-	INIT_LIST_HEAD(&blkio_schedule_list);
-
-	ret = kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES);
-	BUG_ON(ret < 0);
-
 	blkif_xenbus_init();
-
 	return 0;
 }
 
diff -r abbe3df33774 linux-2.6-xen-sparse/drivers/xen/blkback/common.h
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/common.h	Tue Nov  8 17:39:58 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/common.h	Wed Nov  9 13:45:37 2005
@@ -56,9 +56,19 @@
 	/* Is this a blktap frontend */
 	unsigned int     is_blktap;
 #endif
-	struct list_head blkdev_list;
 	spinlock_t       blk_ring_lock;
 	atomic_t         refcnt;
+
+	wait_queue_head_t   wq;
+	struct task_struct  *xenblkd;
+	atomic_t            io_pending;
+	request_queue_t     *plug;
+
+	/* statistics */
+	unsigned long       st_print;
+	int                 st_rd_req;
+	int                 st_wr_req;
+	int                 st_oo_req;
 
 	struct work_struct free_work;
 
@@ -97,11 +107,10 @@
 
 void blkif_interface_init(void);
 
-void blkif_deschedule(blkif_t *blkif);
-
 void blkif_xenbus_init(void);
 
 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+int blkif_schedule(void *arg);
 
 #endif /* __BLKIF__BACKEND__COMMON_H__ */
 
diff -r abbe3df33774 linux-2.6-xen-sparse/drivers/xen/blkback/interface.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c	Tue Nov  8 17:39:58 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c	Wed Nov  9 13:45:37 2005
@@ -24,6 +24,8 @@
 	blkif->status = DISCONNECTED;
 	spin_lock_init(&blkif->blk_ring_lock);
 	atomic_set(&blkif->refcnt, 1);
+	init_waitqueue_head(&blkif->wq);
+	blkif->st_print = jiffies;
 
 	return blkif;
 }
diff -r abbe3df33774 linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c	Tue Nov  8 17:39:58 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c	Wed Nov  9 13:45:37 2005
@@ -17,6 +17,7 @@
 */
 #include <stdarg.h>
 #include <linux/module.h>
+#include <linux/kthread.h>
 #include <asm-xen/xenbus.h>
 #include "common.h"
 
@@ -46,8 +47,11 @@
 	if (be->watch.node)
 		unregister_xenbus_watch(&be->watch);
 	unregister_xenbus_watch(&be->backend_watch);
-	if (be->blkif)
+	if (be->blkif) {
+		if (be->blkif->xenblkd)
+			kthread_stop(be->blkif->xenblkd);
 		blkif_put(be->blkif);
+	}
 	if (be->frontpath)
 		kfree(be->frontpath);
 	kfree(be);
@@ -198,6 +202,16 @@
 			be->blkif = NULL;
 			xenbus_dev_error(dev, err,
 					 "creating vbd structure");
+			return;
+		}
+
+		be->blkif->xenblkd = kthread_run(blkif_schedule, be->blkif,
+						 "xenblkd %d/%04lx",
+						 be->blkif->domid, be->pdev);
+		if (IS_ERR(be->blkif->xenblkd)) {
+			err = PTR_ERR(be->blkif->xenblkd);
+			be->blkif->xenblkd = NULL;
+			xenbus_dev_error(dev, err, "start xenblkd");
 			return;
 		}
 

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch] CFQ for xen domains
  2005-11-09 13:29                 ` [patch] CFQ for xen domains Gerd Knorr
@ 2005-11-15 17:51                   ` Gerd Knorr
  2005-11-24 11:31                     ` Gerd Knorr
  0 siblings, 1 reply; 5+ messages in thread
From: Gerd Knorr @ 2005-11-15 17:51 UTC (permalink / raw)
  To: Gerd Knorr; +Cc: Ian Pratt, xen-devel, Chris Mason, Kurt Garloff, Jens Axboe

[-- Attachment #1: Type: text/plain, Size: 1063 bytes --]

> I've resynced the blkback threading patch with the latest sparse tree, 
> here we are.  Changes:
> 
>   * One thread per blkif.  The I/O scheduler can do a better job that
>     way, also you can use ionice on the blkback threads to adjust the
>     block I/O priorities for the domain.
>   * Various stuff has been moved from global variables into blkif_t.
>   * The scary allocation ring for pending_req's is gone and has been
>     replaced by a free list.
>   * made dispatch_rw_block_io() reentrant.
>   * general linux coding style cleanup, at least for the code I've
>     touched anyway.
>   * number of outstanding requests is runtime-configurable now.
>   * made the ia64 #ifdefs smaller and dropped one.  It should still
>     work on ia64 in theory, but would be great if the ia64 folks
>     can have a look ...

Next version of that patch, with those additional changes:

   * re-added the xen_init() which got lost by mistake (pointed
     out by the ia64 guys).
   * adapted to the driver architecture changes merged recently.

cheers,

   Gerd

[-- Attachment #2: blkback-7793-11.diff --]
[-- Type: text/x-patch, Size: 23348 bytes --]

diff -r 090e44133d40 linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c	Mon Nov 14 17:13:38 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c	Tue Nov 15 18:17:26 2005
@@ -12,6 +12,8 @@
  */
 
 #include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
 #include <asm-xen/balloon.h>
 #include <asm/hypervisor.h>
 #include "common.h"
@@ -21,26 +23,21 @@
  * pulled from a communication ring are quite likely to end up being part of
  * the same scatter/gather request at the disc.
  * 
- * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
+ * 
  * This will increase the chances of being able to write whole tracks.
  * 64 should be enough to keep us competitive with Linux.
  */
-#define MAX_PENDING_REQS 64
-#define BATCH_PER_DOMAIN 16
-
-static unsigned long mmap_vstart;
-#define MMAP_PAGES						\
-	(MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
-#ifdef __ia64__
-static void *pending_vaddrs[MMAP_PAGES];
-#define MMAP_VADDR(_idx, _i) \
-	(unsigned long)(pending_vaddrs[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
-#else
-#define MMAP_VADDR(_req,_seg)						\
-	(mmap_vstart +							\
-	 ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +	\
-	 ((_seg) * PAGE_SIZE))
-#endif
+static int blkif_reqs = 64;
+static int mmap_pages;
+
+static int __init set_blkif_reqs(char *str)
+{
+	get_option(&str, &blkif_reqs);
+	return 1;
+}
+__setup("blkif_reqs=", set_blkif_reqs);
+
 
 /*
  * Each outstanding request that we've passed to the lower device layers has a 
@@ -55,43 +52,38 @@
 	atomic_t       pendcnt;
 	unsigned short operation;
 	int            status;
+	struct list_head free_list;
 } pending_req_t;
 
-/*
- * We can't allocate pending_req's in order, since they may complete out of 
- * order. We therefore maintain an allocation ring. This ring also indicates 
- * when enough work has been passed down -- at that point the allocation ring 
- * will be empty.
- */
-static pending_req_t pending_reqs[MAX_PENDING_REQS];
-static unsigned char pending_ring[MAX_PENDING_REQS];
-static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
-/* NB. We use a different index type to differentiate from shared blk rings. */
-typedef unsigned int PEND_RING_IDX;
-#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
-static PEND_RING_IDX pending_prod, pending_cons;
-#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
-
-static request_queue_t *plugged_queue;
-static inline void flush_plugged_queue(void)
-{
-	request_queue_t *q = plugged_queue;
-	if (q != NULL) {
-		if ( q->unplug_fn != NULL )
-			q->unplug_fn(q);
-		blk_put_queue(q);
-		plugged_queue = NULL;
-	}
-}
+static pending_req_t *pending_reqs;
+static struct list_head pending_free;
+static spinlock_t pending_free_lock = SPIN_LOCK_UNLOCKED;
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
+#define BLKBACK_INVALID_HANDLE (0xFFFF)
+
+static unsigned long mmap_vstart;
+static unsigned long *pending_vaddrs;
+static u16 *pending_grant_handles;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+	return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+	return pending_vaddrs[vaddr_pagenr(req, seg)];
+}
+
+#define pending_handle(_req, _seg) \
+	(pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
 
 /* When using grant tables to map a frame for device access then the
  * handle returned must be used to unmap the frame. This is needed to
  * drop the ref count on the frame.
  */
-static u16 pending_grant_handles[MMAP_PAGES];
-#define pending_handle(_idx, _i) \
-    (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
-#define BLKBACK_INVALID_HANDLE (0xFFFF)
 
 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
 /*
@@ -105,26 +97,79 @@
 static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
 #endif
 
-static int do_block_io_op(blkif_t *blkif, int max_to_do);
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 blkif_request_t *req,
+				 pending_req_t *pending_req);
 static void make_response(blkif_t *blkif, unsigned long id, 
                           unsigned short op, int st);
 
-static void fast_flush_area(int idx, int nr_pages)
+/******************************************************************
+ * misc small helpers
+ */
+static pending_req_t* alloc_req(void)
+{
+	pending_req_t *req = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	if (!list_empty(&pending_free)) {
+		req = list_entry(pending_free.next, pending_req_t, free_list);
+		list_del(&req->free_list);
+	}
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+	unsigned long flags;
+	int was_empty;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	was_empty = list_empty(&pending_free);
+	list_add(&req->free_list, &pending_free);
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	if (was_empty)
+		wake_up(&pending_free_wq);
+}
+
+static void unplug_queue(blkif_t *blkif)
+{
+	if (NULL == blkif->plug)
+		return;
+	if (blkif->plug->unplug_fn)
+		blkif->plug->unplug_fn(blkif->plug);
+	blk_put_queue(blkif->plug);
+	blkif->plug = NULL;
+}
+
+static void plug_queue(blkif_t *blkif, struct bio *bio)
+{
+	request_queue_t *q = bdev_get_queue(bio->bi_bdev);
+
+	if (q == blkif->plug)
+		return;
+	unplug_queue(blkif);
+	blk_get_queue(q);
+	blkif->plug = q;
+}
+
+static void fast_flush_area(pending_req_t *req)
 {
 	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int i, invcount = 0;
 	u16 handle;
 	int ret;
 
-	for (i = 0; i < nr_pages; i++) {
-		handle = pending_handle(idx, i);
+	for (i = 0; i < req->nr_pages; i++) {
+		handle = pending_handle(req, i);
 		if (handle == BLKBACK_INVALID_HANDLE)
 			continue;
-		unmap[invcount].host_addr    = MMAP_VADDR(idx, i);
+		unmap[invcount].host_addr    = vaddr(req, i);
 		unmap[invcount].dev_bus_addr = 0;
 		unmap[invcount].handle       = handle;
-		pending_handle(idx, i) = BLKBACK_INVALID_HANDLE;
+		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
 		invcount++;
 	}
 
@@ -133,109 +178,72 @@
 	BUG_ON(ret);
 }
 
-
-/******************************************************************
- * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
- */
-
-static struct list_head blkio_schedule_list;
-static spinlock_t blkio_schedule_list_lock;
-
-static int __on_blkdev_list(blkif_t *blkif)
-{
-	return blkif->blkdev_list.next != NULL;
-}
-
-static void remove_from_blkdev_list(blkif_t *blkif)
-{
-	unsigned long flags;
-
-	if (!__on_blkdev_list(blkif))
-		return;
-
-	spin_lock_irqsave(&blkio_schedule_list_lock, flags);
-	if (__on_blkdev_list(blkif)) {
-		list_del(&blkif->blkdev_list);
-		blkif->blkdev_list.next = NULL;
-		blkif_put(blkif);
-	}
-	spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-static void add_to_blkdev_list_tail(blkif_t *blkif)
-{
-	unsigned long flags;
-
-	if (__on_blkdev_list(blkif))
-		return;
-
-	spin_lock_irqsave(&blkio_schedule_list_lock, flags);
-	if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) {
-		list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
-		blkif_get(blkif);
-	}
-	spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-
 /******************************************************************
  * SCHEDULER FUNCTIONS
  */
 
-static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
-
-static int blkio_schedule(void *arg)
-{
-	DECLARE_WAITQUEUE(wq, current);
-
-	blkif_t          *blkif;
-	struct list_head *ent;
-
-	daemonize("xenblkd");
-
+int blkif_schedule(void *arg)
+{
+	blkif_t          *blkif = arg;
+
+	blkif_get(blkif);
+	printk(KERN_DEBUG "%s: started\n", current->comm);
 	for (;;) {
-		/* Wait for work to do. */
-		add_wait_queue(&blkio_schedule_wait, &wq);
-		set_current_state(TASK_INTERRUPTIBLE);
-		if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || 
-		     list_empty(&blkio_schedule_list) )
-			schedule();
-		__set_current_state(TASK_RUNNING);
-		remove_wait_queue(&blkio_schedule_wait, &wq);
-
-		/* Queue up a batch of requests. */
-		while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
-		       !list_empty(&blkio_schedule_list)) {
-			ent = blkio_schedule_list.next;
-			blkif = list_entry(ent, blkif_t, blkdev_list);
-			blkif_get(blkif);
-			remove_from_blkdev_list(blkif);
-			if (do_block_io_op(blkif, BATCH_PER_DOMAIN))
-				add_to_blkdev_list_tail(blkif);
-			blkif_put(blkif);
-		}
-
-		/* Push the batch through to disc. */
-		flush_plugged_queue();
-	}
-}
-
-static void maybe_trigger_blkio_schedule(void)
-{
-	/*
-	 * Needed so that two processes, which together make the following
-	 * predicate true, don't both read stale values and evaluate the
-	 * predicate incorrectly. Incredibly unlikely to stall the scheduler
-	 * on x86, but...
-	 */
-	smp_mb();
-
-	if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
-	    !list_empty(&blkio_schedule_list))
-		wake_up(&blkio_schedule_wait);
-}
-
-
+		if (kthread_should_stop()) {
+			/* asked to quit? */
+			if (!atomic_read(&blkif->io_pending))
+				break;
+			printk(KERN_DEBUG "%s: I/O pending, delaying exit\n",
+			       current->comm);
+		}
+
+		if (!atomic_read(&blkif->io_pending)) {
+			/* Wait for work to do. */
+			wait_event_interruptible(blkif->wq,
+						 atomic_read(&blkif->io_pending) ||
+						 kthread_should_stop());
+		} else if (list_empty(&pending_free)) {
+			/* Wait for pending_req becoming available. */
+			wait_event_interruptible(pending_free_wq,
+						 !list_empty(&pending_free));
+		}
+
+		if (blkif->status != CONNECTED) {
+			/* make sure we are connected */
+			printk(KERN_DEBUG "%s: not connected (%d pending)\n",
+			       current->comm, atomic_read(&blkif->io_pending));
+			wait_event_interruptible(blkif->wq,
+						 blkif->status != CONNECTED ||
+						 kthread_should_stop());
+			continue;
+		}
+
+		/* Schedule I/O */
+		atomic_set(&blkif->io_pending, 0);
+		if (do_block_io_op(blkif))
+			atomic_inc(&blkif->io_pending);
+		unplug_queue(blkif);
+
+#if 0
+		/* Print stats for performance debugging. */
+		if (time_after(jiffies, blkif->st_print)) {
+			printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
+			       current->comm, blkif->st_oo_req,
+			       blkif->st_rd_req, blkif->st_wr_req);
+			blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+			blkif->st_rd_req = 0;
+			blkif->st_wr_req = 0;
+			blkif->st_oo_req = 0;
+		}
+#endif
+	}
+
+	/* bye folks, and thanks for all the fish ;) */
+	printk(KERN_DEBUG "%s: exiting\n", current->comm);
+	blkif->xenblkd = NULL;
+	blkif_put(blkif);
+	return 0;
+}
 
 /******************************************************************
  * COMPLETION CALLBACK -- Called as bh->b_end_io()
@@ -243,8 +251,6 @@
 
 static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
 {
-	unsigned long flags;
-
 	/* An error fails the entire request. */
 	if (!uptodate) {
 		DPRINTK("Buffer not up-to-date at end of operation\n");
@@ -252,15 +258,11 @@
 	}
 
 	if (atomic_dec_and_test(&pending_req->pendcnt)) {
-		int pending_idx = pending_req - pending_reqs;
-		fast_flush_area(pending_idx, pending_req->nr_pages);
+		fast_flush_area(pending_req);
 		make_response(pending_req->blkif, pending_req->id,
 			      pending_req->operation, pending_req->status);
 		blkif_put(pending_req->blkif);
-		spin_lock_irqsave(&pend_prod_lock, flags);
-		pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
-		spin_unlock_irqrestore(&pend_prod_lock, flags);
-		maybe_trigger_blkio_schedule();
+		free_req(pending_req);
 	}
 }
 
@@ -281,8 +283,9 @@
 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
 {
 	blkif_t *blkif = dev_id;
-	add_to_blkdev_list_tail(blkif);
-	maybe_trigger_blkio_schedule();
+
+	atomic_inc(&blkif->io_pending);
+	wake_up(&blkif->wq);
 	return IRQ_HANDLED;
 }
 
@@ -292,10 +295,11 @@
  * DOWNWARD CALLS -- These interface with the block-device layer proper.
  */
 
-static int do_block_io_op(blkif_t *blkif, int max_to_do)
+static int do_block_io_op(blkif_t *blkif)
 {
 	blkif_back_ring_t *blk_ring = &blkif->blk_ring;
 	blkif_request_t *req;
+	pending_req_t *pending_req;
 	RING_IDX i, rp;
 	int more_to_do = 0;
 
@@ -305,24 +309,30 @@
 	for (i = blk_ring->req_cons; 
 	     (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i);
 	     i++) {
-		if ((max_to_do-- == 0) ||
-		    (NR_PENDING_REQS == MAX_PENDING_REQS)) {
+
+		pending_req = alloc_req();
+		if (NULL == pending_req) {
+			blkif->st_oo_req++;
 			more_to_do = 1;
 			break;
 		}
-        
+
 		req = RING_GET_REQUEST(blk_ring, i);
 		switch (req->operation) {
 		case BLKIF_OP_READ:
+			blkif->st_rd_req++;
+			dispatch_rw_block_io(blkif, req, pending_req);
+			break;
 		case BLKIF_OP_WRITE:
-			dispatch_rw_block_io(blkif, req);
+			blkif->st_wr_req++;
+			dispatch_rw_block_io(blkif, req, pending_req);
 			break;
-
 		default:
 			DPRINTK("error: unknown block io operation [%d]\n",
 				req->operation);
 			make_response(blkif, req->id, req->operation,
 				      BLKIF_RSP_ERROR);
+			free_req(pending_req);
 			break;
 		}
 	}
@@ -331,13 +341,13 @@
 	return more_to_do;
 }
 
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 blkif_request_t *req,
+				 pending_req_t *pending_req)
 {
 	extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
 	int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
 	unsigned long fas = 0;
-	int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
-	pending_req_t *pending_req;
 	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	struct phys_req preq;
 	struct { 
@@ -345,31 +355,35 @@
 	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int nseg;
 	struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-	int nbio = 0;
-	request_queue_t *q;
-	int ret, errors = 0;
+	int ret, i, nbio = 0;
 
 	/* Check that number of segments is sane. */
 	nseg = req->nr_segments;
 	if (unlikely(nseg == 0) || 
 	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 		DPRINTK("Bad number of segments in request (%d)\n", nseg);
-		goto bad_descriptor;
+		goto fail_response;
 	}
 
 	preq.dev           = req->handle;
 	preq.sector_number = req->sector_number;
 	preq.nr_sects      = 0;
 
+	pending_req->blkif     = blkif;
+	pending_req->id        = req->id;
+	pending_req->operation = operation;
+	pending_req->status    = BLKIF_RSP_OKAY;
+	pending_req->nr_pages  = nseg;
+
 	for (i = 0; i < nseg; i++) {
 		fas         = req->frame_and_sects[i];
 		seg[i].nsec = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
 
 		if (seg[i].nsec <= 0)
-			goto bad_descriptor;
+			goto fail_response;
 		preq.nr_sects += seg[i].nsec;
 
-		map[i].host_addr = MMAP_VADDR(pending_idx, i);
+		map[i].host_addr = vaddr(pending_req, i);
 		map[i].dom = blkif->domid;
 		map[i].ref = blkif_gref_from_fas(fas);
 		map[i].flags = GNTMAP_host_map;
@@ -381,27 +395,23 @@
 	BUG_ON(ret);
 
 	for (i = 0; i < nseg; i++) {
-		if (likely(map[i].handle >= 0)) {
-			pending_handle(pending_idx, i) = map[i].handle;
+		if (unlikely(map[i].handle < 0)) {
+			DPRINTK("invalid buffer -- could not remap it\n");
+			goto fail_flush;
+		}
+
+		pending_handle(pending_req, i) = map[i].handle;
 #ifdef __ia64__
-			MMAP_VADDR(pending_idx,i) = gnttab_map_vaddr(map[i]);
+		pending_vaddrs[vaddr_pagenr(req, seg)] =
+			= gnttab_map_vaddr(map[i]);
 #else
-			set_phys_to_machine(__pa(MMAP_VADDR(
-				pending_idx, i)) >> PAGE_SHIFT,
-				FOREIGN_FRAME(map[i].dev_bus_addr>>PAGE_SHIFT));
+		set_phys_to_machine(__pa(vaddr(
+			pending_req, i)) >> PAGE_SHIFT,
+			FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
 #endif
-			fas        = req->frame_and_sects[i];
-			seg[i].buf = map[i].dev_bus_addr | 
-				(blkif_first_sect(fas) << 9);
-		} else {
-			errors++;
-		}
-	}
-
-	if (errors) {
-		DPRINTK("invalid buffer -- could not remap it\n");
-		fast_flush_area(pending_idx, nseg);
-		goto bad_descriptor;
+		fas         = req->frame_and_sects[i];
+		seg[i].buf  = map[i].dev_bus_addr | 
+			(blkif_first_sect(fas) << 9);
 	}
 
 	if (vbd_translate(&preq, blkif, operation) != 0) {
@@ -409,37 +419,25 @@
 			operation == READ ? "read" : "write",
 			preq.sector_number,
 			preq.sector_number + preq.nr_sects, preq.dev); 
-		goto bad_descriptor;
-	}
-
-	pending_req = &pending_reqs[pending_idx];
-	pending_req->blkif     = blkif;
-	pending_req->id        = req->id;
-	pending_req->operation = operation;
-	pending_req->status    = BLKIF_RSP_OKAY;
-	pending_req->nr_pages  = nseg;
+		goto fail_flush;
+	}
 
 	for (i = 0; i < nseg; i++) {
 		if (((int)preq.sector_number|(int)seg[i].nsec) &
 		    ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
 			DPRINTK("Misaligned I/O request from domain %d",
 				blkif->domid);
-			goto cleanup_and_fail;
+			goto fail_put_bio;
 		}
 
 		while ((bio == NULL) ||
 		       (bio_add_page(bio,
-				     virt_to_page(MMAP_VADDR(pending_idx, i)),
+				     virt_to_page(vaddr(pending_req, i)),
 				     seg[i].nsec << 9,
 				     seg[i].buf & ~PAGE_MASK) == 0)) {
 			bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
-			if (unlikely(bio == NULL)) {
-			cleanup_and_fail:
-				for (i = 0; i < (nbio-1); i++)
-					bio_put(biolist[i]);
-				fast_flush_area(pending_idx, nseg);
-				goto bad_descriptor;
-			}
+			if (unlikely(bio == NULL))
+				goto fail_put_bio;
                 
 			bio->bi_bdev    = preq.bdev;
 			bio->bi_private = pending_req;
@@ -450,14 +448,8 @@
 		preq.sector_number += seg[i].nsec;
 	}
 
-	if ((q = bdev_get_queue(bio->bi_bdev)) != plugged_queue) {
-		flush_plugged_queue();
-		blk_get_queue(q);
-		plugged_queue = q;
-	}
-
+	plug_queue(blkif, bio);
 	atomic_set(&pending_req->pendcnt, nbio);
-	pending_cons++;
 	blkif_get(blkif);
 
 	for (i = 0; i < nbio; i++)
@@ -465,8 +457,14 @@
 
 	return;
 
- bad_descriptor:
+ fail_put_bio:
+	for (i = 0; i < (nbio-1); i++)
+		bio_put(biolist[i]);
+ fail_flush:
+	fast_flush_area(pending_req);
+ fail_response:
 	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+	free_req(pending_req);
 } 
 
 
@@ -498,56 +496,50 @@
 	notify_remote_via_irq(blkif->irq);
 }
 
-void blkif_deschedule(blkif_t *blkif)
-{
-	remove_from_blkdev_list(blkif);
-}
-
 static int __init blkif_init(void)
 {
+	struct page *page;
 	int i;
-	struct page *page;
-	int ret;
-
-	for (i = 0; i < MMAP_PAGES; i++)
-		pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
 
 	if (xen_init() < 0)
 		return -ENODEV;
 
+	mmap_pages            = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
+					blkif_reqs, GFP_KERNEL);
+	pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+					mmap_pages, GFP_KERNEL);
+	pending_vaddrs        = kmalloc(sizeof(pending_vaddrs[0]) *
+					mmap_pages, GFP_KERNEL);
+	if (!pending_reqs || !pending_grant_handles || !pending_vaddrs) {
+		printk("%s: out of memory\n", __FUNCTION__);
+		return -1;
+	}
+
 	blkif_interface_init();
-
+	
 #ifdef __ia64__
-    {
 	extern unsigned long alloc_empty_foreign_map_page_range(unsigned long pages);
-	int i;
-
-	mmap_vstart =  alloc_empty_foreign_map_page_range(MMAP_PAGES);
-	printk("Allocated mmap_vstart: 0x%lx\n", mmap_vstart);
-	for(i = 0; i < MMAP_PAGES; i++)
-	    pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
-	BUG_ON(mmap_vstart == NULL);
-    }
-#else
-	page = balloon_alloc_empty_page_range(MMAP_PAGES);
+	mmap_vstart = (unsigned long)alloc_empty_foreign_map_page_range(mmap_pages);
+#else /* ! ia64 */
+	page = balloon_alloc_empty_page_range(mmap_pages);
 	BUG_ON(page == NULL);
 	mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
 #endif
-
-	pending_cons = 0;
-	pending_prod = MAX_PENDING_REQS;
+	printk("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n",
+	       __FUNCTION__, blkif_reqs, mmap_pages, mmap_vstart);
+	BUG_ON(mmap_vstart == 0);
+	for (i = 0; i < mmap_pages; i++)
+		pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
+
+	memset(pending_grant_handles,  BLKBACK_INVALID_HANDLE, mmap_pages);
 	memset(pending_reqs, 0, sizeof(pending_reqs));
-	for (i = 0; i < MAX_PENDING_REQS; i++)
-		pending_ring[i] = i;
+	INIT_LIST_HEAD(&pending_free);
+
+	for (i = 0; i < blkif_reqs; i++)
+		list_add_tail(&pending_reqs[i].free_list, &pending_free);
     
-	spin_lock_init(&blkio_schedule_list_lock);
-	INIT_LIST_HEAD(&blkio_schedule_list);
-
-	ret = kernel_thread(blkio_schedule, 0, CLONE_FS | CLONE_FILES);
-	BUG_ON(ret < 0);
-
 	blkif_xenbus_init();
-
 	return 0;
 }
 
diff -r 090e44133d40 linux-2.6-xen-sparse/drivers/xen/blkback/common.h
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/common.h	Mon Nov 14 17:13:38 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/common.h	Tue Nov 15 18:17:26 2005
@@ -56,9 +56,19 @@
 	/* Is this a blktap frontend */
 	unsigned int     is_blktap;
 #endif
-	struct list_head blkdev_list;
 	spinlock_t       blk_ring_lock;
 	atomic_t         refcnt;
+
+	wait_queue_head_t   wq;
+	struct task_struct  *xenblkd;
+	atomic_t            io_pending;
+	request_queue_t     *plug;
+
+	/* statistics */
+	unsigned long       st_print;
+	int                 st_rd_req;
+	int                 st_wr_req;
+	int                 st_oo_req;
 
 	struct work_struct free_work;
 
@@ -97,11 +107,10 @@
 
 void blkif_interface_init(void);
 
-void blkif_deschedule(blkif_t *blkif);
-
 void blkif_xenbus_init(void);
 
 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+int blkif_schedule(void *arg);
 
 #endif /* __BLKIF__BACKEND__COMMON_H__ */
 
diff -r 090e44133d40 linux-2.6-xen-sparse/drivers/xen/blkback/interface.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c	Mon Nov 14 17:13:38 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c	Tue Nov 15 18:17:26 2005
@@ -24,6 +24,8 @@
 	blkif->status = DISCONNECTED;
 	spin_lock_init(&blkif->blk_ring_lock);
 	atomic_set(&blkif->refcnt, 1);
+	init_waitqueue_head(&blkif->wq);
+	blkif->st_print = jiffies;
 
 	return blkif;
 }
@@ -113,6 +115,7 @@
 	blkif->irq = bind_evtchn_to_irqhandler(
 		blkif->evtchn, blkif_be_int, 0, "blkif-backend", blkif);
 	blkif->status = CONNECTED;
+	wake_up(&blkif->wq);
 
 	return 0;
 }
diff -r 090e44133d40 linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c	Mon Nov 14 17:13:38 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c	Tue Nov 15 18:17:26 2005
@@ -20,6 +20,7 @@
 
 #include <stdarg.h>
 #include <linux/module.h>
+#include <linux/kthread.h>
 #include <asm-xen/xenbus.h>
 #include "common.h"
 
@@ -61,6 +62,8 @@
 		be->backend_watch.node = NULL;
 	}
 	if (be->blkif) {
+		if (be->blkif->xenblkd)
+			kthread_stop(be->blkif->xenblkd);
 		blkif_put(be->blkif);
 		be->blkif = NULL;
 	}
@@ -175,6 +178,16 @@
 			be->pdev = 0L;
 			xenbus_dev_fatal(dev, err,
 					 "creating vbd structure");
+			return;
+		}
+
+		be->blkif->xenblkd = kthread_run(blkif_schedule, be->blkif,
+						 "xenblkd %d/%04lx",
+						 be->blkif->domid, be->pdev);
+		if (IS_ERR(be->blkif->xenblkd)) {
+			err = PTR_ERR(be->blkif->xenblkd);
+			be->blkif->xenblkd = NULL;
+			xenbus_dev_error(dev, err, "start xenblkd");
 			return;
 		}
 

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [patch] CFQ for xen domains
  2005-11-15 17:51                   ` Gerd Knorr
@ 2005-11-24 11:31                     ` Gerd Knorr
  2005-11-24 23:00                       ` Keir Fraser
  2005-12-06 15:47                       ` Gerd Knorr
  0 siblings, 2 replies; 5+ messages in thread
From: Gerd Knorr @ 2005-11-24 11:31 UTC (permalink / raw)
  To: Gerd Knorr; +Cc: Ian Pratt, xen-devel, Chris Mason, Kurt Garloff, Jens Axboe

[-- Attachment #1: Type: text/plain, Size: 1212 bytes --]

   Hi folks,

New version of the patch, adapted to apply cleanly against latest 
unstable, also debug output is runtime-switchable now, no major changes 
from last version.  I'm running my xen machine all day with that patch, 
without any issues showing up, including working save/restore.  Any 
chance to get it merged?

Full list changes:

   * One thread per blkif.  The I/O scheduler can do a better job that
     way, also you can use ionice on the blkback threads to adjust the
     block I/O priorities for the domain.
   * Various stuff has been moved from global variables into blkif_t.
   * The scary allocation ring for pending_req's is gone and has been
     replaced by a free list.
   * made dispatch_rw_block_io() reentrant.
   * general linux coding style cleanup, at least for the code I've
     touched anyway.
   * number of outstanding requests is runtime-configurable now.
   * made the ia64 #ifdefs smaller and dropped one.  It should still
     work on ia64 in theory, but would be great if the ia64 folks
     can have a look ...
   * re-added the xen_init() which got lost by mistake (pointed
     out by the ia64 guys).
   * runtime-switchable stats and debug output.

cheers,

   Gerd

[-- Attachment #2: blkback-7922-16.diff --]
[-- Type: text/x-patch, Size: 23686 bytes --]

diff -r 6a666940fa04 linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c	Sun Nov 20 09:19:38 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c	Mon Nov 21 11:21:46 2005
@@ -12,6 +12,8 @@
  */
 
 #include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
 #include <asm-xen/balloon.h>
 #include <asm/hypervisor.h>
 #include "common.h"
@@ -21,26 +23,26 @@
  * pulled from a communication ring are quite likely to end up being part of
  * the same scatter/gather request at the disc.
  * 
- * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
+ * 
  * This will increase the chances of being able to write whole tracks.
  * 64 should be enough to keep us competitive with Linux.
  */
-#define MAX_PENDING_REQS 64
-#define BATCH_PER_DOMAIN 16
-
-static unsigned long mmap_vstart;
-#define MMAP_PAGES						\
-	(MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
-#ifdef __ia64__
-static void *pending_vaddrs[MMAP_PAGES];
-#define MMAP_VADDR(_idx, _i) \
-	(unsigned long)(pending_vaddrs[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
-#else
-#define MMAP_VADDR(_req,_seg)						\
-	(mmap_vstart +							\
-	 ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +	\
-	 ((_seg) * PAGE_SIZE))
-#endif
+static int blkif_reqs = 64;
+static int mmap_pages;
+
+static int __init set_blkif_reqs(char *str)
+{
+	get_option(&str, &blkif_reqs);
+	return 1;
+}
+__setup("blkif_reqs=", set_blkif_reqs);
+
+/* runtime-switchable, check /sys/module/blkback/parameters/ ;) */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
 
 /*
  * Each outstanding request that we've passed to the lower device layers has a 
@@ -55,43 +57,38 @@
 	atomic_t       pendcnt;
 	unsigned short operation;
 	int            status;
+	struct list_head free_list;
 } pending_req_t;
 
-/*
- * We can't allocate pending_req's in order, since they may complete out of 
- * order. We therefore maintain an allocation ring. This ring also indicates 
- * when enough work has been passed down -- at that point the allocation ring 
- * will be empty.
- */
-static pending_req_t pending_reqs[MAX_PENDING_REQS];
-static unsigned char pending_ring[MAX_PENDING_REQS];
-static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
-/* NB. We use a different index type to differentiate from shared blk rings. */
-typedef unsigned int PEND_RING_IDX;
-#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
-static PEND_RING_IDX pending_prod, pending_cons;
-#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
-
-static request_queue_t *plugged_queue;
-static inline void flush_plugged_queue(void)
-{
-	request_queue_t *q = plugged_queue;
-	if (q != NULL) {
-		if ( q->unplug_fn != NULL )
-			q->unplug_fn(q);
-		blk_put_queue(q);
-		plugged_queue = NULL;
-	}
-}
+static pending_req_t *pending_reqs;
+static struct list_head pending_free;
+static spinlock_t pending_free_lock = SPIN_LOCK_UNLOCKED;
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
+#define BLKBACK_INVALID_HANDLE (0xFFFF)
+
+static unsigned long mmap_vstart;
+static unsigned long *pending_vaddrs;
+static u16 *pending_grant_handles;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+	return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+	return pending_vaddrs[vaddr_pagenr(req, seg)];
+}
+
+#define pending_handle(_req, _seg) \
+	(pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
 
 /* When using grant tables to map a frame for device access then the
  * handle returned must be used to unmap the frame. This is needed to
  * drop the ref count on the frame.
  */
-static u16 pending_grant_handles[MMAP_PAGES];
-#define pending_handle(_idx, _i) \
-    (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
-#define BLKBACK_INVALID_HANDLE (0xFFFF)
 
 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
 /*
@@ -105,26 +102,79 @@
 static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
 #endif
 
-static int do_block_io_op(blkif_t *blkif, int max_to_do);
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 blkif_request_t *req,
+				 pending_req_t *pending_req);
 static void make_response(blkif_t *blkif, unsigned long id, 
                           unsigned short op, int st);
 
-static void fast_flush_area(int idx, int nr_pages)
+/******************************************************************
+ * misc small helpers
+ */
+static pending_req_t* alloc_req(void)
+{
+	pending_req_t *req = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	if (!list_empty(&pending_free)) {
+		req = list_entry(pending_free.next, pending_req_t, free_list);
+		list_del(&req->free_list);
+	}
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+	unsigned long flags;
+	int was_empty;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	was_empty = list_empty(&pending_free);
+	list_add(&req->free_list, &pending_free);
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	if (was_empty)
+		wake_up(&pending_free_wq);
+}
+
+static void unplug_queue(blkif_t *blkif)
+{
+	if (NULL == blkif->plug)
+		return;
+	if (blkif->plug->unplug_fn)
+		blkif->plug->unplug_fn(blkif->plug);
+	blk_put_queue(blkif->plug);
+	blkif->plug = NULL;
+}
+
+static void plug_queue(blkif_t *blkif, struct bio *bio)
+{
+	request_queue_t *q = bdev_get_queue(bio->bi_bdev);
+
+	if (q == blkif->plug)
+		return;
+	unplug_queue(blkif);
+	blk_get_queue(q);
+	blkif->plug = q;
+}
+
+static void fast_flush_area(pending_req_t *req)
 {
 	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int i, invcount = 0;
 	u16 handle;
 	int ret;
 
-	for (i = 0; i < nr_pages; i++) {
-		handle = pending_handle(idx, i);
+	for (i = 0; i < req->nr_pages; i++) {
+		handle = pending_handle(req, i);
 		if (handle == BLKBACK_INVALID_HANDLE)
 			continue;
-		unmap[invcount].host_addr    = MMAP_VADDR(idx, i);
+		unmap[invcount].host_addr    = vaddr(req, i);
 		unmap[invcount].dev_bus_addr = 0;
 		unmap[invcount].handle       = handle;
-		pending_handle(idx, i) = BLKBACK_INVALID_HANDLE;
+		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
 		invcount++;
 	}
 
@@ -133,109 +183,79 @@
 	BUG_ON(ret);
 }
 
-
-/******************************************************************
- * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
- */
-
-static struct list_head blkio_schedule_list;
-static spinlock_t blkio_schedule_list_lock;
-
-static int __on_blkdev_list(blkif_t *blkif)
-{
-	return blkif->blkdev_list.next != NULL;
-}
-
-static void remove_from_blkdev_list(blkif_t *blkif)
-{
-	unsigned long flags;
-
-	if (!__on_blkdev_list(blkif))
-		return;
-
-	spin_lock_irqsave(&blkio_schedule_list_lock, flags);
-	if (__on_blkdev_list(blkif)) {
-		list_del(&blkif->blkdev_list);
-		blkif->blkdev_list.next = NULL;
-		blkif_put(blkif);
-	}
-	spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-static void add_to_blkdev_list_tail(blkif_t *blkif)
-{
-	unsigned long flags;
-
-	if (__on_blkdev_list(blkif))
-		return;
-
-	spin_lock_irqsave(&blkio_schedule_list_lock, flags);
-	if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) {
-		list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
-		blkif_get(blkif);
-	}
-	spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-
 /******************************************************************
  * SCHEDULER FUNCTIONS
  */
 
-static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
-
-static int blkio_schedule(void *arg)
-{
-	DECLARE_WAITQUEUE(wq, current);
-
-	blkif_t          *blkif;
-	struct list_head *ent;
-
-	daemonize("xenblkd");
-
+static void print_stats(blkif_t *blkif)
+{
+	printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
+	       current->comm, blkif->st_oo_req,
+	       blkif->st_rd_req, blkif->st_wr_req);
+	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+	blkif->st_rd_req = 0;
+	blkif->st_wr_req = 0;
+	blkif->st_oo_req = 0;
+}
+
+int blkif_schedule(void *arg)
+{
+	blkif_t          *blkif = arg;
+
+	blkif_get(blkif);
+	if (debug_lvl)
+		printk(KERN_DEBUG "%s: started\n", current->comm);
 	for (;;) {
-		/* Wait for work to do. */
-		add_wait_queue(&blkio_schedule_wait, &wq);
-		set_current_state(TASK_INTERRUPTIBLE);
-		if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || 
-		     list_empty(&blkio_schedule_list) )
-			schedule();
-		__set_current_state(TASK_RUNNING);
-		remove_wait_queue(&blkio_schedule_wait, &wq);
-
-		/* Queue up a batch of requests. */
-		while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
-		       !list_empty(&blkio_schedule_list)) {
-			ent = blkio_schedule_list.next;
-			blkif = list_entry(ent, blkif_t, blkdev_list);
-			blkif_get(blkif);
-			remove_from_blkdev_list(blkif);
-			if (do_block_io_op(blkif, BATCH_PER_DOMAIN))
-				add_to_blkdev_list_tail(blkif);
-			blkif_put(blkif);
-		}
-
-		/* Push the batch through to disc. */
-		flush_plugged_queue();
-	}
-}
-
-static void maybe_trigger_blkio_schedule(void)
-{
-	/*
-	 * Needed so that two processes, which together make the following
-	 * predicate true, don't both read stale values and evaluate the
-	 * predicate incorrectly. Incredibly unlikely to stall the scheduler
-	 * on x86, but...
-	 */
-	smp_mb();
-
-	if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
-	    !list_empty(&blkio_schedule_list))
-		wake_up(&blkio_schedule_wait);
-}
-
-
+		if (kthread_should_stop()) {
+			/* asked to quit? */
+			if (!atomic_read(&blkif->io_pending))
+				break;
+			if (debug_lvl)
+				printk(KERN_DEBUG "%s: I/O pending, delaying exit\n",
+				       current->comm);
+		}
+
+		if (!atomic_read(&blkif->io_pending)) {
+			/* Wait for work to do. */
+			wait_event_interruptible(blkif->wq,
+						 atomic_read(&blkif->io_pending) ||
+						 kthread_should_stop());
+		} else if (list_empty(&pending_free)) {
+			/* Wait for pending_req becoming available. */
+			wait_event_interruptible(pending_free_wq,
+						 !list_empty(&pending_free));
+		}
+
+		if (blkif->status != CONNECTED) {
+			/* make sure we are connected */
+			if (debug_lvl)
+				printk(KERN_DEBUG "%s: not connected (%d pending)\n",
+				       current->comm, atomic_read(&blkif->io_pending));
+			wait_event_interruptible(blkif->wq,
+						 blkif->status != CONNECTED ||
+						 kthread_should_stop());
+			continue;
+		}
+
+		/* Schedule I/O */
+		atomic_set(&blkif->io_pending, 0);
+		if (do_block_io_op(blkif))
+			atomic_inc(&blkif->io_pending);
+		unplug_queue(blkif);
+
+		if (log_stats && time_after(jiffies, blkif->st_print))
+			print_stats(blkif);
+	}
+
+	/* bye folks, and thanks for all the fish ;) */
+	if (log_stats)
+		print_stats(blkif);
+	if (debug_lvl)
+		printk(KERN_DEBUG "%s: exiting\n", current->comm);
+	blkif->xenblkd = NULL;
+	blkif_put(blkif);
+	return 0;
+}
 
 /******************************************************************
  * COMPLETION CALLBACK -- Called as bh->b_end_io()
@@ -243,8 +263,6 @@
 
 static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
 {
-	unsigned long flags;
-
 	/* An error fails the entire request. */
 	if (!uptodate) {
 		DPRINTK("Buffer not up-to-date at end of operation\n");
@@ -252,15 +270,11 @@
 	}
 
 	if (atomic_dec_and_test(&pending_req->pendcnt)) {
-		int pending_idx = pending_req - pending_reqs;
-		fast_flush_area(pending_idx, pending_req->nr_pages);
+		fast_flush_area(pending_req);
 		make_response(pending_req->blkif, pending_req->id,
 			      pending_req->operation, pending_req->status);
 		blkif_put(pending_req->blkif);
-		spin_lock_irqsave(&pend_prod_lock, flags);
-		pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
-		spin_unlock_irqrestore(&pend_prod_lock, flags);
-		maybe_trigger_blkio_schedule();
+		free_req(pending_req);
 	}
 }
 
@@ -281,8 +295,9 @@
 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
 {
 	blkif_t *blkif = dev_id;
-	add_to_blkdev_list_tail(blkif);
-	maybe_trigger_blkio_schedule();
+
+	atomic_inc(&blkif->io_pending);
+	wake_up(&blkif->wq);
 	return IRQ_HANDLED;
 }
 
@@ -292,10 +307,11 @@
  * DOWNWARD CALLS -- These interface with the block-device layer proper.
  */
 
-static int do_block_io_op(blkif_t *blkif, int max_to_do)
+static int do_block_io_op(blkif_t *blkif)
 {
 	blkif_back_ring_t *blk_ring = &blkif->blk_ring;
 	blkif_request_t *req;
+	pending_req_t *pending_req;
 	RING_IDX i, rp;
 	int more_to_do = 0;
 
@@ -305,24 +321,30 @@
 	for (i = blk_ring->req_cons; 
 	     (i != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, i);
 	     i++) {
-		if ((max_to_do-- == 0) ||
-		    (NR_PENDING_REQS == MAX_PENDING_REQS)) {
+
+		pending_req = alloc_req();
+		if (NULL == pending_req) {
+			blkif->st_oo_req++;
 			more_to_do = 1;
 			break;
 		}
-        
+
 		req = RING_GET_REQUEST(blk_ring, i);
 		switch (req->operation) {
 		case BLKIF_OP_READ:
+			blkif->st_rd_req++;
+			dispatch_rw_block_io(blkif, req, pending_req);
+			break;
 		case BLKIF_OP_WRITE:
-			dispatch_rw_block_io(blkif, req);
+			blkif->st_wr_req++;
+			dispatch_rw_block_io(blkif, req, pending_req);
 			break;
-
 		default:
 			DPRINTK("error: unknown block io operation [%d]\n",
 				req->operation);
 			make_response(blkif, req->id, req->operation,
 				      BLKIF_RSP_ERROR);
+			free_req(pending_req);
 			break;
 		}
 	}
@@ -331,13 +353,13 @@
 	return more_to_do;
 }
 
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 blkif_request_t *req,
+				 pending_req_t *pending_req)
 {
 	extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
 	int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
 	unsigned long fas = 0;
-	int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
-	pending_req_t *pending_req;
 	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	struct phys_req preq;
 	struct { 
@@ -345,31 +367,35 @@
 	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int nseg;
 	struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-	int nbio = 0;
-	request_queue_t *q;
-	int ret, errors = 0;
+	int ret, i, nbio = 0;
 
 	/* Check that number of segments is sane. */
 	nseg = req->nr_segments;
 	if (unlikely(nseg == 0) || 
 	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 		DPRINTK("Bad number of segments in request (%d)\n", nseg);
-		goto bad_descriptor;
+		goto fail_response;
 	}
 
 	preq.dev           = req->handle;
 	preq.sector_number = req->sector_number;
 	preq.nr_sects      = 0;
 
+	pending_req->blkif     = blkif;
+	pending_req->id        = req->id;
+	pending_req->operation = operation;
+	pending_req->status    = BLKIF_RSP_OKAY;
+	pending_req->nr_pages  = nseg;
+
 	for (i = 0; i < nseg; i++) {
 		fas         = req->frame_and_sects[i];
 		seg[i].nsec = blkif_last_sect(fas) - blkif_first_sect(fas) + 1;
 
 		if (seg[i].nsec <= 0)
-			goto bad_descriptor;
+			goto fail_response;
 		preq.nr_sects += seg[i].nsec;
 
-		map[i].host_addr = MMAP_VADDR(pending_idx, i);
+		map[i].host_addr = vaddr(pending_req, i);
 		map[i].dom = blkif->domid;
 		map[i].ref = blkif_gref_from_fas(fas);
 		map[i].flags = GNTMAP_host_map;
@@ -381,27 +407,23 @@
 	BUG_ON(ret);
 
 	for (i = 0; i < nseg; i++) {
-		if (likely(map[i].handle >= 0)) {
-			pending_handle(pending_idx, i) = map[i].handle;
+		if (unlikely(map[i].handle < 0)) {
+			DPRINTK("invalid buffer -- could not remap it\n");
+			goto fail_flush;
+		}
+
+		pending_handle(pending_req, i) = map[i].handle;
 #ifdef __ia64__
-			MMAP_VADDR(pending_idx,i) = gnttab_map_vaddr(map[i]);
+		pending_vaddrs[vaddr_pagenr(req, seg)] =
+			= gnttab_map_vaddr(map[i]);
 #else
-			set_phys_to_machine(__pa(MMAP_VADDR(
-				pending_idx, i)) >> PAGE_SHIFT,
-				FOREIGN_FRAME(map[i].dev_bus_addr>>PAGE_SHIFT));
+		set_phys_to_machine(__pa(vaddr(
+			pending_req, i)) >> PAGE_SHIFT,
+			FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
 #endif
-			fas        = req->frame_and_sects[i];
-			seg[i].buf = map[i].dev_bus_addr | 
-				(blkif_first_sect(fas) << 9);
-		} else {
-			errors++;
-		}
-	}
-
-	if (errors) {
-		DPRINTK("invalid buffer -- could not remap it\n");
-		fast_flush_area(pending_idx, nseg);
-		goto bad_descriptor;
+		fas         = req->frame_and_sects[i];
+		seg[i].buf  = map[i].dev_bus_addr | 
+			(blkif_first_sect(fas) << 9);
 	}
 
 	if (vbd_translate(&preq, blkif, operation) != 0) {
@@ -409,37 +431,25 @@
 			operation == READ ? "read" : "write",
 			preq.sector_number,
 			preq.sector_number + preq.nr_sects, preq.dev); 
-		goto bad_descriptor;
-	}
-
-	pending_req = &pending_reqs[pending_idx];
-	pending_req->blkif     = blkif;
-	pending_req->id        = req->id;
-	pending_req->operation = operation;
-	pending_req->status    = BLKIF_RSP_OKAY;
-	pending_req->nr_pages  = nseg;
+		goto fail_flush;
+	}
 
 	for (i = 0; i < nseg; i++) {
 		if (((int)preq.sector_number|(int)seg[i].nsec) &
 		    ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
 			DPRINTK("Misaligned I/O request from domain %d",
 				blkif->domid);
-			goto cleanup_and_fail;
+			goto fail_put_bio;
 		}
 
 		while ((bio == NULL) ||
 		       (bio_add_page(bio,
-				     virt_to_page(MMAP_VADDR(pending_idx, i)),
+				     virt_to_page(vaddr(pending_req, i)),
 				     seg[i].nsec << 9,
 				     seg[i].buf & ~PAGE_MASK) == 0)) {
 			bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
-			if (unlikely(bio == NULL)) {
-			cleanup_and_fail:
-				for (i = 0; i < (nbio-1); i++)
-					bio_put(biolist[i]);
-				fast_flush_area(pending_idx, nseg);
-				goto bad_descriptor;
-			}
+			if (unlikely(bio == NULL))
+				goto fail_put_bio;
                 
 			bio->bi_bdev    = preq.bdev;
 			bio->bi_private = pending_req;
@@ -450,14 +460,8 @@
 		preq.sector_number += seg[i].nsec;
 	}
 
-	if ((q = bdev_get_queue(bio->bi_bdev)) != plugged_queue) {
-		flush_plugged_queue();
-		blk_get_queue(q);
-		plugged_queue = q;
-	}
-
+	plug_queue(blkif, bio);
 	atomic_set(&pending_req->pendcnt, nbio);
-	pending_cons++;
 	blkif_get(blkif);
 
 	for (i = 0; i < nbio; i++)
@@ -465,8 +469,14 @@
 
 	return;
 
- bad_descriptor:
+ fail_put_bio:
+	for (i = 0; i < (nbio-1); i++)
+		bio_put(biolist[i]);
+ fail_flush:
+	fast_flush_area(pending_req);
+ fail_response:
 	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+	free_req(pending_req);
 } 
 
 
@@ -498,56 +508,50 @@
 	notify_remote_via_irq(blkif->irq);
 }
 
-void blkif_deschedule(blkif_t *blkif)
-{
-	remove_from_blkdev_list(blkif);
-}
-
 static int __init blkif_init(void)
 {
+	struct page *page;
 	int i;
-	struct page *page;
-	int ret;
-
-	for (i = 0; i < MMAP_PAGES; i++)
-		pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
 
 	if (xen_init() < 0)
 		return -ENODEV;
 
+	mmap_pages            = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
+					blkif_reqs, GFP_KERNEL);
+	pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+					mmap_pages, GFP_KERNEL);
+	pending_vaddrs        = kmalloc(sizeof(pending_vaddrs[0]) *
+					mmap_pages, GFP_KERNEL);
+	if (!pending_reqs || !pending_grant_handles || !pending_vaddrs) {
+		printk("%s: out of memory\n", __FUNCTION__);
+		return -1;
+	}
+
 	blkif_interface_init();
-
+	
 #ifdef __ia64__
-    {
 	extern unsigned long alloc_empty_foreign_map_page_range(unsigned long pages);
-	int i;
-
-	mmap_vstart =  alloc_empty_foreign_map_page_range(MMAP_PAGES);
-	printk("Allocated mmap_vstart: 0x%lx\n", mmap_vstart);
-	for(i = 0; i < MMAP_PAGES; i++)
-	    pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
-	BUG_ON(mmap_vstart == NULL);
-    }
-#else
-	page = balloon_alloc_empty_page_range(MMAP_PAGES);
+	mmap_vstart = (unsigned long)alloc_empty_foreign_map_page_range(mmap_pages);
+#else /* ! ia64 */
+	page = balloon_alloc_empty_page_range(mmap_pages);
 	BUG_ON(page == NULL);
 	mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
 #endif
-
-	pending_cons = 0;
-	pending_prod = MAX_PENDING_REQS;
+	printk("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n",
+	       __FUNCTION__, blkif_reqs, mmap_pages, mmap_vstart);
+	BUG_ON(mmap_vstart == 0);
+	for (i = 0; i < mmap_pages; i++)
+		pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
+
+	memset(pending_grant_handles,  BLKBACK_INVALID_HANDLE, mmap_pages);
 	memset(pending_reqs, 0, sizeof(pending_reqs));
-	for (i = 0; i < MAX_PENDING_REQS; i++)
-		pending_ring[i] = i;
+	INIT_LIST_HEAD(&pending_free);
+
+	for (i = 0; i < blkif_reqs; i++)
+		list_add_tail(&pending_reqs[i].free_list, &pending_free);
     
-	spin_lock_init(&blkio_schedule_list_lock);
-	INIT_LIST_HEAD(&blkio_schedule_list);
-
-	ret = kernel_thread(blkio_schedule, NULL, CLONE_FS | CLONE_FILES);
-	BUG_ON(ret < 0);
-
 	blkif_xenbus_init();
-
 	return 0;
 }
 
diff -r 6a666940fa04 linux-2.6-xen-sparse/drivers/xen/blkback/common.h
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/common.h	Sun Nov 20 09:19:38 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/common.h	Mon Nov 21 11:21:46 2005
@@ -56,9 +56,19 @@
 	/* Is this a blktap frontend */
 	unsigned int     is_blktap;
 #endif
-	struct list_head blkdev_list;
 	spinlock_t       blk_ring_lock;
 	atomic_t         refcnt;
+
+	wait_queue_head_t   wq;
+	struct task_struct  *xenblkd;
+	atomic_t            io_pending;
+	request_queue_t     *plug;
+
+	/* statistics */
+	unsigned long       st_print;
+	int                 st_rd_req;
+	int                 st_wr_req;
+	int                 st_oo_req;
 
 	struct work_struct free_work;
 
@@ -97,11 +107,10 @@
 
 void blkif_interface_init(void);
 
-void blkif_deschedule(blkif_t *blkif);
-
 void blkif_xenbus_init(void);
 
 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+int blkif_schedule(void *arg);
 
 #endif /* __BLKIF__BACKEND__COMMON_H__ */
 
diff -r 6a666940fa04 linux-2.6-xen-sparse/drivers/xen/blkback/interface.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c	Sun Nov 20 09:19:38 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c	Mon Nov 21 11:21:46 2005
@@ -24,6 +24,8 @@
 	blkif->status = DISCONNECTED;
 	spin_lock_init(&blkif->blk_ring_lock);
 	atomic_set(&blkif->refcnt, 1);
+	init_waitqueue_head(&blkif->wq);
+	blkif->st_print = jiffies;
 
 	return blkif;
 }
@@ -113,6 +115,7 @@
 	blkif->irq = bind_evtchn_to_irqhandler(
 		blkif->evtchn, blkif_be_int, 0, "blkif-backend", blkif);
 	blkif->status = CONNECTED;
+	wake_up(&blkif->wq);
 
 	return 0;
 }
diff -r 6a666940fa04 linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c	Sun Nov 20 09:19:38 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c	Mon Nov 21 11:21:46 2005
@@ -20,6 +20,7 @@
 
 #include <stdarg.h>
 #include <linux/module.h>
+#include <linux/kthread.h>
 #include <asm-xen/xenbus.h>
 #include "common.h"
 
@@ -61,6 +62,8 @@
 		be->backend_watch.node = NULL;
 	}
 	if (be->blkif) {
+		if (be->blkif->xenblkd)
+			kthread_stop(be->blkif->xenblkd);
 		blkif_put(be->blkif);
 		be->blkif = NULL;
 	}
@@ -175,6 +178,16 @@
 			be->pdev = 0L;
 			xenbus_dev_fatal(dev, err,
 					 "creating vbd structure");
+			return;
+		}
+
+		be->blkif->xenblkd = kthread_run(blkif_schedule, be->blkif,
+						 "xenblkd %d/%04lx",
+						 be->blkif->domid, be->pdev);
+		if (IS_ERR(be->blkif->xenblkd)) {
+			err = PTR_ERR(be->blkif->xenblkd);
+			be->blkif->xenblkd = NULL;
+			xenbus_dev_error(dev, err, "start xenblkd");
 			return;
 		}
 

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch] CFQ for xen domains
  2005-11-24 11:31                     ` Gerd Knorr
@ 2005-11-24 23:00                       ` Keir Fraser
  2005-12-06 15:47                       ` Gerd Knorr
  1 sibling, 0 replies; 5+ messages in thread
From: Keir Fraser @ 2005-11-24 23:00 UTC (permalink / raw)
  To: Gerd Knorr; +Cc: Ian Pratt, xen-devel, Kurt Garloff, Chris Mason, Jens Axboe

On 24 Nov 2005, at 11:31, Gerd Knorr wrote:

> New version of the patch, adapted to apply cleanly against latest 
> unstable, also debug output is runtime-switchable now, no major 
> changes from last version.  I'm running my xen machine all day with 
> that patch, without any issues showing up, including working 
> save/restore.  Any chance to get it merged?

The patch is looking good, but we really want to limit patches to bug 
fixes right now, to rein in 3.0.0. The risk of small regressions and 
bugs creeping, and time required to thoroughly review the patch, are 
things we'd like to avoid for the next week or so. For example, I see 
that one of the recent fixes (initialisation of pending_grant_handles 
array) isn't included in the current patch.

We'll be much more receptive after 3.0.0 is out the door. Improved QoS 
control is one of the targets for 3.0.x.

  -- Keir

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [patch] CFQ for xen domains
  2005-11-24 11:31                     ` Gerd Knorr
  2005-11-24 23:00                       ` Keir Fraser
@ 2005-12-06 15:47                       ` Gerd Knorr
  1 sibling, 0 replies; 5+ messages in thread
From: Gerd Knorr @ 2005-12-06 15:47 UTC (permalink / raw)
  To: Gerd Knorr; +Cc: Ian Pratt, xen-devel, Chris Mason, Kurt Garloff, Jens Axboe

[-- Attachment #1: Type: text/plain, Size: 173 bytes --]

Gerd Knorr wrote:
>   Hi folks,
> 
> New version of the patch, adapted to apply cleanly against latest 

One more version, this time against 3.0-final ;)

cheers,

   Gerd


[-- Attachment #2: blkback-8241-18.diff --]
[-- Type: text/x-patch, Size: 24591 bytes --]

diff -r 0255f48b757f linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c	Sun Dec  4 19:12:00 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c	Tue Dec  6 15:29:06 2005
@@ -12,6 +12,8 @@
  */
 
 #include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
 #include <asm-xen/balloon.h>
 #include <asm/hypervisor.h>
 #include "common.h"
@@ -21,26 +23,26 @@
  * pulled from a communication ring are quite likely to end up being part of
  * the same scatter/gather request at the disc.
  * 
- * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
+ * 
  * This will increase the chances of being able to write whole tracks.
  * 64 should be enough to keep us competitive with Linux.
  */
-#define MAX_PENDING_REQS 64
-#define BATCH_PER_DOMAIN 16
-
-static unsigned long mmap_vstart;
-#define MMAP_PAGES						\
-	(MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
-#ifdef __ia64__
-static void *pending_vaddrs[MMAP_PAGES];
-#define MMAP_VADDR(_idx, _i) \
-	(unsigned long)(pending_vaddrs[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
-#else
-#define MMAP_VADDR(_req,_seg)						\
-	(mmap_vstart +							\
-	 ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) +	\
-	 ((_seg) * PAGE_SIZE))
-#endif
+static int blkif_reqs = 64;
+static int mmap_pages;
+
+static int __init set_blkif_reqs(char *str)
+{
+	get_option(&str, &blkif_reqs);
+	return 1;
+}
+__setup("blkif_reqs=", set_blkif_reqs);
+
+/* runtime-switchable, check /sys/module/blkback/parameters/ ;) */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);
 
 /*
  * Each outstanding request that we've passed to the lower device layers has a 
@@ -55,43 +57,38 @@
 	atomic_t       pendcnt;
 	unsigned short operation;
 	int            status;
+	struct list_head free_list;
 } pending_req_t;
 
-/*
- * We can't allocate pending_req's in order, since they may complete out of 
- * order. We therefore maintain an allocation ring. This ring also indicates 
- * when enough work has been passed down -- at that point the allocation ring 
- * will be empty.
- */
-static pending_req_t pending_reqs[MAX_PENDING_REQS];
-static unsigned char pending_ring[MAX_PENDING_REQS];
-static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
-/* NB. We use a different index type to differentiate from shared blk rings. */
-typedef unsigned int PEND_RING_IDX;
-#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
-static PEND_RING_IDX pending_prod, pending_cons;
-#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
-
-static request_queue_t *plugged_queue;
-static inline void flush_plugged_queue(void)
-{
-	request_queue_t *q = plugged_queue;
-	if (q != NULL) {
-		if ( q->unplug_fn != NULL )
-			q->unplug_fn(q);
-		blk_put_queue(q);
-		plugged_queue = NULL;
-	}
-}
+static pending_req_t *pending_reqs;
+static struct list_head pending_free;
+static spinlock_t pending_free_lock = SPIN_LOCK_UNLOCKED;
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+static unsigned long mmap_vstart;
+static void **pending_vaddrs;
+static grant_handle_t *pending_grant_handles;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+	return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+	return pending_vaddrs[vaddr_pagenr(req, seg)];
+}
+
+#define pending_handle(_req, _seg) \
+	(pending_grant_handles[vaddr_pagenr(_req, _seg)])
+
 
 /* When using grant tables to map a frame for device access then the
  * handle returned must be used to unmap the frame. This is needed to
  * drop the ref count on the frame.
  */
-static grant_handle_t pending_grant_handles[MMAP_PAGES];
-#define pending_handle(_idx, _i) \
-    (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
-#define BLKBACK_INVALID_HANDLE (~0)
 
 #ifdef CONFIG_XEN_BLKDEV_TAP_BE
 /*
@@ -105,26 +102,79 @@
 static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
 #endif
 
-static int do_block_io_op(blkif_t *blkif, int max_to_do);
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 blkif_request_t *req,
+				 pending_req_t *pending_req);
 static void make_response(blkif_t *blkif, unsigned long id, 
                           unsigned short op, int st);
 
-static void fast_flush_area(int idx, int nr_pages)
+/******************************************************************
+ * misc small helpers
+ */
+static pending_req_t* alloc_req(void)
+{
+	pending_req_t *req = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	if (!list_empty(&pending_free)) {
+		req = list_entry(pending_free.next, pending_req_t, free_list);
+		list_del(&req->free_list);
+	}
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+	unsigned long flags;
+	int was_empty;
+
+	spin_lock_irqsave(&pending_free_lock, flags);
+	was_empty = list_empty(&pending_free);
+	list_add(&req->free_list, &pending_free);
+	spin_unlock_irqrestore(&pending_free_lock, flags);
+	if (was_empty)
+		wake_up(&pending_free_wq);
+}
+
+static void unplug_queue(blkif_t *blkif)
+{
+	if (NULL == blkif->plug)
+		return;
+	if (blkif->plug->unplug_fn)
+		blkif->plug->unplug_fn(blkif->plug);
+	blk_put_queue(blkif->plug);
+	blkif->plug = NULL;
+}
+
+static void plug_queue(blkif_t *blkif, struct bio *bio)
+{
+	request_queue_t *q = bdev_get_queue(bio->bi_bdev);
+
+	if (q == blkif->plug)
+		return;
+	unplug_queue(blkif);
+	blk_get_queue(q);
+	blkif->plug = q;
+}
+
+static void fast_flush_area(pending_req_t *req)
 {
 	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int i, invcount = 0;
 	grant_handle_t handle;
 	int ret;
 
-	for (i = 0; i < nr_pages; i++) {
-		handle = pending_handle(idx, i);
+	for (i = 0; i < req->nr_pages; i++) {
+		handle = pending_handle(req, i);
 		if (handle == BLKBACK_INVALID_HANDLE)
 			continue;
-		unmap[invcount].host_addr    = MMAP_VADDR(idx, i);
+		unmap[invcount].host_addr    = vaddr(req, i);
 		unmap[invcount].dev_bus_addr = 0;
 		unmap[invcount].handle       = handle;
-		pending_handle(idx, i) = BLKBACK_INVALID_HANDLE;
+		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
 		invcount++;
 	}
 
@@ -133,109 +183,79 @@
 	BUG_ON(ret);
 }
 
-
-/******************************************************************
- * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
- */
-
-static struct list_head blkio_schedule_list;
-static spinlock_t blkio_schedule_list_lock;
-
-static int __on_blkdev_list(blkif_t *blkif)
-{
-	return blkif->blkdev_list.next != NULL;
-}
-
-static void remove_from_blkdev_list(blkif_t *blkif)
-{
-	unsigned long flags;
-
-	if (!__on_blkdev_list(blkif))
-		return;
-
-	spin_lock_irqsave(&blkio_schedule_list_lock, flags);
-	if (__on_blkdev_list(blkif)) {
-		list_del(&blkif->blkdev_list);
-		blkif->blkdev_list.next = NULL;
-		blkif_put(blkif);
-	}
-	spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-static void add_to_blkdev_list_tail(blkif_t *blkif)
-{
-	unsigned long flags;
-
-	if (__on_blkdev_list(blkif))
-		return;
-
-	spin_lock_irqsave(&blkio_schedule_list_lock, flags);
-	if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) {
-		list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
-		blkif_get(blkif);
-	}
-	spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-
 /******************************************************************
  * SCHEDULER FUNCTIONS
  */
 
-static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
-
-static int blkio_schedule(void *arg)
-{
-	DECLARE_WAITQUEUE(wq, current);
-
-	blkif_t          *blkif;
-	struct list_head *ent;
-
-	daemonize("xenblkd");
-
+static void print_stats(blkif_t *blkif)
+{
+	printk(KERN_DEBUG "%s: oo %3d  |  rd %4d  |  wr %4d\n",
+	       current->comm, blkif->st_oo_req,
+	       blkif->st_rd_req, blkif->st_wr_req);
+	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+	blkif->st_rd_req = 0;
+	blkif->st_wr_req = 0;
+	blkif->st_oo_req = 0;
+}
+
+int blkif_schedule(void *arg)
+{
+	blkif_t          *blkif = arg;
+
+	blkif_get(blkif);
+	if (debug_lvl)
+		printk(KERN_DEBUG "%s: started\n", current->comm);
 	for (;;) {
-		/* Wait for work to do. */
-		add_wait_queue(&blkio_schedule_wait, &wq);
-		set_current_state(TASK_INTERRUPTIBLE);
-		if ( (NR_PENDING_REQS == MAX_PENDING_REQS) || 
-		     list_empty(&blkio_schedule_list) )
-			schedule();
-		__set_current_state(TASK_RUNNING);
-		remove_wait_queue(&blkio_schedule_wait, &wq);
-
-		/* Queue up a batch of requests. */
-		while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
-		       !list_empty(&blkio_schedule_list)) {
-			ent = blkio_schedule_list.next;
-			blkif = list_entry(ent, blkif_t, blkdev_list);
-			blkif_get(blkif);
-			remove_from_blkdev_list(blkif);
-			if (do_block_io_op(blkif, BATCH_PER_DOMAIN))
-				add_to_blkdev_list_tail(blkif);
-			blkif_put(blkif);
-		}
-
-		/* Push the batch through to disc. */
-		flush_plugged_queue();
-	}
-}
-
-static void maybe_trigger_blkio_schedule(void)
-{
-	/*
-	 * Needed so that two processes, which together make the following
-	 * predicate true, don't both read stale values and evaluate the
-	 * predicate incorrectly. Incredibly unlikely to stall the scheduler
-	 * on x86, but...
-	 */
-	smp_mb();
-
-	if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
-	    !list_empty(&blkio_schedule_list))
-		wake_up(&blkio_schedule_wait);
-}
-
-
+		if (kthread_should_stop()) {
+			/* asked to quit? */
+			if (!atomic_read(&blkif->io_pending))
+				break;
+			if (debug_lvl)
+				printk(KERN_DEBUG "%s: I/O pending, delaying exit\n",
+				       current->comm);
+		}
+
+		if (!atomic_read(&blkif->io_pending)) {
+			/* Wait for work to do. */
+			wait_event_interruptible(blkif->wq,
+						 atomic_read(&blkif->io_pending) ||
+						 kthread_should_stop());
+		} else if (list_empty(&pending_free)) {
+			/* Wait for pending_req becoming available. */
+			wait_event_interruptible(pending_free_wq,
+						 !list_empty(&pending_free));
+		}
+
+		if (blkif->status != CONNECTED) {
+			/* make sure we are connected */
+			if (debug_lvl)
+				printk(KERN_DEBUG "%s: not connected (%d pending)\n",
+				       current->comm, atomic_read(&blkif->io_pending));
+			wait_event_interruptible(blkif->wq,
+						 blkif->status != CONNECTED ||
+						 kthread_should_stop());
+			continue;
+		}
+
+		/* Schedule I/O */
+		atomic_set(&blkif->io_pending, 0);
+		if (do_block_io_op(blkif))
+			atomic_inc(&blkif->io_pending);
+		unplug_queue(blkif);
+
+		if (log_stats && time_after(jiffies, blkif->st_print))
+			print_stats(blkif);
+	}
+
+	/* bye folks, and thanks for all the fish ;) */
+	if (log_stats)
+		print_stats(blkif);
+	if (debug_lvl)
+		printk(KERN_DEBUG "%s: exiting\n", current->comm);
+	blkif->xenblkd = NULL;
+	blkif_put(blkif);
+	return 0;
+}
 
 /******************************************************************
  * COMPLETION CALLBACK -- Called as bh->b_end_io()
@@ -243,8 +263,6 @@
 
 static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
 {
-	unsigned long flags;
-
 	/* An error fails the entire request. */
 	if (!uptodate) {
 		DPRINTK("Buffer not up-to-date at end of operation\n");
@@ -252,15 +270,11 @@
 	}
 
 	if (atomic_dec_and_test(&pending_req->pendcnt)) {
-		int pending_idx = pending_req - pending_reqs;
-		fast_flush_area(pending_idx, pending_req->nr_pages);
+		fast_flush_area(pending_req);
 		make_response(pending_req->blkif, pending_req->id,
 			      pending_req->operation, pending_req->status);
 		blkif_put(pending_req->blkif);
-		spin_lock_irqsave(&pend_prod_lock, flags);
-		pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
-		spin_unlock_irqrestore(&pend_prod_lock, flags);
-		maybe_trigger_blkio_schedule();
+		free_req(pending_req);
 	}
 }
 
@@ -281,8 +295,9 @@
 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
 {
 	blkif_t *blkif = dev_id;
-	add_to_blkdev_list_tail(blkif);
-	maybe_trigger_blkio_schedule();
+
+	atomic_inc(&blkif->io_pending);
+	wake_up(&blkif->wq);
 	return IRQ_HANDLED;
 }
 
@@ -292,10 +307,11 @@
  * DOWNWARD CALLS -- These interface with the block-device layer proper.
  */
 
-static int do_block_io_op(blkif_t *blkif, int max_to_do)
+static int do_block_io_op(blkif_t *blkif)
 {
 	blkif_back_ring_t *blk_ring = &blkif->blk_ring;
 	blkif_request_t *req;
+	pending_req_t *pending_req;
 	RING_IDX rc, rp;
 	int more_to_do = 0;
 
@@ -304,8 +320,10 @@
 	rmb(); /* Ensure we see queued requests up to 'rp'. */
 
 	while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
-		if ((max_to_do-- == 0) ||
-		    (NR_PENDING_REQS == MAX_PENDING_REQS)) {
+
+		pending_req = alloc_req();
+		if (NULL == pending_req) {
+			blkif->st_oo_req++;
 			more_to_do = 1;
 			break;
 		}
@@ -315,28 +333,31 @@
 
 		switch (req->operation) {
 		case BLKIF_OP_READ:
+			blkif->st_rd_req++;
+			dispatch_rw_block_io(blkif, req, pending_req);
+			break;
 		case BLKIF_OP_WRITE:
-			dispatch_rw_block_io(blkif, req);
+			blkif->st_wr_req++;
+			dispatch_rw_block_io(blkif, req, pending_req);
 			break;
-
 		default:
 			DPRINTK("error: unknown block io operation [%d]\n",
 				req->operation);
 			make_response(blkif, req->id, req->operation,
 				      BLKIF_RSP_ERROR);
+			free_req(pending_req);
 			break;
 		}
 	}
-
 	return more_to_do;
 }
 
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
+static void dispatch_rw_block_io(blkif_t *blkif,
+				 blkif_request_t *req,
+				 pending_req_t *pending_req)
 {
 	extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); 
 	int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
-	int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
-	pending_req_t *pending_req;
 	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	struct phys_req preq;
 	struct { 
@@ -344,32 +365,36 @@
 	} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int nseg;
 	struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-	int nbio = 0;
-	request_queue_t *q;
-	int ret, errors = 0;
+	int ret, i, nbio = 0;
 
 	/* Check that number of segments is sane. */
 	nseg = req->nr_segments;
 	if (unlikely(nseg == 0) || 
 	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 		DPRINTK("Bad number of segments in request (%d)\n", nseg);
-		goto bad_descriptor;
+		goto fail_response;
 	}
 
 	preq.dev           = req->handle;
 	preq.sector_number = req->sector_number;
 	preq.nr_sects      = 0;
 
+	pending_req->blkif     = blkif;
+	pending_req->id        = req->id;
+	pending_req->operation = operation;
+	pending_req->status    = BLKIF_RSP_OKAY;
+	pending_req->nr_pages  = nseg;
+
 	for (i = 0; i < nseg; i++) {
 		seg[i].nsec = req->seg[i].last_sect -
 			req->seg[i].first_sect + 1;
 
 		if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
 		    (seg[i].nsec <= 0))
-			goto bad_descriptor;
+			goto fail_response;
 		preq.nr_sects += seg[i].nsec;
 
-		map[i].host_addr = MMAP_VADDR(pending_idx, i);
+		map[i].host_addr = vaddr(pending_req, i);
 		map[i].dom = blkif->domid;
 		map[i].ref = req->seg[i].gref;
 		map[i].flags = GNTMAP_host_map;
@@ -381,26 +406,22 @@
 	BUG_ON(ret);
 
 	for (i = 0; i < nseg; i++) {
-		if (likely(map[i].status == 0)) {
-			pending_handle(pending_idx, i) = map[i].handle;
+		if (unlikely(map[i].status != 0)) {
+			DPRINTK("invalid buffer -- could not remap it\n");
+			goto fail_flush;
+		}
+
+		pending_handle(pending_req, i) = map[i].handle;
 #ifdef __ia64__
-			MMAP_VADDR(pending_idx,i) = gnttab_map_vaddr(map[i]);
+		pending_vaddrs[vaddr_pagenr(req, seg)] =
+			= gnttab_map_vaddr(map[i]);
 #else
-			set_phys_to_machine(__pa(MMAP_VADDR(
-				pending_idx, i)) >> PAGE_SHIFT,
-				FOREIGN_FRAME(map[i].dev_bus_addr>>PAGE_SHIFT));
+		set_phys_to_machine(__pa(vaddr(
+			pending_req, i)) >> PAGE_SHIFT,
+			FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
 #endif
-			seg[i].buf = map[i].dev_bus_addr |
-				(req->seg[i].first_sect << 9);
-		} else {
-			errors++;
-		}
-	}
-
-	if (errors) {
-		DPRINTK("invalid buffer -- could not remap it\n");
-		fast_flush_area(pending_idx, nseg);
-		goto bad_descriptor;
+		seg[i].buf  = map[i].dev_bus_addr | 
+			(req->seg[i].first_sect << 9);
 	}
 
 	if (vbd_translate(&preq, blkif, operation) != 0) {
@@ -408,37 +429,25 @@
 			operation == READ ? "read" : "write",
 			preq.sector_number,
 			preq.sector_number + preq.nr_sects, preq.dev); 
-		goto bad_descriptor;
-	}
-
-	pending_req = &pending_reqs[pending_idx];
-	pending_req->blkif     = blkif;
-	pending_req->id        = req->id;
-	pending_req->operation = operation;
-	pending_req->status    = BLKIF_RSP_OKAY;
-	pending_req->nr_pages  = nseg;
+		goto fail_flush;
+	}
 
 	for (i = 0; i < nseg; i++) {
 		if (((int)preq.sector_number|(int)seg[i].nsec) &
 		    ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
 			DPRINTK("Misaligned I/O request from domain %d",
 				blkif->domid);
-			goto cleanup_and_fail;
+			goto fail_put_bio;
 		}
 
 		while ((bio == NULL) ||
 		       (bio_add_page(bio,
-				     virt_to_page(MMAP_VADDR(pending_idx, i)),
+				     virt_to_page(vaddr(pending_req, i)),
 				     seg[i].nsec << 9,
 				     seg[i].buf & ~PAGE_MASK) == 0)) {
 			bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
-			if (unlikely(bio == NULL)) {
-			cleanup_and_fail:
-				for (i = 0; i < (nbio-1); i++)
-					bio_put(biolist[i]);
-				fast_flush_area(pending_idx, nseg);
-				goto bad_descriptor;
-			}
+			if (unlikely(bio == NULL))
+				goto fail_put_bio;
                 
 			bio->bi_bdev    = preq.bdev;
 			bio->bi_private = pending_req;
@@ -449,14 +458,8 @@
 		preq.sector_number += seg[i].nsec;
 	}
 
-	if ((q = bdev_get_queue(bio->bi_bdev)) != plugged_queue) {
-		flush_plugged_queue();
-		blk_get_queue(q);
-		plugged_queue = q;
-	}
-
+	plug_queue(blkif, bio);
 	atomic_set(&pending_req->pendcnt, nbio);
-	pending_cons++;
 	blkif_get(blkif);
 
 	for (i = 0; i < nbio; i++)
@@ -464,8 +467,14 @@
 
 	return;
 
- bad_descriptor:
+ fail_put_bio:
+	for (i = 0; i < (nbio-1); i++)
+		bio_put(biolist[i]);
+ fail_flush:
+	fast_flush_area(pending_req);
+ fail_response:
 	make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+	free_req(pending_req);
 } 
 
 
@@ -481,6 +490,7 @@
 	blkif_response_t *resp;
 	unsigned long     flags;
 	blkif_back_ring_t *blk_ring = &blkif->blk_ring;
+	int more_to_do = 0;
 	int notify;
 
 	spin_lock_irqsave(&blkif->blk_ring_lock, flags);
@@ -499,76 +509,67 @@
 		 * notifications if requests are already in flight (lower
 		 * overheads and promotes batching).
 		 */
-		int more_to_do;
 		RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
-		if (more_to_do) {
-			add_to_blkdev_list_tail(blkif);
-			maybe_trigger_blkio_schedule();
-		}
-	}
-	else if (!__on_blkdev_list(blkif)
-		 && RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
-		/* Keep pulling requests as they become available... */
-		add_to_blkdev_list_tail(blkif);
-		maybe_trigger_blkio_schedule();
-	}
-
+
+	} else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
+		more_to_do = 1;
+
+	}
 	spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
 
+	if (more_to_do) {
+		atomic_inc(&blkif->io_pending);
+		wake_up(&blkif->wq);
+	}
 	if (notify)
 		notify_remote_via_irq(blkif->irq);
 }
 
-void blkif_deschedule(blkif_t *blkif)
-{
-	remove_from_blkdev_list(blkif);
-}
-
 static int __init blkif_init(void)
 {
+	struct page *page;
 	int i;
-	struct page *page;
-	int ret;
-
-	for (i = 0; i < MMAP_PAGES; i++)
-		pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
 
 	if (xen_init() < 0)
 		return -ENODEV;
 
+	mmap_pages            = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	pending_reqs          = kmalloc(sizeof(pending_reqs[0]) *
+					blkif_reqs, GFP_KERNEL);
+	pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+					mmap_pages, GFP_KERNEL);
+	pending_vaddrs        = kmalloc(sizeof(pending_vaddrs[0]) *
+					mmap_pages, GFP_KERNEL);
+	if (!pending_reqs || !pending_grant_handles || !pending_vaddrs) {
+		printk("%s: out of memory\n", __FUNCTION__);
+		return -1;
+	}
+
 	blkif_interface_init();
-
+	
 #ifdef __ia64__
-    {
 	extern unsigned long alloc_empty_foreign_map_page_range(unsigned long pages);
-	int i;
-
-	mmap_vstart =  alloc_empty_foreign_map_page_range(MMAP_PAGES);
-	printk("Allocated mmap_vstart: 0x%lx\n", mmap_vstart);
-	for(i = 0; i < MMAP_PAGES; i++)
-	    pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
-	BUG_ON(mmap_vstart == NULL);
-    }
-#else
-	page = balloon_alloc_empty_page_range(MMAP_PAGES);
+	mmap_vstart = (unsigned long)alloc_empty_foreign_map_page_range(mmap_pages);
+#else /* ! ia64 */
+	page = balloon_alloc_empty_page_range(mmap_pages);
 	BUG_ON(page == NULL);
 	mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
 #endif
-
-	pending_cons = 0;
-	pending_prod = MAX_PENDING_REQS;
+	printk("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n",
+	       __FUNCTION__, blkif_reqs, mmap_pages, mmap_vstart);
+	BUG_ON(mmap_vstart == 0);
+	for (i = 0; i < mmap_pages; i++) {
+		pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
+		pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
+	}
+
 	memset(pending_reqs, 0, sizeof(pending_reqs));
-	for (i = 0; i < MAX_PENDING_REQS; i++)
-		pending_ring[i] = i;
+	INIT_LIST_HEAD(&pending_free);
+
+	for (i = 0; i < blkif_reqs; i++)
+		list_add_tail(&pending_reqs[i].free_list, &pending_free);
     
-	spin_lock_init(&blkio_schedule_list_lock);
-	INIT_LIST_HEAD(&blkio_schedule_list);
-
-	ret = kernel_thread(blkio_schedule, NULL, CLONE_FS | CLONE_FILES);
-	BUG_ON(ret < 0);
-
 	blkif_xenbus_init();
-
 	return 0;
 }
 
diff -r 0255f48b757f linux-2.6-xen-sparse/drivers/xen/blkback/common.h
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/common.h	Sun Dec  4 19:12:00 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/common.h	Tue Dec  6 15:29:06 2005
@@ -60,9 +60,19 @@
 	/* Is this a blktap frontend */
 	unsigned int     is_blktap;
 #endif
-	struct list_head blkdev_list;
 	spinlock_t       blk_ring_lock;
 	atomic_t         refcnt;
+
+	wait_queue_head_t   wq;
+	struct task_struct  *xenblkd;
+	atomic_t            io_pending;
+	request_queue_t     *plug;
+
+	/* statistics */
+	unsigned long       st_print;
+	int                 st_rd_req;
+	int                 st_wr_req;
+	int                 st_oo_req;
 
 	struct work_struct free_work;
 
@@ -101,11 +111,10 @@
 
 void blkif_interface_init(void);
 
-void blkif_deschedule(blkif_t *blkif);
-
 void blkif_xenbus_init(void);
 
 irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+int blkif_schedule(void *arg);
 
 void update_blkif_status(blkif_t *blkif); 
 
diff -r 0255f48b757f linux-2.6-xen-sparse/drivers/xen/blkback/interface.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c	Sun Dec  4 19:12:00 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c	Tue Dec  6 15:29:06 2005
@@ -24,6 +24,8 @@
 	blkif->status = DISCONNECTED;
 	spin_lock_init(&blkif->blk_ring_lock);
 	atomic_set(&blkif->refcnt, 1);
+	init_waitqueue_head(&blkif->wq);
+	blkif->st_print = jiffies;
 
 	return blkif;
 }
@@ -111,6 +113,7 @@
 
 	blkif->irq = bind_evtchn_to_irqhandler(
 		blkif->evtchn, blkif_be_int, 0, "blkif-backend", blkif);
+	wake_up(&blkif->wq);
 
 	/* We're potentially connected now */
 	update_blkif_status(blkif); 
diff -r 0255f48b757f linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c	Sun Dec  4 19:12:00 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c	Tue Dec  6 15:29:06 2005
@@ -20,6 +20,7 @@
 
 #include <stdarg.h>
 #include <linux/module.h>
+#include <linux/kthread.h>
 #include <asm-xen/xenbus.h>
 #include "common.h"
 
@@ -92,6 +93,8 @@
 	}
 	if (be->blkif) {
 		be->blkif->status = DISCONNECTED; 
+		if (be->blkif->xenblkd)
+			kthread_stop(be->blkif->xenblkd);
 		blkif_put(be->blkif);
 		be->blkif = NULL;
 	}
@@ -217,6 +220,17 @@
 			be->major = 0;
 			be->minor = 0;
 			xenbus_dev_fatal(dev, err, "creating vbd structure");
+			return;
+		}
+
+		be->blkif->xenblkd = kthread_run(blkif_schedule, be->blkif,
+						 "xvd %d %02x:%02x",
+						 be->blkif->domid,
+						 be->major, be->minor);
+		if (IS_ERR(be->blkif->xenblkd)) {
+			err = PTR_ERR(be->blkif->xenblkd);
+			be->blkif->xenblkd = NULL;
+			xenbus_dev_error(dev, err, "start xenblkd");
 			return;
 		}
 

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2005-12-06 15:47 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20051019150719.GC10710@tpkurt.pmc.nue.novell.com>
     [not found] ` <20051020085107.GA2811@suse.de>
     [not found]   ` <435E56B6.2060707@suse.de>
     [not found]     ` <20051025171505.GG4774@suse.de>
     [not found]       ` <9f07b56a8f036cecd2a0687e040a417b@cl.cam.ac.uk>
     [not found]         ` <4360ABE5.1010705@suse.de>
     [not found]           ` <b943edf9af93016dc65d8090323ed520@cl.cam.ac.uk>
     [not found]             ` <4360CA79.40605@suse.de>
     [not found]               ` <efbd0c82d427f99326de12f394c2c90b@cl.cam.ac.uk>
2005-11-09 13:29                 ` [patch] CFQ for xen domains Gerd Knorr
2005-11-15 17:51                   ` Gerd Knorr
2005-11-24 11:31                     ` Gerd Knorr
2005-11-24 23:00                       ` Keir Fraser
2005-12-06 15:47                       ` Gerd Knorr

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.