[PATCH][RFC] Use bio markers for request callback

public inbox for linux-scsi@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH][RFC] Use bio markers for request callback
@ 2007-09-11 10:44 Hannes Reinecke
  2007-09-11 11:11 ` Jens Axboe
  2007-09-11 16:49 ` Kiyoshi Ueda
  0 siblings, 2 replies; 4+ messages in thread
From: Hannes Reinecke @ 2007-09-11 10:44 UTC (permalink / raw)
  To: device-mapper development, linux-scsi; +Cc: michaelc, jens.axboe

[-- Attachment #1: Type: text/plain, Size: 859 bytes --]

Hi all,

this is a proposal for a different implementation of request  
callbacks. The existing ->endio callback of a request is actually a  
destructor function, to be called to terminate a request and free all  
structures.

However, on certain occasions (like request-based multipathing) it is  
desirable to have a callback function for a request which is called  
right after the request is finished, ie in end_that_request_first()  
before any bio->bi_endio callback is called.

So a simple solution for this is to clone the request and add a new  
'marker' bio in front of the bio list of the request. This callback  
will be attached a structure in bi_private which keeps a pointer to  
the cloned and the original request, thus serving as a callback for  
the request itself.

Proposed patch attached. As usual comments are welcome.

Cheers,

Hannes

[-- Attachment #2: rq-marker-bio --]
[-- Type: text/x-patch, Size: 39471 bytes --]

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index a15845c..c286f05 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -40,7 +40,6 @@ static void blk_unplug_work(struct work_struct *work);
 static void blk_unplug_timeout(unsigned long data);
 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
 static void init_request_from_bio(struct request *req, struct bio *bio);
-static int __make_request(struct request_queue *q, struct bio *bio);
 static struct io_context *current_io_context(gfp_t gfp_flags, int node);
 
 /*
@@ -2912,7 +2911,7 @@ static void init_request_from_bio(struct request *req, struct bio *bio)
 	req->start_time = jiffies;
 }
 
-static int __make_request(struct request_queue *q, struct bio *bio)
+int __make_request(struct request_queue *q, struct bio *bio)
 {
 	struct request *req;
 	int el_ret, nr_sectors, barrier, err;
@@ -3030,6 +3029,7 @@ end_io:
 	bio_endio(bio, nr_sectors << 9, err);
 	return 0;
 }
+EXPORT_SYMBOL(__make_request);
 
 /*
  * If bio->bi_dev is a partition, remap the location
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index d6ca9d0..76acda2 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (C) 2003 Sistina Software Limited.
  * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2007 NEC Corporation.
  *
  * This file is released under the GPL.
  */
@@ -8,8 +9,7 @@
 #include "dm.h"
 #include "dm-path-selector.h"
 #include "dm-hw-handler.h"
-#include "dm-bio-list.h"
-#include "dm-bio-record.h"
+#include "dm-rq-record.h"
 
 #include <linux/ctype.h>
 #include <linux/init.h>
@@ -77,7 +77,7 @@ struct multipath {
 	unsigned saved_queue_if_no_path;/* Saved state during suspension */
 
 	struct work_struct process_queued_ios;
-	struct bio_list queued_ios;
+	struct list_head queued_ios;
 	unsigned queue_size;
 
 	struct work_struct trigger_event;
@@ -94,7 +94,7 @@ struct multipath {
  */
 struct dm_mpath_io {
 	struct pgpath *pgpath;
-	struct dm_bio_details details;
+	struct dm_rq_details details;
 };
 
 typedef int (*action_fn) (struct pgpath *pgpath);
@@ -171,6 +171,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 	m = kzalloc(sizeof(*m), GFP_KERNEL);
 	if (m) {
 		INIT_LIST_HEAD(&m->priority_groups);
+		INIT_LIST_HEAD(&m->queued_ios);
 		spin_lock_init(&m->lock);
 		m->queue_io = 1;
 		INIT_WORK(&m->process_queued_ios, process_queued_ios);
@@ -180,6 +181,9 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 			kfree(m);
 			return NULL;
 		}
+
+		dm_table_enable_elevator(ti->table);
+
 		m->ti = ti;
 		ti->private = m;
 	}
@@ -202,6 +206,8 @@ static void free_multipath(struct multipath *m)
 		dm_put_hw_handler(hwh->type);
 	}
 
+	dm_table_disable_elevator(m->ti->table);
+
 	mempool_destroy(m->mpio_pool);
 	kfree(m);
 }
@@ -299,7 +305,7 @@ static int __must_push_back(struct multipath *m)
 		dm_noflush_suspending(m->ti));
 }
 
-static int map_io(struct multipath *m, struct bio *bio,
+static int map_io(struct multipath *m, struct request *clone,
 		  struct dm_mpath_io *mpio, unsigned was_queued)
 {
 	int r = DM_MAPIO_REMAPPED;
@@ -321,22 +327,30 @@ static int map_io(struct multipath *m, struct bio *bio,
 	if ((pgpath && m->queue_io) ||
 	    (!pgpath && m->queue_if_no_path)) {
 		/* Queue for the daemon to resubmit */
-		bio_list_add(&m->queued_ios, bio);
+		list_add_tail(&clone->queuelist, &m->queued_ios);
 		m->queue_size++;
 		if ((m->pg_init_required && !m->pg_init_in_progress) ||
 		    !m->queue_io)
 			queue_work(kmultipathd, &m->process_queued_ios);
 		pgpath = NULL;
+		clone->q = NULL;
+		clone->rq_disk = NULL;
 		r = DM_MAPIO_SUBMITTED;
-	} else if (pgpath)
-		bio->bi_bdev = pgpath->path.dev->bdev;
-	else if (__must_push_back(m))
-		r = DM_MAPIO_REQUEUE;
-	else
+	} else if (pgpath) {
+		clone->q = bdev_get_queue(pgpath->path.dev->bdev);
+		clone->rq_disk = pgpath->path.dev->bdev->bd_disk;
+	} else {
+		printk(KERN_INFO "request %p: failed to map\n", clone);
+		clone->q = NULL;
+		clone->rq_disk = NULL;
 		r = -EIO;	/* Failed */
+	}
 
 	mpio->pgpath = pgpath;
 
+	if (r == DM_MAPIO_REMAPPED && m->current_pg->ps.type->start_io)
+		m->current_pg->ps.type->start_io(&m->current_pg->ps,
+						 &pgpath->path, clone);
 	spin_unlock_irqrestore(&m->lock, flags);
 
 	return r;
@@ -373,30 +387,38 @@ static void dispatch_queued_ios(struct multipath *m)
 {
 	int r;
 	unsigned long flags;
-	struct bio *bio = NULL, *next;
 	struct dm_mpath_io *mpio;
 	union map_info *info;
+	struct request *clone, *n;
+	LIST_HEAD(cl);
 
 	spin_lock_irqsave(&m->lock, flags);
-	bio = bio_list_get(&m->queued_ios);
+	list_splice_init(&m->queued_ios, &cl);
 	spin_unlock_irqrestore(&m->lock, flags);
 
-	while (bio) {
-		next = bio->bi_next;
-		bio->bi_next = NULL;
+	list_for_each_entry_safe(clone, n, &cl, queuelist) {
+		struct bio *bio = clone->bio;
+		list_del(&clone->queuelist);
+
+		BUG_ON(!bio);
 
-		info = dm_get_mapinfo(bio);
+		info = dm_get_mapinfo(clone->bio);
 		mpio = info->ptr;
 
-		r = map_io(m, bio, mpio, 1);
-		if (r < 0)
+		r = map_io(m, clone, mpio, 1);
+		if (r < 0) {
+			printk(KERN_INFO "%s: map clone %p failed\n",
+			       __FUNCTION__, clone);
+			/* FIXME */
 			bio_endio(bio, bio->bi_size, r);
-		else if (r == DM_MAPIO_REMAPPED)
-			generic_make_request(bio);
-		else if (r == DM_MAPIO_REQUEUE)
+		} else if (r == DM_MAPIO_REMAPPED)
+			dm_dispatch_request(clone->q, clone);
+		else if (r == DM_MAPIO_REQUEUE) {
+			printk(KERN_INFO "%s: requeue map clone %p\n",
+			       __FUNCTION__, clone);
+			/* FIXME */
 			bio_endio(bio, bio->bi_size, -EIO);
-
-		bio = next;
+		}
 	}
 }
 
@@ -789,23 +811,29 @@ static void multipath_dtr(struct dm_target *ti)
 }
 
 /*
- * Map bios, recording original fields for later in case we have to resubmit
+ * Map cloned requests
  */
-static int multipath_map(struct dm_target *ti, struct bio *bio,
+static int multipath_map(struct dm_target *ti, struct request *clone,
 			 union map_info *map_context)
 {
 	int r;
 	struct dm_mpath_io *mpio;
 	struct multipath *m = (struct multipath *) ti->private;
 
+	if (blk_barrier_rq(clone))
+		return -EOPNOTSUPP;
+
 	mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
-	dm_bio_record(&mpio->details, bio);
+	memset(mpio, 0, sizeof(mpio));
 
+	clone->cmd_flags |= REQ_FAILFAST;
+	dm_rq_record(&mpio->details, clone);
 	map_context->ptr = mpio;
-	bio->bi_rw |= (1 << BIO_RW_FAILFAST);
-	r = map_io(m, bio, mpio, 0);
-	if (r < 0 || r == DM_MAPIO_REQUEUE)
+	r = map_io(m, clone, mpio, 0);
+	if (r < 0 || r == DM_MAPIO_REQUEUE) {
 		mempool_free(mpio, m->mpio_pool);
+		map_context->ptr = NULL;
+	}
 
 	return r;
 }
@@ -1007,25 +1035,37 @@ void dm_pg_init_complete(struct dm_path *path, unsigned err_flags)
 	spin_unlock_irqrestore(&m->lock, flags);
 }
 
+static void multipath_queue_in_tgt(struct dm_target *ti, struct request *clone,
+				   union map_info *map_context)
+{
+	struct multipath *m = (struct multipath *) ti->private;
+	struct dm_mpath_io *mpio = (struct dm_mpath_io *) map_context->ptr;
+	unsigned long flags = 0UL;
+
+	dm_rq_restore(&mpio->details, clone);
+
+	/* queue for the daemon to resubmit or fail */
+	spin_lock_irqsave(&m->lock, flags);
+	list_add_tail(&clone->queuelist, &m->queued_ios);
+	m->queue_size++;
+	if (!m->queue_io)
+		queue_work(kmultipathd, &m->process_queued_ios);
+	spin_unlock_irqrestore(&m->lock, flags);
+}
+
 /*
  * end_io handling
  */
-static int do_end_io(struct multipath *m, struct bio *bio,
+static int do_end_io(struct multipath *m, struct request *clone,
 		     int error, struct dm_mpath_io *mpio)
 {
 	struct hw_handler *hwh = &m->hw_handler;
 	unsigned err_flags = MP_FAIL_PATH;	/* Default behavior */
-	unsigned long flags;
+	unsigned long flags = 0UL;
 
-	if (!error)
+	if (!clone->errors && !error)
 		return 0;	/* I/O complete */
 
-	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
-		return error;
-
-	if (error == -EOPNOTSUPP)
-		return error;
-
 	spin_lock_irqsave(&m->lock, flags);
 	if (!m->nr_valid_paths) {
 		if (__must_push_back(m)) {
@@ -1041,9 +1081,6 @@ static int do_end_io(struct multipath *m, struct bio *bio,
 	}
 	spin_unlock_irqrestore(&m->lock, flags);
 
-	if (hwh->type && hwh->type->error)
-		err_flags = hwh->type->error(hwh, bio);
-
 	if (mpio->pgpath) {
 		if (err_flags & MP_FAIL_PATH)
 			fail_path(mpio->pgpath);
@@ -1056,21 +1093,14 @@ static int do_end_io(struct multipath *m, struct bio *bio,
 		return -EIO;
 
       requeue:
-	dm_bio_restore(&mpio->details, bio);
-
-	/* queue for the daemon to resubmit or fail */
-	spin_lock_irqsave(&m->lock, flags);
-	bio_list_add(&m->queued_ios, bio);
-	m->queue_size++;
-	if (!m->queue_io)
-		queue_work(kmultipathd, &m->process_queued_ios);
-	spin_unlock_irqrestore(&m->lock, flags);
-
 	return DM_ENDIO_INCOMPLETE;	/* io not complete */
 }
 
-static int multipath_end_io(struct dm_target *ti, struct bio *bio,
-			    int error, union map_info *map_context)
+/*
+ * clone->q's lock must be held
+ */
+static int multipath_end_io_first(struct dm_target *ti, struct request *clone,
+				  int error, union map_info *map_context)
 {
 	struct multipath *m = ti->private;
 	struct dm_mpath_io *mpio = map_context->ptr;
@@ -1078,18 +1108,34 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio,
 	struct path_selector *ps;
 	int r;
 
-	r  = do_end_io(m, bio, error, mpio);
+	r  = do_end_io(m, clone, error, mpio);
 	if (pgpath) {
 		ps = &pgpath->pg->ps;
 		if (ps->type->end_io)
-			ps->type->end_io(ps, &pgpath->path);
+			ps->type->end_io(ps, &pgpath->path, clone);
 	}
-	if (r != DM_ENDIO_INCOMPLETE)
+	if (r != DM_ENDIO_INCOMPLETE && mpio) {
 		mempool_free(mpio, m->mpio_pool);
+		map_context->ptr = NULL;
+	}
 
 	return r;
 }
 
+static int multipath_end_io(struct dm_target *ti, struct request *clone,
+			    int error, union map_info *map_context)
+{
+	struct multipath *m = (struct multipath *) ti->private;
+	struct dm_mpath_io *mpio = (struct dm_mpath_io *) map_context->ptr;
+
+	if (mpio) {
+		mempool_free(mpio, m->mpio_pool);
+		map_context->ptr = NULL;
+	}
+
+	return 0;
+}
+
 /*
  * Suspend can't complete until all the I/O is processed so if
  * the last path fails we must error any remaining I/O.
@@ -1329,8 +1375,10 @@ static struct target_type multipath_target = {
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
-	.map = multipath_map,
-	.end_io = multipath_end_io,
+	.map_rq = multipath_map,
+	.rq_end_io_first = multipath_end_io_first,
+	.rq_end_io = multipath_end_io,
+	.queue_in_tgt = multipath_queue_in_tgt,
 	.presuspend = multipath_presuspend,
 	.resume = multipath_resume,
 	.status = multipath_status,
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index 27357b8..089a979 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -75,7 +75,10 @@ struct path_selector_type {
 	int (*status) (struct path_selector *ps, struct dm_path *path,
 		       status_type_t type, char *result, unsigned int maxlen);
 
-	int (*end_io) (struct path_selector *ps, struct dm_path *path);
+	int (*start_io) (struct path_selector *ps, struct dm_path *path,
+			struct request *clone);
+	int (*end_io) (struct path_selector *ps, struct dm_path *path,
+			struct request *clone);
 };
 
 /* Register a path selector */
diff --git a/drivers/md/dm-rq-record.h b/drivers/md/dm-rq-record.h
new file mode 100644
index 0000000..4e09598
--- /dev/null
+++ b/drivers/md/dm-rq-record.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2007 NEC Corporation.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_RQ_RECORD_H
+#define DM_RQ_RECORD_H
+
+#include <linux/blkdev.h>
+
+/*
+ * There are lots of mutable fields in the request struct that get
+ * changed by the lower levels of the block layer.  Some targets,
+ * such as multipath, may wish to resubmit a request on error.  The
+ * functions in this file help the target record and restore the
+ * original request state.
+ */
+struct dm_rq_details {
+	unsigned int cmd_flags;
+	int ref_count;
+};
+
+static inline void dm_rq_record(struct dm_rq_details *rd, struct request *rq)
+{
+	rd->cmd_flags = rq->cmd_flags;
+	rd->ref_count = rq->ref_count;
+}
+
+static inline void dm_rq_restore(struct dm_rq_details *rd, struct request *rq)
+{
+	rq->cmd_flags = rd->cmd_flags;
+	rq->ref_count = rd->ref_count;
+}
+
+#endif
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2bcde57..ad0c85e 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1033,6 +1033,28 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
 	return t->md;
 }
 
+int dm_table_enable_elevator(struct dm_table *t)
+{
+	struct mapped_device *md = dm_table_get_md(t);
+
+	if (!md)
+		return 1;
+
+	if (dm_enable_elevator(md)) {
+		dm_put(md);
+		return 1;
+	}
+	dm_put(md);
+
+	return 0;
+}
+
+void dm_table_disable_elevator(struct dm_table *t)
+{
+	/* No reference taken */
+	dm_disable_elevator(t->md);
+}
+
 EXPORT_SYMBOL(dm_vcalloc);
 EXPORT_SYMBOL(dm_get_device);
 EXPORT_SYMBOL(dm_put_device);
@@ -1044,3 +1066,5 @@ EXPORT_SYMBOL(dm_table_put);
 EXPORT_SYMBOL(dm_table_get);
 EXPORT_SYMBOL(dm_table_unplug_all);
 EXPORT_SYMBOL(dm_table_flush_all);
+EXPORT_SYMBOL(dm_table_enable_elevator);
+EXPORT_SYMBOL(dm_table_disable_elevator);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2120155..7ac9527 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -24,6 +24,8 @@
 
 #define DM_MSG_PREFIX "core"
 
+#define DM_DEBUG
+
 static const char *_name = DM_NAME;
 
 static unsigned int major = 0;
@@ -31,13 +33,19 @@ static unsigned int _major = 0;
 
 static DEFINE_SPINLOCK(_minor_lock);
 /*
- * One of these is allocated per bio.
+ * One of these is allocated per bio / request.
  */
 struct dm_io {
 	struct mapped_device *md;
 	int error;
-	struct bio *bio;
-	atomic_t io_count;
+	union {
+		struct bio *bio;
+		struct request *req;
+	} data;
+	union {
+		atomic_t io_count;
+		struct request *orig;
+	} parm;
 	unsigned long start_time;
 };
 
@@ -48,6 +56,7 @@ struct dm_io {
 struct dm_target_io {
 	struct dm_io *io;
 	struct dm_target *ti;
+	struct dm_table *map;
 	union map_info info;
 };
 
@@ -103,6 +112,7 @@ struct mapped_device {
 	 * io objects are allocated from here.
 	 */
 	mempool_t *io_pool;
+	mempool_t *rq_pool;
 	mempool_t *tio_pool;
 
 	struct bio_set *bs;
@@ -125,6 +135,7 @@ struct mapped_device {
 
 #define MIN_IOS 256
 static struct kmem_cache *_io_cache;
+static struct kmem_cache *_rq_cache;
 static struct kmem_cache *_tio_cache;
 
 static int __init local_init(void)
@@ -143,6 +154,14 @@ static int __init local_init(void)
 		return -ENOMEM;
 	}
 
+	_rq_cache = kmem_cache_create("dm_rq", sizeof(struct request),
+				      0, 0, NULL);
+	if (!_rq_cache) {
+		kmem_cache_destroy(_tio_cache);
+		kmem_cache_destroy(_io_cache);
+		return -ENOMEM;
+	}
+
 	_major = major;
 	r = register_blkdev(_major, _name);
 	if (r < 0) {
@@ -160,6 +179,7 @@ static int __init local_init(void)
 static void local_exit(void)
 {
 	kmem_cache_destroy(_tio_cache);
+	kmem_cache_destroy(_rq_cache);
 	kmem_cache_destroy(_io_cache);
 	unregister_blkdev(_major, _name);
 
@@ -341,6 +361,16 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 	mempool_free(tio, md->tio_pool);
 }
 
+static inline struct request *alloc_rq(struct mapped_device *md)
+{
+	return mempool_alloc(md->rq_pool, GFP_NOIO);
+}
+
+static inline void free_rq(struct mapped_device *md, struct request *rq)
+{
+	mempool_free(rq, md->rq_pool);
+}
+
 static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
@@ -356,7 +386,7 @@ static void start_io_acct(struct dm_io *io)
 static int end_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
-	struct bio *bio = io->bio;
+	struct bio *bio = io->data.bio;
 	unsigned long duration = jiffies - io->start_time;
 	int pending;
 	int rw = bio_data_dir(bio);
@@ -434,6 +464,94 @@ int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 	return 0;
 }
 
+/*
+ * The queue is only valid as long as you have a reference
+ * count on 'md'.
+ */
+struct request_queue *dm_get_queue(struct mapped_device *md)
+{
+	if (blk_get_queue(md->queue))
+		return NULL;
+
+	return md->queue;
+}
+EXPORT_SYMBOL_GPL(dm_get_queue);
+
+void dm_put_queue(struct request_queue *q)
+{
+	blk_put_queue(q);
+}
+EXPORT_SYMBOL_GPL(dm_put_queue);
+
+void dm_stop_queue(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	blk_stop_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_stop_queue);
+
+void dm_start_queue(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (blk_queue_stopped(q))
+		blk_start_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_start_queue);
+
+void __dm_requeue_request(struct request_queue *q, struct request *rq)
+{
+	if (elv_queue_empty(q))
+		blk_plug_device(q);
+	blk_requeue_request(q, rq);
+}
+EXPORT_SYMBOL_GPL(__dm_requeue_request);
+
+void dm_requeue_request(struct request_queue *q, struct request *rq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	__dm_requeue_request(q, rq);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_requeue_request);
+
+void dm_dispatch_request(struct request_queue *q, struct request *rq)
+{
+	int where = ELEVATOR_INSERT_BACK;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	/* Initialize statistic stuff */
+	disk_round_stats(rq->rq_disk);
+	rq->start_time = jiffies;
+	rq->rq_disk->in_flight++;
+
+	__elv_add_request(q, rq, where, 0);
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_dispatch_request);
+
+static void kill_request(struct request *rq)
+{
+	int nr_bytes = rq->hard_nr_sectors << 9;
+
+	if (!nr_bytes)
+		nr_bytes = rq->data_len;
+
+	rq->cmd_flags |= REQ_QUIET;
+	end_that_request_first(rq, 0, nr_bytes);
+	end_that_request_last(rq, 0);
+}
+
 /*-----------------------------------------------------------------
  * CRUD START:
  *   A more elegant soln is in the works that uses the queue
@@ -460,7 +578,7 @@ static void dec_pending(struct dm_io *io, int error)
 	if (error && !(io->error > 0 && __noflush_suspending(io->md)))
 		io->error = error;
 
-	if (atomic_dec_and_test(&io->io_count)) {
+	if (atomic_dec_and_test(&io->parm.io_count)) {
 		if (io->error == DM_ENDIO_REQUEUE) {
 			/*
 			 * Target requested pushing back the I/O.
@@ -469,7 +587,7 @@ static void dec_pending(struct dm_io *io, int error)
 			 */
 			spin_lock_irqsave(&io->md->pushback_lock, flags);
 			if (__noflush_suspending(io->md))
-				bio_list_add(&io->md->pushback, io->bio);
+				bio_list_add(&io->md->pushback, io->data.bio);
 			else
 				/* noflush suspend was interrupted. */
 				io->error = -EIO;
@@ -481,10 +599,10 @@ static void dec_pending(struct dm_io *io, int error)
 			wake_up(&io->md->wait);
 
 		if (io->error != DM_ENDIO_REQUEUE) {
-			blk_add_trace_bio(io->md->queue, io->bio,
+			blk_add_trace_bio(io->md->queue, io->data.bio,
 					  BLK_TA_COMPLETE);
 
-			bio_endio(io->bio, io->bio->bi_size, io->error);
+			bio_endio(io->data.bio, io->data.bio->bi_size, io->error);
 		}
 
 		free_io(io->md, io);
@@ -533,6 +651,30 @@ static int clone_endio(struct bio *bio, unsigned int done, int error)
 	return r;
 }
 
+static void blk_update_cloned_rq(struct request *rq, struct request *clone)
+{
+	clone->nr_phys_segments = rq->nr_phys_segments;
+	clone->nr_hw_segments = rq->nr_hw_segments;
+	clone->current_nr_sectors = rq->current_nr_sectors;
+	clone->hard_cur_sectors = rq->hard_cur_sectors;
+	clone->hard_nr_sectors = rq->hard_nr_sectors;
+	clone->nr_sectors = rq->nr_sectors;
+	clone->hard_sector = rq->hard_sector;
+	clone->sector = rq->sector;
+	clone->data_len = rq->data_len;
+	clone->buffer = rq->buffer;
+	clone->data = rq->data;
+	clone->bio = rq->bio;
+	clone->biotail = rq->biotail;
+}
+
+static void dec_rq_pending(struct dm_target_io *tio)
+{
+	if (!atomic_dec_return(&tio->io->md->pending))
+		/* nudge anyone waiting on suspend queue */
+		wake_up(&tio->io->md->wait);
+}
+
 static sector_t max_io_len(struct mapped_device *md,
 			   sector_t sector, struct dm_target *ti)
 {
@@ -573,14 +715,14 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 	 * anything, the target has assumed ownership of
 	 * this io.
 	 */
-	atomic_inc(&tio->io->io_count);
+	atomic_inc(&tio->io->parm.io_count);
 	sector = clone->bi_sector;
 	r = ti->type->map(ti, clone, &tio->info);
 	if (r == DM_MAPIO_REMAPPED) {
 		/* the bio has been remapped so dispatch it */
 
 		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
-				    tio->io->bio->bi_bdev->bd_dev,
+				    tio->io->data.bio->bi_bdev->bd_dev,
 				    clone->bi_sector, sector);
 
 		generic_make_request(clone);
@@ -613,7 +755,10 @@ struct clone_info {
 static void dm_bio_destructor(struct bio *bio)
 {
 	struct bio_set *bs = bio->bi_private;
-
+#ifdef DM_DEBUG
+	if (bio_rq_callback(bio))
+		printk(KERN_INFO "%s: bio %p\n", __FUNCTION__, bio);
+#endif
 	bio_free(bio, bs);
 }
 
@@ -686,6 +831,7 @@ static void __clone_and_map(struct clone_info *ci)
 		clone = clone_bio(bio, ci->sector, ci->idx,
 				  bio->bi_vcnt - ci->idx, ci->sector_count,
 				  ci->md->bs);
+
 		__map_bio(ti, clone, tio);
 		ci->sector_count = 0;
 
@@ -740,7 +886,6 @@ static void __clone_and_map(struct clone_info *ci)
 			clone = split_bvec(bio, ci->sector, ci->idx,
 					   bv->bv_offset + offset, len,
 					   ci->md->bs);
-
 			__map_bio(ti, clone, tio);
 
 			ci->sector += len;
@@ -769,8 +914,8 @@ static void __split_bio(struct mapped_device *md, struct bio *bio)
 	ci.bio = bio;
 	ci.io = alloc_io(md);
 	ci.io->error = 0;
-	atomic_set(&ci.io->io_count, 1);
-	ci.io->bio = bio;
+	atomic_set(&ci.io->parm.io_count, 1);
+	ci.io->data.bio = bio;
 	ci.io->md = md;
 	ci.sector = bio->bi_sector;
 	ci.sector_count = bio_sectors(bio);
@@ -784,6 +929,272 @@ static void __split_bio(struct mapped_device *md, struct bio *bio)
 	dec_pending(ci.io, 0);
 	dm_table_put(ci.map);
 }
+
+/*
+ * request cloning
+ */
+static void clone_end_request(struct request *rq, int error)
+{
+	struct dm_target_io *tio = (struct dm_target_io *)rq->end_io_data;
+	dm_request_endio_fn endio = tio->ti->type->rq_end_io;
+	struct request_queue *q = rq->q;
+	struct request *orig = tio->io->parm.orig;
+	struct mapped_device *md = tio->io->md;
+	int uptodate;
+
+	/* Safeguard */
+	rq->end_io = NULL;
+	rq->end_io_data = NULL;
+
+	if (endio)
+		endio(tio->ti, rq, error, &tio->info);
+
+	__blk_put_request(q, rq);
+
+	if (orig) {
+		spin_unlock_irq(q->queue_lock);
+		uptodate = (error < 0)? error: !error;
+		end_that_request_last(orig, uptodate);
+		spin_lock_irq(q->queue_lock);
+	}
+
+	dec_rq_pending(tio);
+	free_io(md, tio->io);
+	free_tio(md, tio);
+	free_rq(md, rq);
+}
+
+static int clone_rq_endio(struct bio *bio, unsigned int bytes, int error)
+{
+	struct dm_target_io *tio = (struct dm_target_io *)bio->bi_private;
+	struct request *clone = tio->io->data.req;
+	struct request *orig = tio->io->parm.orig;
+	struct mapped_device *md = tio->io->md;
+	int r = 0, uptodate;
+	dm_request_endio_first_fn endio = tio->ti->type->rq_end_io_first;
+	dm_request_queue_in_tgt_fn queue_in_tgt = tio->ti->type->queue_in_tgt;
+	struct request_queue *q = clone->q, *q_orig = orig->q;
+
+	if (bio->bi_size)
+		return 1;
+
+	if (endio) {
+		r = endio(tio->ti, clone, error, &tio->info);
+		switch (r) {
+		case 0:
+#ifdef DM_DEBUG
+			printk(KERN_INFO "clone %p bio %p endio done\n",
+			       clone, bio);
+#endif
+			/* Destroy sequencer bio */
+			bio->bi_private = md->bs;
+			bio_put(bio);
+			/* succeeded */
+			break;
+		case DM_ENDIO_INCOMPLETE:
+			/* FIXME */
+			printk(KERN_INFO "clone %p bio %p endio incomplete\n",
+			       clone, bio);
+			/* the target wants to handle the io without unmap */
+			__blk_put_request(q, clone);
+			clone->special = NULL;
+			clone->errors = 0;
+
+			if (!queue_in_tgt) {
+				DMERR("queue_in_tgt isn't implemented.");
+				BUG();
+			}
+			queue_in_tgt(tio->ti, clone, &tio->info);
+			blk_run_queue(q_orig);
+
+			return 0;
+		case DM_ENDIO_REQUEUE:
+			/* FIXME */
+			printk(KERN_INFO "clone %p bio %p endio requeue\n",
+			       clone, bio);
+			/*
+			 * The target wants to requeue the original request,
+			 * unthread sequencer bio and stop bio processing.
+			 */
+			clone->bio = NULL;
+			clone->biotail = clone->bio;
+			/* Destroy sequencer bio */
+			bio->bi_private = md->bs;
+			bio_put(bio);
+			/* Requeue original request */
+			dm_requeue_request(q_orig, orig);
+			break;
+		default:
+			/* FIXME */
+			printk(KERN_INFO "clone %p bio %p endio return %d\n",
+			       clone, bio, r);
+			if (r >= 0) {
+				DMWARN("unimplemented target endio return value: %d", r);
+				BUG();
+			}
+			/* Destroy sequencer bio */
+			bio->bi_private = md->bs;
+			bio_put(bio);
+			break;
+		}
+	}
+
+	/*
+	 * Finish original request
+	 */
+	uptodate = (error < 0)? error: !error;
+	r = end_that_request_first(orig, uptodate,
+				   orig->hard_cur_sectors);
+	/*
+	 * Recopy the original request fields that were updated
+	 * in blk_end_request() to the clone.
+	 */
+	blk_update_cloned_rq(orig, clone);
+
+	return r?1:0;
+}
+
+static struct bio *alloc_seq_bio(struct request *req, struct bio_set *bs)
+{
+	struct bio *clone;
+
+	clone = bio_alloc_bioset(GFP_NOIO, 0, bs);
+	clone->bi_sector = 0;
+	clone->bi_flags |= 1 << BIO_CLONED;
+	clone->bi_rw |= (1 << BIO_RW_RQCALLBACK);
+	clone->bi_vcnt = 0;
+	clone->bi_size = 0;
+	clone->bi_idx = 0;
+	clone->bi_end_io = clone_rq_endio;
+	bio_phys_segments(req->q, clone);
+	bio_hw_segments(req->q, clone);
+	clone->bi_private = req;
+	clone->bi_destructor = dm_bio_destructor;
+	clone->bi_flags &= ~(1 << BIO_SEG_VALID);
+#ifdef DM_DEBUG
+	printk(KERN_INFO "%s: bio %p\n", __FUNCTION__, clone);
+#endif
+	return clone;
+}
+
+static void setup_clone(struct request *clone, struct request *rq)
+{
+	memset(clone, 0, sizeof(struct request));
+	INIT_LIST_HEAD(&clone->queuelist);
+	INIT_LIST_HEAD(&clone->donelist);
+	clone->cmd_flags = (rq_data_dir(rq) | REQ_NOMERGE);
+	clone->cmd_type = rq->cmd_type;
+	clone->sector = rq->sector;
+	clone->hard_sector = rq->hard_sector;
+	clone->nr_sectors = rq->nr_sectors;
+	clone->hard_nr_sectors = rq->hard_nr_sectors;
+	clone->current_nr_sectors = rq->current_nr_sectors;
+	clone->hard_cur_sectors = rq->hard_cur_sectors;
+	clone->bio = rq->bio;
+	clone->biotail = rq->biotail;
+	INIT_HLIST_NODE(&clone->hash);
+	clone->start_time = jiffies;
+	clone->nr_phys_segments = rq->nr_phys_segments;
+	clone->nr_hw_segments = rq->nr_hw_segments;
+	clone->ioprio = rq->ioprio;
+	clone->buffer = rq->buffer;
+	clone->tag = -1;
+	clone->ref_count = 1;
+	clone->cmd_len = rq->cmd_len;
+	memcpy(clone->cmd, rq->cmd, sizeof(rq->cmd));
+	clone->data_len = rq->data_len;
+	clone->sense_len = rq->sense_len;
+	clone->data = rq->data;
+	clone->sense = rq->sense;
+}
+
+static int clone_and_map_request(struct dm_target *ti, struct request *rq,
+				 struct mapped_device *md)
+{
+	int r;
+	struct request *clone;
+	struct dm_target_io *tio;
+
+	tio = alloc_tio(md); /* only one for each original request */
+	if (!tio)
+		/* -ENOMEM, requeue */
+		return 1;
+	tio->io = alloc_io(md);
+	if (!tio->io) {
+		/* -ENOMEM, requeue */
+		free_tio(md, tio);
+		return 1;
+	}
+	tio->io->md = md;
+	tio->io->error = 0;
+	tio->ti = ti;
+	memset(&tio->info, 0, sizeof(tio->info));
+
+	clone = alloc_rq(md);
+	if (!clone) {
+		/* -ENOMEM, requeue */
+		free_io(md, tio->io);
+		free_tio(md, tio);
+		return 1;
+	}
+	setup_clone(clone, rq);
+	clone->bio = alloc_seq_bio(clone, md->bs);
+	clone->end_io = clone_end_request;
+	clone->end_io_data = tio;
+	clone->bio->bi_end_io = clone_rq_endio;
+	clone->bio->bi_private = tio;
+	clone->bio->bi_next = rq->bio;
+	tio->io->data.req = clone;
+	tio->io->parm.orig = rq;
+
+	atomic_inc(&md->pending);
+	r = ti->type->map_rq(ti, clone, &tio->info);
+	switch (r) {
+	case DM_MAPIO_REMAPPED:
+#ifdef DM_DEBUG
+		printk(KERN_INFO "clone %p remapped\n", clone);
+#endif
+		/* the clone has been remapped so dispatch it */
+		dm_dispatch_request(clone->q, clone);
+		break;
+	case DM_MAPIO_SUBMITTED:
+#ifdef DM_DEBUG
+		printk(KERN_INFO "clone %p submitted\n", clone);
+#endif
+		/* the target has taken the request to submit by itself */
+		break;
+	case DM_MAPIO_REQUEUE:
+		printk(KERN_INFO "clone %p removed\n", clone);
+		/* the target has requested to requeue the original request */
+		clone->end_io = NULL;
+		clone->end_io_data = NULL;
+		dec_rq_pending(tio);
+		free_io(md, tio->io);
+		free_tio(md, tio);
+		free_rq(md, clone);
+		printk(KERN_INFO "requeue request %p\n", rq);
+		return 1;
+	default:
+		printk(KERN_INFO "clone %p failure %d\n", clone, r);
+		if (r >= 0) {
+			DMWARN("unimplemented target map return value: %d", r);
+			BUG();
+		}
+		clone->end_io = NULL;
+		clone->end_io_data = NULL;
+
+		dec_rq_pending(tio);
+		free_io(md, tio->io);
+		free_tio(md, tio);
+		free_rq(md, clone);
+		printk(KERN_INFO "kill request %p\n", rq);
+		kill_request(rq);
+		break;
+	}
+
+	return 0;
+}
+
 /*-----------------------------------------------------------------
  * CRUD END
  *---------------------------------------------------------------*/
@@ -844,6 +1255,41 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	return 0;
 }
 
+/*
+ * q->request_fn for request based dm.
+ * called with q->queue_lock held
+ */
+static void dm_request_fn(struct request_queue *q)
+{
+	int r;
+	struct mapped_device *md = (struct mapped_device *)q->queuedata;
+	struct dm_table *map = dm_get_table(md);
+	struct dm_target *ti;
+	struct request *rq;
+
+	while (!blk_queue_plugged(q)) {
+		rq = elv_next_request(q);
+		if (!rq)
+			break;
+#ifdef DM_DEBUG
+		printk(KERN_INFO "got request %p\n", rq);
+#endif
+		ti = dm_table_find_target(map, rq->sector);
+
+		blkdev_dequeue_request(rq);
+		spin_unlock_irq(q->queue_lock);
+		r = clone_and_map_request(ti, rq, md);
+		spin_lock_irq(q->queue_lock);
+
+		if (r)
+			__dm_requeue_request(q, rq);
+	}
+
+	dm_table_put(map);
+
+	return;
+}
+
 static int dm_flush_all(struct request_queue *q, struct gendisk *disk,
 			sector_t *error_sector)
 {
@@ -870,6 +1316,30 @@ static void dm_unplug_all(struct request_queue *q)
 	}
 }
 
+static int dm_make_request(struct request_queue *q, struct bio *bio)
+{
+	/*
+	 * There is no use in forwarding any barrier request since we can't
+	 * guarantee it is (or can be) handled by the targets correctly.
+	 */
+	if (unlikely(bio_barrier(bio))) {
+		bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
+		return 0;
+	}
+
+	return __make_request(q, bio);
+}
+
+static int dm_prep_rq (struct request_queue *q, struct request *req)
+{
+	struct mapped_device *md = q->queuedata;
+
+	if (test_bit(DMF_BLOCK_IO, &md->flags))
+		return BLKPREP_DEFER;
+
+	return BLKPREP_OK;
+}
+
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
 	int r;
@@ -997,10 +1467,14 @@ static struct mapped_device *alloc_dev(int minor)
 	atomic_set(&md->open_count, 0);
 	atomic_set(&md->event_nr, 0);
 
-	md->queue = blk_alloc_queue(GFP_KERNEL);
+	md->queue = blk_init_queue(dm_request_fn, NULL);
 	if (!md->queue)
 		goto bad1_free_minor;
 
+	md->queue->nr_requests = BLKDEV_MAX_RQ;
+	md->queue->nr_batching = 32;
+	md->queue->unplug_thresh = 4;
+	md->queue->unplug_delay = (3 * HZ) / 1000;
 	md->queue->queuedata = md;
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
 	md->queue->backing_dev_info.congested_data = md;
@@ -1013,9 +1487,13 @@ static struct mapped_device *alloc_dev(int minor)
 	if (!md->io_pool)
 		goto bad2;
 
+	md->rq_pool = mempool_create_slab_pool(MIN_IOS, _rq_cache);
+	if (!md->rq_pool)
+		goto bad3;
+
 	md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
 	if (!md->tio_pool)
-		goto bad3;
+		goto bad4;
 
 	md->bs = bioset_create(16, 16);
 	if (!md->bs)
@@ -1023,7 +1501,7 @@ static struct mapped_device *alloc_dev(int minor)
 
 	md->disk = alloc_disk(1);
 	if (!md->disk)
-		goto bad4;
+		goto bad5;
 
 	atomic_set(&md->pending, 0);
 	init_waitqueue_head(&md->wait);
@@ -1047,10 +1525,12 @@ static struct mapped_device *alloc_dev(int minor)
 
 	return md;
 
- bad4:
+ bad5:
 	bioset_free(md->bs);
  bad_no_bioset:
 	mempool_destroy(md->tio_pool);
+ bad4:
+	mempool_destroy(md->rq_pool);
  bad3:
 	mempool_destroy(md->io_pool);
  bad2:
@@ -1073,6 +1553,7 @@ static void free_dev(struct mapped_device *md)
 		bdput(md->suspended_bdev);
 	}
 	mempool_destroy(md->tio_pool);
+	mempool_destroy(md->rq_pool);
 	mempool_destroy(md->io_pool);
 	bioset_free(md->bs);
 	del_gendisk(md->disk);
@@ -1508,6 +1989,40 @@ out:
 	return r;
 }
 
+/*
+ * Functions to enable and disable elevator support for
+ * individual devices.
+ */
+int dm_enable_elevator(struct mapped_device *md)
+{
+	if (md->queue->make_request_fn == dm_make_request)
+		return 0;
+
+	blk_queue_make_request(md->queue, dm_make_request);
+	blk_queue_prep_rq(md->queue, dm_prep_rq);
+	md->queue->unplug_fn = generic_unplug_device;
+	md->queue->backing_dev_info.congested_fn = NULL;
+	md->queue->backing_dev_info.congested_data = NULL;
+	set_bit(QUEUE_FLAG_CLUSTER, &md->queue->queue_flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(dm_enable_elevator);
+
+void dm_disable_elevator(struct mapped_device *md)
+{
+	if (md->queue->make_request_fn == dm_request)
+		return;
+
+	blk_queue_make_request(md->queue, dm_request);
+	blk_queue_prep_rq(md->queue, NULL);
+	md->queue->unplug_fn = dm_unplug_all;
+	md->queue->backing_dev_info.congested_fn = dm_any_congested;
+	md->queue->backing_dev_info.congested_data = md;
+	clear_bit(QUEUE_FLAG_CLUSTER, &md->queue->queue_flags);
+}
+EXPORT_SYMBOL(dm_disable_elevator);
+
 /*-----------------------------------------------------------------
  * Event notification.
  *---------------------------------------------------------------*/
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 462ee65..57e7edb 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -112,6 +112,8 @@ int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
 void dm_table_unplug_all(struct dm_table *t);
 int dm_table_flush_all(struct dm_table *t);
+int dm_table_enable_elevator(struct dm_table *t);
+void dm_table_disable_elevator(struct dm_table *t);
 
 /*-----------------------------------------------------------------
  * A registry of target types.
@@ -182,5 +184,10 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
 union map_info *dm_get_mapinfo(struct bio *bio);
 int dm_open_count(struct mapped_device *md);
 int dm_lock_for_deletion(struct mapped_device *md);
+void dm_dispatch_request(struct request_queue *q, struct request *rq);
+int dm_enable_elevator(struct mapped_device *md);
+void dm_disable_elevator(struct mapped_device *md);
+struct request_queue *dm_get_queue(struct mapped_device *md);
+void dm_put_queue(struct request_queue *q);
 
 #endif
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1ddef34..cb46e30 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -152,6 +152,7 @@ struct bio {
 #define BIO_RW_FAILFAST	3
 #define BIO_RW_SYNC	4
 #define BIO_RW_META	5
+#define BIO_RW_RQCALLBACK	6	/* Request callback */
 
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
@@ -183,6 +184,7 @@ struct bio {
 #define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
 #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 #define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
+#define bio_rq_callback(bio)	((bio)->bi_rw & (1 << BIO_RW_RQCALLBACK))
 
 /*
  * will die
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b126c6f..7f72ea6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -647,6 +647,7 @@ extern void register_disk(struct gendisk *dev);
 extern void generic_make_request(struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
+extern int __make_request(struct request_queue *q, struct bio *bio);
 extern void blk_end_sync_rq(struct request *rq, int error);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
 extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 499f537..1e7748a 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -14,6 +14,7 @@ struct dm_target;
 struct dm_table;
 struct dm_dev;
 struct mapped_device;
+struct request;
 
 typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
 
@@ -45,6 +46,9 @@ typedef void (*dm_dtr_fn) (struct dm_target *ti);
 typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio,
 			  union map_info *map_context);
 
+typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
+				  union map_info *map_context);
+
 /*
  * Returns:
  * < 0 : error (currently ignored)
@@ -57,6 +61,18 @@ typedef int (*dm_endio_fn) (struct dm_target *ti,
 			    struct bio *bio, int error,
 			    union map_info *map_context);
 
+typedef int (*dm_request_endio_first_fn) (struct dm_target *ti,
+					  struct request *clone, int error,
+					  union map_info *map_context);
+
+typedef int (*dm_request_endio_fn) (struct dm_target *ti,
+				    struct request *clone, int error,
+				    union map_info *map_context);
+
+typedef void (*dm_request_queue_in_tgt_fn) (struct dm_target *ti,
+					    struct request *clone,
+					    union map_info *map_context);
+
 typedef void (*dm_flush_fn) (struct dm_target *ti);
 typedef void (*dm_presuspend_fn) (struct dm_target *ti);
 typedef void (*dm_postsuspend_fn) (struct dm_target *ti);
@@ -98,7 +114,11 @@ struct target_type {
 	dm_ctr_fn ctr;
 	dm_dtr_fn dtr;
 	dm_map_fn map;
+	dm_map_request_fn map_rq;
 	dm_endio_fn end_io;
+	dm_request_endio_first_fn rq_end_io_first;
+	dm_request_endio_fn rq_end_io;
+	dm_request_queue_in_tgt_fn queue_in_tgt;
 	dm_flush_fn flush;
 	dm_presuspend_fn presuspend;
 	dm_postsuspend_fn postsuspend;

[-- Attachment #3: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH][RFC] Use bio markers for request callback
  2007-09-11 10:44 [PATCH][RFC] Use bio markers for request callback Hannes Reinecke
@ 2007-09-11 11:11 ` Jens Axboe
  2007-09-11 14:27   ` Hannes Reinecke
  2007-09-11 16:49 ` Kiyoshi Ueda
  1 sibling, 1 reply; 4+ messages in thread
From: Jens Axboe @ 2007-09-11 11:11 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: device-mapper development, linux-scsi, michaelc, k-ueda

On Tue, Sep 11 2007, Hannes Reinecke wrote:
> Hi all,
>
> this is a proposal for a different implementation of request callbacks. The 
> existing ->endio callback of a request is actually a destructor function, 
> to be called to terminate a request and free all structures.
>
> However, on certain occasions (like request-based multipathing) it is 
> desirable to have a callback function for a request which is called right 
> after the request is finished, ie in end_that_request_first() before any 
> bio->bi_endio callback is called.
>
> So a simple solution for this is to clone the request and add a new 
> 'marker' bio in front of the bio list of the request. This callback will be 
> attached a structure in bi_private which keeps a pointer to the cloned and 
> the original request, thus serving as a callback for the request itself.
>
> Proposed patch attached. As usual comments are welcome.

Honestly, I think this approach is a much worse design than the NEC
callback option. Exporting __make_request() (which is an INTERNAL
function) aside, this looks like one big hack with pseudo bio attached
to front of list, the enable/disable elevator stuff (does that even work
on stacked devices?).

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH][RFC] Use bio markers for request callback
  2007-09-11 11:11 ` Jens Axboe
@ 2007-09-11 14:27   ` Hannes Reinecke
  0 siblings, 0 replies; 4+ messages in thread
From: Hannes Reinecke @ 2007-09-11 14:27 UTC (permalink / raw)
  To: Jens Axboe; +Cc: device-mapper development, linux-scsi, michaelc, k-ueda

Am Tue 11 Sep 2007 01:11:34 PM CEST schrieb Jens Axboe  
<jens.axboe@oracle.com>:

> On Tue, Sep 11 2007, Hannes Reinecke wrote:
>> Hi all,
>>
>> this is a proposal for a different implementation of request callbacks. The
>> existing ->endio callback of a request is actually a destructor function,
>> to be called to terminate a request and free all structures.
>>
>> However, on certain occasions (like request-based multipathing) it is
>> desirable to have a callback function for a request which is called right
>> after the request is finished, ie in end_that_request_first() before any
>> bio->bi_endio callback is called.
>>
>> So a simple solution for this is to clone the request and add a new
>> 'marker' bio in front of the bio list of the request. This callback will be
>> attached a structure in bi_private which keeps a pointer to the cloned and
>> the original request, thus serving as a callback for the request itself.
>>
>> Proposed patch attached. As usual comments are welcome.
>
> Honestly, I think this approach is a much worse design than the NEC
> callback option. Exporting __make_request() (which is an INTERNAL
> function) aside, this looks like one big hack with pseudo bio attached
> to front of list, the enable/disable elevator stuff (does that even work
> on stacked devices?).
>
Hmm; agreed. Main reason for the patch was to show an alternative.
Having callbacks within callbacks is not the most straightforward way.
But if you're okay with it: you're the maintainer, you get to decide.

Exporting __make_request() is just for enabling elevator support based
on the target type. Of course this can be done differently.

Main point here was the marker bio idea. If that's rejected everything else
is moot to discuss.

Cheers,

Hannes
---
No .sig today, my .message went away ...

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH][RFC] Use bio markers for request callback
  2007-09-11 10:44 [PATCH][RFC] Use bio markers for request callback Hannes Reinecke
  2007-09-11 11:11 ` Jens Axboe
@ 2007-09-11 16:49 ` Kiyoshi Ueda
  1 sibling, 0 replies; 4+ messages in thread
From: Kiyoshi Ueda @ 2007-09-11 16:49 UTC (permalink / raw)
  To: hare; +Cc: michaelc, linux-scsi, dm-devel, jens.axboe

Hi Hannes,

On Tue, 11 Sep 2007 12:44:46 +0200, Hannes Reinecke <hare@suse.de> wrote:
> this is a proposal for a different implementation of request  
> callbacks. The existing ->endio callback of a request is actually a  
> destructor function, to be called to terminate a request and free all  
> structures.
> 
> However, on certain occasions (like request-based multipathing) it is  
> desirable to have a callback function for a request which is called  
> right after the request is finished, ie in end_that_request_first()  
> before any bio->bi_endio callback is called.
> 
> So a simple solution for this is to clone the request and add a new  
> 'marker' bio in front of the bio list of the request. This callback  
> will be attached a structure in bi_private which keeps a pointer to  
> the cloned and the original request, thus serving as a callback for  
> the request itself.

Thank you for another idea for request-based multipath.

However, I disagree with the idea.
I think the design (bio->bi_end_io() completes a request) is complex
a little bit and is breaking layer stacking.
Also, I think that the design of completion handling by 2 hooks,
bio->bi_end_io() and rq->end_io(), makes error handing in dm-multipath
complex.
e.g. Even if we detect an error on the first bio->bi_end_io() hook,
     the request can't be retried until the second rq->end_io() hook
     is called, because the ownership of the request is still on
     low level drivers until the low level driver calls
     end_that_request_last().


Although this is just implementation issue, I think that the patch
can't handle leftover.
SCSI, scsi_end_request(), may have some leftovers in the request.
Then, the marker bio is freed in bio->bi_end_io() and the request
is resubmitted in SCSI level.  So the resubmitted request
doesn't have the marker bio any more.

Thanks,
Kiyoshi Ueda

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2007-09-11 16:49 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-09-11 10:44 [PATCH][RFC] Use bio markers for request callback Hannes Reinecke
2007-09-11 11:11 ` Jens Axboe
2007-09-11 14:27   ` Hannes Reinecke
2007-09-11 16:49 ` Kiyoshi Ueda

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox