Linux virtualization list
 help / color / mirror / Atom feed
* [PATCH v2 0/3] Improve virtio-blk performance
From: Asias He @ 2012-06-18  6:53 UTC (permalink / raw)
  To: kvm, linux-kernel, virtualization

This patchset implements bio-based IO path for virito-blk to improve
performance.

Fio test shows it gives, 28%, 24%, 21%, 16% IOPS boost and 32%, 17%, 21%, 16%
latency improvement for sequential read/write, random read/write respectively.

Asias He (3):
  block: Introduce __blk_segment_map_sg() helper
  block: Add blk_bio_map_sg() helper
  virtio-blk: Add bio-based IO path for virtio-blk

 block/blk-merge.c          |  117 +++++++++++++++++--------
 drivers/block/virtio_blk.c |  203 +++++++++++++++++++++++++++++++++++---------
 include/linux/blkdev.h     |    2 +
 3 files changed, 247 insertions(+), 75 deletions(-)

-- 
1.7.10.2

^ permalink raw reply

* [PATCH 1/3] block: Introduce __blk_segment_map_sg() helper
From: Asias He @ 2012-06-18  6:53 UTC (permalink / raw)
  To: kvm, linux-kernel, virtualization; +Cc: Jens Axboe, Tejun Heo, Shaohua Li
In-Reply-To: <1340002390-3950-1-git-send-email-asias@redhat.com>

Split the mapping code in blk_rq_map_sg() to a helper
__blk_segment_map_sg(), so that other mapping function, e.g.
blk_bio_map_sg(), can share the code.

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: linux-kernel@vger.kernel.org
Suggested-by: Tejun Heo <tj@kernel.org>
Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Asias He <asias@redhat.com>
---
 block/blk-merge.c |   80 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 35 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 160035f..576b68e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -110,6 +110,49 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 	return 0;
 }
 
+static void
+__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
+		     struct scatterlist *sglist, struct bio_vec **bvprv,
+		     struct scatterlist **sg, int *nsegs, int *cluster)
+{
+
+	int nbytes = bvec->bv_len;
+
+	if (*bvprv && *cluster) {
+		if ((*sg)->length + nbytes > queue_max_segment_size(q))
+			goto new_segment;
+
+		if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec))
+			goto new_segment;
+		if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec))
+			goto new_segment;
+
+		(*sg)->length += nbytes;
+	} else {
+new_segment:
+		if (!*sg)
+			*sg = sglist;
+		else {
+			/*
+			 * If the driver previously mapped a shorter
+			 * list, we could see a termination bit
+			 * prematurely unless it fully inits the sg
+			 * table on each mapping. We KNOW that there
+			 * must be more entries here or the driver
+			 * would be buggy, so force clear the
+			 * termination bit to avoid doing a full
+			 * sg_init_table() in drivers for each command.
+			 */
+			(*sg)->page_link &= ~0x02;
+			*sg = sg_next(*sg);
+		}
+
+		sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
+		(*nsegs)++;
+	}
+	*bvprv = bvec;
+}
+
 /*
  * map a request to scatterlist, return number of sg entries setup. Caller
  * must make sure sg can hold rq->nr_phys_segments entries
@@ -131,41 +174,8 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 	bvprv = NULL;
 	sg = NULL;
 	rq_for_each_segment(bvec, rq, iter) {
-		int nbytes = bvec->bv_len;
-
-		if (bvprv && cluster) {
-			if (sg->length + nbytes > queue_max_segment_size(q))
-				goto new_segment;
-
-			if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
-				goto new_segment;
-			if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
-				goto new_segment;
-
-			sg->length += nbytes;
-		} else {
-new_segment:
-			if (!sg)
-				sg = sglist;
-			else {
-				/*
-				 * If the driver previously mapped a shorter
-				 * list, we could see a termination bit
-				 * prematurely unless it fully inits the sg
-				 * table on each mapping. We KNOW that there
-				 * must be more entries here or the driver
-				 * would be buggy, so force clear the
-				 * termination bit to avoid doing a full
-				 * sg_init_table() in drivers for each command.
-				 */
-				sg->page_link &= ~0x02;
-				sg = sg_next(sg);
-			}
-
-			sg_set_page(sg, bvec->bv_page, nbytes, bvec->bv_offset);
-			nsegs++;
-		}
-		bvprv = bvec;
+		__blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
+				     &nsegs, &cluster);
 	} /* segments in rq */
 
 
-- 
1.7.10.2

^ permalink raw reply related

* [PATCH v2 2/3] block: Add blk_bio_map_sg() helper
From: Asias He @ 2012-06-18  6:53 UTC (permalink / raw)
  To: kvm, linux-kernel, virtualization
  Cc: Jens Axboe, Tejun Heo, Shaohua Li, Christoph Hellwig
In-Reply-To: <1340002390-3950-1-git-send-email-asias@redhat.com>

Add a helper to map a bio to a scatterlist, modelled after
blk_rq_map_sg.

This helper is useful for any driver that wants to create
a scatterlist from its ->make_request_fn method.

Changes in v2:
 - Use __blk_segment_map_sg to avoid duplicated code
 - Add cocbook style function comment

Cc: Jens Axboe <axboe@kernel.dk>
Cc: Tejun Heo <tj@kernel.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Asias He <asias@redhat.com>
---
 block/blk-merge.c      |   37 +++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |    2 ++
 2 files changed, 39 insertions(+)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 576b68e..e76279e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -209,6 +209,43 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 }
 EXPORT_SYMBOL(blk_rq_map_sg);
 
+/**
+ * blk_bio_map_sg - map a bio to a scatterlist
+ * @q: request_queue in question
+ * @bio: bio being mapped
+ * @sglist: scatterlist being mapped
+ *
+ * Note:
+ *    Caller must make sure sg can hold bio->bi_phys_segments entries
+ *
+ * Will return the number of sg entries setup
+ */
+int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
+		   struct scatterlist *sglist)
+{
+	struct bio_vec *bvec, *bvprv;
+	struct scatterlist *sg;
+	int nsegs, cluster;
+	unsigned long i;
+
+	nsegs = 0;
+	cluster = blk_queue_cluster(q);
+
+	bvprv = NULL;
+	sg = NULL;
+	bio_for_each_segment(bvec, bio, i) {
+		__blk_segment_map_sg(q, bvec, sglist, &bvprv, &sg,
+				     &nsegs, &cluster);
+	} /* segments in bio */
+
+	if (sg)
+		sg_mark_end(sg);
+
+	BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments);
+	return nsegs;
+}
+EXPORT_SYMBOL(blk_bio_map_sg);
+
 static inline int ll_new_hw_segment(struct request_queue *q,
 				    struct request *req,
 				    struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ba43f40..46916af 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -884,6 +884,8 @@ extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
+extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
+			  struct scatterlist *sglist);
 extern void blk_dump_rq_flags(struct request *, char *);
 extern long nr_blockdev_pages(void);
 
-- 
1.7.10.2

^ permalink raw reply related

* [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Asias He @ 2012-06-18  6:53 UTC (permalink / raw)
  To: kvm, linux-kernel, virtualization; +Cc: Christoph Hellwig, Michael S. Tsirkin
In-Reply-To: <1340002390-3950-1-git-send-email-asias@redhat.com>

This patch introduces bio-based IO path for virtio-blk.

Compared to request-based IO path, bio-based IO path uses driver
provided ->make_request_fn() method to bypasses the IO scheduler. It
handles the bio to device directly without allocating a request in block
layer. This reduces the IO path in guest kernel to achieve high IOPS
and lower latency. The downside is that guest can not use the IO
scheduler to merge and sort requests. However, this is not a big problem
if the backend disk in host side uses faster disk device.

When the bio-based IO path is not enabled, virtio-blk still uses the
original request-based IO path, no performance difference is observed.

Performance evaluation:
-----------------------------
Fio test is performed in a 8 vcpu guest with ramdisk based guest using
kvm tool.

Short version:
 With bio-based IO path, sequential read/write, random read/write
 IOPS boost         : 28%, 24%, 21%, 16%
 Latency improvement: 32%, 17%, 21%, 16%

Long version:
 With bio-based IO path:
  seq-read  : io=2048.0MB, bw=116996KB/s, iops=233991 , runt= 17925msec
  seq-write : io=2048.0MB, bw=100829KB/s, iops=201658 , runt= 20799msec
  rand-read : io=3095.7MB, bw=112134KB/s, iops=224268 , runt= 28269msec
  rand-write: io=3095.7MB, bw=96198KB/s,  iops=192396 , runt= 32952msec
    clat (usec): min=0 , max=2631.6K, avg=58716.99, stdev=191377.30
    clat (usec): min=0 , max=1753.2K, avg=66423.25, stdev=81774.35
    clat (usec): min=0 , max=2915.5K, avg=61685.70, stdev=120598.39
    clat (usec): min=0 , max=1933.4K, avg=76935.12, stdev=96603.45
  cpu : usr=74.08%, sys=703.84%, ctx=29661403, majf=21354, minf=22460954
  cpu : usr=70.92%, sys=702.81%, ctx=77219828, majf=13980, minf=27713137
  cpu : usr=72.23%, sys=695.37%, ctx=88081059, majf=18475, minf=28177648
  cpu : usr=69.69%, sys=654.13%, ctx=145476035, majf=15867, minf=26176375
 With request-based IO path:
  seq-read  : io=2048.0MB, bw=91074KB/s, iops=182147 , runt= 23027msec
  seq-write : io=2048.0MB, bw=80725KB/s, iops=161449 , runt= 25979msec
  rand-read : io=3095.7MB, bw=92106KB/s, iops=184211 , runt= 34416msec
  rand-write: io=3095.7MB, bw=82815KB/s, iops=165630 , runt= 38277msec
    clat (usec): min=0 , max=1932.4K, avg=77824.17, stdev=170339.49
    clat (usec): min=0 , max=2510.2K, avg=78023.96, stdev=146949.15
    clat (usec): min=0 , max=3037.2K, avg=74746.53, stdev=128498.27
    clat (usec): min=0 , max=1363.4K, avg=89830.75, stdev=114279.68
  cpu : usr=53.28%, sys=724.19%, ctx=37988895, majf=17531, minf=23577622
  cpu : usr=49.03%, sys=633.20%, ctx=205935380, majf=18197, minf=27288959
  cpu : usr=55.78%, sys=722.40%, ctx=101525058, majf=19273, minf=28067082
  cpu : usr=56.55%, sys=690.83%, ctx=228205022, majf=18039, minf=26551985

How to use:
-----------------------------
Add 'virtio_blk.use_bio=1' to kernel cmdline or 'modprobe virtio_blk
use_bio=1' to enable ->make_request_fn() based I/O path.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: virtualization@lists.linux-foundation.org
Cc: kvm@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Asias He <asias@redhat.com>
---
 drivers/block/virtio_blk.c |  203 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 163 insertions(+), 40 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 774c31d..b2d5002 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -14,6 +14,9 @@
 
 #define PART_BITS 4
 
+static bool use_bio;
+module_param(use_bio, bool, S_IRUGO);
+
 static int major;
 static DEFINE_IDA(vd_index_ida);
 
@@ -23,6 +26,7 @@ struct virtio_blk
 {
 	struct virtio_device *vdev;
 	struct virtqueue *vq;
+	wait_queue_head_t queue_wait;
 
 	/* The disk structure for the kernel. */
 	struct gendisk *disk;
@@ -51,53 +55,87 @@ struct virtio_blk
 struct virtblk_req
 {
 	struct request *req;
+	struct bio *bio;
 	struct virtio_blk_outhdr out_hdr;
 	struct virtio_scsi_inhdr in_hdr;
 	u8 status;
+	struct scatterlist sg[];
 };
 
-static void blk_done(struct virtqueue *vq)
+static inline int virtblk_result(struct virtblk_req *vbr)
+{
+	switch (vbr->status) {
+	case VIRTIO_BLK_S_OK:
+		return 0;
+	case VIRTIO_BLK_S_UNSUPP:
+		return -ENOTTY;
+	default:
+		return -EIO;
+	}
+}
+
+static inline void virtblk_request_done(struct virtio_blk *vblk,
+					struct virtblk_req *vbr)
+{
+	struct request *req = vbr->req;
+	int error = virtblk_result(vbr);
+
+	if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
+		req->resid_len = vbr->in_hdr.residual;
+		req->sense_len = vbr->in_hdr.sense_len;
+		req->errors = vbr->in_hdr.errors;
+	} else if (req->cmd_type == REQ_TYPE_SPECIAL) {
+		req->errors = (error != 0);
+	}
+
+	__blk_end_request_all(req, error);
+	mempool_free(vbr, vblk->pool);
+}
+
+static inline void virtblk_bio_done(struct virtio_blk *vblk,
+				    struct virtblk_req *vbr)
+{
+	bio_endio(vbr->bio, virtblk_result(vbr));
+	mempool_free(vbr, vblk->pool);
+}
+
+static void virtblk_done(struct virtqueue *vq)
 {
 	struct virtio_blk *vblk = vq->vdev->priv;
+	unsigned long bio_done = 0, req_done = 0;
 	struct virtblk_req *vbr;
-	unsigned int len;
 	unsigned long flags;
+	unsigned int len;
 
 	spin_lock_irqsave(vblk->disk->queue->queue_lock, flags);
 	while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
-		int error;
-
-		switch (vbr->status) {
-		case VIRTIO_BLK_S_OK:
-			error = 0;
-			break;
-		case VIRTIO_BLK_S_UNSUPP:
-			error = -ENOTTY;
-			break;
-		default:
-			error = -EIO;
-			break;
-		}
-
-		switch (vbr->req->cmd_type) {
-		case REQ_TYPE_BLOCK_PC:
-			vbr->req->resid_len = vbr->in_hdr.residual;
-			vbr->req->sense_len = vbr->in_hdr.sense_len;
-			vbr->req->errors = vbr->in_hdr.errors;
-			break;
-		case REQ_TYPE_SPECIAL:
-			vbr->req->errors = (error != 0);
-			break;
-		default:
-			break;
+		if (vbr->bio) {
+			virtblk_bio_done(vblk, vbr);
+			bio_done++;
+		} else {
+			virtblk_request_done(vblk, vbr);
+			req_done++;
 		}
-
-		__blk_end_request_all(vbr->req, error);
-		mempool_free(vbr, vblk->pool);
 	}
 	/* In case queue is stopped waiting for more buffers. */
-	blk_start_queue(vblk->disk->queue);
+	if (req_done)
+		blk_start_queue(vblk->disk->queue);
 	spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags);
+
+	if (bio_done)
+		wake_up(&vblk->queue_wait);
+}
+
+static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
+						    gfp_t gfp_mask)
+{
+	struct virtblk_req *vbr;
+
+	vbr = mempool_alloc(vblk->pool, gfp_mask);
+	if (vbr && use_bio)
+		sg_init_table(vbr->sg, vblk->sg_elems);
+
+	return vbr;
 }
 
 static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
@@ -106,13 +144,13 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 	unsigned long num, out = 0, in = 0;
 	struct virtblk_req *vbr;
 
-	vbr = mempool_alloc(vblk->pool, GFP_ATOMIC);
+	vbr = virtblk_alloc_req(vblk, GFP_ATOMIC);
 	if (!vbr)
 		/* When another request finishes we'll try again. */
 		return false;
 
 	vbr->req = req;
-
+	vbr->bio = NULL;
 	if (req->cmd_flags & REQ_FLUSH) {
 		vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
 		vbr->out_hdr.sector = 0;
@@ -172,7 +210,8 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 		}
 	}
 
-	if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, GFP_ATOMIC)<0) {
+	if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr,
+			      GFP_ATOMIC) < 0) {
 		mempool_free(vbr, vblk->pool);
 		return false;
 	}
@@ -180,7 +219,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 	return true;
 }
 
-static void do_virtblk_request(struct request_queue *q)
+static void virtblk_request(struct request_queue *q)
 {
 	struct virtio_blk *vblk = q->queuedata;
 	struct request *req;
@@ -203,6 +242,82 @@ static void do_virtblk_request(struct request_queue *q)
 		virtqueue_kick(vblk->vq);
 }
 
+static void virtblk_add_buf_wait(struct virtio_blk *vblk,
+				 struct virtblk_req *vbr,
+				 unsigned long out,
+				 unsigned long in)
+{
+	DEFINE_WAIT(wait);
+
+	for (;;) {
+		prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
+					  TASK_UNINTERRUPTIBLE);
+
+		spin_lock_irq(vblk->disk->queue->queue_lock);
+		if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
+				      GFP_ATOMIC) < 0) {
+			spin_unlock_irq(vblk->disk->queue->queue_lock);
+			io_schedule();
+		} else {
+			virtqueue_kick(vblk->vq);
+			spin_unlock_irq(vblk->disk->queue->queue_lock);
+			break;
+		}
+
+	}
+
+	finish_wait(&vblk->queue_wait, &wait);
+}
+
+static void virtblk_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct virtio_blk *vblk = q->queuedata;
+	unsigned int num, out = 0, in = 0;
+	struct virtblk_req *vbr;
+
+	BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems);
+	BUG_ON(bio->bi_rw & (REQ_FLUSH | REQ_FUA));
+
+	vbr = virtblk_alloc_req(vblk, GFP_NOIO);
+	if (!vbr) {
+		bio_endio(bio, -ENOMEM);
+		return;
+	}
+
+	vbr->bio = bio;
+	vbr->req = NULL;
+	vbr->out_hdr.type = 0;
+	vbr->out_hdr.sector = bio->bi_sector;
+	vbr->out_hdr.ioprio = bio_prio(bio);
+
+	sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
+
+	num = blk_bio_map_sg(q, bio, vbr->sg + out);
+
+	sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
+		   sizeof(vbr->status));
+
+	if (num) {
+		if (bio->bi_rw & REQ_WRITE) {
+			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
+			out += num;
+		} else {
+			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
+			in += num;
+		}
+	}
+
+	spin_lock_irq(vblk->disk->queue->queue_lock);
+	if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
+			      GFP_ATOMIC) < 0) {
+		spin_unlock_irq(vblk->disk->queue->queue_lock);
+		virtblk_add_buf_wait(vblk, vbr, out, in);
+	} else {
+		virtqueue_kick(vblk->vq);
+		spin_unlock_irq(vblk->disk->queue->queue_lock);
+	}
+}
+
 /* return id (s/n) string for *disk to *id_str
  */
 static int virtblk_get_id(struct gendisk *disk, char *id_str)
@@ -360,7 +475,7 @@ static int init_vq(struct virtio_blk *vblk)
 	int err = 0;
 
 	/* We expect one virtqueue, for output. */
-	vblk->vq = virtio_find_single_vq(vblk->vdev, blk_done, "requests");
+	vblk->vq = virtio_find_single_vq(vblk->vdev, virtblk_done, "requests");
 	if (IS_ERR(vblk->vq))
 		err = PTR_ERR(vblk->vq);
 
@@ -400,6 +515,8 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	struct virtio_blk *vblk;
 	struct request_queue *q;
 	int err, index;
+	int pool_size;
+
 	u64 cap;
 	u32 v, blk_size, sg_elems, opt_io_size;
 	u16 min_io_size;
@@ -429,10 +546,12 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 		goto out_free_index;
 	}
 
+	init_waitqueue_head(&vblk->queue_wait);
 	vblk->vdev = vdev;
 	vblk->sg_elems = sg_elems;
 	sg_init_table(vblk->sg, vblk->sg_elems);
 	mutex_init(&vblk->config_lock);
+
 	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
 	vblk->config_enable = true;
 
@@ -440,7 +559,10 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	if (err)
 		goto out_free_vblk;
 
-	vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req));
+	pool_size = sizeof(struct virtblk_req);
+	if (use_bio)
+		pool_size += sizeof(struct scatterlist) * sg_elems;
+	vblk->pool = mempool_create_kmalloc_pool(1, pool_size);
 	if (!vblk->pool) {
 		err = -ENOMEM;
 		goto out_free_vq;
@@ -453,12 +575,14 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 		goto out_mempool;
 	}
 
-	q = vblk->disk->queue = blk_init_queue(do_virtblk_request, NULL);
+	q = vblk->disk->queue = blk_init_queue(virtblk_request, NULL);
 	if (!q) {
 		err = -ENOMEM;
 		goto out_put_disk;
 	}
 
+	if (use_bio)
+		blk_queue_make_request(q, virtblk_make_request);
 	q->queuedata = vblk;
 
 	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
@@ -471,7 +595,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	vblk->index = index;
 
 	/* configure queue flush support */
-	if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH) && !use_bio)
 		blk_queue_flush(q, REQ_FLUSH);
 
 	/* If disk is read-only in the host, the guest should obey */
@@ -544,7 +668,6 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	if (!err && opt_io_size)
 		blk_queue_io_opt(q, blk_size * opt_io_size);
 
-
 	add_disk(vblk->disk);
 	err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
 	if (err)
-- 
1.7.10.2

^ permalink raw reply related

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Rusty Russell @ 2012-06-18  7:46 UTC (permalink / raw)
  To: Asias He, kvm, linux-kernel, virtualization
  Cc: Christoph Hellwig, Michael S. Tsirkin
In-Reply-To: <1340002390-3950-4-git-send-email-asias@redhat.com>

On Mon, 18 Jun 2012 14:53:10 +0800, Asias He <asias@redhat.com> wrote:
> This patch introduces bio-based IO path for virtio-blk.

Why make it optional?

Thanks,
Rusty.

^ permalink raw reply

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Asias He @ 2012-06-18  8:03 UTC (permalink / raw)
  To: Rusty Russell
  Cc: kvm, Michael S. Tsirkin, linux-kernel, virtualization,
	Christoph Hellwig
In-Reply-To: <87hau9yse7.fsf@rustcorp.com.au>

On 06/18/2012 03:46 PM, Rusty Russell wrote:
> On Mon, 18 Jun 2012 14:53:10 +0800, Asias He <asias@redhat.com> wrote:
>> This patch introduces bio-based IO path for virtio-blk.
>
> Why make it optional?

request-based IO path is useful for users who do not want to bypass the 
IO scheduler in guest kernel, e.g. users using spinning disk. For users 
using fast disk device, e.g. SSD device, they can use bio-based IO path.

-- 
Asias

^ permalink raw reply

* Re: [PATCH v2 0/3] Improve virtio-blk performance
From: Stefan Hajnoczi @ 2012-06-18  9:14 UTC (permalink / raw)
  To: Asias He; +Cc: linux-kernel, kvm, virtualization
In-Reply-To: <1340002390-3950-1-git-send-email-asias@redhat.com>

On Mon, Jun 18, 2012 at 7:53 AM, Asias He <asias@redhat.com> wrote:
> Fio test shows it gives, 28%, 24%, 21%, 16% IOPS boost and 32%, 17%, 21%, 16%
> latency improvement for sequential read/write, random read/write respectively.

Sounds great.  What storage configuration did you use (single spinning
disk, SSD, storage array) and are these numbers for parallel I/O or
sequential I/O?

What changed since Minchan worked on this?  I remember he wasn't
satisfied that this was a clear win.  Your numbers are strong so
either you fixed something important or you are looking at different
benchmark configurations.

Stefan

^ permalink raw reply

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Stefan Hajnoczi @ 2012-06-18  9:37 UTC (permalink / raw)
  To: Asias He
  Cc: Michael S. Tsirkin, Christoph Hellwig, linux-kernel, kvm,
	virtualization
In-Reply-To: <1340002390-3950-4-git-send-email-asias@redhat.com>

On Mon, Jun 18, 2012 at 7:53 AM, Asias He <asias@redhat.com> wrote:
> +static void virtblk_add_buf_wait(struct virtio_blk *vblk,
> +                                struct virtblk_req *vbr,
> +                                unsigned long out,
> +                                unsigned long in)
> +{
> +       DEFINE_WAIT(wait);
> +
> +       for (;;) {
> +               prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
> +                                         TASK_UNINTERRUPTIBLE);
> +
> +               spin_lock_irq(vblk->disk->queue->queue_lock);
> +               if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
> +                                     GFP_ATOMIC) < 0) {
> +                       spin_unlock_irq(vblk->disk->queue->queue_lock);
> +                       io_schedule();
> +               } else {
> +                       virtqueue_kick(vblk->vq);
> +                       spin_unlock_irq(vblk->disk->queue->queue_lock);
> +                       break;
> +               }
> +
> +       }

Is there a meaningful condition where we would cancel this request
(e.g. signal)?  If the vring fills up for some reason and we get stuck
here the user might wish to kill hung processes.

Stefan

^ permalink raw reply

* Re: [PATCH v2 0/3] Improve virtio-blk performance
From: Asias He @ 2012-06-18  9:39 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: linux-kernel, kvm, virtualization
In-Reply-To: <CAJSP0QXOn-5Krm6hSMmm=zRYkG43k9F4Xe6P+ENMFtgMhTupFA@mail.gmail.com>

On 06/18/2012 05:14 PM, Stefan Hajnoczi wrote:
> On Mon, Jun 18, 2012 at 7:53 AM, Asias He <asias@redhat.com> wrote:
>> Fio test shows it gives, 28%, 24%, 21%, 16% IOPS boost and 32%, 17%, 21%, 16%
>> latency improvement for sequential read/write, random read/write respectively.
>
> Sounds great.  What storage configuration did you use (single spinning
> disk, SSD, storage array) and are these numbers for parallel I/O or
> sequential I/O?

I used ramdisk as the backend storage.

> What changed since Minchan worked on this?  I remember he wasn't
> satisfied that this was a clear win.  Your numbers are strong so
> either you fixed something important or you are looking at different
> benchmark configurations.

I am using kvm tool instead qemu. He wasn't satisfied the poor 
sequential performance. I removed the plug and unplug operation and bio 
completion batching. You can grab Michan's patch and make a diff to see 
the details.

Here is the fio's config file.

[global]
exec_prerun="echo 3 > /proc/sys/vm/drop_caches"
group_reporting
norandommap
ioscheduler=noop
thread
bs=512
size=4MB
direct=1
filename=/dev/vdb
numjobs=256
ioengine=aio
iodepth=64
loops=3

[seq-read]
stonewall
rw=read

[seq-write]
stonewall
rw=write

[rnd-read]
stonewall
rw=randread

[rnd-write]
stonewall
rw=randwrite



-- 
Asias

^ permalink raw reply

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Rusty Russell @ 2012-06-18 10:05 UTC (permalink / raw)
  To: Asias He
  Cc: kvm, Michael S. Tsirkin, linux-kernel, virtualization,
	Christoph Hellwig
In-Reply-To: <4FDEE0CB.1030505@redhat.com>

On Mon, 18 Jun 2012 16:03:23 +0800, Asias He <asias@redhat.com> wrote:
> On 06/18/2012 03:46 PM, Rusty Russell wrote:
> > On Mon, 18 Jun 2012 14:53:10 +0800, Asias He <asias@redhat.com> wrote:
> >> This patch introduces bio-based IO path for virtio-blk.
> >
> > Why make it optional?
> 
> request-based IO path is useful for users who do not want to bypass the 
> IO scheduler in guest kernel, e.g. users using spinning disk. For users 
> using fast disk device, e.g. SSD device, they can use bio-based IO path.

Users using a spinning disk still get IO scheduling in the host though.
What benefit is there in doing it in the guest as well?

Cheers,
Rusty.

^ permalink raw reply

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Michael S. Tsirkin @ 2012-06-18 10:13 UTC (permalink / raw)
  To: Asias He; +Cc: kvm, linux-kernel, virtualization, Christoph Hellwig
In-Reply-To: <4FDEE0CB.1030505@redhat.com>

On Mon, Jun 18, 2012 at 04:03:23PM +0800, Asias He wrote:
> On 06/18/2012 03:46 PM, Rusty Russell wrote:
> >On Mon, 18 Jun 2012 14:53:10 +0800, Asias He <asias@redhat.com> wrote:
> >>This patch introduces bio-based IO path for virtio-blk.
> >
> >Why make it optional?
> 
> request-based IO path is useful for users who do not want to bypass
> the IO scheduler in guest kernel, e.g. users using spinning disk.
> For users using fast disk device, e.g. SSD device, they can use
> bio-based IO path.

OK I guess but then it should be per-device. There could be
a mix of slow and fast disks :)

> -- 
> Asias
> 

^ permalink raw reply

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Michael S. Tsirkin @ 2012-06-18 10:21 UTC (permalink / raw)
  To: Asias He; +Cc: kvm, linux-kernel, virtualization, Christoph Hellwig
In-Reply-To: <1340002390-3950-4-git-send-email-asias@redhat.com>

On Mon, Jun 18, 2012 at 02:53:10PM +0800, Asias He wrote:
> +static void virtblk_make_request(struct request_queue *q, struct bio *bio)
> +{
> +	struct virtio_blk *vblk = q->queuedata;
> +	unsigned int num, out = 0, in = 0;
> +	struct virtblk_req *vbr;
> +
> +	BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems);
> +	BUG_ON(bio->bi_rw & (REQ_FLUSH | REQ_FUA));
> +
> +	vbr = virtblk_alloc_req(vblk, GFP_NOIO);
> +	if (!vbr) {
> +		bio_endio(bio, -ENOMEM);
> +		return;
> +	}
> +
> +	vbr->bio = bio;
> +	vbr->req = NULL;
> +	vbr->out_hdr.type = 0;
> +	vbr->out_hdr.sector = bio->bi_sector;
> +	vbr->out_hdr.ioprio = bio_prio(bio);
> +
> +	sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
> +
> +	num = blk_bio_map_sg(q, bio, vbr->sg + out);
> +
> +	sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
> +		   sizeof(vbr->status));
> +
> +	if (num) {
> +		if (bio->bi_rw & REQ_WRITE) {
> +			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
> +			out += num;
> +		} else {
> +			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
> +			in += num;
> +		}
> +	}
> +
> +	spin_lock_irq(vblk->disk->queue->queue_lock);
> +	if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
> +			      GFP_ATOMIC) < 0) {
> +		spin_unlock_irq(vblk->disk->queue->queue_lock);

Any implications of dropping lock like that?
E.g. for suspend. like we are still discussing with
unlocked kick?

> +		virtblk_add_buf_wait(vblk, vbr, out, in);
> +	} else {
> +		virtqueue_kick(vblk->vq);

Why special case the first call? task state manipulation so expensive?

> +		spin_unlock_irq(vblk->disk->queue->queue_lock);
> +	}
> +}
> +

^ permalink raw reply

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Stefan Hajnoczi @ 2012-06-18 10:45 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm, linux-kernel, virtualization, Christoph Hellwig
In-Reply-To: <20120618102108.GD23134@redhat.com>

On Mon, Jun 18, 2012 at 11:21 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> On Mon, Jun 18, 2012 at 02:53:10PM +0800, Asias He wrote:
>> +static void virtblk_make_request(struct request_queue *q, struct bio *bio)
>> +{
>> +     struct virtio_blk *vblk = q->queuedata;
>> +     unsigned int num, out = 0, in = 0;
>> +     struct virtblk_req *vbr;
>> +
>> +     BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems);
>> +     BUG_ON(bio->bi_rw & (REQ_FLUSH | REQ_FUA));
>> +
>> +     vbr = virtblk_alloc_req(vblk, GFP_NOIO);
>> +     if (!vbr) {
>> +             bio_endio(bio, -ENOMEM);
>> +             return;
>> +     }
>> +
>> +     vbr->bio = bio;
>> +     vbr->req = NULL;
>> +     vbr->out_hdr.type = 0;
>> +     vbr->out_hdr.sector = bio->bi_sector;
>> +     vbr->out_hdr.ioprio = bio_prio(bio);
>> +
>> +     sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
>> +
>> +     num = blk_bio_map_sg(q, bio, vbr->sg + out);
>> +
>> +     sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
>> +                sizeof(vbr->status));
>> +
>> +     if (num) {
>> +             if (bio->bi_rw & REQ_WRITE) {
>> +                     vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
>> +                     out += num;
>> +             } else {
>> +                     vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
>> +                     in += num;
>> +             }
>> +     }
>> +
>> +     spin_lock_irq(vblk->disk->queue->queue_lock);
>> +     if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
>> +                           GFP_ATOMIC) < 0) {
>> +             spin_unlock_irq(vblk->disk->queue->queue_lock);
>
> Any implications of dropping lock like that?
> E.g. for suspend. like we are still discussing with
> unlocked kick?

Since we aquired the lock in this function there should be no problem.
 Whatever protects against vblk or vblk->disk disappearing upon
entering this function also protects after unlocking queue_lock.
Otherwise all .make_request_fn() functions would be broken.

I'd still like to understand the details though.

Stefan

^ permalink raw reply

* Re: [PATCH v2 0/3] Improve virtio-blk performance
From: Stefan Hajnoczi @ 2012-06-18 10:58 UTC (permalink / raw)
  To: Asias He; +Cc: linux-kernel, kvm, virtualization
In-Reply-To: <4FDEF73E.3010501@redhat.com>

On Mon, Jun 18, 2012 at 10:39 AM, Asias He <asias@redhat.com> wrote:
> On 06/18/2012 05:14 PM, Stefan Hajnoczi wrote:
>>
>> On Mon, Jun 18, 2012 at 7:53 AM, Asias He <asias@redhat.com> wrote:
>>>
>>> Fio test shows it gives, 28%, 24%, 21%, 16% IOPS boost and 32%, 17%, 21%,
>>> 16%
>>> latency improvement for sequential read/write, random read/write
>>> respectively.
>>
>>
>> Sounds great.  What storage configuration did you use (single spinning
>> disk, SSD, storage array) and are these numbers for parallel I/O or
>> sequential I/O?
>
>
> I used ramdisk as the backend storage.

As long as the latency is decreasing that's good.  But It's worth
keeping in mind that these percentages are probably wildly different
on real storage devices and/or qemu-kvm.  What we don't know here is
whether this bottleneck matters in real environments - results with
real storage and with qemu-kvm would be interesting.

Stefan

^ permalink raw reply

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Dor Laor @ 2012-06-18 11:14 UTC (permalink / raw)
  To: Rusty Russell
  Cc: kvm, Michael S. Tsirkin, linux-kernel, virtualization,
	Christoph Hellwig
In-Reply-To: <87zk81x7dp.fsf@rustcorp.com.au>

On 06/18/2012 01:05 PM, Rusty Russell wrote:
> On Mon, 18 Jun 2012 16:03:23 +0800, Asias He<asias@redhat.com>  wrote:
>> On 06/18/2012 03:46 PM, Rusty Russell wrote:
>>> On Mon, 18 Jun 2012 14:53:10 +0800, Asias He<asias@redhat.com>  wrote:
>>>> This patch introduces bio-based IO path for virtio-blk.
>>>
>>> Why make it optional?
>>
>> request-based IO path is useful for users who do not want to bypass the
>> IO scheduler in guest kernel, e.g. users using spinning disk. For users
>> using fast disk device, e.g. SSD device, they can use bio-based IO path.
>
> Users using a spinning disk still get IO scheduling in the host though.
> What benefit is there in doing it in the guest as well?

The io scheduler waits for requests to merge and thus batch IOs 
together. It's not important w.r.t spinning disks since the host can do 
it but it causes much less vmexits which is the key issue for VMs.

>
> Cheers,
> Rusty.
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Sasha Levin @ 2012-06-18 11:39 UTC (permalink / raw)
  To: dlaor
  Cc: kvm, Michael S. Tsirkin, linux-kernel, virtualization,
	Christoph Hellwig
In-Reply-To: <4FDF0DA7.40604@redhat.com>

On Mon, 2012-06-18 at 14:14 +0300, Dor Laor wrote:
> On 06/18/2012 01:05 PM, Rusty Russell wrote:
> > On Mon, 18 Jun 2012 16:03:23 +0800, Asias He<asias@redhat.com>  wrote:
> >> On 06/18/2012 03:46 PM, Rusty Russell wrote:
> >>> On Mon, 18 Jun 2012 14:53:10 +0800, Asias He<asias@redhat.com>  wrote:
> >>>> This patch introduces bio-based IO path for virtio-blk.
> >>>
> >>> Why make it optional?
> >>
> >> request-based IO path is useful for users who do not want to bypass the
> >> IO scheduler in guest kernel, e.g. users using spinning disk. For users
> >> using fast disk device, e.g. SSD device, they can use bio-based IO path.
> >
> > Users using a spinning disk still get IO scheduling in the host though.
> > What benefit is there in doing it in the guest as well?
> 
> The io scheduler waits for requests to merge and thus batch IOs 
> together. It's not important w.r.t spinning disks since the host can do 
> it but it causes much less vmexits which is the key issue for VMs. 

Is the amount of exits caused by virtio-blk significant at all with
EVENT_IDX?

^ permalink raw reply

* Re: [PATCH RFC] virtio-spec: flexible configuration layout
From: Michael S. Tsirkin @ 2012-06-18 11:54 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Krishna Kumar, kvm, Pawel Moll, Wang Sheng-Hui,
	Alexey Kardashevskiy, lkml - Kernel Mailing List, virtualization,
	Christian Borntraeger, penberg, Sasha Levin, Amit Shah, avi
In-Reply-To: <alpine.LFD.2.02.1111091434230.4936@tux.localdomain>

On Wed, Nov 09, 2011 at 02:36:43PM +0200, Pekka Enberg wrote:
> On Wed, 9 Nov 2011, Sasha Levin wrote:
> >They don't exist in kernel code either, for same reason as above.
> >
> >Nothing will break if we remove it since no one really used it, we were
> >probably the first and only implementation of the spec which considered
> >them :)
> 
> As long as we are able to run older versions of the KVM tool with
> newer kernels and vice versa, I see no reason why we can't drop
> 64-bit features from the KVM tool.
> 
> 			Pekka

So what happened? Did you guys do this?  Need to know what to do to make
progress.  IIUC Rusty removed the new fields in 0.9.3.
Does your tool still use them? Did any version of the tool released by
distros do so?

-- 
MST

^ permalink raw reply

* Re: [Qemu-devel] [PATCH RFC] virtio-pci: add MMIO property
From: Michael S. Tsirkin @ 2012-06-18 12:05 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Anthony Liguori, kvm, Alexey Kardashevskiy, qemu-devel,
	virtualization, avi, Anthony Liguori
In-Reply-To: <877gygqhbp.fsf@rustcorp.com.au>

On Tue, Mar 20, 2012 at 10:22:42AM +1030, Rusty Russell wrote:
> On Mon, 19 Mar 2012 17:13:06 -0500, Anthony Liguori <anthony@codemonkey.ws> wrote:
> > > Maybe just make this a hidden option like x-miio?
> > 
> > x-violate-the-virtio-spec-to-trick-old-linux-drivers-into-working-on-power?
> 
> "To configure the device, we use the first I/O region of the PCI
> device."
> 
> Meh, it does sound a little like we are specifying that it's an PCI I/O
> bar.
> 
> Let's resurrect the PCI-v2 idea, which is ready to implement now, and a
> nice cleanup?  Detach it from the change-of-ring-format idea which is
> turning out to be a tarpit.
> 
> Thanks,
> Rusty.

Yes. But it seems silly to even write code to play with device config in
memory when we agreed the right thing to do is to use a config vq
everywhere.

Now a question: does a oconfig vq look like a PCI specific
feature to you, a work-around for lack of multibyte atomic
accesses? If yes it's sane to make it a PCI capability.
Or is it something most transports would need? If yes we
need a feature bit and this is a chicken and egg problem ...

> -- 
>   How could I marry someone with more hair than me?  http://baldalex.org

^ permalink raw reply

* Re: [PATCH RFC] virtio-spec: flexible configuration layout
From: Sasha Levin @ 2012-06-18 12:05 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Krishna Kumar, kvm, Pawel Moll, Wang Sheng-Hui,
	Alexey Kardashevskiy, lkml - Kernel Mailing List, virtualization,
	Pekka Enberg, Christian Borntraeger, penberg, avi, Amit Shah
In-Reply-To: <20120618115440.GA25042@redhat.com>

On Mon, 2012-06-18 at 14:54 +0300, Michael S. Tsirkin wrote:
> On Wed, Nov 09, 2011 at 02:36:43PM +0200, Pekka Enberg wrote:
> > On Wed, 9 Nov 2011, Sasha Levin wrote:
> > >They don't exist in kernel code either, for same reason as above.
> > >
> > >Nothing will break if we remove it since no one really used it, we were
> > >probably the first and only implementation of the spec which considered
> > >them :)
> > 
> > As long as we are able to run older versions of the KVM tool with
> > newer kernels and vice versa, I see no reason why we can't drop
> > 64-bit features from the KVM tool.
> > 
> > 			Pekka
> 
> So what happened? Did you guys do this?  Need to know what to do to make
> progress.  IIUC Rusty removed the new fields in 0.9.3.
> Does your tool still use them? Did any version of the tool released by
> distros do so?

Yup, they were removed quite a while ago.

^ permalink raw reply

* Re: [PATCH RFC] virtio-spec: flexible configuration layout
From: Michael S. Tsirkin @ 2012-06-18 12:07 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Krishna Kumar, kvm, Pawel Moll, Wang Sheng-Hui,
	Alexey Kardashevskiy, lkml - Kernel Mailing List, virtualization,
	Pekka Enberg, Christian Borntraeger, penberg, avi, Amit Shah
In-Reply-To: <1340021117.22848.3.camel@lappy>

On Mon, Jun 18, 2012 at 02:05:17PM +0200, Sasha Levin wrote:
> On Mon, 2012-06-18 at 14:54 +0300, Michael S. Tsirkin wrote:
> > On Wed, Nov 09, 2011 at 02:36:43PM +0200, Pekka Enberg wrote:
> > > On Wed, 9 Nov 2011, Sasha Levin wrote:
> > > >They don't exist in kernel code either, for same reason as above.
> > > >
> > > >Nothing will break if we remove it since no one really used it, we were
> > > >probably the first and only implementation of the spec which considered
> > > >them :)
> > > 
> > > As long as we are able to run older versions of the KVM tool with
> > > newer kernels and vice versa, I see no reason why we can't drop
> > > 64-bit features from the KVM tool.
> > > 
> > > 			Pekka
> > 
> > So what happened? Did you guys do this?  Need to know what to do to make
> > progress.  IIUC Rusty removed the new fields in 0.9.3.
> > Does your tool still use them? Did any version of the tool released by
> > distros do so?
> 
> Yup, they were removed quite a while ago.

Thanks, that's nice. So we can reuse that bit if we need it
and it won't break your code.

-- 
MST

^ permalink raw reply

* [RFC 1/2] virtio-ring: Use threshold for switching to indirect descriptors
From: Sasha Levin @ 2012-06-18 14:09 UTC (permalink / raw)
  To: mst, asias, stefanha; +Cc: Sasha Levin, kvm, virtualization

Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will use indirect
descriptors even if we have plenty of space in the ring. This means that
we take a performance hit at all times due to the overhead of creating
indirect descriptors.

Instead, use it only after we're below a configurable offset.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 drivers/block/virtio_blk.c          |    4 ++++
 drivers/char/hw_random/virtio-rng.c |    4 ++++
 drivers/char/virtio_console.c       |    4 ++++
 drivers/net/virtio_net.c            |    4 ++++
 drivers/virtio/virtio_balloon.c     |    4 ++++
 drivers/virtio/virtio_ring.c        |   21 +++++++++++++++------
 include/linux/virtio.h              |    1 +
 net/9p/trans_virtio.c               |    4 ++++
 8 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 693187d..a2c8d97 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -19,6 +19,9 @@ static DEFINE_IDA(vd_index_ida);
 
 struct workqueue_struct *virtblk_wq;
 
+static unsigned int indirect_thresh = 0;
+module_param(indirect_thresh, uint, S_IRUGO);
+
 struct virtio_blk
 {
 	spinlock_t lock;
@@ -438,6 +441,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	mutex_init(&vblk->config_lock);
 	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
 	vblk->config_enable = true;
+	vdev->indirect_thresh = indirect_thresh;
 
 	err = init_vq(vblk);
 	if (err)
diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 723725b..bcaddb9 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -25,6 +25,9 @@
 #include <linux/virtio_rng.h>
 #include <linux/module.h>
 
+static unsigned int indirect_thresh = 0;
+module_param(indirect_thresh, uint, S_IRUGO);
+
 static struct virtqueue *vq;
 static unsigned int data_avail;
 static DECLARE_COMPLETION(have_data);
@@ -90,6 +93,7 @@ static int virtrng_probe(struct virtio_device *vdev)
 	int err;
 
 	/* We expect a single virtqueue. */
+	vdev->indirect_thresh = indirect_thresh;
 	vq = virtio_find_single_vq(vdev, random_recv_done, "input");
 	if (IS_ERR(vq))
 		return PTR_ERR(vq);
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index cdf2f54..60397a4 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -37,6 +37,9 @@
 #include <linux/module.h>
 #include "../tty/hvc/hvc_console.h"
 
+static unsigned int indirect_thresh = 0;
+module_param(indirect_thresh, uint, S_IRUGO);
+
 /*
  * This is a global struct for storing common data for all the devices
  * this driver handles.
@@ -1729,6 +1732,7 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
 				       max_nr_ports),
 			      &portdev->config.max_nr_ports) == 0)
 		multiport = true;
+	vdev->indirect_thresh = indirect_thresh;
 
 	err = init_vqs(portdev);
 	if (err < 0) {
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f18149a..5c1be92 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -34,6 +34,9 @@ static bool csum = true, gso = true;
 module_param(csum, bool, 0444);
 module_param(gso, bool, 0444);
 
+static unsigned int indirect_thresh = 0;
+module_param(indirect_thresh, uint, S_IRUGO);
+
 /* FIXME: MTU in config. */
 #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 #define GOOD_COPY_LEN	128
@@ -1128,6 +1131,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
 		vi->mergeable_rx_bufs = true;
+	vdev->indirect_thresh = indirect_thresh;
 
 	err = init_vqs(vi);
 	if (err)
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index bfbc15c..4fc11ba 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -35,6 +35,9 @@
  */
 #define VIRTIO_BALLOON_PAGES_PER_PAGE (PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT)
 
+static unsigned int indirect_thresh = 0;
+module_param(indirect_thresh, uint, S_IRUGO);
+
 struct virtio_balloon
 {
 	struct virtio_device *vdev;
@@ -360,6 +363,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	init_waitqueue_head(&vb->config_change);
 	vb->vdev = vdev;
 	vb->need_stats_update = 0;
+	vdev->indirect_thresh = indirect_thresh;
 
 	err = init_vqs(vb);
 	if (err)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 5aa43c3..99a64a7 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -87,8 +87,11 @@ struct vring_virtqueue
 	/* Other side has made a mess, don't try any more. */
 	bool broken;
 
-	/* Host supports indirect buffers */
-	bool indirect;
+	/* 
+	 * Min. number of free space in the ring to trigger direct
+	 * descriptor use
+	 */
+	unsigned int indirect_thresh;
 
 	/* Host publishes avail event idx */
 	bool event;
@@ -216,9 +219,12 @@ int virtqueue_add_buf(struct virtqueue *_vq,
 	}
 #endif
 
-	/* If the host supports indirect descriptor tables, and we have multiple
-	 * buffers, then go indirect. FIXME: tune this threshold */
-	if (vq->indirect && (out + in) > 1 && vq->num_free) {
+	/*
+	 * If the host supports indirect descriptor tables, and we have multiple
+	 * buffers, then go indirect.
+	 */
+	if ((out + in) > 1 && vq->num_free &&
+		(vq->num_free < vq->indirect_thresh)) {
 		head = vring_add_indirect(vq, sg, out, in, gfp);
 		if (likely(head >= 0))
 			goto add_head;
@@ -647,13 +653,16 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
 	vq->broken = false;
 	vq->last_used_idx = 0;
 	vq->num_added = 0;
+	vq->indirect_thresh = 0;
 	list_add_tail(&vq->vq.list, &vdev->vqs);
 #ifdef DEBUG
 	vq->in_use = false;
 	vq->last_add_time_valid = false;
 #endif
 
-	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
+	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
+		vq->indirect_thresh = vdev->indirect_thresh;
+
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
 	/* No callback?  Tell other side not to bother us. */
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 8efd28a..36019ec 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -69,6 +69,7 @@ struct virtio_device {
 	/* Note that this is a Linux set_bit-style bitmap. */
 	unsigned long features[1];
 	void *priv;
+	unsigned int indirect_thresh;
 };
 
 #define dev_to_virtio(dev) container_of(dev, struct virtio_device, dev)
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 5af18d1..357bfba 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -52,6 +52,9 @@
 
 #define VIRTQUEUE_NUM	128
 
+static unsigned int indirect_thresh = 0;
+module_param(indirect_thresh, uint, S_IRUGO);
+
 /* a single mutex to manage channel initialization and attachment */
 static DEFINE_MUTEX(virtio_9p_lock);
 static DECLARE_WAIT_QUEUE_HEAD(vp_wq);
@@ -501,6 +504,7 @@ static int p9_virtio_probe(struct virtio_device *vdev)
 	chan->vdev = vdev;
 
 	/* We expect one virtqueue, for requests. */
+	vdev->indirect_thresh = indirect_thresh;
 	chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
 	if (IS_ERR(chan->vq)) {
 		err = PTR_ERR(chan->vq);
-- 
1.7.8.6

^ permalink raw reply related

* [RFC 2/2] virtio-ring: Allocate indirect buffers from cache when possible
From: Sasha Levin @ 2012-06-18 14:09 UTC (permalink / raw)
  To: mst, asias, stefanha; +Cc: Sasha Levin, kvm, virtualization
In-Reply-To: <1340028598-10142-1-git-send-email-levinsasha928@gmail.com>

Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will
use indirect descriptors and allocate them using a simple
kmalloc().

This patch adds a cache which will allow indirect buffers under
a configurable size to be allocated from that cache instead.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 drivers/block/virtio_blk.c          |    4 ++++
 drivers/char/hw_random/virtio-rng.c |    4 ++++
 drivers/char/virtio_console.c       |    4 ++++
 drivers/net/virtio_net.c            |    4 ++++
 drivers/virtio/virtio_balloon.c     |    4 ++++
 drivers/virtio/virtio_ring.c        |   28 ++++++++++++++++++++++++----
 include/linux/virtio.h              |    1 +
 net/9p/trans_virtio.c               |    5 +++++
 8 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index a2c8d97..3c6d1bd 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -22,6 +22,9 @@ struct workqueue_struct *virtblk_wq;
 static unsigned int indirect_thresh = 0;
 module_param(indirect_thresh, uint, S_IRUGO);
 
+static unsigned int indirect_alloc_thresh = 0;
+module_param(indirect_alloc_thresh, uint, S_IRUGO);
+
 struct virtio_blk
 {
 	spinlock_t lock;
@@ -442,6 +445,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
 	vblk->config_enable = true;
 	vdev->indirect_thresh = indirect_thresh;
+	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
 
 	err = init_vq(vblk);
 	if (err)
diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index bcaddb9..6a32f76 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -28,6 +28,9 @@
 static unsigned int indirect_thresh = 0;
 module_param(indirect_thresh, uint, S_IRUGO);
 
+static unsigned int indirect_alloc_thresh = 0;
+module_param(indirect_alloc_thresh, uint, S_IRUGO);
+
 static struct virtqueue *vq;
 static unsigned int data_avail;
 static DECLARE_COMPLETION(have_data);
@@ -94,6 +97,7 @@ static int virtrng_probe(struct virtio_device *vdev)
 
 	/* We expect a single virtqueue. */
 	vdev->indirect_thresh = indirect_thresh;
+	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
 	vq = virtio_find_single_vq(vdev, random_recv_done, "input");
 	if (IS_ERR(vq))
 		return PTR_ERR(vq);
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 60397a4..7c33714 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -40,6 +40,9 @@
 static unsigned int indirect_thresh = 0;
 module_param(indirect_thresh, uint, S_IRUGO);
 
+static unsigned int indirect_alloc_thresh = 0;
+module_param(indirect_alloc_thresh, uint, S_IRUGO);
+
 /*
  * This is a global struct for storing common data for all the devices
  * this driver handles.
@@ -1733,6 +1736,7 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
 			      &portdev->config.max_nr_ports) == 0)
 		multiport = true;
 	vdev->indirect_thresh = indirect_thresh;
+	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
 
 	err = init_vqs(portdev);
 	if (err < 0) {
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 5c1be92..441bd2f 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -37,6 +37,9 @@ module_param(gso, bool, 0444);
 static unsigned int indirect_thresh = 0;
 module_param(indirect_thresh, uint, S_IRUGO);
 
+static unsigned int indirect_alloc_thresh = 0;
+module_param(indirect_alloc_thresh, uint, S_IRUGO);
+
 /* FIXME: MTU in config. */
 #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 #define GOOD_COPY_LEN	128
@@ -1132,6 +1135,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
 		vi->mergeable_rx_bufs = true;
 	vdev->indirect_thresh = indirect_thresh;
+	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
 
 	err = init_vqs(vi);
 	if (err)
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 4fc11ba..6e6313b 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -38,6 +38,9 @@
 static unsigned int indirect_thresh = 0;
 module_param(indirect_thresh, uint, S_IRUGO);
 
+static unsigned int indirect_alloc_thresh = 0;
+module_param(indirect_alloc_thresh, uint, S_IRUGO);
+
 struct virtio_balloon
 {
 	struct virtio_device *vdev;
@@ -364,6 +367,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
 	vb->vdev = vdev;
 	vb->need_stats_update = 0;
 	vdev->indirect_thresh = indirect_thresh;
+	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
 
 	err = init_vqs(vb);
 	if (err)
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 99a64a7..f0b0ce3 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -93,6 +93,10 @@ struct vring_virtqueue
 	 */
 	unsigned int indirect_thresh;
 
+	/* Buffers below this size will be allocated from cache */
+	unsigned int indirect_alloc_thresh;
+	struct kmem_cache *indirect_cache;
+
 	/* Host publishes avail event idx */
 	bool event;
 
@@ -135,7 +139,10 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
 	unsigned head;
 	int i;
 
-	desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
+	if ((out + in) <= vq->indirect_alloc_thresh)
+		desc = kmem_cache_alloc(vq->indirect_cache, gfp);
+	else
+		desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
 	if (!desc)
 		return -ENOMEM;
 
@@ -384,8 +391,13 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
 	i = head;
 
 	/* Free the indirect table */
-	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT)
-		kfree(phys_to_virt(vq->vring.desc[i].addr));
+	if (vq->vring.desc[i].flags & VRING_DESC_F_INDIRECT) {
+		if (vq->vring.desc[i].len < vq->indirect_alloc_thresh)
+			kfree(phys_to_virt(vq->vring.desc[i].addr));
+		else
+			kmem_cache_free(vq->indirect_cache,
+					phys_to_virt(vq->vring.desc[i].addr));
+	}
 
 	while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) {
 		i = vq->vring.desc[i].next;
@@ -654,14 +666,20 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
 	vq->last_used_idx = 0;
 	vq->num_added = 0;
 	vq->indirect_thresh = 0;
+	vq->indirect_alloc_thresh = 0;
+	vq->indirect_cache = NULL;
 	list_add_tail(&vq->vq.list, &vdev->vqs);
 #ifdef DEBUG
 	vq->in_use = false;
 	vq->last_add_time_valid = false;
 #endif
 
-	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
+	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) {
 		vq->indirect_thresh = vdev->indirect_thresh;
+		vq->indirect_alloc_thresh = vdev->indirect_alloc_thresh;
+		if (vq->indirect_alloc_thresh)
+			vq->indirect_cache = KMEM_CACHE(vring_desc[vq->indirect_alloc_thresh], 0);
+	}
 
 	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
 
@@ -685,6 +703,8 @@ EXPORT_SYMBOL_GPL(vring_new_virtqueue);
 void vring_del_virtqueue(struct virtqueue *vq)
 {
 	list_del(&vq->list);
+	if (to_vvq(vq)->indirect_cache)
+		kmem_cache_destroy(to_vvq(vq)->indirect_cache);
 	kfree(to_vvq(vq));
 }
 EXPORT_SYMBOL_GPL(vring_del_virtqueue);
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 36019ec..f3f0d3f 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -70,6 +70,7 @@ struct virtio_device {
 	unsigned long features[1];
 	void *priv;
 	unsigned int indirect_thresh;
+	unsigned int indirect_alloc_thresh;
 };
 
 #define dev_to_virtio(dev) container_of(dev, struct virtio_device, dev)
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 357bfba..8424b36 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -55,6 +55,9 @@
 static unsigned int indirect_thresh = 0;
 module_param(indirect_thresh, uint, S_IRUGO);
 
+static unsigned int indirect_alloc_thresh = 0;
+module_param(indirect_alloc_thresh, uint, S_IRUGO);
+
 /* a single mutex to manage channel initialization and attachment */
 static DEFINE_MUTEX(virtio_9p_lock);
 static DECLARE_WAIT_QUEUE_HEAD(vp_wq);
@@ -505,6 +508,8 @@ static int p9_virtio_probe(struct virtio_device *vdev)
 
 	/* We expect one virtqueue, for requests. */
 	vdev->indirect_thresh = indirect_thresh;
+	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
+
 	chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
 	if (IS_ERR(chan->vq)) {
 		err = PTR_ERR(chan->vq);
-- 
1.7.8.6

^ permalink raw reply related

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Tejun Heo @ 2012-06-18 21:28 UTC (permalink / raw)
  To: Rusty Russell
  Cc: kvm, Michael S. Tsirkin, linux-kernel, virtualization,
	Christoph Hellwig
In-Reply-To: <87zk81x7dp.fsf@rustcorp.com.au>

Hello, guys.

On Mon, Jun 18, 2012 at 07:35:22PM +0930, Rusty Russell wrote:
> On Mon, 18 Jun 2012 16:03:23 +0800, Asias He <asias@redhat.com> wrote:
> > On 06/18/2012 03:46 PM, Rusty Russell wrote:
> > > On Mon, 18 Jun 2012 14:53:10 +0800, Asias He <asias@redhat.com> wrote:
> > >> This patch introduces bio-based IO path for virtio-blk.
> > >
> > > Why make it optional?
> > 
> > request-based IO path is useful for users who do not want to bypass the 
> > IO scheduler in guest kernel, e.g. users using spinning disk. For users 
> > using fast disk device, e.g. SSD device, they can use bio-based IO path.
> 
> Users using a spinning disk still get IO scheduling in the host though.
> What benefit is there in doing it in the guest as well?

Another thing is that some of cgroup features are impelmented in IO
scheduler (cfq).  e.g. bio based driver will be able to use cgroup
based throttling but IO weights won't work.  Not sure how meaningful
this is tho.

With that said, I think this sort of feature switching is quite ugly.
The pros and cons of each choice aren't obvious unless one is familiar
with implementation details.  IMHO, if the benefits of ioscheds aren't
critical, it would be better to just go with bio based implementation.

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [PATCH 1/3] block: Introduce __blk_segment_map_sg() helper
From: Tejun Heo @ 2012-06-18 21:31 UTC (permalink / raw)
  To: Asias He; +Cc: Jens Axboe, Shaohua Li, linux-kernel, kvm, virtualization
In-Reply-To: <1340002390-3950-2-git-send-email-asias@redhat.com>

Hello, Asias.

On Mon, Jun 18, 2012 at 02:53:08PM +0800, Asias He wrote:
> Split the mapping code in blk_rq_map_sg() to a helper
> __blk_segment_map_sg(), so that other mapping function, e.g.
> blk_bio_map_sg(), can share the code.
> 
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Shaohua Li <shli@kernel.org>
> Cc: linux-kernel@vger.kernel.org
> Suggested-by: Tejun Heo <tj@kernel.org>
> Suggested-by: Jens Axboe <axboe@kernel.dk>
> Signed-off-by: Asias He <asias@redhat.com>
> ---
>  block/blk-merge.c |   80 ++++++++++++++++++++++++++++++-----------------------
>  1 file changed, 45 insertions(+), 35 deletions(-)
> 
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 160035f..576b68e 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -110,6 +110,49 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
>  	return 0;
>  }
>  
> +static void
> +__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
> +		     struct scatterlist *sglist, struct bio_vec **bvprv,
> +		     struct scatterlist **sg, int *nsegs, int *cluster)
> +{
> +
> +	int nbytes = bvec->bv_len;
> +
> +	if (*bvprv && *cluster) {
> +		if ((*sg)->length + nbytes > queue_max_segment_size(q))
> +			goto new_segment;
> +
> +		if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec))
> +			goto new_segment;
> +		if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec))
> +			goto new_segment;
> +
> +		(*sg)->length += nbytes;
> +	} else {
> +new_segment:
> +		if (!*sg)
> +			*sg = sglist;
> +		else {
> +			/*
> +			 * If the driver previously mapped a shorter
> +			 * list, we could see a termination bit
> +			 * prematurely unless it fully inits the sg
> +			 * table on each mapping. We KNOW that there
> +			 * must be more entries here or the driver
> +			 * would be buggy, so force clear the
> +			 * termination bit to avoid doing a full
> +			 * sg_init_table() in drivers for each command.
> +			 */
> +			(*sg)->page_link &= ~0x02;
> +			*sg = sg_next(*sg);
> +		}
> +
> +		sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
> +		(*nsegs)++;
> +	}
> +	*bvprv = bvec;
> +}

I *hope* this is a bit prettier.  e.g. Do we really need to pass in
@sglist and keep using "goto new_segment"?

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [PATCH 1/3] block: Introduce __blk_segment_map_sg() helper
From: Asias He @ 2012-06-19  2:02 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jens Axboe, Shaohua Li, linux-kernel, kvm, virtualization
In-Reply-To: <20120618213103.GC32733@google.com>

On 06/19/2012 05:31 AM, Tejun Heo wrote:
> Hello, Asias.
>
> On Mon, Jun 18, 2012 at 02:53:08PM +0800, Asias He wrote:
>> Split the mapping code in blk_rq_map_sg() to a helper
>> __blk_segment_map_sg(), so that other mapping function, e.g.
>> blk_bio_map_sg(), can share the code.
>>
>> Cc: Jens Axboe <axboe@kernel.dk>
>> Cc: Tejun Heo <tj@kernel.org>
>> Cc: Shaohua Li <shli@kernel.org>
>> Cc: linux-kernel@vger.kernel.org
>> Suggested-by: Tejun Heo <tj@kernel.org>
>> Suggested-by: Jens Axboe <axboe@kernel.dk>
>> Signed-off-by: Asias He <asias@redhat.com>
>> ---
>>   block/blk-merge.c |   80 ++++++++++++++++++++++++++++++-----------------------
>>   1 file changed, 45 insertions(+), 35 deletions(-)
>>
>> diff --git a/block/blk-merge.c b/block/blk-merge.c
>> index 160035f..576b68e 100644
>> --- a/block/blk-merge.c
>> +++ b/block/blk-merge.c
>> @@ -110,6 +110,49 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
>>   	return 0;
>>   }
>>
>> +static void
>> +__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
>> +		     struct scatterlist *sglist, struct bio_vec **bvprv,
>> +		     struct scatterlist **sg, int *nsegs, int *cluster)
>> +{
>> +
>> +	int nbytes = bvec->bv_len;
>> +
>> +	if (*bvprv && *cluster) {
>> +		if ((*sg)->length + nbytes > queue_max_segment_size(q))
>> +			goto new_segment;
>> +
>> +		if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec))
>> +			goto new_segment;
>> +		if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec))
>> +			goto new_segment;
>> +
>> +		(*sg)->length += nbytes;
>> +	} else {
>> +new_segment:
>> +		if (!*sg)
>> +			*sg = sglist;
>> +		else {
>> +			/*
>> +			 * If the driver previously mapped a shorter
>> +			 * list, we could see a termination bit
>> +			 * prematurely unless it fully inits the sg
>> +			 * table on each mapping. We KNOW that there
>> +			 * must be more entries here or the driver
>> +			 * would be buggy, so force clear the
>> +			 * termination bit to avoid doing a full
>> +			 * sg_init_table() in drivers for each command.
>> +			 */
>> +			(*sg)->page_link &= ~0x02;
>> +			*sg = sg_next(*sg);
>> +		}
>> +
>> +		sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
>> +		(*nsegs)++;
>> +	}
>> +	*bvprv = bvec;
>> +}
>
> I *hope* this is a bit prettier.  e.g. Do we really need to pass in
> @sglist and keep using "goto new_segment"?

I think this deserves another patch on top of this splitting one. I'd 
like to clean this up later.

-- 
Asias

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox