All of lore.kernel.org
 help / color / mirror / Atom feed
From: Minchan Kim <minchan@kernel.org>
To: Rusty Russell <rusty@rustcorp.com.au>
Cc: Chris Wright <chrisw@sous-sol.org>, Jens Axboe <axboe@kernel.dk>,
	Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>,
	kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	Christoph Hellwig <hch@infradead.org>,
	Minchan Kim <minchan@kernel.org>,
	Minchan Kim <minchan@redhat.com>
Subject: [PATCH 5/6] virtio-blk: Support batch I/O for enhancing sequential IO
Date: Wed, 21 Dec 2011 10:00:53 +0900	[thread overview]
Message-ID: <1324429254-28383-6-git-send-email-minchan@kernel.org> (raw)
In-Reply-To: <1324429254-28383-1-git-send-email-minchan@kernel.org>

BIO-based path has a disadvantage which it's not good to sequential
stream because it cannot merge BIO while reuqest can do it.

This patch makes per-cpu BIO for batch I/O.
If this request is contiguous with previous's one, this request would
be merged with previous one on batch queue.
If non-contiguous I/O issue or pass 1ms, batch queue would be drained.

Signed-off-by: Minchan Kim <minchan@redhat.com>
---
 drivers/block/virtio_blk.c |  366 +++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 331 insertions(+), 35 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 4e476d6..e32c69e 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -19,6 +19,28 @@ static DEFINE_IDA(vd_index_ida);
 
 struct workqueue_struct *virtblk_wq;
 
+#define BIO_QUEUE_MAX	32
+
+struct per_cpu_bio
+{
+	struct bio *bios[BIO_QUEUE_MAX];
+	int idx;			/* current index */
+	struct virtio_blk *vblk;
+	struct request_queue *q;
+	struct delayed_work dwork;
+	unsigned int segments; 		/* the number of accumulated segement */
+	bool seq_mode;			/* sequential mode */
+	sector_t next_offset;		/*
+					 * next expected sector offset
+					 * for becoming sequential mode
+					 */
+};
+
+struct bio_queue
+{
+	struct per_cpu_bio __percpu *pcbio;
+};
+
 struct virtio_blk
 {
 	spinlock_t lock;
@@ -38,6 +60,9 @@ struct virtio_blk
 	/* What host tells us, plus 2 for header & tailer. */
 	unsigned int sg_elems;
 
+	/* bio queue for batch IO */
+	struct bio_queue bq;
+
 	/* Ida index - used to track minor number allocations. */
 	int index;
 };
@@ -57,6 +82,8 @@ struct virtblk_req
 	struct scatterlist sg[];
 };
 
+static void wait_virtq_flush(struct virtio_blk *vblk);
+
 static struct virtblk_req *alloc_virtblk_req(struct virtio_blk *vblk,
 		gfp_t gfp_mask)
 {
@@ -93,7 +120,6 @@ static void virtblk_request_done(struct virtio_blk *vblk,
 		req->errors = vbr->in_hdr.errors;
 	}
 	else if (req->cmd_type == REQ_TYPE_SPECIAL) {
-		printk("REQ_TYPE_SPECIAL done\n");
 		req->errors = (error != 0);
 	}
 
@@ -104,7 +130,15 @@ static void virtblk_request_done(struct virtio_blk *vblk,
 static void virtblk_bio_done(struct virtio_blk *vblk,
 		struct virtblk_req *vbr)
 {
-	bio_endio(vbr->private, virtblk_result(vbr));
+	struct bio *bio;
+	bio = vbr->private;
+
+	while(bio) {
+		struct bio *free_bio = bio;
+		bio = bio->bi_next;
+		bio_endio(free_bio, virtblk_result(vbr));
+	}
+
 	mempool_free(vbr, vblk->pool);
 }
 
@@ -298,52 +332,220 @@ static bool virtblk_plugged(struct virtio_blk *vblk)
 	return true;
 }
 
-static void virtblk_add_buf_wait(struct virtio_blk *vblk,
-	struct virtblk_req *vbr, unsigned long out, unsigned long in)
+bool seq_bio(struct bio *bio, struct per_cpu_bio __percpu *pcbio)
 {
-	DEFINE_WAIT(wait);
-	bool retry, notify;
+	struct bio *last_bio;
+	int index = pcbio->idx - 1;
 
-	for (;;) {
-		prepare_to_wait(&vblk->queue_wait, &wait,
-				TASK_UNINTERRUPTIBLE);
+	BUG_ON(index < 0 || index > BIO_QUEUE_MAX);
+	last_bio = pcbio->bios[index];
+
+	if (last_bio->bi_rw != bio->bi_rw)
+		return false;
+
+	if ((last_bio->bi_sector + (last_bio->bi_size >> 9)) ==
+				bio->bi_sector)
+		return true;
+
+	return false;
+}
+
+int add_pcbio_to_vq(struct per_cpu_bio __percpu *pcbio,
+		struct virtio_blk *vblk, struct request_queue *q,
+		int *notify)
+{
+	int i;
+	unsigned long num = 0, out = 0, in = 0;
+	bool retry;
+	struct virtblk_req *vbr;
+	struct bio *bio;
+
+	vbr = alloc_virtblk_req(vblk, GFP_ATOMIC);
+	if (!vbr)
+		return 1;
+
+	vbr->private = NULL;
+	vbr->next = NULL;
+	vbr->kind = VIRTIO_BLK_BIO;
+
+	bio = pcbio->bios[0];
+	BUG_ON(!bio);
+
+	vbr->out_hdr.type = 0;
+	vbr->out_hdr.sector = bio->bi_sector;
+	vbr->out_hdr.ioprio = bio_prio(bio);
+
+	sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
 
-		spin_lock_irq(&vblk->lock);
-		if (virtqueue_add_buf(vblk->vq, vbr->sg,
-			out, in, vbr) < 0) {
-			retry = true;
+	for ( i = 0; i < pcbio->idx; i++) {
+		struct bio *prev;
+		bio = pcbio->bios[i];
+
+		BUG_ON(!bio);
+		num += bio_map_sg(q, bio, vbr->sg + out + num);
+		BUG_ON(num > (vblk->sg_elems - 2));
+
+		prev = vbr->private;
+		if (prev)
+			bio->bi_next = prev;
+		vbr->private = bio;
+	}
+
+	sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
+		   sizeof(vbr->status));
+
+	if (num) {
+		if (bio->bi_rw & REQ_WRITE) {
+			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
+			out += num;
 		} else {
-			retry = false;
+			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
+			in += num;
 		}
-		notify = virtqueue_kick_prepare(vblk->vq);
-		spin_unlock_irq(&vblk->lock);
+	}
+
+	spin_lock_irq(&vblk->lock);
+	if (virtqueue_add_buf(vblk->vq, vbr->sg,
+		out, in, vbr) < 0) {
+		struct bio *bio, *next_bio;
 
-		if (notify)
-			virtqueue_notify(vblk->vq);
+		retry = true;
 
-		if (!retry)
-			break;
-		schedule();
+		bio = vbr->private;
+		while(bio) {
+			next_bio = bio->bi_next;
+			bio->bi_next = NULL;
+			bio = next_bio;
+		}
+
+		mempool_free(vbr, vblk->pool);
+
+	} else {
+
+		for ( i = 0; i < pcbio->idx; i++) {
+			pcbio->bios[i] = NULL;
+		}
+
+		pcbio->idx = 0;
+		pcbio->segments = 0;
+
+		retry = false;
 	}
-	finish_wait(&vblk->queue_wait, &wait);
+
+	*notify |= virtqueue_kick_prepare(vblk->vq);
+	spin_unlock_irq(&vblk->lock);
+
+	return retry;
 }
 
-static void virtblk_make_request(struct request_queue *q, struct bio *bio)
+/*
+ * Return 0 if it is successful flush
+ * This function might be able to don't flush so caller
+ * should retry it.
+ */
+int try_flush_pcb(struct per_cpu_bio __percpu *pcbio)
 {
-	struct virtio_blk *vblk = q->queuedata;
-	unsigned long num, out = 0, in = 0;
-	struct virtblk_req *vbr;
-	bool retry, notify;
+	int notify = 0;
 
-	BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems);
-	BUG_ON(bio->bi_rw & (REQ_FLUSH | REQ_FUA));
+	if (!pcbio->idx)
+		return 0;
 
-	vbr = alloc_virtblk_req(vblk, GFP_NOIO);
-	if (!vbr) {
-		bio_endio(bio, -ENOMEM);
-		return;
+	if (add_pcbio_to_vq(pcbio, pcbio->vblk, pcbio->q, &notify)) {
+		virtqueue_notify(pcbio->vblk->vq);
+		return 1;
 	}
 
+	if (notify && !virtblk_plugged(pcbio->vblk))
+		virtqueue_notify(pcbio->vblk->vq);
+
+	return 0;
+}
+
+static void virtblk_delay_q_flush(struct work_struct *work)
+{
+	struct per_cpu_bio __percpu *pcbio =
+		container_of(work, struct per_cpu_bio, dwork.work);
+
+	while(try_flush_pcb(pcbio))
+		wait_virtq_flush(pcbio->vblk);
+}
+
+void wait_virtq_flush(struct virtio_blk *vblk)
+{
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(&vblk->queue_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+	schedule();
+	finish_wait(&vblk->queue_wait, &wait);
+}
+
+void add_bio_to_pcbio(struct bio *bio, struct per_cpu_bio __percpu *pcbio)
+{
+	BUG_ON(pcbio->idx >= BIO_QUEUE_MAX);
+
+	pcbio->bios[pcbio->idx++] = bio;
+	pcbio->segments += bio->bi_phys_segments;
+	/*
+	 * If this bio is first bio on queue, start timer to flush
+	 * bio within 1ms.
+	 */
+	if (pcbio->idx == 1)
+		queue_delayed_work_on(smp_processor_id(),
+			virtblk_wq, &pcbio->dwork,
+			msecs_to_jiffies(1));
+}
+
+static void virtblk_add_buf_wait(struct virtio_blk *vblk,
+        struct virtblk_req *vbr, unsigned long out, unsigned long in)
+{
+        DEFINE_WAIT(wait);
+        bool retry, notify;
+
+        for (;;) {
+                prepare_to_wait(&vblk->queue_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+
+                spin_lock_irq(&vblk->lock);
+                if (virtqueue_add_buf(vblk->vq, vbr->sg,
+                        out, in, vbr) < 0) {
+                        retry = true;
+                } else {
+                        retry = false;
+                }
+                notify = virtqueue_kick_prepare(vblk->vq);
+                spin_unlock_irq(&vblk->lock);
+
+                if (notify)
+                        virtqueue_notify(vblk->vq);
+
+                if (!retry)
+                        break;
+                schedule();
+        }
+        finish_wait(&vblk->queue_wait, &wait);
+}
+
+bool full_segment(struct per_cpu_bio __percpu *pcbio, struct bio *bio,
+		unsigned int max)
+{
+	bool full;
+	full = (pcbio->segments + bio->bi_phys_segments) > max;
+
+	return full;
+}
+
+int add_bio_to_vq(struct bio *bio, struct virtio_blk *vblk,
+		struct request_queue *q)
+{
+	int notify;
+	bool retry;
+	unsigned long num, out = 0, in = 0;
+	struct virtblk_req *vbr = alloc_virtblk_req(vblk, GFP_KERNEL);
+
+	if (!vbr)
+		return 1;
+
 	vbr->private = bio;
 	vbr->next = NULL;
 	vbr->kind = VIRTIO_BLK_BIO;
@@ -357,7 +559,7 @@ static void virtblk_make_request(struct request_queue *q, struct bio *bio)
 	num = bio_map_sg(q, bio, vbr->sg + out);
 
 	sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
-		   sizeof(vbr->status));
+			sizeof(vbr->status));
 
 	if (num) {
 		if (bio->bi_rw & REQ_WRITE) {
@@ -371,7 +573,7 @@ static void virtblk_make_request(struct request_queue *q, struct bio *bio)
 
 	spin_lock_irq(&vblk->lock);
 	if (virtqueue_add_buf(vblk->vq, vbr->sg,
-		out, in, vbr) < 0) {
+				out, in, vbr) < 0) {
 		retry = true;
 	} else {
 		retry = false;
@@ -385,6 +587,75 @@ static void virtblk_make_request(struct request_queue *q, struct bio *bio)
 
 	if (retry)
 		virtblk_add_buf_wait(vblk, vbr, out, in);
+	return 0;
+}
+
+bool seq_mode(struct per_cpu_bio __percpu *pcbio, struct bio *bio)
+{
+	if (pcbio->seq_mode == false)
+		return false;
+
+	if (pcbio->idx == 0)
+		return true;
+
+	return seq_bio(bio, pcbio);
+}
+
+void reset_seq_mode(struct per_cpu_bio __percpu *pcbio, struct bio *bio)
+{
+	if (bio->bi_sector == pcbio->next_offset)
+		pcbio->seq_mode = true;
+	else
+		pcbio->seq_mode = false;
+
+	pcbio->next_offset = bio->bi_sector + (bio->bi_size >> 9);
+}
+
+
+static void virtblk_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct virtio_blk *vblk = q->queuedata;
+	struct per_cpu_bio __percpu *pcbio;
+
+	BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems);
+	BUG_ON(bio->bi_rw & (REQ_FLUSH | REQ_FUA));
+retry:
+	preempt_disable();
+	pcbio = this_cpu_ptr(vblk->bq.pcbio);
+
+	if (seq_mode(pcbio, bio)) {
+		if (pcbio->idx >= BIO_QUEUE_MAX ||
+			full_segment(pcbio, bio, vblk->sg_elems -2)) {
+			if (try_flush_pcb(pcbio)) {
+				preempt_enable();
+				wait_virtq_flush(pcbio->vblk);
+				goto retry;
+			}
+
+			cancel_delayed_work(&pcbio->dwork);
+		}
+
+		add_bio_to_pcbio(bio, pcbio);
+	}
+	else {
+		while(try_flush_pcb(pcbio)) {
+			preempt_enable();
+			wait_virtq_flush(pcbio->vblk);
+			preempt_disable();
+			pcbio = this_cpu_ptr(vblk->bq.pcbio);
+		}
+
+		cancel_delayed_work(&pcbio->dwork);
+		reset_seq_mode(pcbio, bio);
+		preempt_enable();
+
+		while (add_bio_to_vq(bio, vblk, q))
+			wait_virtq_flush(pcbio->vblk);
+
+		preempt_disable();
+	}
+
+	preempt_enable();
 }
 
 /* return id (s/n) string for *disk to *id_str
@@ -532,6 +803,26 @@ static void virtblk_config_changed(struct virtio_device *vdev)
 	queue_work(virtblk_wq, &vblk->config_work);
 }
 
+void setup_per_cpu_bio(struct virtio_blk *vblk, struct request_queue *q)
+{
+	int cpu;
+
+	struct bio_queue *bq = &vblk->bq;
+	bq->pcbio = alloc_percpu(struct per_cpu_bio);
+	for_each_possible_cpu(cpu) {
+		struct per_cpu_bio __percpu *pcbio =
+					per_cpu_ptr(bq->pcbio, cpu);
+		pcbio->q = q;
+		pcbio->vblk = vblk;
+		pcbio->idx = 0;
+		pcbio->segments = 0;
+		pcbio->seq_mode = false;
+		pcbio->next_offset = 0;
+		memset(pcbio->bios, 0, BIO_QUEUE_MAX);
+		INIT_DELAYED_WORK(&pcbio->dwork, virtblk_delay_q_flush);
+	}
+}
+
 static int __devinit virtblk_probe(struct virtio_device *vdev)
 {
 	struct virtio_blk *vblk;
@@ -571,6 +862,8 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	vblk->sg_elems = sg_elems;
 	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
 
+	memset(&vblk->bq, 0, sizeof(struct bio_queue));
+
 	/* We expect one virtqueue, for output. */
 	vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests");
 	if (IS_ERR(vblk->vq)) {
@@ -602,6 +895,8 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 	blk_queue_make_request(q, virtblk_make_request);
 	q->queuedata = vblk;
 
+	setup_per_cpu_bio(vblk, q);
+
 	if (index < 26) {
 		sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26);
 	} else if (index < (26 + 1) * 26) {
@@ -736,6 +1031,7 @@ static void __devexit virtblk_remove(struct virtio_device *vdev)
 	put_disk(vblk->disk);
 	mempool_destroy(vblk->pool);
 	vdev->config->del_vqs(vdev);
+	free_percpu(vblk->bq.pcbio);
 	kfree(vblk);
 	ida_simple_remove(&vd_index_ida, index);
 }
-- 
1.7.6.4

  parent reply	other threads:[~2011-12-21  1:00 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-12-21  1:00 [PATCH 0/6][RFC] virtio-blk: Change I/O path from request to BIO Minchan Kim
2011-12-21  1:00 ` [PATCH 1/6] block: add bio_map_sg Minchan Kim
2011-12-21  1:00 ` [PATCH 2/6] virtio: support unlocked queue kick Minchan Kim
2011-12-21  1:00 ` [PATCH 3/6] virtio-blk: remove the unused list of pending requests Minchan Kim
2011-12-21  1:00 ` [PATCH 4/6] virtio-blk: implement ->make_request Minchan Kim
2011-12-22 12:20   ` Stefan Hajnoczi
2011-12-22 20:28     ` Christoph Hellwig
2011-12-21  1:00 ` Minchan Kim [this message]
2011-12-21  1:00 ` [PATCH 6/6] virtio-blk: Emulate Flush/FUA Minchan Kim
2011-12-21  5:08 ` [PATCH 0/6][RFC] virtio-blk: Change I/O path from request to BIO Rusty Russell
2011-12-21  5:56   ` Minchan Kim
2011-12-21  8:28 ` Sasha Levin
2011-12-21  8:17   ` Minchan Kim
2011-12-21 19:11 ` Vivek Goyal
2011-12-22  1:05   ` Minchan Kim
2011-12-22 15:45     ` Vivek Goyal
2011-12-22 23:26       ` Minchan Kim
2011-12-22 12:57 ` Stefan Hajnoczi
2011-12-22 23:41   ` Minchan Kim
2012-01-01 16:45     ` Stefan Hajnoczi
2012-01-02  7:48       ` Dor Laor
2012-01-02 16:12       ` Paolo Bonzini
2012-01-02 16:15         ` Christoph Hellwig
2012-01-02 16:18           ` Paolo Bonzini
2012-01-02 16:23             ` Christoph Hellwig
2012-01-02 16:18       ` Christoph Hellwig
2012-01-02 16:21         ` Avi Kivity

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1324429254-28383-6-git-send-email-minchan@kernel.org \
    --to=minchan@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=chrisw@sous-sol.org \
    --cc=hch@infradead.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=minchan@redhat.com \
    --cc=rusty@rustcorp.com.au \
    --cc=stefanha@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.