All of lore.kernel.org
 help / color / mirror / Atom feed
From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
To: jaxboe@fusionio.com, vgoyal@redhat.com, jmarchan@redhat.com,
	torvalds@linux-foundation.org, linux-kernel@vger.kernel.org
Subject: [PATCH 1/2] Don't merge different partition's IOs
Date: Mon, 06 Dec 2010 18:44:47 +0900	[thread overview]
Message-ID: <4CFCB08F.4010509@jp.fujitsu.com> (raw)

From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

PROBLEM:

/proc/diskstats would display a strange output as follows.

$ cat /proc/diskstats |grep sda
   8       0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
   8       1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
                                                ~~~~~~~~~~
   8       2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
   8       3 sda3 54 487 2188 92 0 0 0 0 0 88 92
   8       4 sda4 4 0 8 0 0 0 0 0 0 0 0
   8       5 sda5 81 2027 2130 138 0 0 0 0 0 87 137

Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.

The detailed root cause is as follows.

Assuming that there are two partition, sda1 and sda2.

1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
   is 0 and sda2's one is 1.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

2. A bio belongs to sda1 is issued and is merged into the request mentioned on
   step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
   from sda2 region to sda1 region. However the two partition's
   hd_struct->in_flight are not changed.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

3. The request is finished and blk_account_io_done() is called. In this case,
   sda2's hd_struct->in_flight, not a sda1's one, is decremented.

        | hd_struct->in_flight
   ---------------------------
   sda1 |         -1
   sda2 |          1
   ---------------------------

HOW TO FIX:

The patch fixes the problem by changing a merging rule.

The problem is caused by merging different partition's I/Os. So the patch
check whether a merging bio or request is a same partition as a request or not
by using a partition's start sector and size.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

---
 block/blk-core.c       |    2 ++
 block/blk-merge.c      |   11 +++++++++++
 include/linux/blkdev.h |   14 ++++++++++++++
 3 files changed, 27 insertions(+)

Index: linux-2.6.37-rc3/block/blk-core.c
===================================================================
--- linux-2.6.37-rc3.orig/block/blk-core.c	2010-11-30 13:33:02.000000000 +0900
+++ linux-2.6.37-rc3/block/blk-core.c	2010-12-03 17:15:50.000000000 +0900
@@ -71,6 +71,8 @@ static void drive_stat_acct(struct reque
 	else {
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
+		rq->__part_start_sect = part->start_sect;
+		rq->__part_size = part->nr_sects;
 	}

 	part_stat_unlock();
Index: linux-2.6.37-rc3/block/blk-merge.c
===================================================================
--- linux-2.6.37-rc3.orig/block/blk-merge.c	2010-11-30 13:33:02.000000000 +0900
+++ linux-2.6.37-rc3/block/blk-merge.c	2010-12-03 17:15:50.000000000 +0900
@@ -235,6 +235,9 @@ int ll_back_merge_fn(struct request_queu
 	else
 		max_sectors = queue_max_sectors(q);

+	if (blk_rq_part_sector(req) + blk_rq_part_size(req) <= bio->bi_sector)
+		return 0;
+
 	if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
@@ -259,6 +262,8 @@ int ll_front_merge_fn(struct request_que
 	else
 		max_sectors = queue_max_sectors(q);

+	if (bio->bi_sector < blk_rq_part_sector(req))
+		return 0;

 	if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
 		req->cmd_flags |= REQ_NOMERGE;
@@ -382,6 +387,12 @@ static int attempt_merge(struct request_
 		return 0;

 	/*
+	 * Don't merge different partition's request
+	 */
+	if (blk_rq_part_sector(req) != blk_rq_part_sector(next))
+		return 0;
+
+	/*
 	 * not contiguous
 	 */
 	if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next))
Index: linux-2.6.37-rc3/include/linux/blkdev.h
===================================================================
--- linux-2.6.37-rc3.orig/include/linux/blkdev.h	2010-11-30 13:33:02.000000000 +0900
+++ linux-2.6.37-rc3/include/linux/blkdev.h	2010-11-30 16:46:19.000000000 +0900
@@ -91,6 +91,8 @@ struct request {
 	/* the following two fields are internal, NEVER access directly */
 	unsigned int __data_len;	/* total data len */
 	sector_t __sector;		/* sector cursor */
+	sector_t __part_start_sect;	/* partition's start sector*/
+	sector_t __part_size;		/* partition's size*/

 	struct bio *bio;
 	struct bio *biotail;
@@ -724,6 +726,8 @@ static inline struct request_queue *bdev
  * blk_rq_err_bytes()		: bytes left till the next error boundary
  * blk_rq_sectors()		: sectors left in the entire request
  * blk_rq_cur_sectors()		: sectors left in the current segment
+ * blk_rq_part_sector()		: partition's start sector
+ * blk_rq_part_size()		: partition's size
  */
 static inline sector_t blk_rq_pos(const struct request *rq)
 {
@@ -752,6 +756,16 @@ static inline unsigned int blk_rq_cur_se
 	return blk_rq_cur_bytes(rq) >> 9;
 }

+static inline sector_t blk_rq_part_sector(const struct request *rq)
+{
+	return rq->__part_start_sect;
+}
+
+static inline sector_t blk_rq_part_size(const struct request *rq)
+{
+	return rq->__part_size;
+}
+
 /*
  * Request issue related functions.
  */


             reply	other threads:[~2010-12-06  9:45 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-12-06  9:44 Yasuaki Ishimatsu [this message]
2010-12-06 16:08 ` [PATCH 1/2] Don't merge different partition's IOs Linus Torvalds
2010-12-07  7:18   ` Satoru Takeuchi
2010-12-07 18:39     ` Vivek Goyal
2010-12-08  7:33     ` Jens Axboe
2010-12-08  7:59       ` Satoru Takeuchi
2010-12-08  8:06         ` Jens Axboe
2010-12-08  8:11           ` Satoru Takeuchi
2010-12-08 14:46             ` Jens Axboe
2010-12-08 15:51               ` Vivek Goyal
2010-12-08 15:58                 ` Vivek Goyal
2010-12-10 11:22                   ` Jerome Marchand
2010-12-10 16:12               ` Jerome Marchand
2010-12-10 16:55                 ` Vivek Goyal
2010-12-14 20:25                   ` Jens Axboe
2010-12-17 13:42                     ` [PATCH] block: fix accounting bug on cross partition merges Jerome Marchand
2010-12-17 19:06                       ` Jens Axboe
2010-12-17 22:32                         ` Vivek Goyal
2010-12-23 15:10                         ` Jerome Marchand
2010-12-23 15:39                           ` Vivek Goyal
2010-12-23 17:04                             ` Jerome Marchand
2010-12-24 19:29                               ` Vivek Goyal
2011-01-04 15:52                                 ` [PATCH 1/2] kref: add kref_test_and_get Jerome Marchand
2011-01-04 15:55                                   ` [PATCH 2/2] block: fix accounting bug on cross partition merges Jerome Marchand
2011-01-04 21:00                                     ` Greg KH
2011-01-05 13:51                                       ` Jerome Marchand
2011-01-05 16:00                                         ` Greg KH
2011-01-05 16:19                                           ` Jerome Marchand
2011-01-05 16:27                                             ` Greg KH
2011-01-05 13:55                                       ` Jens Axboe
2011-01-05 15:58                                         ` Greg KH
2011-01-05 18:46                                           ` Jens Axboe
2011-01-05 20:08                                             ` Greg KH
2011-01-05 21:38                                               ` Jens Axboe
2011-01-05 22:16                                                 ` Greg KH
2011-01-06  9:46                                                   ` Jens Axboe
2011-01-05 14:00                                     ` Jens Axboe
2011-01-05 14:09                                       ` Jerome Marchand
2011-01-05 14:17                                         ` Jens Axboe
2011-01-04 16:05                                   ` [PATCH 1/2] kref: add kref_test_and_get Eric Dumazet
2011-01-05 15:02                                     ` [PATCH 1/2 v2] " Jerome Marchand
2011-01-05 15:43                                       ` Alexey Dobriyan
2011-01-05 15:57                                         ` Greg KH
2011-01-05 15:56                                       ` Greg KH
2011-01-04 20:57                                   ` [PATCH 1/2] " Greg KH
2011-01-05 13:35                                     ` Jerome Marchand
2011-01-05 15:55                                       ` Greg KH

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4CFCB08F.4010509@jp.fujitsu.com \
    --to=isimatu.yasuaki@jp.fujitsu.com \
    --cc=jaxboe@fusionio.com \
    --cc=jmarchan@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=torvalds@linux-foundation.org \
    --cc=vgoyal@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.