All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mark Seger <Mark.Seger@hp.com>
To: axboe@suse.de, linux-kernel@vger.kernel.org
Subject: Patch for inconsistent recording of block device statistics
Date: Tue, 22 Mar 2005 16:50:11 -0500	[thread overview]
Message-ID: <42409313.1010308@hp.com> (raw)

The read/write statistics for both sectors and merges are calculated at 
the time requests first enter the request queue but the remainder of the 
statistics, such as the number of read/writes are calculated at the time 
the I/O completes.  As a result, one cannot accurately determine the 
data rates read or written at the actual time the I/O is performed.  
This behavior is masked with smaller queue sizes but is very real and 
was very noticeable with earlier 2.6 kenels using the cfq scheduler 
which had a default queue size of 8192 where the time difference between 
these sets of counters could exceed 10 seconds for large file writes and 
small monitoring intervals such as 1 second.  In that environment, one 
would see extremely high bursts of I/O, sometimes exceeding 500 or even 
1000 MB/sec for the first second or two and then drop to 0 for a long 
time while the 'number of operations' counters accurately reflect what 
is really happening.

The attached patch fixes this problem by simply accumulating the 
read/write sector/merge data in temporary variables stored in the 
request queue entry, and when the I/O completes copies those values to 
the disk statistics block.

-mark

diff -uprN -X dontdiff ../linux-2.6.11.4/drivers/block/ll_rw_blk.c 
../linux-2.6.11.4-mjs/drivers/block/ll_rw_blk.c
--- ../linux-2.6.11.4/drivers/block/ll_rw_blk.c    2005-03-15 
19:09:00.000000000 -0500
+++ ../linux-2.6.11.4-mjs/drivers/block/ll_rw_blk.c    2005-03-22 
15:43:07.000000000 -0500
@@ -2107,13 +2107,13 @@ void drive_stat_acct(struct request *rq,
         return;
 
     if (rw == READ) {
-        __disk_stat_add(rq->rq_disk, read_sectors, nr_sectors);
+                rq->read_sectors_accum += nr_sectors;
         if (!new_io)
-            __disk_stat_inc(rq->rq_disk, read_merges);
+                        rq->read_merges_accum += 1;
     } else if (rw == WRITE) {
-        __disk_stat_add(rq->rq_disk, write_sectors, nr_sectors);
+                rq->write_sectors_accum += nr_sectors;
         if (!new_io)
-            __disk_stat_inc(rq->rq_disk, write_merges);
+                        rq->write_merges_accum += 1;
     }
     if (new_io) {
         disk_round_stats(rq->rq_disk);
@@ -2487,6 +2487,11 @@ get_rq:
     req->rq_disk = bio->bi_bdev->bd_disk;
     req->start_time = jiffies;
 
+        req->write_sectors_accum=0;
+    req->write_merges_accum=0;
+        req->read_sectors_accum=0;
+    req->read_merges_accum=0;
+
     add_request(q, req);
 out:
     if (freereq)
@@ -2989,10 +2994,14 @@ void end_that_request_last(struct reques
             case WRITE:
             __disk_stat_inc(disk, writes);
             __disk_stat_add(disk, write_ticks, duration);
+                        __disk_stat_add(disk, write_sectors, 
req->write_sectors_accum);
+                        __disk_stat_add(disk, write_merges,  
req->write_merges_accum);
             break;
             case READ:
             __disk_stat_inc(disk, reads);
             __disk_stat_add(disk, read_ticks, duration);
+                        __disk_stat_add(disk, read_sectors, 
req->read_sectors_accum);
+                        __disk_stat_add(disk, read_merges,  
req->read_merges_accum);
             break;
         }
         disk_round_stats(disk);
diff -uprN -X dontdiff ../linux-2.6.11.4/include/linux/blkdev.h 
../linux-2.6.11.4-mjs/include/linux/blkdev.h
--- ../linux-2.6.11.4/include/linux/blkdev.h    2005-03-15 
19:09:02.000000000 -0500
+++ ../linux-2.6.11.4-mjs/include/linux/blkdev.h    2005-03-22 
15:42:47.000000000 -0500
@@ -176,6 +176,12 @@ struct request {
      * For Power Management requests
      */
     struct request_pm_state *pm;
+
+    /*
+     * accumulate intermediate stats
+     */
+        unsigned long read_sectors_accum, write_sectors_accum;
+        unsigned long read_merges_accum, write_merges_accum;
 };
 
 /*


             reply	other threads:[~2005-03-22 21:51 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-03-22 21:50 Mark Seger [this message]
2005-03-23  9:19 ` Patch for inconsistent recording of block device statistics Jens Axboe
2005-03-23 14:40   ` Mark Seger
2005-03-23 15:51     ` Jens Axboe
2005-03-23 18:23       ` Mark Seger
2005-03-23 18:33         ` Jens Axboe
2005-03-24  2:27           ` Mark Goodwin
2005-03-24  6:50             ` Jens Axboe
2005-03-23 15:49 ` Process level I/O stats? Mark Seger
2005-03-23 15:54   ` Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=42409313.1010308@hp.com \
    --to=mark.seger@hp.com \
    --cc=axboe@suse.de \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.