From: Edward Falk <efalk@google.com>
To: linux-kernel@vger.kernel.org, jens.axboe@oracle.com
Cc: Andrew Morton <akpm@osdl.org>
Subject: [PATCH] Introduce block I/O performance histograms
Date: Wed, 15 Nov 2006 19:15:52 -0800 [thread overview]
Message-ID: <455BD7E8.9020303@google.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 772 bytes --]
This patch introduces performance histogram record keeping for block
I/O, used for performance tuning. It is turned off by default.
When turned on, you simply do something like:
# cat /sys/block/sda/read_request_histo
rows = bytes columns = ms
10 20 50 100 200 500 1000 2000
2048 5 0 0 0 0 0 0 0
4096 0 0 0 0 0 0 0 0
8192 17231 135 41 10 0 0 0 0
16384 4400 24 6 2 0 0 0 0
32768 2897 34 4 4 0 0 0 0
65536 7089 87 5 1 2 0 0 0
#
Signed-off-by: Edward A. Falk <efalk@google.com>
[-- Attachment #2: block_histogram.patch --]
[-- Type: text/plain, Size: 14619 bytes --]
diff -uprN linux-2.6.18.1/block/Kconfig block_histogram/block/Kconfig
--- linux-2.6.18.1/block/Kconfig 2006-10-13 20:34:03.000000000 -0700
+++ block_histogram/block/Kconfig 2006-11-15 18:08:07.000000000 -0800
@@ -24,6 +24,31 @@ config BLK_DEV_IO_TRACE
git://brick.kernel.dk/data/git/blktrace.git
+config BLOCK_HISTOGRAM
+ bool "Performance histogram data"
+ help
+ This option causes block devices to collect statistics on transfer
+ sizes and times. Useful for performance-tuning a system. Creates
+ entries in /sys/block/. Increases kernel size about 21k.
+
+ If you are unsure, say N here.
+
+config HISTO_SIZE_BUCKETS
+ int "Number of size buckets in histogram"
+ depends on BLOCK_HISTOGRAM
+ default "6"
+ help
+ This option controls how many buckets are used to collect
+ transfer size statistics.
+
+config HISTO_TIME_BUCKETS
+ int "Number of time buckets in histogram"
+ depends on BLOCK_HISTOGRAM
+ default "8"
+ help
+ This option controls how many buckets are used to collect
+ transfer time statistics.
+
config LSF
bool "Support for Large Single Files"
depends on X86 || (MIPS && 32BIT) || PPC32 || ARCH_S390_31 || SUPERH || UML
diff -uprN linux-2.6.18.1/block/genhd.c block_histogram/block/genhd.c
--- linux-2.6.18.1/block/genhd.c 2006-10-13 20:34:03.000000000 -0700
+++ block_histogram/block/genhd.c 2006-11-15 18:08:07.000000000 -0800
@@ -16,6 +16,13 @@
#include <linux/buffer_head.h>
#include <linux/mutex.h>
+#ifdef CONFIG_BLOCK_HISTOGRAM
+static ssize_t disk_read_request_histo_read(struct gendisk *, char *page);
+static ssize_t disk_read_dma_histo_read(struct gendisk *, char *page);
+static ssize_t disk_write_request_histo_read(struct gendisk *, char *page);
+static ssize_t disk_write_dma_histo_read(struct gendisk *, char *page);
+#endif
+
struct subsystem block_subsys;
static DEFINE_MUTEX(block_subsys_lock);
@@ -412,6 +419,25 @@ static struct disk_attribute disk_attr_s
.show = disk_stats_read
};
+#ifdef CONFIG_BLOCK_HISTOGRAM
+static struct disk_attribute disk_attr_read_request_histo = {
+ .attr = {.name = "read_request_histo", .mode = S_IRUGO },
+ .show = disk_read_request_histo_read,
+};
+static struct disk_attribute disk_attr_read_dma_histo = {
+ .attr = {.name = "read_dma_histo", .mode = S_IRUGO },
+ .show = disk_read_dma_histo_read,
+};
+static struct disk_attribute disk_attr_write_request_histo = {
+ .attr = {.name = "write_request_histo", .mode = S_IRUGO },
+ .show = disk_write_request_histo_read,
+};
+static struct disk_attribute disk_attr_write_dma_histo = {
+ .attr = {.name = "write_dma_histo", .mode = S_IRUGO },
+ .show = disk_write_dma_histo_read,
+};
+#endif
+
static struct attribute * default_attrs[] = {
&disk_attr_uevent.attr,
&disk_attr_dev.attr,
@@ -419,6 +445,12 @@ static struct attribute * default_attrs[
&disk_attr_removable.attr,
&disk_attr_size.attr,
&disk_attr_stat.attr,
+#ifdef CONFIG_BLOCK_HISTOGRAM
+ &disk_attr_read_request_histo.attr,
+ &disk_attr_read_dma_histo.attr,
+ &disk_attr_write_request_histo.attr,
+ &disk_attr_write_dma_histo.attr,
+#endif
NULL,
};
@@ -708,3 +740,288 @@ int invalidate_partition(struct gendisk
}
EXPORT_SYMBOL(invalidate_partition);
+
+
+
+#ifdef CONFIG_BLOCK_HISTOGRAM
+
+/*
+ * Explanation of histogram code:
+ *
+ * The block_histogram code implements a 2-variable histogram, with transfers
+ * tracked by transfer size and completion time. The /sysfs files are
+ * /sysfs/block/DEV/read_request_histo, /sysfs/block/DEV/write_request_histo,
+ * /sysfs/block/DEV/read_dma_histo, and /sysfs/block/DEV/write_dma_histo
+ *
+ * The *request_histo files measure time from when the request is first
+ * submitted into the drive's queue. The *dma_histo files measure time
+ * from when the request is transfered from the queue to the device.
+ *
+ * block_histogram_start_time() is used to note the time that the
+ * request was transfered to the device. block_histogram_completion()
+ * is used to note the time the transfer was completed.
+ */
+
+
+#define HISTO_REQUEST 0
+#define HISTO_DMA 1
+
+/**
+ * Clear one per-cpu instance of one channel of I/O histogram
+ */
+static inline void block_histogram_init2(struct disk_stats *stats,
+ int cmd, int type)
+{
+ if (type == HISTO_REQUEST) {
+ if (cmd == READ)
+ memset(&stats->read_req_histo, 0,
+ sizeof(stats->read_req_histo));
+ else
+ memset(&stats->write_req_histo, 0,
+ sizeof(stats->write_req_histo));
+ }
+ else {
+ if (cmd == READ)
+ memset(&stats->read_dma_histo, 0,
+ sizeof(stats->read_dma_histo));
+ else
+ memset(&stats->write_dma_histo, 0,
+ sizeof(stats->write_dma_histo));
+ }
+}
+
+
+/**
+ * Clear one channel of I/O histogram
+ */
+static void block_histogram_init1(struct gendisk *disk, int cmd, int type)
+{
+#ifdef CONFIG_SMP
+ int i;
+
+ for (i=0; i < NR_CPUS; ++i)
+ if (cpu_possible(i))
+ block_histogram_init2(per_cpu_ptr(disk->dkstats, i),
+ cmd, type);
+#else
+ block_histogram_init2(&disk->dkstats, cmd, type);
+#endif
+}
+
+
+/**
+ * Clear all channels of I/O histogram.
+ */
+void block_histogram_init(struct gendisk *disk)
+{
+ block_histogram_init1(disk, HISTO_REQUEST, READ);
+ block_histogram_init1(disk, HISTO_REQUEST, WRITE);
+ block_histogram_init1(disk, HISTO_DMA, READ);
+ block_histogram_init1(disk, HISTO_DMA, WRITE);
+}
+EXPORT_SYMBOL(block_histogram_init);
+
+
+
+/*
+ * Map transfer size to histogram bucket. Transfer sizes use an
+ * exponential backoff: 4, 8, 16, ... sectors.
+ */
+static inline int stats_size_bucket(int sectors)
+{
+ int i;
+ sectors /= BASE_HISTO_SIZE;
+ if (sectors >= (1<<(CONFIG_HISTO_SIZE_BUCKETS-2)))
+ return CONFIG_HISTO_SIZE_BUCKETS - 1;
+
+ for (i = 0; sectors > 0; ++i, sectors /= 2);
+ return i;
+}
+
+
+/*
+ * Map transfer time to histogram bucket. This also uses an exponential
+ * backoff, but we like the 1,2,5,10,20,50 progression. Start at 10 ms.
+ */
+static inline int stats_time_bucket(int jiffies)
+{
+ int i;
+ int ms = jiffies * ( 1000 / HZ);
+ int t = BASE_HISTO_TIME;
+
+ for (i = 0;; t *= 10) {
+ if (++i >= CONFIG_HISTO_TIME_BUCKETS || ms <= t) return i - 1;
+ if (++i >= CONFIG_HISTO_TIME_BUCKETS || ms <= t*2) return i - 1;
+ if (++i >= CONFIG_HISTO_TIME_BUCKETS || ms <= t*5) return i - 1;
+ }
+}
+
+
+/**
+ * Log I/O completion, update histogram.
+ *
+ * @disk: disk device
+ * @flags: r/w command
+ * @sectors: # sectors transferred
+ * @jiffies: time transfer required
+ * @jiffies_dma: time dma required, or -1 if n/a
+ */
+void block_histogram_completion(struct gendisk *disk,
+ unsigned long flags,
+ int sectors,
+ int jiffies,
+ int jiffies_dma)
+{
+ int size_idx = stats_size_bucket(sectors);
+ int req_time_idx = stats_time_bucket(jiffies);
+
+ if (!(flags & REQ_CMD))
+ return;
+
+ if (!(flags & REQ_RW)) {
+ __disk_stat_add(disk,
+ read_req_histo[size_idx][req_time_idx], 1);
+ } else {
+ __disk_stat_add(disk,
+ write_req_histo[size_idx][req_time_idx], 1);
+ }
+
+ if (jiffies_dma >= 0) {
+ int dma_time_idx = stats_time_bucket(jiffies_dma);
+ if (!(flags & REQ_RW)) {
+ __disk_stat_add(disk,
+ read_dma_histo[size_idx][dma_time_idx], 1);
+ } else {
+ __disk_stat_add(disk,
+ write_dma_histo[size_idx][dma_time_idx], 1);
+ }
+ }
+}
+EXPORT_SYMBOL(block_histogram_completion);
+
+
+/**
+ * Helper function: calls block_histogram_completion after a dma
+ * interrupt.
+ */
+void block_histogram_req_cmplt(struct gendisk *disk, struct request *req)
+{
+ static int count = 20;
+
+ if (req) {
+ int rq_elapsed = jiffies - req->start_time;
+ int io_elapsed;
+
+ if (req->io_start_time == 0) {
+ if( count > 0 ) {
+ printk( KERN_NOTICE
+ "%s: unexpected transfer w/out start time\n",
+ disk->disk_name);
+ dump_stack();
+ --count;
+ }
+ io_elapsed = -1;
+ } else {
+ io_elapsed = jiffies - req->io_start_time;
+ }
+ block_histogram_completion(disk,
+ req->flags,
+ req->nr_sectors,
+ rq_elapsed,
+ io_elapsed);
+ }
+}
+
+
+/**
+ * Tiny helper utility, append sprintf() output to a buffer, update pointers,
+ * and guard against overflow.
+ */
+static inline void sysfs_out(char **ptr, ssize_t *rem, const char *fmt, ...)
+{
+ va_list ap;
+ int i;
+ va_start(ap, fmt);
+ i = vsnprintf(*ptr, *rem, fmt, ap);
+ *ptr += i;
+ *rem -= i;
+ va_end(ap);
+}
+
+
+/**
+ * Dumps the specified 'type' of histogram for disk to out.
+ * The result must be less than PAGE_SIZE
+ */
+static int dump_histo (int cmd, int type, struct gendisk *disk, char* page)
+{
+ ssize_t rem = PAGE_SIZE;
+ char *optr = page;
+ int len, j, k;
+ int v;
+ int ms;
+ int size = BASE_HISTO_SIZE * 512;
+ static const int mults[3] = {1,2,5};
+
+ /* Key */
+ len = snprintf(page, rem, "rows = bytes columns = ms\n");
+ page += len; rem -= len;
+
+ /* Row header */
+ len = snprintf(page, rem, " "); page += len; rem -= len;
+
+ for (j = 0, ms = BASE_HISTO_TIME; j < CONFIG_HISTO_TIME_BUCKETS; ms *= 10)
+ {
+ for(k=0; k < 3 && j < CONFIG_HISTO_TIME_BUCKETS; ++k, ++j) {
+ len = snprintf(page, rem, "\t%d", ms * mults[k]);
+ page += len; rem -= len;
+ }
+ }
+ *page++ = '\n'; --rem;
+
+ /* Payload */
+ for (j = 0; j < CONFIG_HISTO_SIZE_BUCKETS; j++, size *= 2) {
+ len = snprintf(page, rem, "%7d", size);
+ page += len; rem -= len;
+ for (k = 0; k < CONFIG_HISTO_TIME_BUCKETS; k++) {
+ v = (type == HISTO_REQUEST) ?
+ ((cmd == READ) ?
+ disk_stat_read(disk, read_req_histo[j][k]) :
+ disk_stat_read(disk, write_req_histo[j][k])) :
+ ((cmd == READ) ?
+ disk_stat_read(disk, read_dma_histo[j][k]) :
+ disk_stat_read(disk, write_dma_histo[j][k]));
+ len = snprintf(page, rem, "\t%d", v);
+ page += len; rem -= len;
+ }
+ len = snprintf(page, rem, "\n");
+ page += len; rem -= len;
+ }
+ return page - optr;
+}
+
+
+/**
+ * sysfs show() methods for the four histogram channels.
+ */
+static ssize_t disk_read_request_histo_read(struct gendisk * disk, char *page)
+{
+ return dump_histo(READ, HISTO_REQUEST, disk, page);
+}
+
+static ssize_t disk_read_dma_histo_read(struct gendisk * disk, char *page)
+{
+ return dump_histo(READ, HISTO_DMA, disk, page);
+}
+
+static ssize_t disk_write_request_histo_read(struct gendisk * disk, char *page)
+{
+ return dump_histo(WRITE, HISTO_REQUEST, disk, page);
+}
+
+static ssize_t disk_write_dma_histo_read(struct gendisk * disk, char *page)
+{
+ return dump_histo(WRITE, HISTO_DMA, disk, page);
+}
+
+#endif
diff -uprN linux-2.6.18.1/block/ll_rw_blk.c block_histogram/block/ll_rw_blk.c
--- linux-2.6.18.1/block/ll_rw_blk.c 2006-11-15 15:12:00.000000000 -0800
+++ block_histogram/block/ll_rw_blk.c 2006-11-15 18:08:07.000000000 -0800
@@ -3469,6 +3469,7 @@ void end_that_request_last(struct reques
__disk_stat_add(disk, ticks[rw], duration);
disk_round_stats(disk);
disk->in_flight--;
+ block_histogram_req_cmplt(disk, req);
}
if (req->end_io)
req->end_io(req, error);
diff -uprN linux-2.6.18.1/drivers/scsi/scsi_lib.c block_histogram/drivers/scsi/scsi_lib.c
--- linux-2.6.18.1/drivers/scsi/scsi_lib.c 2006-11-15 16:37:34.000000000 -0800
+++ block_histogram/drivers/scsi/scsi_lib.c 2006-11-15 18:08:07.000000000 -0800
@@ -1477,6 +1477,7 @@ static void scsi_request_fn(struct reque
/*
* Dispatch the command to the low-level driver.
*/
+ block_histogram_start_time(req);
rtn = scsi_dispatch_cmd(cmd);
spin_lock_irq(q->queue_lock);
if(rtn) {
diff -uprN linux-2.6.18.1/include/linux/blkdev.h block_histogram/include/linux/blkdev.h
--- linux-2.6.18.1/include/linux/blkdev.h 2006-10-13 20:34:03.000000000 -0700
+++ block_histogram/include/linux/blkdev.h 2006-11-15 18:08:07.000000000 -0800
@@ -154,7 +154,10 @@ struct request {
int rq_status; /* should split this into a few status bits */
int errors;
struct gendisk *rq_disk;
- unsigned long start_time;
+ unsigned long start_time; /* when request submitted */
+#ifdef CONFIG_BLOCK_HISTOGRAM
+ unsigned long io_start_time; /* when passed to hardware */
+#endif
/* Number of scatter-gather DMA addr+len pairs after
* physical address coalescing is performed.
@@ -840,5 +843,13 @@ void kblockd_flush(void);
#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
MODULE_ALIAS("block-major-" __stringify(major) "-*")
+#ifdef CONFIG_BLOCK_HISTOGRAM
+inline static void block_histogram_start_time(struct request *req)
+{
+ req->io_start_time = jiffies;
+}
+#else
+#define block_histogram_start_time(req)
+#endif
#endif
diff -uprN linux-2.6.18.1/include/linux/genhd.h block_histogram/include/linux/genhd.h
--- linux-2.6.18.1/include/linux/genhd.h 2006-10-13 20:34:03.000000000 -0700
+++ block_histogram/include/linux/genhd.h 2006-11-15 18:08:07.000000000 -0800
@@ -11,6 +11,8 @@
#include <linux/types.h>
+struct request;
+
enum {
/* These three have identical behaviour; use the second one if DOS FDISK gets
confused about extended/logical partitions starting past cylinder 1023. */
@@ -89,6 +91,10 @@ struct hd_struct {
#define GENHD_FL_UP 16
#define GENHD_FL_SUPPRESS_PARTITION_INFO 32
+#define BASE_HISTO_SIZE 4 /* smallest transfer size, sectors */
+#define BASE_HISTO_TIME 10 /* shortest transfer time, ms */
+
+
struct disk_stats {
unsigned long sectors[2]; /* READs and WRITEs */
unsigned long ios[2];
@@ -96,6 +102,12 @@ struct disk_stats {
unsigned long ticks[2];
unsigned long io_ticks;
unsigned long time_in_queue;
+#ifdef CONFIG_BLOCK_HISTOGRAM
+ int read_req_histo[CONFIG_HISTO_SIZE_BUCKETS][CONFIG_HISTO_TIME_BUCKETS];
+ int read_dma_histo[CONFIG_HISTO_SIZE_BUCKETS][CONFIG_HISTO_TIME_BUCKETS];
+ int write_req_histo[CONFIG_HISTO_SIZE_BUCKETS][CONFIG_HISTO_TIME_BUCKETS];
+ int write_dma_histo[CONFIG_HISTO_SIZE_BUCKETS][CONFIG_HISTO_TIME_BUCKETS];
+#endif
};
struct gendisk {
@@ -230,6 +242,21 @@ extern struct gendisk *get_gendisk(dev_t
extern void set_device_ro(struct block_device *bdev, int flag);
extern void set_disk_ro(struct gendisk *disk, int flag);
+#ifdef CONFIG_BLOCK_HISTOGRAM
+extern void block_histogram_init(struct gendisk *disk);
+extern void block_histogram_completion(struct gendisk *disk,
+ unsigned long flags,
+ int sectors,
+ int jiffies,
+ int jiffies_dma);
+extern void block_histogram_req_cmplt(struct gendisk *disk, struct request *);
+#else
+#define block_histogram_init(disk)
+#define block_histogram_completion(disk, flags, sectors, jiffies, jiffies_dma)
+#define block_histogram_req_cmplt(disk, req)
+#endif
+
+
/* drivers/char/random.c */
extern void add_disk_randomness(struct gendisk *disk);
extern void rand_initialize_disk(struct gendisk *disk);
next reply other threads:[~2006-11-16 3:16 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2006-11-16 3:15 Edward Falk [this message]
2006-11-16 6:58 ` [PATCH] Introduce block I/O performance histograms Jens Axboe
2006-11-16 20:01 ` Edward Falk
2006-11-16 20:21 ` Jens Axboe
2006-11-16 20:01 ` Greg KH
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=455BD7E8.9020303@google.com \
--to=efalk@google.com \
--cc=akpm@osdl.org \
--cc=jens.axboe@oracle.com \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.