public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH][CFT] new IO scheduler for 2.4.20
@ 2003-05-30 22:09 Neil Schemenauer
  2003-05-30 23:40 ` Con Kolivas
  0 siblings, 1 reply; 11+ messages in thread
From: Neil Schemenauer @ 2003-05-30 22:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: conman, Marc-Christian Petersen, Matt

The major benefit of this patch is that read latency is much lower while
lots of writes are occuring.  On my machine, running:

 while :; do dd if=/dev/zero of=foo bs=1M count=1000 conv=notrunc; done

makes 2.4.20 unusable.  With this patch the "write bomb" causes no
particular problems.

With this version of the patch I've improved the bulk read performance
of the elevator.  The bonnie++ results are now:

                    -Per Chr- --Block-- -Rewrite- -Per Chr- --Block--
               Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP
2.4.20           1G 13001  97 34939  18 13034   7 12175  92 34112  14
2.4.20-nas       1G 12923  98 36471  17 13340   8 10809  83 35569  13

Note that the "rewrite" and "per-char read" stats are slightly bogus for
2.4.20-nas.  Reads get a boost in priority over writes.  When the
"per-char read" test has started there is still some writing happening
from the rewrite test.  I think the net effect is that the "rewrite"
number is too high and the "per-char read" number is too low.

I would be very pleased if someone could run some tests on using bonnie,
contest, or their other favorite benchmarks and post the results.

  Neil


diff -u -ur linux-2.4.20/Makefile linux-iosched-2/Makefile
--- linux-2.4.20/Makefile	2003-04-14 14:47:20.000000000 -0400
+++ linux-iosched-2/Makefile	2003-05-30 17:27:16.000000000 -0400
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 4
 SUBLEVEL = 20
-EXTRAVERSION =
+EXTRAVERSION = -nas
 
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
diff -u -ur linux-2.4.20/drivers/block/elevator.c linux-iosched-2/drivers/block/elevator.c
--- linux-2.4.20/drivers/block/elevator.c	2003-04-14 14:47:22.000000000 -0400
+++ linux-iosched-2/drivers/block/elevator.c	2003-05-30 17:28:57.000000000 -0400
@@ -74,6 +74,81 @@
 	return 0;
 }
 
+int elevator_neil_merge(request_queue_t *q, struct request **req,
+			 struct list_head * head,
+			 struct buffer_head *bh, int rw,
+			 int max_sectors)
+{
+	struct list_head *entry = &q->queue_head;
+	struct request *__rq;
+	unsigned int count = bh->b_size >> 9, ret = ELEVATOR_NO_MERGE;
+	unsigned int reads = 0, writes = 0;
+	/* XXX make tunable? */
+	unsigned int expire_time = jiffies - 1*HZ;
+
+	if (list_empty(&q->queue_head))
+		goto out;
+
+	/* try to merge requests, fall back to ordering them by sector */
+	while ((entry = entry->prev) != head) {
+		__rq = blkdev_entry_to_request(entry);
+
+		if (__rq->elevator_sequence <= 0)
+			break;
+		if (__rq->cmd == READ)
+			++reads;
+		else
+			++writes;
+		if (__rq->rq_dev != bh->b_rdev)
+			continue;
+		if (__rq->waiting)
+			continue;
+		if (__rq->cmd != rw)
+			continue;
+		if (time_before(__rq->start_time, expire_time))
+			break;
+		if (bh->b_rsector > __rq->sector)
+			*req = __rq;
+		if (__rq->nr_sectors + count > max_sectors)
+			continue;
+		if (__rq->sector + __rq->nr_sectors == bh->b_rsector) {
+			ret = ELEVATOR_BACK_MERGE;
+			*req = __rq;
+			goto out;
+		} else if (__rq->sector - count == bh->b_rsector) {
+			ret = ELEVATOR_FRONT_MERGE;
+			*req = __rq;
+			goto out;
+		}
+	}
+	if (!*req && rw == READ) {
+		int extra_writes = writes - reads;
+		/*
+		 * If there are more writes than reads in the queue then put
+		 * read requests ahead of the extra writes.  This prevents
+		 * writes from starving reads.
+		 */
+		entry = q->queue_head.prev;
+		while (extra_writes > 0 && entry != head) {
+			__rq = blkdev_entry_to_request(entry);
+			if (__rq->cmd == WRITE)
+				--extra_writes;
+			else if (time_before(__rq->start_time, expire_time))
+				break;
+			entry = entry->prev;
+		}
+		*req = blkdev_entry_to_request(entry);
+	}
+out:
+	return ret;
+}
+
+void elevator_neil_merge_req(struct request *req, struct request *next)
+{
+	if (time_before(next->start_time, req->start_time))
+		req->start_time = next->start_time;
+}
+
 
 int elevator_linus_merge(request_queue_t *q, struct request **req,
 			 struct list_head * head,
diff -u -ur linux-2.4.20/drivers/block/ll_rw_blk.c linux-iosched-2/drivers/block/ll_rw_blk.c
--- linux-2.4.20/drivers/block/ll_rw_blk.c	2003-04-14 14:47:22.000000000 -0400
+++ linux-iosched-2/drivers/block/ll_rw_blk.c	2003-05-30 17:27:16.000000000 -0400
@@ -480,7 +480,7 @@
 void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
 {
 	INIT_LIST_HEAD(&q->queue_head);
-	elevator_init(&q->elevator, ELEVATOR_LINUS);
+	elevator_init(&q->elevator, ELEVATOR_NEIL);
 	blk_init_free_list(q);
 	q->request_fn     	= rfn;
 	q->back_merge_fn       	= ll_back_merge_fn;
@@ -922,7 +922,8 @@
 			rw = READ;	/* drop into READ */
 		case READ:
 		case WRITE:
-			latency = elevator_request_latency(elevator, rw);
+			/* latency = elevator_request_latency(elevator, rw); */
+			latency = 1;
 			break;
 		default:
 			BUG();
diff -u -ur linux-2.4.20/include/linux/elevator.h linux-iosched-2/include/linux/elevator.h
--- linux-2.4.20/include/linux/elevator.h	2003-04-14 14:47:24.000000000 -0400
+++ linux-iosched-2/include/linux/elevator.h	2003-05-30 17:27:16.000000000 -0400
@@ -31,6 +31,9 @@
 void elevator_linus_merge_cleanup(request_queue_t *, struct request *, int);
 void elevator_linus_merge_req(struct request *, struct request *);
 
+int elevator_neil_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int);
+void elevator_neil_merge_req(struct request *, struct request *);
+
 typedef struct blkelv_ioctl_arg_s {
 	int queue_ID;
 	int read_latency;
@@ -101,3 +104,12 @@
 	})
 
 #endif
+
+#define ELEVATOR_NEIL							\
+((elevator_t) {								\
+	0,				/* read_latency */		\
+	0,				/* write_latency */		\
+									\
+	elevator_neil_merge,		/* elevator_merge_fn */		\
+	elevator_neil_merge_req,	/* elevator_merge_req_fn */	\
+	})

^ permalink raw reply	[flat|nested] 11+ messages in thread
* Re: [PATCH][CFT] new IO scheduler for 2.4.20
@ 2003-06-02 17:21 Andreas Dilger
  0 siblings, 0 replies; 11+ messages in thread
From: Andreas Dilger @ 2003-06-02 17:21 UTC (permalink / raw)
  To: Neil Schemenauer; +Cc: linux-kernel

On Sat, 31 May 2003 08:09, Neil Schemenauer wrote:
> The major benefit of this patch is that read latency is much lower while
> lots of writes are occuring.  On my machine, running:
>
>  while :; do dd if=/dev/zero of=foo bs=1M count=1000 conv=notrunc; done
>
> makes 2.4.20 unusable.  With this patch the "write bomb" causes no
> particular problems.
>
> With this version of the patch I've improved the bulk read performance
> of the elevator.  The bonnie++ results are now:
>
>                     -Per Chr- --Block-- -Rewrite- -Per Chr- --Block--
>                Size K/sec %CP K/sec %CP K/sec %CP K/sec %CP K/sec %CP
> 2.4.20           1G 13001  97 34939  18 13034   7 12175  92 34112  14
> 2.4.20-nas       1G 12923  98 36471  17 13340   8 10809  83 35569  13
>
> Note that the "rewrite" and "per-char read" stats are slightly bogus for
> 2.4.20-nas.  Reads get a boost in priority over writes.  When the
> "per-char read" test has started there is still some writing happening
> from the rewrite test.  I think the net effect is that the "rewrite"
> number is too high and the "per-char read" number is too low.
>
> I would be very pleased if someone could run some tests on using bonnie,
> contest, or their other favorite benchmarks and post the results.

In local testing we hit the following problem at boot:

SCSI device sda: 35565080 512-byte hdwr sectors (18209 MB)
Partition check:
  sda:elevator returned crap (-1069467520)
------------[ cut here ]------------
kernel BUG at ll_rw_blk.c:1043!
invalid operand: 0000

CPU:    0
EIP:    0060:[<c01d40ab>]    Not tainted
EFLAGS: 00010086

EIP is at __make_request [kernel] 0x3bb 
(2.4.20-9smp-l18-b_devel-30-05-2003-fercher-nas)
eax: 00000025   ebx: c466a2e0   ecx: 00000001   edx: 00000001
esi: 00000000   edi: c464b638   ebp: c44c9d30   esp: c44c9c90
ds: 0068   es: 0068   ss: 0068
Process swapper (pid: 1, stackpage=c44c9000)
Stack: c02c7ea6 c0413880 c464b640 c463b880 00000400 00000000 00000002 
00000002
        00000000 c44c8000 00000000 c0413880 c03e1fc4 c011ac62 c44c9d10 
c0413880
        00000000 c44c9cec c020707b c4623a38 c4623a38 c4695980 c4652400 
c4623a00
Call Trace:   [<c011ac62>] schedule [kernel] 0x362 (0xc44c9cc4))
[<c020707b>] scsi_insert_special_req [kernel] 0x1b (0xc44c9cd8))
[<c020723a>] scsi_queue_next_request [kernel] 0x4a (0xc44c9d04))
[<c01d43f9>] generic_make_request [kernel] 0x119 (0xc44c9d34))
[<c01d445f>] submit_bh [kernel] 0x4f (0xc44c9d68))
[<c014f140>] block_read_full_page [kernel] 0x2e0 (0xc44c9d84))
[<c012411d>] bh_action [kernel] 0x4d (0xc44c9dc8))
[<c0123fc4>] tasklet_hi_action [kernel] 0x74 (0xc44c9dd4))
[<c0135981>] add_to_page_cache_unique [kernel] 0x81 (0xc44c9e14))
[<c0138459>] read_cache_page [kernel] 0x89 (0xc44c9e30))
[<c0152a20>] blkdev_get_block [kernel] 0x0 (0xc44c9e38))
[<c014367c>] __alloc_pages [kernel] 0x7c (0xc44c9e44))
[<c0174d46>] read_dev_sector [kernel] 0x36 (0xc44c9e5c))
[<c0152ab0>] blkdev_readpage [kernel] 0x0 (0xc44c9e68))
[<c01756e9>] handle_ide_mess [kernel] 0x29 (0xc44c9e90))
[<c01758c5>] msdos_partition [kernel] 0x65 (0xc44c9ebc))
[<c0163cfc>] new_inode [kernel] 0xc (0xc44c9ed0))
[<c0174b4f>] check_partition [kernel] 0xff (0xc44c9ef0))
[<c02236fa>] sd_init_onedisk [kernel] 0x7ba (0xc44c9f24))
[<c0174cd2>] grok_partitions [kernel] 0xc2 (0xc44c9f5c))
[<c0223d76>] sd_finish [kernel] 0x106 (0xc44c9f80))
[<c0201985>] scsi_register_device_module [kernel] 0x155 (0xc44c9fa8))
[<c0105094>] init [kernel] 0x34 (0xc44c9fdc))
[<c0105060>] init [kernel] 0x0 (0xc44c9fe0))
[<c0107459>] kernel_thread_helper [kernel] 0x5 (0xc44c9ff0))


Code: 0f 0b 13 04 9a 7e 2c c0 59 5f 8b 95 74 ff ff ff 85 d2 74 15
  <0>Kernel panic: Attempted to kill init!

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://www-mddsp.enel.ucalgary.ca/People/adilger/


^ permalink raw reply	[flat|nested] 11+ messages in thread
* [PATCH][CFT] new IO scheduler for 2.4.20
@ 2003-04-17 17:28 Neil Schemenauer
  2003-04-17 20:41 ` Andrew Morton
  2003-04-21 11:33 ` Andrea Arcangeli
  0 siblings, 2 replies; 11+ messages in thread
From: Neil Schemenauer @ 2003-04-17 17:28 UTC (permalink / raw)
  To: linux-kernel; +Cc: axboe, akpm, conman

[-- Attachment #1: Type: text/plain, Size: 1113 bytes --]

Hi all,

Recently I was bitten badly by bad IO scheduler behavior on an important
Linux server.  An easy way to trigger this problem is to start a
streaming write process:

    while :
    do
            dd if=/dev/zero of=foo bs=1M count=512 conv=notrunc
    done

and then try doing a bunch of small reads:

    time (find kernel-tree -type f | xargs cat > /dev/null)

With Linux 2.4.20, the reading takes a very long time to finish.  I
looked at backporting Jen's deadline scheduler to 2.4 but decided it
would be too much work (especially for someone like me who doesn't know
much about kernel hacking).  

I've come up with a fairly simple patch that seems to solve the
starvation problem.  Perhaps it is simple enough to get merged into the
stable branch (after a little more polish).  The basic idea is to give
reads a boost if there are lots of write requests in the queue.  I think
my solution also prevents reads and writes from being starved
indefinitely.

Throughput also seems good although I have not done a lot of testing
yet.  I would greatly appreciate it if people could give it a test.

  Neil

[-- Attachment #2: neil-iosched-1.diff --]
[-- Type: text/plain, Size: 4186 bytes --]

diff -u -ur linux-2.4.20/drivers/block/elevator.c linux-iosched-1d/drivers/block/elevator.c
--- linux-2.4.20/drivers/block/elevator.c	2003-04-14 14:47:22.000000000 -0400
+++ linux-iosched-1d/drivers/block/elevator.c	2003-04-17 12:15:10.000000000 -0400
@@ -74,6 +74,86 @@
 	return 0;
 }
 
+int elevator_neil_merge(request_queue_t *q, struct request **req,
+			 struct list_head * head,
+			 struct buffer_head *bh, int rw,
+			 int max_sectors)
+{
+	struct list_head *entry = &q->queue_head;
+	unsigned int count = bh->b_size >> 9, ret = ELEVATOR_NO_MERGE;
+	unsigned long read_sectors = 0, write_sectors = 0;
+	unsigned int expire_time = jiffies - 1*HZ;
+	struct request *__rq;
+
+	while ((entry = entry->prev) != head) {
+		__rq = blkdev_entry_to_request(entry);
+
+		/*
+		 * we can't insert beyond a zero sequence point
+		 */
+		if (__rq->elevator_sequence == 0) {
+			goto append;
+		}
+
+		if (__rq->cmd == READ)
+			read_sectors += __rq->nr_sectors;
+		else
+			write_sectors += __rq->nr_sectors;
+
+		if (__rq->rq_dev != bh->b_rdev)
+			continue;
+
+		if (__rq->waiting)
+			continue;
+
+		if (__rq->cmd != rw)
+			continue;
+
+		if (time_before(__rq->start_time, expire_time)) {
+			break;
+		}
+
+		if (__rq->sector + __rq->nr_sectors == bh->b_rsector) {
+			if (__rq->nr_sectors + count <= max_sectors)
+				ret = ELEVATOR_BACK_MERGE;
+			*req = __rq;
+			break;
+		} else if (__rq->sector - count == bh->b_rsector) {
+			if (__rq->nr_sectors + count <= max_sectors)
+				ret = ELEVATOR_FRONT_MERGE;
+			*req = __rq;
+			break;
+		}
+	}
+
+	if (!*req && rw == READ) {
+		long extra_writes = write_sectors - read_sectors;
+		/*
+		 * If there are more writes than reads in the queue then put
+		 * read requests ahead of the extra writes.  This prevents
+		 * writes from starving reads.
+		 */
+		entry = q->queue_head.prev;
+		while (extra_writes > 0 && entry != head) {
+			__rq = blkdev_entry_to_request(entry);
+			if (__rq->cmd == WRITE)
+				extra_writes -= __rq->nr_sectors;
+			entry = entry->prev;
+		}
+		*req = blkdev_entry_to_request(entry);
+	}
+
+append:
+
+	return ret;
+}
+
+void elevator_neil_merge_req(struct request *req, struct request *next)
+{
+	if (time_before(next->start_time, req->start_time))
+		req->start_time = next->start_time;
+}
+
 
 int elevator_linus_merge(request_queue_t *q, struct request **req,
 			 struct list_head * head,
diff -u -ur linux-2.4.20/drivers/block/ll_rw_blk.c linux-iosched-1d/drivers/block/ll_rw_blk.c
--- linux-2.4.20/drivers/block/ll_rw_blk.c	2003-04-14 14:47:22.000000000 -0400
+++ linux-iosched-1d/drivers/block/ll_rw_blk.c	2003-04-17 12:21:59.000000000 -0400
@@ -480,7 +480,7 @@
 void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
 {
 	INIT_LIST_HEAD(&q->queue_head);
-	elevator_init(&q->elevator, ELEVATOR_LINUS);
+	elevator_init(&q->elevator, ELEVATOR_NEIL);
 	blk_init_free_list(q);
 	q->request_fn     	= rfn;
 	q->back_merge_fn       	= ll_back_merge_fn;
@@ -922,7 +922,8 @@
 			rw = READ;	/* drop into READ */
 		case READ:
 		case WRITE:
-			latency = elevator_request_latency(elevator, rw);
+			/* latency = elevator_request_latency(elevator, rw); */
+			latency = 1;
 			break;
 		default:
 			BUG();
diff -u -ur linux-2.4.20/include/linux/elevator.h linux-iosched-1d/include/linux/elevator.h
--- linux-2.4.20/include/linux/elevator.h	2003-04-14 14:47:24.000000000 -0400
+++ linux-iosched-1d/include/linux/elevator.h	2003-04-15 10:17:23.000000000 -0400
@@ -31,6 +31,9 @@
 void elevator_linus_merge_cleanup(request_queue_t *, struct request *, int);
 void elevator_linus_merge_req(struct request *, struct request *);
 
+int elevator_neil_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int);
+void elevator_neil_merge_req(struct request *, struct request *);
+
 typedef struct blkelv_ioctl_arg_s {
 	int queue_ID;
 	int read_latency;
@@ -101,3 +104,12 @@
 	})
 
 #endif
+
+#define ELEVATOR_NEIL							\
+((elevator_t) {								\
+	0,				/* read_latency */		\
+	0,				/* write_latency */		\
+									\
+	elevator_neil_merge,		/* elevator_merge_fn */		\
+	elevator_neil_merge_req,	/* elevator_merge_req_fn */	\
+	})
Only in linux-iosched-1d: inst.sh

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2003-06-02 17:11 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2003-05-30 22:09 [PATCH][CFT] new IO scheduler for 2.4.20 Neil Schemenauer
2003-05-30 23:40 ` Con Kolivas
2003-05-31  0:52   ` Neil Schemenauer
2003-05-30 17:58     ` Robert Love
  -- strict thread matches above, loose matches on Subject: below --
2003-06-02 17:21 Andreas Dilger
2003-04-17 17:28 Neil Schemenauer
2003-04-17 20:41 ` Andrew Morton
2003-04-20 18:26   ` Neil Schemenauer
2003-04-20 22:06     ` Marc-Christian Petersen
2003-04-21  1:46       ` Neil Schemenauer
2003-04-21 11:33 ` Andrea Arcangeli

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox