public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Speed up the cdrw packet writing driver
@ 2004-08-14 19:13 Peter Osterlund
  2004-08-23 11:43 ` Jens Axboe
  0 siblings, 1 reply; 19+ messages in thread
From: Peter Osterlund @ 2004-08-14 19:13 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

Hi!

This patch replaces the pd->bio_queue linked list with an rbtree.  The
list can get very long (>200000 entries on a 1GB machine), so keeping
it sorted with a naive algorithm is far too expensive.

This patch also improves write performance when writing lots of data,
because the old code gave up on sorting if the queue became longer
than 10000 entries.  This caused unnecessary seeks.

Signed-off-by: Peter Osterlund <petero2@telia.com>
---

 linux-petero/drivers/block/pktcdvd.c |  193 +++++++++++++++++++++++------------
 linux-petero/include/linux/pktcdvd.h |   14 ++
 2 files changed, 143 insertions(+), 64 deletions(-)

diff -puN drivers/block/pktcdvd.c~packet-rbtree drivers/block/pktcdvd.c
--- linux/drivers/block/pktcdvd.c~packet-rbtree	2004-08-14 11:35:42.000000000 +0200
+++ linux-petero/drivers/block/pktcdvd.c	2004-08-14 19:26:50.612757536 +0200
@@ -254,6 +254,89 @@ static int pkt_grow_pktlist(struct pktcd
 	return 1;
 }
 
+static void *pkt_rb_alloc(int gfp_mask, void *data)
+{
+	return kmalloc(sizeof(struct pkt_rb_node), gfp_mask);
+}
+
+static void pkt_rb_free(void *ptr, void *data)
+{
+	kfree(ptr);
+}
+
+static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node)
+{
+	struct rb_node *n = rb_next(&node->rb_node);
+	if (!n)
+		return NULL;
+	return rb_entry(n, struct pkt_rb_node, rb_node);
+}
+
+static inline void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node)
+{
+	rb_erase(&node->rb_node, &pd->bio_queue);
+	mempool_free(node, pd->rb_pool);
+	pd->bio_queue_size--;
+	BUG_ON(pd->bio_queue_size < 0);
+}
+
+/*
+ * Find the first node in the pd->bio_queue rb tree with a starting sector >= s.
+ */
+static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s)
+{
+	struct rb_node *n = pd->bio_queue.rb_node;
+	struct rb_node *next;
+	struct pkt_rb_node *tmp;
+
+	if (!n) {
+		BUG_ON(pd->bio_queue_size > 0);
+		return NULL;
+	}
+
+	for (;;) {
+		tmp = rb_entry(n, struct pkt_rb_node, rb_node);
+		if (s <= tmp->bio->bi_sector)
+			next = n->rb_left;
+		else
+			next = n->rb_right;
+		if (!next)
+			break;
+		n = next;
+	}
+
+	if (s > tmp->bio->bi_sector) {
+		tmp = pkt_rbtree_next(tmp);
+		if (!tmp)
+			return NULL;
+	}
+	BUG_ON(s > tmp->bio->bi_sector);
+	return tmp;
+}
+
+/*
+ * Insert a node into the pd->bio_queue rb tree.
+ */
+static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *node)
+{
+	struct rb_node **p = &pd->bio_queue.rb_node;
+	struct rb_node *parent = NULL;
+	sector_t s = node->bio->bi_sector;
+	struct pkt_rb_node *tmp;
+
+	while (*p) {
+		parent = *p;
+		tmp = rb_entry(parent, struct pkt_rb_node, rb_node);
+		if (s < tmp->bio->bi_sector)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&node->rb_node, parent, p);
+	rb_insert_color(&node->rb_node, &pd->bio_queue);
+	pd->bio_queue_size++;
+}
+
 /*
  * Add a bio to a single linked list defined by its head and tail pointers.
  */
@@ -839,8 +922,10 @@ static inline void pkt_set_state(struct 
 static int pkt_handle_queue(struct pktcdvd_device *pd)
 {
 	struct packet_data *pkt, *p;
-	struct bio *bio, *prev, *next;
+	struct bio *bio = NULL;
 	sector_t zone = 0; /* Suppress gcc warning */
+	struct pkt_rb_node *node, *first_node;
+	struct rb_node *n;
 
 	VPRINTK("handle_queue\n");
 
@@ -855,14 +940,30 @@ static int pkt_handle_queue(struct pktcd
 	 * Try to find a zone we are not already working on.
 	 */
 	spin_lock(&pd->lock);
-	for (bio = pd->bio_queue; bio; bio = bio->bi_next) {
+	first_node = pkt_rbtree_find(pd, pd->current_sector);
+	if (!first_node) {
+		n = rb_first(&pd->bio_queue);
+		if (n)
+			first_node = rb_entry(n, struct pkt_rb_node, rb_node);
+	}
+	node = first_node;
+	while (node) {
+		bio = node->bio;
 		zone = ZONE(bio->bi_sector, pd);
 		list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) {
 			if (p->sector == zone)
 				goto try_next_bio;
 		}
 		break;
-try_next_bio: ;
+try_next_bio:
+		node = pkt_rbtree_next(node);
+		if (!node) {
+			n = rb_first(&pd->bio_queue);
+			if (n)
+				node = rb_entry(n, struct pkt_rb_node, rb_node);
+		}
+		if (node == first_node)
+			node = NULL;
 	}
 	spin_unlock(&pd->lock);
 	if (!bio) {
@@ -873,6 +974,7 @@ try_next_bio: ;
 	pkt = pkt_get_packet_data(pd, zone);
 	BUG_ON(!pkt);
 
+	pd->current_sector = zone + pd->settings.size;
 	pkt->sector = zone;
 	pkt->frames = pd->settings.size >> 2;
 	BUG_ON(pkt->frames > PACKET_MAX_SIZE);
@@ -883,35 +985,18 @@ try_next_bio: ;
 	 * to this packet.
 	 */
 	spin_lock(&pd->lock);
-	prev = NULL;
 	VPRINTK("pkt_handle_queue: looking for zone %llx\n", (unsigned long long)zone);
-	bio = pd->bio_queue;
-	while (bio) {
+	while ((node = pkt_rbtree_find(pd, zone)) != NULL) {
+		bio = node->bio;
 		VPRINTK("pkt_handle_queue: found zone=%llx\n",
 			(unsigned long long)ZONE(bio->bi_sector, pd));
-		if (ZONE(bio->bi_sector, pd) == zone) {
-			if (prev) {
-				prev->bi_next = bio->bi_next;
-			} else {
-				pd->bio_queue = bio->bi_next;
-			}
-			if (bio == pd->bio_queue_tail)
-				pd->bio_queue_tail = prev;
-			next = bio->bi_next;
-			spin_lock(&pkt->lock);
-			pkt_add_list_last(bio, &pkt->orig_bios,
-					  &pkt->orig_bios_tail);
-			pkt->write_size += bio->bi_size / CD_FRAMESIZE;
-			if (pkt->write_size >= pkt->frames) {
-				VPRINTK("pkt_handle_queue: pkt is full\n");
-				next = NULL; /* Stop searching if the packet is full */
-			}
-			spin_unlock(&pkt->lock);
-			bio = next;
-		} else {
-			prev = bio;
-			bio = bio->bi_next;
-		}
+		if (ZONE(bio->bi_sector, pd) != zone)
+			break;
+		pkt_rbtree_erase(pd, node);
+		spin_lock(&pkt->lock);
+		pkt_add_list_last(bio, &pkt->orig_bios, &pkt->orig_bios_tail);
+		pkt->write_size += bio->bi_size / CD_FRAMESIZE;
+		spin_unlock(&pkt->lock);
 	}
 	spin_unlock(&pd->lock);
 
@@ -2032,6 +2117,7 @@ static int pkt_make_request(request_queu
 	sector_t zone;
 	struct packet_data *pkt;
 	int was_empty, blocked_bio;
+	struct pkt_rb_node *node;
 
 	pd = q->queuedata;
 	if (!pd) {
@@ -2119,39 +2205,13 @@ static int pkt_make_request(request_queu
 	/*
 	 * No matching packet found. Store the bio in the work queue.
 	 */
+	node = mempool_alloc(pd->rb_pool, GFP_NOIO);
+	BUG_ON(!node);
+	node->bio = bio;
 	spin_lock(&pd->lock);
-	if (pd->bio_queue == NULL) {
-		was_empty = 1;
-		bio->bi_next = NULL;
-		pd->bio_queue = bio;
-		pd->bio_queue_tail = bio;
-	} else {
-		struct bio *bio2, *insert_after;
-		int distance, z, cnt;
-
-		was_empty = 0;
-		z = ZONE(pd->bio_queue_tail->bi_sector, pd);
-		distance = (zone >= z ? zone - z : INT_MAX);
-		insert_after = pd->bio_queue_tail;
-		if (distance > pd->settings.size) {
-			for (bio2 = pd->bio_queue, cnt = 0; bio2 && (cnt < 10000);
-			     bio2 = bio2->bi_next, cnt++) {
-				int d2;
-				z = ZONE(bio2->bi_sector, pd);
-				d2 = (zone >= z ? zone - z : INT_MAX);
-				if (d2 < distance) {
-					distance = d2;
-					insert_after = bio2;
-					if (distance == 0)
-						break;
-				}
-			}
-		}
-		bio->bi_next = insert_after->bi_next;
-		insert_after->bi_next = bio;
-		if (insert_after == pd->bio_queue_tail)
-			pd->bio_queue_tail = bio;
-	}
+	BUG_ON(pd->bio_queue_size < 0);
+	was_empty = (pd->bio_queue_size == 0);
+	pkt_rbtree_insert(pd, node);
 	spin_unlock(&pd->lock);
 
 	/*
@@ -2254,7 +2314,8 @@ static int pkt_seq_show(struct seq_file 
 	seq_printf(m, "\tmode page offset:\t%u\n", pd->mode_offset);
 
 	seq_printf(m, "\nQueue state:\n");
-	seq_printf(m, "\tbios queued:\t\t%s\n", pd->bio_queue ? "yes" : "no");
+	seq_printf(m, "\tbios queued:\t\t%d\n", pd->bio_queue_size);
+	seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", (unsigned long long)pd->current_sector);
 
 	pkt_count_states(pd, states);
 	seq_printf(m, "\tstate:\t\t\ti:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
@@ -2427,6 +2488,10 @@ static int pkt_setup_dev(struct pkt_ctrl
 		return ret;
 	memset(pd, 0, sizeof(struct pktcdvd_device));
 
+	pd->rb_pool = mempool_create(PKT_RB_POOL_SIZE, pkt_rb_alloc, pkt_rb_free, NULL);
+	if (!pd->rb_pool)
+		goto out_mem;
+
 	disk = alloc_disk(1);
 	if (!disk)
 		goto out_mem;
@@ -2436,6 +2501,7 @@ static int pkt_setup_dev(struct pkt_ctrl
 	spin_lock_init(&pd->iosched.lock);
 	sprintf(pd->name, "pktcdvd%d", idx);
 	init_waitqueue_head(&pd->wqueue);
+	pd->bio_queue = RB_ROOT;
 
 	disk->major = pkt_major;
 	disk->first_minor = idx;
@@ -2462,6 +2528,8 @@ out_new_dev:
 out_mem2:
 	put_disk(disk);
 out_mem:
+	if (pd->rb_pool)
+		mempool_destroy(pd->rb_pool);
 	kfree(pd);
 	return ret;
 }
@@ -2503,6 +2571,7 @@ static int pkt_remove_dev(struct pkt_ctr
 	put_disk(pd->disk);
 
 	pkt_devs[idx] = NULL;
+	mempool_destroy(pd->rb_pool);
 	kfree(pd);
 
 	/* This is safe: open() is still holding a reference. */
diff -puN include/linux/pktcdvd.h~packet-rbtree include/linux/pktcdvd.h
--- linux/include/linux/pktcdvd.h~packet-rbtree	2004-08-14 11:35:42.000000000 +0200
+++ linux-petero/include/linux/pktcdvd.h	2004-08-14 19:35:10.822714064 +0200
@@ -21,6 +21,8 @@
 
 #define	MAX_WRITERS		8
 
+#define PKT_RB_POOL_SIZE	512
+
 /*
  * How long we should hold a non-full packet before starting data gathering.
  */
@@ -224,6 +226,11 @@ struct packet_data
 	struct pktcdvd_device	*pd;
 };
 
+struct pkt_rb_node {
+	struct rb_node		rb_node;
+	struct bio		*bio;
+};
+
 struct pktcdvd_device
 {
 	struct block_device	*bdev;		/* dev attached */
@@ -245,10 +252,13 @@ struct pktcdvd_device
 	wait_queue_head_t	wqueue;
 
 	spinlock_t		lock;		/* Serialize access to bio_queue */
-	struct bio		*bio_queue;	/* Work queue of bios we need to handle */
-	struct bio		*bio_queue_tail;
+	struct rb_root		bio_queue;	/* Work queue of bios we need to handle */
+	int			bio_queue_size;	/* Number of nodes in bio_queue */
+	sector_t		current_sector;	/* Keep track of where the elevator is */
 	atomic_t		scan_queue;	/* Set to non-zero when pkt_handle_queue */
 						/* needs to be run. */
+	mempool_t		*rb_pool;	/* mempool for pkt_rb_node allocations */
+
 	struct packet_iosched   iosched;
 	struct gendisk		*disk;
 };
_

-- 
Peter Osterlund - petero2@telia.com
http://w1.894.telia.com/~u89404340

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-14 19:13 [PATCH] Speed up the cdrw packet writing driver Peter Osterlund
@ 2004-08-23 11:43 ` Jens Axboe
  2004-08-23 16:07   ` Peter Osterlund
  0 siblings, 1 reply; 19+ messages in thread
From: Jens Axboe @ 2004-08-23 11:43 UTC (permalink / raw)
  To: Peter Osterlund; +Cc: Andrew Morton, linux-kernel

On Sat, Aug 14 2004, Peter Osterlund wrote:
> Hi!
> 
> This patch replaces the pd->bio_queue linked list with an rbtree.  The
> list can get very long (>200000 entries on a 1GB machine), so keeping
> it sorted with a naive algorithm is far too expensive.

It looks like you are assuming that bio->bi_sector is unique which isn't
necessarily true. In that respect, list -> rbtree conversion isn't
trivial (or, at least it requires extra code to handle this).

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-23 11:43 ` Jens Axboe
@ 2004-08-23 16:07   ` Peter Osterlund
  2004-08-24 20:29     ` Jens Axboe
  0 siblings, 1 reply; 19+ messages in thread
From: Peter Osterlund @ 2004-08-23 16:07 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, linux-kernel

Jens Axboe <axboe@suse.de> writes:

> On Sat, Aug 14 2004, Peter Osterlund wrote:
> > Hi!
> > 
> > This patch replaces the pd->bio_queue linked list with an rbtree.  The
> > list can get very long (>200000 entries on a 1GB machine), so keeping
> > it sorted with a naive algorithm is far too expensive.
> 
> It looks like you are assuming that bio->bi_sector is unique which isn't
> necessarily true. In that respect, list -> rbtree conversion isn't
> trivial (or, at least it requires extra code to handle this).

I don't think that is assumed anywhere.

The pkt_rbtree_find() function returns the first node with a sector
number >= s, even if there are multiple bios with bi_sector == s. Note
that the code branches to the left if s == tmp->bio->bi_sector.

The pkt_rbtree_insert() function is careful to insert a bio after any
already existing bios with the same sector number. Note that it
branches to the right if s == tmp->bio->bi_sector.

The tree rotations done internally in rbtree.c also can't mess things
up, because tree rotations don't change the inorder traversal order of
a tree.

-- 
Peter Osterlund - petero2@telia.com
http://w1.894.telia.com/~u89404340

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-23 16:07   ` Peter Osterlund
@ 2004-08-24 20:29     ` Jens Axboe
  2004-08-24 21:04       ` Peter Osterlund
  0 siblings, 1 reply; 19+ messages in thread
From: Jens Axboe @ 2004-08-24 20:29 UTC (permalink / raw)
  To: Peter Osterlund; +Cc: Andrew Morton, linux-kernel

On Mon, Aug 23 2004, Peter Osterlund wrote:
> Jens Axboe <axboe@suse.de> writes:
> 
> > On Sat, Aug 14 2004, Peter Osterlund wrote:
> > > Hi!
> > > 
> > > This patch replaces the pd->bio_queue linked list with an rbtree.  The
> > > list can get very long (>200000 entries on a 1GB machine), so keeping
> > > it sorted with a naive algorithm is far too expensive.
> > 
> > It looks like you are assuming that bio->bi_sector is unique which isn't
> > necessarily true. In that respect, list -> rbtree conversion isn't
> > trivial (or, at least it requires extra code to handle this).
> 
> I don't think that is assumed anywhere.
> 
> The pkt_rbtree_find() function returns the first node with a sector
> number >= s, even if there are multiple bios with bi_sector == s. Note
> that the code branches to the left if s == tmp->bio->bi_sector.
> 
> The pkt_rbtree_insert() function is careful to insert a bio after any
> already existing bios with the same sector number. Note that it
> branches to the right if s == tmp->bio->bi_sector.
> 
> The tree rotations done internally in rbtree.c also can't mess things
> up, because tree rotations don't change the inorder traversal order of
> a tree.

You are right, the code looks fine indeed. The bigger problem is
probably that a faster data structure is needed at all, having hundreds
of thousands bio's pending for a packet writing device is not nice at
all.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-24 20:29     ` Jens Axboe
@ 2004-08-24 21:04       ` Peter Osterlund
  2004-08-24 21:47         ` Andrew Morton
  2004-08-25  6:50         ` Jens Axboe
  0 siblings, 2 replies; 19+ messages in thread
From: Peter Osterlund @ 2004-08-24 21:04 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, linux-kernel

Jens Axboe <axboe@suse.de> writes:

> On Mon, Aug 23 2004, Peter Osterlund wrote:
> > Jens Axboe <axboe@suse.de> writes:
> > 
> > > On Sat, Aug 14 2004, Peter Osterlund wrote:
> > > > 
> > > > This patch replaces the pd->bio_queue linked list with an rbtree.  The
> > > > list can get very long (>200000 entries on a 1GB machine), so keeping
> > > > it sorted with a naive algorithm is far too expensive.
> > > 
> > > It looks like you are assuming that bio->bi_sector is unique which isn't
> > > necessarily true. In that respect, list -> rbtree conversion isn't
> > > trivial (or, at least it requires extra code to handle this).
> > 
> > I don't think that is assumed anywhere.
> > 
> > [...]
> 
> You are right, the code looks fine indeed. The bigger problem is
> probably that a faster data structure is needed at all, having hundreds
> of thousands bio's pending for a packet writing device is not nice at
> all.

Why is it not nice? If the VM has decided to create 400MB of dirty
data on a DVD+RW packet device, I don't see a problem with submitting
all bio's at the same time to the packet device.

The situation happened when I dumped >1GB of data to a DVD+RW disc on
a 1GB RAM machine. For some reason, the number of pending bio's didn't
go much larger than 200000 (ie 400MB) even though it could probably
have gone to 800MB without swapping. The machine didn't feel
unresponsive during this test.

-- 
Peter Osterlund - petero2@telia.com
http://w1.894.telia.com/~u89404340

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-24 21:04       ` Peter Osterlund
@ 2004-08-24 21:47         ` Andrew Morton
  2004-08-24 21:57           ` Lee Revell
  2004-08-24 22:03           ` Peter Osterlund
  2004-08-25  6:50         ` Jens Axboe
  1 sibling, 2 replies; 19+ messages in thread
From: Andrew Morton @ 2004-08-24 21:47 UTC (permalink / raw)
  To: Peter Osterlund; +Cc: axboe, linux-kernel

Peter Osterlund <petero2@telia.com> wrote:
>
> Jens Axboe <axboe@suse.de> writes:
> 
> > On Mon, Aug 23 2004, Peter Osterlund wrote:
> > > Jens Axboe <axboe@suse.de> writes:
> > > 
> > > > On Sat, Aug 14 2004, Peter Osterlund wrote:
> > > > > 
> > > > > This patch replaces the pd->bio_queue linked list with an rbtree.  The
> > > > > list can get very long (>200000 entries on a 1GB machine), so keeping
> > > > > it sorted with a naive algorithm is far too expensive.
> > > > 
> > > > It looks like you are assuming that bio->bi_sector is unique which isn't
> > > > necessarily true. In that respect, list -> rbtree conversion isn't
> > > > trivial (or, at least it requires extra code to handle this).
> > > 
> > > I don't think that is assumed anywhere.
> > > 
> > > [...]
> > 
> > You are right, the code looks fine indeed. The bigger problem is
> > probably that a faster data structure is needed at all, having hundreds
> > of thousands bio's pending for a packet writing device is not nice at
> > all.
> 
> Why is it not nice? If the VM has decided to create 400MB of dirty
> data on a DVD+RW packet device, I don't see a problem with submitting
> all bio's at the same time to the packet device.

We also have a limit on the number of in-flight requests:
/sys/block/sda/queue/nr_requests.  It defaults to 128.

Are you saying that your requests are so huge that each one has 1000 BIOs? 
That would be odd, for an IDE interface.

> The situation happened when I dumped >1GB of data to a DVD+RW disc on
> a 1GB RAM machine. For some reason, the number of pending bio's didn't
> go much larger than 200000 (ie 400MB) even though it could probably
> have gone to 800MB without swapping. The machine didn't feel
> unresponsive during this test.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-24 21:47         ` Andrew Morton
@ 2004-08-24 21:57           ` Lee Revell
  2004-08-29 13:52             ` Alan Cox
  2004-08-24 22:03           ` Peter Osterlund
  1 sibling, 1 reply; 19+ messages in thread
From: Lee Revell @ 2004-08-24 21:57 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Peter Osterlund, axboe, linux-kernel

On Tue, 2004-08-24 at 17:47, Andrew Morton wrote:
> Peter Osterlund <petero2@telia.com> wrote:
> > 
> > Why is it not nice? If the VM has decided to create 400MB of dirty
> > data on a DVD+RW packet device, I don't see a problem with submitting
> > all bio's at the same time to the packet device.
> 
> We also have a limit on the number of in-flight requests:
> /sys/block/sda/queue/nr_requests.  It defaults to 128.
> 
> Are you saying that your requests are so huge that each one has 1000 BIOs? 
> That would be odd, for an IDE interface.
> 

This defaults to 8192 on my (IDE) system.  IIRC this value is larger if
48-bit addressing is in use (drive size > ~128GB).  It does not seem
right to me that the size of your hard drive should dictate the amount
of I/O allowed to be in flight.

Lee


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-24 21:47         ` Andrew Morton
  2004-08-24 21:57           ` Lee Revell
@ 2004-08-24 22:03           ` Peter Osterlund
  2004-08-24 22:56             ` Andrew Morton
  1 sibling, 1 reply; 19+ messages in thread
From: Peter Osterlund @ 2004-08-24 22:03 UTC (permalink / raw)
  To: Andrew Morton; +Cc: axboe, linux-kernel

Andrew Morton <akpm@osdl.org> writes:

> Peter Osterlund <petero2@telia.com> wrote:
> >
> > Jens Axboe <axboe@suse.de> writes:
> > 
> > > On Mon, Aug 23 2004, Peter Osterlund wrote:
> > > > Jens Axboe <axboe@suse.de> writes:
> > > > 
> > > > > On Sat, Aug 14 2004, Peter Osterlund wrote:
> > > > > > 
> > > > > > This patch replaces the pd->bio_queue linked list with an rbtree.  The
> > > > > > list can get very long (>200000 entries on a 1GB machine), so keeping
> > > > > > it sorted with a naive algorithm is far too expensive.
> > > > > 
> > > > > It looks like you are assuming that bio->bi_sector is unique which isn't
> > > > > necessarily true. In that respect, list -> rbtree conversion isn't
> > > > > trivial (or, at least it requires extra code to handle this).
> > > > 
> > > > I don't think that is assumed anywhere.
> > > > 
> > > > [...]
> > > 
> > > You are right, the code looks fine indeed. The bigger problem is
> > > probably that a faster data structure is needed at all, having hundreds
> > > of thousands bio's pending for a packet writing device is not nice at
> > > all.
> > 
> > Why is it not nice? If the VM has decided to create 400MB of dirty
> > data on a DVD+RW packet device, I don't see a problem with submitting
> > all bio's at the same time to the packet device.
> 
> We also have a limit on the number of in-flight requests:
> /sys/block/sda/queue/nr_requests.  It defaults to 128.
> 
> Are you saying that your requests are so huge that each one has 1000 BIOs? 
> That would be odd, for an IDE interface.

No, the thing is that the packet driver doesn't create requests at
all. It stuffs incoming bio's in the rbtree, then the worker thread
collects bio's from the rbtree for the same "zone" on the disc (each
zone is 64kb on a CDRW and 32KB on a DVD). The driver then creates a
new bio with the same size as the zone size and submits it to the real
CD/DVD device.

-- 
Peter Osterlund - petero2@telia.com
http://w1.894.telia.com/~u89404340

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-24 22:03           ` Peter Osterlund
@ 2004-08-24 22:56             ` Andrew Morton
  2004-08-25  5:38               ` Peter Osterlund
  0 siblings, 1 reply; 19+ messages in thread
From: Andrew Morton @ 2004-08-24 22:56 UTC (permalink / raw)
  To: Peter Osterlund; +Cc: axboe, linux-kernel

Peter Osterlund <petero2@telia.com> wrote:
>
> > Are you saying that your requests are so huge that each one has 1000 BIOs? 
> > That would be odd, for an IDE interface.
> 
> No, the thing is that the packet driver doesn't create requests at
> all. It stuffs incoming bio's in the rbtree, then the worker thread
> collects bio's from the rbtree for the same "zone" on the disc (each
> zone is 64kb on a CDRW and 32KB on a DVD). The driver then creates a
> new bio with the same size as the zone size and submits it to the real
> CD/DVD device.

Good lord.  I assume there's a good reason for this?

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-24 22:56             ` Andrew Morton
@ 2004-08-25  5:38               ` Peter Osterlund
  0 siblings, 0 replies; 19+ messages in thread
From: Peter Osterlund @ 2004-08-25  5:38 UTC (permalink / raw)
  To: Andrew Morton; +Cc: axboe, linux-kernel

Andrew Morton <akpm@osdl.org> writes:

> Peter Osterlund <petero2@telia.com> wrote:
> >
> > > Are you saying that your requests are so huge that each one has 1000 BIOs? 
> > > That would be odd, for an IDE interface.
> > 
> > No, the thing is that the packet driver doesn't create requests at
> > all. It stuffs incoming bio's in the rbtree, then the worker thread
> > collects bio's from the rbtree for the same "zone" on the disc (each
> > zone is 64kb on a CDRW and 32KB on a DVD). The driver then creates a
> > new bio with the same size as the zone size and submits it to the real
> > CD/DVD device.
> 
> Good lord.  I assume there's a good reason for this?

The reason is that generic_make_request() takes a bio, not a request.
Loop and md don't deal with "struct request" either, they also work
directly at the bio level.

The reason the driver doesn't process the bios in sequential order
like md and loop seems to do, is because it wants to maximize I/O
performance. With a typical DVD drive, write bandwidth is 5.3MB/s and
seek times when writing are 500ms. Also, there is a big penalty for
not being able to completely fill a "zone", because it means that the
driver will have to read the missing pieces from the disc before being
able to submit the "write bio".

Is there some problem doing what the pktcdvd driver is doing?

-- 
Peter Osterlund - petero2@telia.com
http://w1.894.telia.com/~u89404340

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-24 21:04       ` Peter Osterlund
  2004-08-24 21:47         ` Andrew Morton
@ 2004-08-25  6:50         ` Jens Axboe
  2004-08-28  9:59           ` Peter Osterlund
  1 sibling, 1 reply; 19+ messages in thread
From: Jens Axboe @ 2004-08-25  6:50 UTC (permalink / raw)
  To: Peter Osterlund; +Cc: Andrew Morton, linux-kernel

On Tue, Aug 24 2004, Peter Osterlund wrote:
> Jens Axboe <axboe@suse.de> writes:
> 
> > On Mon, Aug 23 2004, Peter Osterlund wrote:
> > > Jens Axboe <axboe@suse.de> writes:
> > > 
> > > > On Sat, Aug 14 2004, Peter Osterlund wrote:
> > > > > 
> > > > > This patch replaces the pd->bio_queue linked list with an rbtree.  The
> > > > > list can get very long (>200000 entries on a 1GB machine), so keeping
> > > > > it sorted with a naive algorithm is far too expensive.
> > > > 
> > > > It looks like you are assuming that bio->bi_sector is unique which isn't
> > > > necessarily true. In that respect, list -> rbtree conversion isn't
> > > > trivial (or, at least it requires extra code to handle this).
> > > 
> > > I don't think that is assumed anywhere.
> > > 
> > > [...]
> > 
> > You are right, the code looks fine indeed. The bigger problem is
> > probably that a faster data structure is needed at all, having hundreds
> > of thousands bio's pending for a packet writing device is not nice at
> > all.
> 
> Why is it not nice? If the VM has decided to create 400MB of dirty
> data on a DVD+RW packet device, I don't see a problem with submitting
> all bio's at the same time to the packet device.

It's not nice because flushing 400MB of dirty data to the packet device
will take _ages_. That the vm will dirty that much data for you
non-blocking is a problem in itself. It would still be a really good
idea for the packet writing driver to "congest" itself, like it happens
for struct request devices, to prevent build up of these huge queues.

> The situation happened when I dumped >1GB of data to a DVD+RW disc on
> a 1GB RAM machine. For some reason, the number of pending bio's didn't
> go much larger than 200000 (ie 400MB) even though it could probably
> have gone to 800MB without swapping. The machine didn't feel
> unresponsive during this test.

But it very well could have. If you dive into the bio mempool (or the
biovec pool) and end up having most of those reserved entries built up
for execution half an hour from now, you'll stall (or at least hinder)
other processes from getting real work done.

Latencies are horrible, I don't think it makes sense to allow more than
a few handful of pending zone writes in the packet writing driver.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-25  6:50         ` Jens Axboe
@ 2004-08-28  9:59           ` Peter Osterlund
  2004-08-28 13:07             ` Jens Axboe
  0 siblings, 1 reply; 19+ messages in thread
From: Peter Osterlund @ 2004-08-28  9:59 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, linux-kernel

Jens Axboe <axboe@suse.de> writes:

> On Tue, Aug 24 2004, Peter Osterlund wrote:
> > Jens Axboe <axboe@suse.de> writes:
> > 
> > > You are right, the code looks fine indeed. The bigger problem is
> > > probably that a faster data structure is needed at all, having hundreds
> > > of thousands bio's pending for a packet writing device is not nice at
> > > all.
> > 
> > Why is it not nice? If the VM has decided to create 400MB of dirty
> > data on a DVD+RW packet device, I don't see a problem with submitting
> > all bio's at the same time to the packet device.
> 
> It's not nice because flushing 400MB of dirty data to the packet device
> will take _ages_. That the vm will dirty that much data for you
> non-blocking is a problem in itself. It would still be a really good
> idea for the packet writing driver to "congest" itself, like it happens
> for struct request devices, to prevent build up of these huge queues.

That would be easy to do, see patch below.

> > The situation happened when I dumped >1GB of data to a DVD+RW disc on
> > a 1GB RAM machine. For some reason, the number of pending bio's didn't
> > go much larger than 200000 (ie 400MB) even though it could probably
> > have gone to 800MB without swapping. The machine didn't feel
> > unresponsive during this test.
> 
> But it very well could have. If you dive into the bio mempool (or the
> biovec pool) and end up having most of those reserved entries built up
> for execution half an hour from now, you'll stall (or at least hinder)
> other processes from getting real work done.
> 
> Latencies are horrible, I don't think it makes sense to allow more than
> a few handful of pending zone writes in the packet writing driver.

I ran some tests copying files to a 4x DVD+RW disc for different
values of the maximum number of allowed bios in the queue. I used
kernel 2.6.8.1 with the packet writing patches from the -mm tree. All
tests used the udf filesystem.

Test 1: dd-ing 250MB from /dev/zero:

        1024:  	53.2s
        8192:	54.6s
        inf:	51.2s

Test 2: Writing 29 files, total size 120MB (Source files cached in RAM
before the test started):

        1024:	71.6s
        8192:	81.1s
        32768:	52.7s
        49152:	31.4s
        65536:	26.6s
        inf:	27.7s

Test 3: Writing 48 files, total size 196MB (Source files cached in RAM
before the test started):

        65536:	65.8s
        inf:	40.2s

Test 4: Repeat of test 3:

        65536:	67.4s
        inf:	41.8s

The conclusion is that when writing one big file, it doesn't hurt to
limit the number of pending bios, but when writing many files,
limiting the amount of queued data to "only" 128MB makes the write
performance 60% slower.

Note that the reduced I/O performance will also reduce the lifetime of
the disc, because some sectors will be written more often than
necessary.

---

 linux-petero/drivers/block/pktcdvd.c |    3 +++
 1 files changed, 3 insertions(+)

diff -puN drivers/block/pktcdvd.c~packet-congest drivers/block/pktcdvd.c
--- linux/drivers/block/pktcdvd.c~packet-congest	2004-08-28 10:18:48.350080048 +0200
+++ linux-petero/drivers/block/pktcdvd.c	2004-08-28 11:26:32.851181960 +0200
@@ -2159,6 +2159,9 @@ static int pkt_make_request(request_queu
 		goto end_io;
 	}
 
+	while (pd->bio_queue_size >= 8192)
+		blk_congestion_wait(WRITE, HZ / 5);
+
 	blk_queue_bounce(q, &bio);
 
 	zone = ZONE(bio->bi_sector, pd);
_

-- 
Peter Osterlund - petero2@telia.com
http://w1.894.telia.com/~u89404340

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-28  9:59           ` Peter Osterlund
@ 2004-08-28 13:07             ` Jens Axboe
  2004-08-28 18:42               ` Peter Osterlund
  0 siblings, 1 reply; 19+ messages in thread
From: Jens Axboe @ 2004-08-28 13:07 UTC (permalink / raw)
  To: Peter Osterlund; +Cc: Andrew Morton, linux-kernel

On Sat, Aug 28 2004, Peter Osterlund wrote:
> > > The situation happened when I dumped >1GB of data to a DVD+RW disc on
> > > a 1GB RAM machine. For some reason, the number of pending bio's didn't
> > > go much larger than 200000 (ie 400MB) even though it could probably
> > > have gone to 800MB without swapping. The machine didn't feel
> > > unresponsive during this test.
> > 
> > But it very well could have. If you dive into the bio mempool (or the
> > biovec pool) and end up having most of those reserved entries built up
> > for execution half an hour from now, you'll stall (or at least hinder)
> > other processes from getting real work done.
> > 
> > Latencies are horrible, I don't think it makes sense to allow more than
> > a few handful of pending zone writes in the packet writing driver.
> 
> I ran some tests copying files to a 4x DVD+RW disc for different
> values of the maximum number of allowed bios in the queue. I used
> kernel 2.6.8.1 with the packet writing patches from the -mm tree. All
> tests used the udf filesystem.
> 
> Test 1: dd-ing 250MB from /dev/zero:
> 
>         1024:  	53.2s
>         8192:	54.6s
>         inf:	51.2s

Out of how many runs did you average those numbers? Look close enough to
be normal variance, so not conclusive I'd say.

> Test 2: Writing 29 files, total size 120MB (Source files cached in RAM
> before the test started):
> 
>         1024:	71.6s
>         8192:	81.1s
>         32768:	52.7s
>         49152:	31.4s
>         65536:	26.6s
>         inf:	27.7s
> 
> Test 3: Writing 48 files, total size 196MB (Source files cached in RAM
> before the test started):
> 
>         65536:	65.8s
>         inf:	40.2s
> 
> Test 4: Repeat of test 3:
> 
>         65536:	67.4s
>         inf:	41.8s
> 
> The conclusion is that when writing one big file, it doesn't hurt to
> limit the number of pending bios, but when writing many files,
> limiting the amount of queued data to "only" 128MB makes the write
> performance 60% slower.

I'd rather say that the above is indicative of a layout (or other)
problem with the file system. 64K bio queue must be at least 256MB
pending in itself, how can inf even make a difference there? You would
gain so much more performance by fixing the fs instead of attempting to
build up hundreds of megabytes of pending data, I think that's a really
bad idea (not just a suboptimal solution, also extremely bad for the
rest of the system).

> Note that the reduced I/O performance will also reduce the lifetime of
> the disc, because some sectors will be written more often than
> necessary.

Surely, yes.

> +	while (pd->bio_queue_size >= 8192)
> +		blk_congestion_wait(WRITE, HZ / 5);
> +

This is not really how you'd want to do it. When you reach that
threshold, mark the queue congested and get pdflush to leave you alone
instead. Then clear the congestion when you get below a certain number
again. For this test I don't think it should make a difference, though.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-28 13:07             ` Jens Axboe
@ 2004-08-28 18:42               ` Peter Osterlund
  2004-08-28 19:45                 ` Andrew Morton
  0 siblings, 1 reply; 19+ messages in thread
From: Peter Osterlund @ 2004-08-28 18:42 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Andrew Morton, linux-kernel

Jens Axboe <axboe@suse.de> writes:

> On Sat, Aug 28 2004, Peter Osterlund wrote:
> > > > The situation happened when I dumped >1GB of data to a DVD+RW disc on
> > > > a 1GB RAM machine. For some reason, the number of pending bio's didn't
> > > > go much larger than 200000 (ie 400MB) even though it could probably
> > > > have gone to 800MB without swapping. The machine didn't feel
> > > > unresponsive during this test.
> > > 
> > > But it very well could have. If you dive into the bio mempool (or the
> > > biovec pool) and end up having most of those reserved entries built up
> > > for execution half an hour from now, you'll stall (or at least hinder)
> > > other processes from getting real work done.
> > > 
> > > Latencies are horrible, I don't think it makes sense to allow more than
> > > a few handful of pending zone writes in the packet writing driver.
> > 
> > I ran some tests copying files to a 4x DVD+RW disc for different
> > values of the maximum number of allowed bios in the queue. I used
> > kernel 2.6.8.1 with the packet writing patches from the -mm tree. All
> > tests used the udf filesystem.
> > 
> > Test 1: dd-ing 250MB from /dev/zero:
> > 
> >         1024:  	53.2s
> >         8192:	54.6s
> >         inf:	51.2s
> 
> Out of how many runs did you average those numbers? Look close enough to
> be normal variance, so not conclusive I'd say.

One run only. I agree the difference is probably not significant,
that's why I said below that for one big file, limiting the queue size
doesn't hurt.

> > Test 2: Writing 29 files, total size 120MB (Source files cached in RAM
> > before the test started):
> > 
> >         1024:	71.6s
> >         8192:	81.1s
> >         32768:	52.7s
> >         49152:	31.4s
> >         65536:	26.6s
> >         inf:	27.7s
> > 
> > Test 3: Writing 48 files, total size 196MB (Source files cached in RAM
> > before the test started):
> > 
> >         65536:	65.8s
> >         inf:	40.2s
> > 
> > Test 4: Repeat of test 3:
> > 
> >         65536:	67.4s
> >         inf:	41.8s
> > 
> > The conclusion is that when writing one big file, it doesn't hurt to
> > limit the number of pending bios, but when writing many files,
> > limiting the amount of queued data to "only" 128MB makes the write
> > performance 60% slower.
> 
> I'd rather say that the above is indicative of a layout (or other)
> problem with the file system. 64K bio queue must be at least 256MB
> pending in itself, how can inf even make a difference there?

Because 64K bio queue is 128MB. Each bio is 2KB when using the udf
filesystem.

> You would gain so much more performance by fixing the fs

I reran the 48 files test using ext2 instead of udf and got this:

Test 5: (48 files, 196MB, ext2)

        1024:   169.7s
        2048:   110.4s
        2048:   144.4s
        4096:   76.8s
        4096:   71.4s
        inf:    59.9s
        inf:    57.0s
        inf:    68.9s

A typical bio seems to be 32KB when using ext2. In the "inf" tests,
the number of queued bios didn't go above 6500. Performance suffers a
lot for ext2 too, although not as bad as for udf when allowing 128MB
in the queue (ie 4096 bios).

The reason ext2 is slower than udf in the "inf" case appears to be
because it reads more from the disc than udf does in these tests.
(Not read gathering from the packet driver, but reads originating from
the ext2 file system.)

> instead of attempting to build up hundreds of megabytes of pending
> data,

Note that the packet writing driver only allocates one rb_node per bio
in the writeout paths, it doesn't allocate any bio or bio_vec
objects. The bio objects are allocated by higher layers before they
call generic_make_request().

> I think that's a really bad idea (not just a suboptimal solution,
> also extremely bad for the rest of the system).

With an otherwise idle system, running

        dd if=/dev/zero bs=1M count=2048 | pipemeter -a -i 2 >tmpfile

on an IDE disk produces 40-45MB/s. 

When running two such commands writing both to the HD and the DVD at
the same time, I get: (The DVD and the HD are on different IDE
channels)

        No queue limit: 4-5MB/s to the DVD, 5-6MB/s to the HD.
        max 4096 bios:  ~4MB/s to the DVD, 8.4MB/s average to the HD.

However, with the 4096 bio limit, the HD writing speed is very
irregular. It blocks for ~30s, then writes data at 30-45MB/s for a
short time, then blocks again, etc. I didn't observe this behavior
when there was no bio queue limit. The DVD writing speed was pretty
stable in both cases, ie the bio queue stayed at ~200000 and 4096
respectively.

Is this a general VM limitation? Has anyone been able to saturate the
write bandwidth of two different block devices at the same time, when
they operate at vastly different speeds (45MB/s vs 5MB/s), and when
the writes are large enough to cause memory pressure?

> > +	while (pd->bio_queue_size >= 8192)
> > +		blk_congestion_wait(WRITE, HZ / 5);
> > +
> 
> This is not really how you'd want to do it. When you reach that
> threshold, mark the queue congested and get pdflush to leave you alone
> instead. Then clear the congestion when you get below a certain number
> again. For this test I don't think it should make a difference, though.

I tried the patch below, then ran:

        dd if=/dev/zero bs=1M count=2048 | pipemeter -a -i 2 >/udf/tmpfile

The queue length stayed below 4096 until there was memory pressure.
The queue then became very long (>200000) and the driver reported:

        pkt_make_request: Process pipemeter didn't honor congestion

Maybe some code path forgets to check the queue congestion state.


 linux-petero/drivers/block/ll_rw_blk.c |    6 ++++--
 linux-petero/drivers/block/pktcdvd.c   |   14 ++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff -puN drivers/block/pktcdvd.c~packet-congest drivers/block/pktcdvd.c
--- linux/drivers/block/pktcdvd.c~packet-congest	2004-08-28 10:18:48.000000000 +0200
+++ linux-petero/drivers/block/pktcdvd.c	2004-08-28 19:49:44.168377544 +0200
@@ -51,6 +51,10 @@
 
 #include <asm/uaccess.h>
 
+void clear_queue_congested(request_queue_t *q, int rw);
+void set_queue_congested(request_queue_t *q, int rw);
+
+
 #if PACKET_DEBUG
 #define DPRINTK(fmt, args...) printk(KERN_NOTICE fmt, ##args)
 #else
@@ -981,6 +985,9 @@ try_next_bio:
 	}
 	spin_unlock(&pd->lock);
 
+	if (pd->bio_queue_size < 3*1024)
+		clear_queue_congested(pd->disk->queue, WRITE);
+
 	pkt->sleep_time = max(PACKET_WAIT_TIME, 1);
 	pkt_set_state(pkt, PACKET_WAITING_STATE);
 	atomic_set(&pkt->run_sm, 1);
@@ -2159,6 +2166,13 @@ static int pkt_make_request(request_queu
 		goto end_io;
 	}
 
+	if (pd->bio_queue_size >= 4096)
+		set_queue_congested(q, WRITE);
+
+	if (pd->bio_queue_size >= 5*1024)
+		printk("pkt_make_request: Process %s didn't honor congestion\n",
+		       current->comm);
+
 	blk_queue_bounce(q, &bio);
 
 	zone = ZONE(bio->bi_sector, pd);
diff -puN drivers/block/ll_rw_blk.c~packet-congest drivers/block/ll_rw_blk.c
--- linux/drivers/block/ll_rw_blk.c~packet-congest	2004-08-28 19:27:46.000000000 +0200
+++ linux-petero/drivers/block/ll_rw_blk.c	2004-08-28 19:28:31.000000000 +0200
@@ -111,7 +111,7 @@ static void blk_queue_congestion_thresho
  * congested queues, and wake up anyone who was waiting for requests to be
  * put back.
  */
-static void clear_queue_congested(request_queue_t *q, int rw)
+void clear_queue_congested(request_queue_t *q, int rw)
 {
 	enum bdi_state bit;
 	wait_queue_head_t *wqh = &congestion_wqh[rw];
@@ -122,18 +122,20 @@ static void clear_queue_congested(reques
 	if (waitqueue_active(wqh))
 		wake_up(wqh);
 }
+EXPORT_SYMBOL(clear_queue_congested);
 
 /*
  * A queue has just entered congestion.  Flag that in the queue's VM-visible
  * state flags and increment the global gounter of congested queues.
  */
-static void set_queue_congested(request_queue_t *q, int rw)
+void set_queue_congested(request_queue_t *q, int rw)
 {
 	enum bdi_state bit;
 
 	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
 	set_bit(bit, &q->backing_dev_info.state);
 }
+EXPORT_SYMBOL(set_queue_congested);
 
 /**
  * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
_

-- 
Peter Osterlund - petero2@telia.com
http://w1.894.telia.com/~u89404340

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-28 18:42               ` Peter Osterlund
@ 2004-08-28 19:45                 ` Andrew Morton
  2004-08-28 20:57                   ` Peter Osterlund
  0 siblings, 1 reply; 19+ messages in thread
From: Andrew Morton @ 2004-08-28 19:45 UTC (permalink / raw)
  To: Peter Osterlund; +Cc: axboe, linux-kernel

Peter Osterlund <petero2@telia.com> wrote:
>
> Is this a general VM limitation? Has anyone been able to saturate the
>  write bandwidth of two different block devices at the same time, when
>  they operate at vastly different speeds (45MB/s vs 5MB/s), and when
>  the writes are large enough to cause memory pressure?

I haven't explicitly tested the pdflush code in a while, and I never tested
on devices with such disparate bandwidth.  But it _should_ work.

The basic deign of the pdflush writeback path is:

	for ( ; ; ) {
		for (each superblock) {
			if (no pdflush thread is working this sb's queue &&
			    the superblock's backingdev is not congested) {
				do some writeout, up to congestion, trying
				to not block on request queue exhaustion
			}
		}
		blk_congestion_wait()
	}

So it basically spins around all the queues keeping them full in a
non-blocking manner.

There _are_ times when pdflush will accidentally block.  Say, doing a
metadata read.  In that case other pdflush instances will keep other queues
busy.

I tested it up to 12 disks.  Works OK.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-28 19:45                 ` Andrew Morton
@ 2004-08-28 20:57                   ` Peter Osterlund
  2004-08-28 21:23                     ` Andrew Morton
  0 siblings, 1 reply; 19+ messages in thread
From: Peter Osterlund @ 2004-08-28 20:57 UTC (permalink / raw)
  To: Andrew Morton; +Cc: axboe, linux-kernel

Andrew Morton <akpm@osdl.org> writes:

> Peter Osterlund <petero2@telia.com> wrote:
> >
> > Is this a general VM limitation? Has anyone been able to saturate the
> >  write bandwidth of two different block devices at the same time, when
> >  they operate at vastly different speeds (45MB/s vs 5MB/s), and when
> >  the writes are large enough to cause memory pressure?
> 
> I haven't explicitly tested the pdflush code in a while, and I never tested
> on devices with such disparate bandwidth.  But it _should_ work.
> 
> The basic deign of the pdflush writeback path is:
> 
> 	for ( ; ; ) {
> 		for (each superblock) {
> 			if (no pdflush thread is working this sb's queue &&
> 			    the superblock's backingdev is not congested) {
> 				do some writeout, up to congestion, trying
> 				to not block on request queue exhaustion
> 			}
> 		}
> 		blk_congestion_wait()
> 	}
> 
> So it basically spins around all the queues keeping them full in a
> non-blocking manner.
> 
> There _are_ times when pdflush will accidentally block.  Say, doing a
> metadata read.  In that case other pdflush instances will keep other queues
> busy.
> 
> I tested it up to 12 disks.  Works OK.

OK, this should make sure that dirty data is flushed as fast as the
disks can handle, but is there anything that makes sure there will be
enough dirty data to flush for each disk?

Assume there are two processes writing one file each to two different
block devices. To be able to dirty more data a process will have to
allocate a page, and a page becomes available whenever one of the
disks finishes an I/O operation. If both processes have a 50/50 chance
to get the freed page, they will dirty data equally fast once steady
state has been reached, even if the block devices have very different
write bandwidths.

-- 
Peter Osterlund - petero2@telia.com
http://w1.894.telia.com/~u89404340

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-28 20:57                   ` Peter Osterlund
@ 2004-08-28 21:23                     ` Andrew Morton
  2004-08-29 12:17                       ` Peter Osterlund
  0 siblings, 1 reply; 19+ messages in thread
From: Andrew Morton @ 2004-08-28 21:23 UTC (permalink / raw)
  To: Peter Osterlund; +Cc: axboe, linux-kernel

Peter Osterlund <petero2@telia.com> wrote:
>
> > So it basically spins around all the queues keeping them full in a
>  > non-blocking manner.
>  > 
>  > There _are_ times when pdflush will accidentally block.  Say, doing a
>  > metadata read.  In that case other pdflush instances will keep other queues
>  > busy.
>  > 
>  > I tested it up to 12 disks.  Works OK.
> 
>  OK, this should make sure that dirty data is flushed as fast as the
>  disks can handle, but is there anything that makes sure there will be
>  enough dirty data to flush for each disk?
> 
>  Assume there are two processes writing one file each to two different
>  block devices. To be able to dirty more data a process will have to
>  allocate a page, and a page becomes available whenever one of the
>  disks finishes an I/O operation. If both processes have a 50/50 chance
>  to get the freed page, they will dirty data equally fast once steady
>  state has been reached, even if the block devices have very different
>  write bandwidths.

hm.

Page allocation is fairly decoupled from the dirty memory thresholds.  The
process which wants to write to the fast disk will skirt around the pages
which are dirty against the slow disk and will reclaim clean pagecache
instead.  So the writer to the fast disk _should_ be able to allocate pages
sufficiently quickly.

The balance_dirty_pages() logic doesn't care (or know) about whether the
pages are dirty against a slow device of a fast one - it just keeps the
queues full while clamping the amount of dirty memory.  I do think that
it's possible for the writer to the fast device to get blocked in
balance_dirty_pages() due to there being lots of dirty pages against a slow
device.

Or not - the fact that the fast device retires writes more quickly will
cause waiters in balance_dirty_pages() to unblock promptly.

Or not not - we'll probably get into the situation where almost all of the
dirty pages are dirty against the slower device.

hm.  It needs some verification testing.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-28 21:23                     ` Andrew Morton
@ 2004-08-29 12:17                       ` Peter Osterlund
  0 siblings, 0 replies; 19+ messages in thread
From: Peter Osterlund @ 2004-08-29 12:17 UTC (permalink / raw)
  To: Andrew Morton; +Cc: axboe, linux-kernel

Andrew Morton <akpm@osdl.org> writes:

> Peter Osterlund <petero2@telia.com> wrote:
> >
> >  OK, this should make sure that dirty data is flushed as fast as the
> >  disks can handle, but is there anything that makes sure there will be
> >  enough dirty data to flush for each disk?
> 
> Page allocation is fairly decoupled from the dirty memory thresholds.  The
> process which wants to write to the fast disk will skirt around the pages
> which are dirty against the slow disk and will reclaim clean pagecache
> instead.  So the writer to the fast disk _should_ be able to allocate pages
> sufficiently quickly.
> 
> The balance_dirty_pages() logic doesn't care (or know) about whether the
> pages are dirty against a slow device of a fast one - it just keeps the
> queues full while clamping the amount of dirty memory.  I do think that
> it's possible for the writer to the fast device to get blocked in
> balance_dirty_pages() due to there being lots of dirty pages against a slow
> device.
> 
> Or not - the fact that the fast device retires writes more quickly will
> cause waiters in balance_dirty_pages() to unblock promptly.
> 
> Or not not - we'll probably get into the situation where almost all of the
> dirty pages are dirty against the slower device.
> 
> hm.  It needs some verification testing.

I ran some more tests with the patch below, which marks the queue
congested as Jens suggested. I started a dd process writing to the DVD
and got a stable 5.3MB/s writing speed. Then I waited until 400MB
memory was dirty and started a second dd process writing to the hard
disk. This dd process now writes at 45MB/s. The bio queue in the
packet driver stays at ~4000 entries during the test, so I think this
proves that the VM *can* handle two devices well even though they have
very different write bandwidths.

However, all is not well, because write performance on the DVD goes to
~200kb/s when I start the second dd process (because of lots of read
gathering in the packet driver) and as I noted in an earlier mail,
write performance is also bad if many files are written, even when
there are no concurrent writes to the hard disk.

If I dd directly to /dev/pktcdvd/0 instead of a file on a udf or ext2
filesystem, I don't see the DVD write speed slowdown when I start the
second dd process that writes to the hard disk. For some reason, dirty
data is flushed in a suboptimal order when the udf or ext2 filesystems
are involved.

---

 linux-petero/drivers/block/ll_rw_blk.c |    6 ++++--
 linux-petero/drivers/block/pktcdvd.c   |    8 ++++++++
 linux-petero/include/linux/blkdev.h    |    2 ++
 3 files changed, 14 insertions(+), 2 deletions(-)

diff -puN drivers/block/pktcdvd.c~packet-congest drivers/block/pktcdvd.c
--- linux/drivers/block/pktcdvd.c~packet-congest	2004-08-28 10:18:48.000000000 +0200
+++ linux-petero/drivers/block/pktcdvd.c	2004-08-29 13:21:20.000000000 +0200
@@ -981,6 +981,9 @@ try_next_bio:
 	}
 	spin_unlock(&pd->lock);
 
+	if (pd->bio_queue_size < 3600)
+		clear_queue_congested(pd->disk->queue, WRITE);
+
 	pkt->sleep_time = max(PACKET_WAIT_TIME, 1);
 	pkt_set_state(pkt, PACKET_WAITING_STATE);
 	atomic_set(&pkt->run_sm, 1);
@@ -2159,6 +2162,11 @@ static int pkt_make_request(request_queu
 		goto end_io;
 	}
 
+	if (pd->bio_queue_size >= 4096)
+		set_queue_congested(q, WRITE);
+	while (pd->bio_queue_size >= 4200)
+		blk_congestion_wait(WRITE, HZ / 10);
+
 	blk_queue_bounce(q, &bio);
 
 	zone = ZONE(bio->bi_sector, pd);
diff -puN drivers/block/ll_rw_blk.c~packet-congest drivers/block/ll_rw_blk.c
--- linux/drivers/block/ll_rw_blk.c~packet-congest	2004-08-28 19:27:46.000000000 +0200
+++ linux-petero/drivers/block/ll_rw_blk.c	2004-08-28 19:28:31.000000000 +0200
@@ -111,7 +111,7 @@ static void blk_queue_congestion_thresho
  * congested queues, and wake up anyone who was waiting for requests to be
  * put back.
  */
-static void clear_queue_congested(request_queue_t *q, int rw)
+void clear_queue_congested(request_queue_t *q, int rw)
 {
 	enum bdi_state bit;
 	wait_queue_head_t *wqh = &congestion_wqh[rw];
@@ -122,18 +122,20 @@ static void clear_queue_congested(reques
 	if (waitqueue_active(wqh))
 		wake_up(wqh);
 }
+EXPORT_SYMBOL(clear_queue_congested);
 
 /*
  * A queue has just entered congestion.  Flag that in the queue's VM-visible
  * state flags and increment the global gounter of congested queues.
  */
-static void set_queue_congested(request_queue_t *q, int rw)
+void set_queue_congested(request_queue_t *q, int rw)
 {
 	enum bdi_state bit;
 
 	bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested;
 	set_bit(bit, &q->backing_dev_info.state);
 }
+EXPORT_SYMBOL(set_queue_congested);
 
 /**
  * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
diff -puN include/linux/blkdev.h~packet-congest include/linux/blkdev.h
--- linux/include/linux/blkdev.h~packet-congest	2004-08-29 13:08:50.000000000 +0200
+++ linux-petero/include/linux/blkdev.h	2004-08-29 13:11:30.000000000 +0200
@@ -518,6 +518,8 @@ extern void blk_recount_segments(request
 extern int blk_phys_contig_segment(request_queue_t *q, struct bio *, struct bio *);
 extern int blk_hw_contig_segment(request_queue_t *q, struct bio *, struct bio *);
 extern int scsi_cmd_ioctl(struct file *, struct gendisk *, unsigned int, void __user *);
+extern void clear_queue_congested(request_queue_t *q, int rw);
+extern void set_queue_congested(request_queue_t *q, int rw);
 extern void blk_start_queue(request_queue_t *q);
 extern void blk_stop_queue(request_queue_t *q);
 extern void __blk_stop_queue(request_queue_t *q);
_

-- 
Peter Osterlund - petero2@telia.com
http://w1.894.telia.com/~u89404340

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Speed up the cdrw packet writing driver
  2004-08-24 21:57           ` Lee Revell
@ 2004-08-29 13:52             ` Alan Cox
  0 siblings, 0 replies; 19+ messages in thread
From: Alan Cox @ 2004-08-29 13:52 UTC (permalink / raw)
  To: Lee Revell
  Cc: Andrew Morton, Peter Osterlund, axboe, Linux Kernel Mailing List

On Maw, 2004-08-24 at 22:57, Lee Revell wrote:
> This defaults to 8192 on my (IDE) system.  IIRC this value is larger if
> 48-bit addressing is in use (drive size > ~128GB).  It does not seem
> right to me that the size of your hard drive should dictate the amount
> of I/O allowed to be in flight.

The largest I/O allowed in LBA28 mode is 256 sectors, and in LBA48
rather larger. Its a limit of the command functionality. The tradeoff
the other way is that LBA28 commands are slightly faster to issue.


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2004-08-29 14:55 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-08-14 19:13 [PATCH] Speed up the cdrw packet writing driver Peter Osterlund
2004-08-23 11:43 ` Jens Axboe
2004-08-23 16:07   ` Peter Osterlund
2004-08-24 20:29     ` Jens Axboe
2004-08-24 21:04       ` Peter Osterlund
2004-08-24 21:47         ` Andrew Morton
2004-08-24 21:57           ` Lee Revell
2004-08-29 13:52             ` Alan Cox
2004-08-24 22:03           ` Peter Osterlund
2004-08-24 22:56             ` Andrew Morton
2004-08-25  5:38               ` Peter Osterlund
2004-08-25  6:50         ` Jens Axboe
2004-08-28  9:59           ` Peter Osterlund
2004-08-28 13:07             ` Jens Axboe
2004-08-28 18:42               ` Peter Osterlund
2004-08-28 19:45                 ` Andrew Morton
2004-08-28 20:57                   ` Peter Osterlund
2004-08-28 21:23                     ` Andrew Morton
2004-08-29 12:17                       ` Peter Osterlund

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox