Re: Horrible drive performance under concurrent i/o jobs (dlh problem?)

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Andrew Morton <akpm@digeo.com>
To: Torben Frey <kernel@mailsammler.de>
Cc: Con Kolivas <conman@kolivas.net>,
	linux kernel mailing list <linux-kernel@vger.kernel.org>
Subject: Re: Horrible drive performance under concurrent i/o jobs (dlh problem?)
Date: Wed, 18 Dec 2002 14:37:08 -0800	[thread overview]
Message-ID: <3E00F894.BDAB4E05@digeo.com> (raw)
In-Reply-To: 3E00F3B4.7050209@mailsammler.de

Torben Frey wrote:
> 
> Hi Con,
> 
> thanks for your fast reply. Unfortunately - I cannot patch a vanilla
> 2.4.20 kernel using patch -p1. The first hunk fails, the other ones are
> found with offsets or even fuzz. Although I applied the first hunk
> manually, compiling fails.
> 
> Do I need the other patches, too? Or a special version of the kernel?
> 

Here's a diff against base 2.4.20.  It may be a little out of date
wrt Andrea's latest but it should tell us if we're looking in the
right place.

I doubt it though.  This problem will be exceedingly rare on kernels
which do not have a voluntary scheduling point in submit_bh().  SMP and
preemptible kernels will hit it, but rarely.

So please try this patch.  Also it would be interesting to know if
read activity against the device fixes the problem.  Try doing a

	cat /dev/sda1 > /dev/null

and see if that unjams things.  If so then yes, it's a queue unplugging
problem.


 drivers/block/ll_rw_blk.c |   25 ++++++++++++++++++++-----
 fs/buffer.c               |   22 +++++++++++++++++++++-
 fs/reiserfs/inode.c       |    1 +
 include/linux/pagemap.h   |    2 ++
 kernel/ksyms.c            |    1 +
 mm/filemap.c              |   14 ++++++++++++++
 6 files changed, 59 insertions(+), 6 deletions(-)

--- 24/drivers/block/ll_rw_blk.c~fix-pausing	Wed Dec 18 14:32:06 2002
+++ 24-akpm/drivers/block/ll_rw_blk.c	Wed Dec 18 14:32:06 2002
@@ -590,12 +590,20 @@ static struct request *__get_request_wai
 	register struct request *rq;
 	DECLARE_WAITQUEUE(wait, current);
 
-	generic_unplug_device(q);
 	add_wait_queue_exclusive(&q->wait_for_requests[rw], &wait);
 	do {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (q->rq[rw].count == 0)
+		if (q->rq[rw].count == 0) {
+			/*
+			 * All we care about is not to stall if any request
+			 * is been released after we set TASK_UNINTERRUPTIBLE.
+			 * This is the most efficient place to unplug the queue
+			 * in case we hit the race and we can get the request
+			 * without waiting.
+			 */
+			generic_unplug_device(q);
 			schedule();
+		}
 		spin_lock_irq(&io_request_lock);
 		rq = get_request(q, rw);
 		spin_unlock_irq(&io_request_lock);
@@ -829,9 +837,11 @@ void blkdev_release_request(struct reque
 	 */
 	if (q) {
 		list_add(&req->queue, &q->rq[rw].free);
-		if (++q->rq[rw].count >= q->batch_requests &&
-				waitqueue_active(&q->wait_for_requests[rw]))
-			wake_up(&q->wait_for_requests[rw]);
+		if (++q->rq[rw].count >= q->batch_requests) {
+			smp_mb();
+			if (waitqueue_active(&q->wait_for_requests[rw]))
+				wake_up(&q->wait_for_requests[rw]);
+		}
 	}
 }
 
@@ -1200,6 +1210,11 @@ void submit_bh(int rw, struct buffer_hea
 
 	generic_make_request(rw, bh);
 
+	/* fix race condition with wait_on_buffer() */
+	smp_mb(); /* spin_unlock may have inclusive semantics */
+	if (waitqueue_active(&bh->b_wait))
+		wake_up(&bh->b_wait);
+
 	switch (rw) {
 		case WRITE:
 			kstat.pgpgout += count;
--- 24/fs/buffer.c~fix-pausing	Wed Dec 18 14:32:06 2002
+++ 24-akpm/fs/buffer.c	Wed Dec 18 14:32:06 2002
@@ -153,10 +153,23 @@ void __wait_on_buffer(struct buffer_head
 	get_bh(bh);
 	add_wait_queue(&bh->b_wait, &wait);
 	do {
-		run_task_queue(&tq_disk);
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (!buffer_locked(bh))
 			break;
+		/*
+		 * We must read tq_disk in TQ_ACTIVE after the
+		 * add_wait_queue effect is visible to other cpus.
+		 * We could unplug some line above it wouldn't matter
+		 * but we can't do that right after add_wait_queue
+		 * without an smp_mb() in between because spin_unlock
+		 * has inclusive semantics.
+		 * Doing it here is the most efficient place so we
+		 * don't do a suprious unplug if we get a racy
+		 * wakeup that make buffer_locked to return 0, and
+		 * doing it here avoids an explicit smp_mb() we
+		 * rely on the implicit one in set_task_state.
+		 */
+		run_task_queue(&tq_disk);
 		schedule();
 	} while (buffer_locked(bh));
 	tsk->state = TASK_RUNNING;
@@ -1512,6 +1525,9 @@ static int __block_write_full_page(struc
 
 	/* Done - end_buffer_io_async will unlock */
 	SetPageUptodate(page);
+
+	wakeup_page_waiters(page);
+
 	return 0;
 
 out:
@@ -1543,6 +1559,7 @@ out:
 	} while (bh != head);
 	if (need_unlock)
 		UnlockPage(page);
+	wakeup_page_waiters(page);
 	return err;
 }
 
@@ -1770,6 +1787,8 @@ int block_read_full_page(struct page *pa
 		else
 			submit_bh(READ, bh);
 	}
+
+	wakeup_page_waiters(page);
 	
 	return 0;
 }
@@ -2383,6 +2402,7 @@ int brw_page(int rw, struct page *page, 
 		submit_bh(rw, bh);
 		bh = next;
 	} while (bh != head);
+	wakeup_page_waiters(page);
 	return 0;
 }
 
--- 24/fs/reiserfs/inode.c~fix-pausing	Wed Dec 18 14:32:06 2002
+++ 24-akpm/fs/reiserfs/inode.c	Wed Dec 18 14:32:06 2002
@@ -1993,6 +1993,7 @@ static int reiserfs_write_full_page(stru
     */
     if (nr) {
         submit_bh_for_writepage(arr, nr) ;
+	wakeup_page_waiters(page);
     } else {
         UnlockPage(page) ;
     }
--- 24/include/linux/pagemap.h~fix-pausing	Wed Dec 18 14:32:06 2002
+++ 24-akpm/include/linux/pagemap.h	Wed Dec 18 14:32:06 2002
@@ -97,6 +97,8 @@ static inline void wait_on_page(struct p
 		___wait_on_page(page);
 }
 
+extern void wakeup_page_waiters(struct page * page);
+
 /*
  * Returns locked page at given index in given cache, creating it if needed.
  */
--- 24/kernel/ksyms.c~fix-pausing	Wed Dec 18 14:32:06 2002
+++ 24-akpm/kernel/ksyms.c	Wed Dec 18 14:32:06 2002
@@ -293,6 +293,7 @@ EXPORT_SYMBOL(filemap_fdatasync);
 EXPORT_SYMBOL(filemap_fdatawait);
 EXPORT_SYMBOL(lock_page);
 EXPORT_SYMBOL(unlock_page);
+EXPORT_SYMBOL(wakeup_page_waiters);
 
 /* device registration */
 EXPORT_SYMBOL(register_chrdev);
--- 24/mm/filemap.c~fix-pausing	Wed Dec 18 14:32:06 2002
+++ 24-akpm/mm/filemap.c	Wed Dec 18 14:32:06 2002
@@ -909,6 +909,20 @@ void lock_page(struct page *page)
 }
 
 /*
+ * This must be called after every submit_bh with end_io
+ * callbacks that would result into the blkdev layer waking
+ * up the page after a queue unplug.
+ */
+void wakeup_page_waiters(struct page * page)
+{
+	wait_queue_head_t * head;
+
+	head = page_waitqueue(page);
+	if (waitqueue_active(head))
+		wake_up(head);
+}
+
+/*
  * a rather lightweight function, finding and getting a reference to a
  * hashed page atomically.
  */

_

next prev parent reply	other threads:[~2002-12-18 22:29 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2002-12-18 21:10 Horrible drive performance under concurrent i/o jobs (dlh problem?) Con Kolivas
2002-12-18 22:16 ` Torben Frey
2002-12-18 22:37   ` Andrew Morton [this message]
2002-12-18 23:30     ` Torben Frey
2002-12-18 23:46       ` Andrew Morton
2002-12-18 22:40   ` Torben Frey
  -- strict thread matches above, loose matches on Subject: below --
2002-12-19 14:29 Torben Frey
2002-12-20  1:47 ` Nuno Silva
2002-12-27 13:04   ` Torben Frey
2002-12-20 14:27 ` Roger Larsson
2002-12-18 19:06 Torben Frey
2002-12-20 20:40 ` Joseph D. Wagner
2002-12-20 22:25   ` David Lang
2002-12-21  6:00     ` Joseph D. Wagner
2002-12-23  1:29       ` David Lang
2002-12-24  9:18       ` Roy Sigurd Karlsbakk
2002-12-24 17:21         ` jw schultz
2002-12-24 21:00           ` Jeremy Fitzhardinge
2002-12-25  1:34           ` Rik van Riel
2002-12-25  2:02           ` jw schultz
2002-12-25  3:41           ` Barry K. Nathan
2002-12-23 17:48   ` Krzysztof Halasa
2002-12-23 18:13 ` Denis Vlasenko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=3E00F894.BDAB4E05@digeo.com \
    --to=akpm@digeo.com \
    --cc=conman@kolivas.net \
    --cc=kernel@mailsammler.de \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox