From: Andrew Morton <akpm@digeo.com>
To: Torben Frey <kernel@mailsammler.de>
Cc: Con Kolivas <conman@kolivas.net>,
linux kernel mailing list <linux-kernel@vger.kernel.org>
Subject: Re: Horrible drive performance under concurrent i/o jobs (dlh problem?)
Date: Wed, 18 Dec 2002 14:37:08 -0800 [thread overview]
Message-ID: <3E00F894.BDAB4E05@digeo.com> (raw)
In-Reply-To: 3E00F3B4.7050209@mailsammler.de
Torben Frey wrote:
>
> Hi Con,
>
> thanks for your fast reply. Unfortunately - I cannot patch a vanilla
> 2.4.20 kernel using patch -p1. The first hunk fails, the other ones are
> found with offsets or even fuzz. Although I applied the first hunk
> manually, compiling fails.
>
> Do I need the other patches, too? Or a special version of the kernel?
>
Here's a diff against base 2.4.20. It may be a little out of date
wrt Andrea's latest but it should tell us if we're looking in the
right place.
I doubt it though. This problem will be exceedingly rare on kernels
which do not have a voluntary scheduling point in submit_bh(). SMP and
preemptible kernels will hit it, but rarely.
So please try this patch. Also it would be interesting to know if
read activity against the device fixes the problem. Try doing a
cat /dev/sda1 > /dev/null
and see if that unjams things. If so then yes, it's a queue unplugging
problem.
drivers/block/ll_rw_blk.c | 25 ++++++++++++++++++++-----
fs/buffer.c | 22 +++++++++++++++++++++-
fs/reiserfs/inode.c | 1 +
include/linux/pagemap.h | 2 ++
kernel/ksyms.c | 1 +
mm/filemap.c | 14 ++++++++++++++
6 files changed, 59 insertions(+), 6 deletions(-)
--- 24/drivers/block/ll_rw_blk.c~fix-pausing Wed Dec 18 14:32:06 2002
+++ 24-akpm/drivers/block/ll_rw_blk.c Wed Dec 18 14:32:06 2002
@@ -590,12 +590,20 @@ static struct request *__get_request_wai
register struct request *rq;
DECLARE_WAITQUEUE(wait, current);
- generic_unplug_device(q);
add_wait_queue_exclusive(&q->wait_for_requests[rw], &wait);
do {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (q->rq[rw].count == 0)
+ if (q->rq[rw].count == 0) {
+ /*
+ * All we care about is not to stall if any request
+ * is been released after we set TASK_UNINTERRUPTIBLE.
+ * This is the most efficient place to unplug the queue
+ * in case we hit the race and we can get the request
+ * without waiting.
+ */
+ generic_unplug_device(q);
schedule();
+ }
spin_lock_irq(&io_request_lock);
rq = get_request(q, rw);
spin_unlock_irq(&io_request_lock);
@@ -829,9 +837,11 @@ void blkdev_release_request(struct reque
*/
if (q) {
list_add(&req->queue, &q->rq[rw].free);
- if (++q->rq[rw].count >= q->batch_requests &&
- waitqueue_active(&q->wait_for_requests[rw]))
- wake_up(&q->wait_for_requests[rw]);
+ if (++q->rq[rw].count >= q->batch_requests) {
+ smp_mb();
+ if (waitqueue_active(&q->wait_for_requests[rw]))
+ wake_up(&q->wait_for_requests[rw]);
+ }
}
}
@@ -1200,6 +1210,11 @@ void submit_bh(int rw, struct buffer_hea
generic_make_request(rw, bh);
+ /* fix race condition with wait_on_buffer() */
+ smp_mb(); /* spin_unlock may have inclusive semantics */
+ if (waitqueue_active(&bh->b_wait))
+ wake_up(&bh->b_wait);
+
switch (rw) {
case WRITE:
kstat.pgpgout += count;
--- 24/fs/buffer.c~fix-pausing Wed Dec 18 14:32:06 2002
+++ 24-akpm/fs/buffer.c Wed Dec 18 14:32:06 2002
@@ -153,10 +153,23 @@ void __wait_on_buffer(struct buffer_head
get_bh(bh);
add_wait_queue(&bh->b_wait, &wait);
do {
- run_task_queue(&tq_disk);
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
if (!buffer_locked(bh))
break;
+ /*
+ * We must read tq_disk in TQ_ACTIVE after the
+ * add_wait_queue effect is visible to other cpus.
+ * We could unplug some line above it wouldn't matter
+ * but we can't do that right after add_wait_queue
+ * without an smp_mb() in between because spin_unlock
+ * has inclusive semantics.
+ * Doing it here is the most efficient place so we
+ * don't do a suprious unplug if we get a racy
+ * wakeup that make buffer_locked to return 0, and
+ * doing it here avoids an explicit smp_mb() we
+ * rely on the implicit one in set_task_state.
+ */
+ run_task_queue(&tq_disk);
schedule();
} while (buffer_locked(bh));
tsk->state = TASK_RUNNING;
@@ -1512,6 +1525,9 @@ static int __block_write_full_page(struc
/* Done - end_buffer_io_async will unlock */
SetPageUptodate(page);
+
+ wakeup_page_waiters(page);
+
return 0;
out:
@@ -1543,6 +1559,7 @@ out:
} while (bh != head);
if (need_unlock)
UnlockPage(page);
+ wakeup_page_waiters(page);
return err;
}
@@ -1770,6 +1787,8 @@ int block_read_full_page(struct page *pa
else
submit_bh(READ, bh);
}
+
+ wakeup_page_waiters(page);
return 0;
}
@@ -2383,6 +2402,7 @@ int brw_page(int rw, struct page *page,
submit_bh(rw, bh);
bh = next;
} while (bh != head);
+ wakeup_page_waiters(page);
return 0;
}
--- 24/fs/reiserfs/inode.c~fix-pausing Wed Dec 18 14:32:06 2002
+++ 24-akpm/fs/reiserfs/inode.c Wed Dec 18 14:32:06 2002
@@ -1993,6 +1993,7 @@ static int reiserfs_write_full_page(stru
*/
if (nr) {
submit_bh_for_writepage(arr, nr) ;
+ wakeup_page_waiters(page);
} else {
UnlockPage(page) ;
}
--- 24/include/linux/pagemap.h~fix-pausing Wed Dec 18 14:32:06 2002
+++ 24-akpm/include/linux/pagemap.h Wed Dec 18 14:32:06 2002
@@ -97,6 +97,8 @@ static inline void wait_on_page(struct p
___wait_on_page(page);
}
+extern void wakeup_page_waiters(struct page * page);
+
/*
* Returns locked page at given index in given cache, creating it if needed.
*/
--- 24/kernel/ksyms.c~fix-pausing Wed Dec 18 14:32:06 2002
+++ 24-akpm/kernel/ksyms.c Wed Dec 18 14:32:06 2002
@@ -293,6 +293,7 @@ EXPORT_SYMBOL(filemap_fdatasync);
EXPORT_SYMBOL(filemap_fdatawait);
EXPORT_SYMBOL(lock_page);
EXPORT_SYMBOL(unlock_page);
+EXPORT_SYMBOL(wakeup_page_waiters);
/* device registration */
EXPORT_SYMBOL(register_chrdev);
--- 24/mm/filemap.c~fix-pausing Wed Dec 18 14:32:06 2002
+++ 24-akpm/mm/filemap.c Wed Dec 18 14:32:06 2002
@@ -909,6 +909,20 @@ void lock_page(struct page *page)
}
/*
+ * This must be called after every submit_bh with end_io
+ * callbacks that would result into the blkdev layer waking
+ * up the page after a queue unplug.
+ */
+void wakeup_page_waiters(struct page * page)
+{
+ wait_queue_head_t * head;
+
+ head = page_waitqueue(page);
+ if (waitqueue_active(head))
+ wake_up(head);
+}
+
+/*
* a rather lightweight function, finding and getting a reference to a
* hashed page atomically.
*/
_
next prev parent reply other threads:[~2002-12-18 22:29 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2002-12-18 21:10 Horrible drive performance under concurrent i/o jobs (dlh problem?) Con Kolivas
2002-12-18 22:16 ` Torben Frey
2002-12-18 22:37 ` Andrew Morton [this message]
2002-12-18 23:30 ` Torben Frey
2002-12-18 23:46 ` Andrew Morton
2002-12-18 22:40 ` Torben Frey
-- strict thread matches above, loose matches on Subject: below --
2002-12-19 14:29 Torben Frey
2002-12-20 1:47 ` Nuno Silva
2002-12-27 13:04 ` Torben Frey
2002-12-20 14:27 ` Roger Larsson
2002-12-18 19:06 Torben Frey
2002-12-20 20:40 ` Joseph D. Wagner
2002-12-20 22:25 ` David Lang
2002-12-21 6:00 ` Joseph D. Wagner
2002-12-23 1:29 ` David Lang
2002-12-24 9:18 ` Roy Sigurd Karlsbakk
2002-12-24 17:21 ` jw schultz
2002-12-24 21:00 ` Jeremy Fitzhardinge
2002-12-25 1:34 ` Rik van Riel
2002-12-25 2:02 ` jw schultz
2002-12-25 3:41 ` Barry K. Nathan
2002-12-23 17:48 ` Krzysztof Halasa
2002-12-23 18:13 ` Denis Vlasenko
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=3E00F894.BDAB4E05@digeo.com \
--to=akpm@digeo.com \
--cc=conman@kolivas.net \
--cc=kernel@mailsammler.de \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox