From: Paul Clements <Paul.Clements@SteelEye.com>
To: neilb@cse.unsw.edu.au, marcelo@conectiva.com.br
Cc: linux-kernel@vger.kernel.org
Subject: [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync counter errors
Date: Fri, 22 Mar 2002 13:28:40 -0500 [thread overview]
Message-ID: <3C9B77D8.ACD68643@SteelEye.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 1669 bytes --]
The following patch addresses several critical problems in the raid1
driver. Please apply. We've tested these patches pretty thoroughly
against the 2.4.9-21 Red Hat kernel. The source for raid1 is nearly
identical from 2.4.9-21 to 2.4.18. The patch is against 2.4.18. If you
have any questions, concerns, or comments, please send them to me.
Thanks,
Paul
--
Paul Clements
SteelEye Technology
Paul.Clements@SteelEye.com
--
The problems are, briefly:
1) overuse of device_lock spin lock
The device_lock was being used for two separate, unrelated purposes.
This was causing too much contention and caused a deadlock in one case.
The device_lock will be split into two separate locks by introducing a
new spin lock, "memory_lock".
2) non-atomic memory allocation
Due to use of GFP_KERNEL rather than GFP_ATOMIC, certain threads of the
raid1 driver were scheduled out while holding a spin lock, causing a
deadlock to occur. Memory allocation during critical sections where a
spin lock is held will be changed to atomic allocations.
3) incorrect enabling/disabling of interrupts during locking
In several cases, the wrong spin_lock* macros were being used. There
were a few cases where the irqsave/irqrestore versions of the macros
were needed, but were not used. The symptom of these problems was that
interrupts were enabled or disabled at inappropriate times, resulting in
deadlocks.
4) incorrect setting of conf->cnt_future and conf->phase resync counters
The symptoms of this problem were that, if I/O was occurring when a
resync ended (or was aborted), the resync would hang and never complete.
This eventually would cause all I/O to the md device to hang.
[-- Attachment #2: raid1_2_4_18_official_patch.diff --]
[-- Type: text/plain, Size: 9507 bytes --]
diff -urN linux-2.4.18.PRISTINE/drivers/md/raid1.c linux-2.4.18/drivers/md/raid1.c
--- linux-2.4.18.PRISTINE/drivers/md/raid1.c Fri Mar 22 12:53:32 2002
+++ linux-2.4.18/drivers/md/raid1.c Fri Mar 22 13:05:05 2002
@@ -11,6 +11,8 @@
*
* Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
* Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
+ * Various fixes by Paul Clements <Paul.Clements@SteelEye.com> and
+ * James Bottomley <James.Bottomley@SteelEye.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -64,7 +66,7 @@
while(cnt) {
struct buffer_head *t;
- md_spin_lock_irq(&conf->device_lock);
+ md_spin_lock_irq(&conf->memory_lock);
if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
while (cnt) {
t = conf->freebh;
@@ -75,7 +77,7 @@
conf->freebh_cnt--;
cnt--;
}
- md_spin_unlock_irq(&conf->device_lock);
+ md_spin_unlock_irq(&conf->memory_lock);
if (cnt == 0)
break;
t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
@@ -98,7 +100,7 @@
static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
{
unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
+ spin_lock_irqsave(&conf->memory_lock, flags);
while (bh) {
struct buffer_head *t = bh;
bh=bh->b_next;
@@ -110,7 +112,7 @@
conf->freebh_cnt++;
}
}
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ spin_unlock_irqrestore(&conf->memory_lock, flags);
wake_up(&conf->wait_buffer);
}
@@ -124,12 +126,12 @@
bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
if (!bh) break;
- md_spin_lock_irq(&conf->device_lock);
+ md_spin_lock_irq(&conf->memory_lock);
bh->b_pprev = &conf->freebh;
bh->b_next = conf->freebh;
conf->freebh = bh;
conf->freebh_cnt++;
- md_spin_unlock_irq(&conf->device_lock);
+ md_spin_unlock_irq(&conf->memory_lock);
i++;
}
@@ -140,14 +142,14 @@
{
/* discard all buffer_heads */
- md_spin_lock_irq(&conf->device_lock);
+ md_spin_lock_irq(&conf->memory_lock);
while (conf->freebh) {
struct buffer_head *bh = conf->freebh;
conf->freebh = bh->b_next;
kmem_cache_free(bh_cachep, bh);
conf->freebh_cnt--;
}
- md_spin_unlock_irq(&conf->device_lock);
+ md_spin_unlock_irq(&conf->memory_lock);
}
@@ -156,7 +158,7 @@
struct raid1_bh *r1_bh = NULL;
do {
- md_spin_lock_irq(&conf->device_lock);
+ md_spin_lock_irq(&conf->memory_lock);
if (!conf->freer1_blocked && conf->freer1) {
r1_bh = conf->freer1;
conf->freer1 = r1_bh->next_r1;
@@ -165,7 +167,7 @@
r1_bh->state = (1 << R1BH_PreAlloc);
r1_bh->bh_req.b_state = 0;
}
- md_spin_unlock_irq(&conf->device_lock);
+ md_spin_unlock_irq(&conf->memory_lock);
if (r1_bh)
return r1_bh;
r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
@@ -191,11 +193,11 @@
if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
+ spin_lock_irqsave(&conf->memory_lock, flags);
r1_bh->next_r1 = conf->freer1;
conf->freer1 = r1_bh;
conf->freer1_cnt++;
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ spin_unlock_irqrestore(&conf->memory_lock, flags);
/* don't need to wakeup wait_buffer because
* raid1_free_bh below will do that
*/
@@ -226,14 +228,14 @@
static void raid1_shrink_r1bh(raid1_conf_t *conf)
{
- md_spin_lock_irq(&conf->device_lock);
+ md_spin_lock_irq(&conf->memory_lock);
while (conf->freer1) {
struct raid1_bh *r1_bh = conf->freer1;
conf->freer1 = r1_bh->next_r1;
conf->freer1_cnt--;
kfree(r1_bh);
}
- md_spin_unlock_irq(&conf->device_lock);
+ md_spin_unlock_irq(&conf->memory_lock);
}
@@ -245,10 +247,10 @@
raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
r1_bh->mirror_bh_list = NULL;
- spin_lock_irqsave(&conf->device_lock, flags);
+ spin_lock_irqsave(&conf->memory_lock, flags);
r1_bh->next_r1 = conf->freebuf;
conf->freebuf = r1_bh;
- spin_unlock_irqrestore(&conf->device_lock, flags);
+ spin_unlock_irqrestore(&conf->memory_lock, flags);
raid1_free_bh(conf, bh);
}
@@ -256,12 +258,12 @@
{
struct raid1_bh *r1_bh;
- md_spin_lock_irq(&conf->device_lock);
- wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
+ md_spin_lock_irq(&conf->memory_lock);
+ wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->memory_lock);
r1_bh = conf->freebuf;
conf->freebuf = r1_bh->next_r1;
r1_bh->next_r1= NULL;
- md_spin_unlock_irq(&conf->device_lock);
+ md_spin_unlock_irq(&conf->memory_lock);
return r1_bh;
}
@@ -269,17 +271,18 @@
static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
{
int i = 0;
+ unsigned long flags;
- md_spin_lock_irq(&conf->device_lock);
+ md_spin_lock_irqsave(&conf->memory_lock, flags);
while (i < cnt) {
struct raid1_bh *r1_bh;
struct page *page;
- page = alloc_page(GFP_KERNEL);
+ page = alloc_page(GFP_ATOMIC);
if (!page)
break;
- r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
+ r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_ATOMIC);
if (!r1_bh) {
__free_page(page);
break;
@@ -291,20 +294,21 @@
conf->freebuf = r1_bh;
i++;
}
- md_spin_unlock_irq(&conf->device_lock);
+ md_spin_unlock_irqrestore(&conf->memory_lock, flags);
return i;
}
static void raid1_shrink_buffers (raid1_conf_t *conf)
{
- md_spin_lock_irq(&conf->device_lock);
+ unsigned long flags;
+ md_spin_lock_irqsave(&conf->memory_lock, flags);
while (conf->freebuf) {
struct raid1_bh *r1_bh = conf->freebuf;
conf->freebuf = r1_bh->next_r1;
__free_page(r1_bh->bh_req.b_page);
kfree(r1_bh);
}
- md_spin_unlock_irq(&conf->device_lock);
+ md_spin_unlock_irqrestore(&conf->memory_lock, flags);
}
static int raid1_map (mddev_t *mddev, kdev_t *rdev)
@@ -723,7 +727,7 @@
#define DISK_FAILED KERN_ALERT \
"raid1: Disk failure on %s, disabling device. \n" \
-" Operation continuing on %d devices\n"
+" Operation continuing on %d devices\n"
#define START_SYNCING KERN_ALERT \
"raid1: start syncing spare disk.\n"
@@ -815,26 +819,33 @@
static void close_sync(raid1_conf_t *conf)
{
mddev_t *mddev = conf->mddev;
+ unsigned long flags;
/* If reconstruction was interrupted, we need to close the "active" and "pending"
* holes.
* we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
*/
/* this is really needed when recovery stops too... */
- spin_lock_irq(&conf->segment_lock);
+ spin_lock_irqsave(&conf->segment_lock, flags);
conf->start_active = conf->start_pending;
conf->start_ready = conf->start_pending;
wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
- conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
- conf->start_future = mddev->sb->size+1;
- conf->cnt_pending = conf->cnt_future;
- conf->cnt_future = 0;
- conf->phase = conf->phase ^1;
- wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
+ /* have to be careful here - we want phase to go to 0 (although,
+ * this isn't really necessary since it's just a toggle) - but we
+ * cannot be sure of the current phase value */
+ if (conf->phase) {
+ /* close the future window and wait for pending I/O to finish */
+ conf->start_active = conf->start_ready = conf->start_pending = conf->start_future;
+ conf->start_future = mddev->sb->size+1;
+ conf->cnt_pending = conf->cnt_future;
+ conf->cnt_future = 0;
+ conf->phase = conf->phase ^1;
+ wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
+ }
+ /* open the future window across the whole device, closing the others */
conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
- conf->phase = 0;
- conf->cnt_future = conf->cnt_done;;
+ conf->cnt_future += conf->cnt_done;
conf->cnt_done = 0;
- spin_unlock_irq(&conf->segment_lock);
+ spin_unlock_irqrestore(&conf->segment_lock, flags);
wake_up(&conf->wait_done);
}
@@ -1347,6 +1358,7 @@
conf->window = buffs*(PAGE_SIZE>>9)/2;
conf->cnt_future += conf->cnt_done+conf->cnt_pending;
conf->cnt_done = conf->cnt_pending = 0;
+ wake_up(&conf->wait_ready);
if (conf->cnt_ready || conf->cnt_active)
MD_BUG();
}
@@ -1614,7 +1626,7 @@
conf->nr_disks = sb->nr_disks;
conf->mddev = mddev;
conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
-
+ conf->memory_lock = MD_SPIN_LOCK_UNLOCKED;
conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
init_waitqueue_head(&conf->wait_buffer);
init_waitqueue_head(&conf->wait_done);
diff -urN linux-2.4.18.PRISTINE/include/linux/raid/raid1.h linux-2.4.18/include/linux/raid/raid1.h
--- linux-2.4.18.PRISTINE/include/linux/raid/raid1.h Sun Aug 12 15:39:02 2001
+++ linux-2.4.18/include/linux/raid/raid1.h Fri Mar 22 13:03:53 2002
@@ -33,6 +33,9 @@
int resync_mirrors;
struct mirror_info *spare;
md_spinlock_t device_lock;
+ md_spinlock_t memory_lock;/* use this to protect
+ * memory (de)allocations instead
+ * of using the device_lock */
/* buffer pool */
/* buffer_heads that we have pre-allocated have b_pprev -> &freebh
next reply other threads:[~2002-03-22 18:31 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2002-03-22 18:28 Paul Clements [this message]
-- strict thread matches above, loose matches on Subject: below --
2002-03-24 22:42 [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync counter errors Neil Brown
2002-03-24 23:24 ` Andrew Morton
2002-03-25 18:33 ` Paul Clements
2002-03-25 18:52 ` Paul Clements
2002-03-26 1:52 Neil Brown
2002-03-26 22:06 ` Paul Clements
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=3C9B77D8.ACD68643@SteelEye.com \
--to=paul.clements@steeleye.com \
--cc=linux-kernel@vger.kernel.org \
--cc=marcelo@conectiva.com.br \
--cc=neilb@cse.unsw.edu.au \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox