[PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync counter errors

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync  counter errors
@ 2002-03-22 18:28 Paul Clements
  0 siblings, 0 replies; 7+ messages in thread
From: Paul Clements @ 2002-03-22 18:28 UTC (permalink / raw)
  To: neilb, marcelo; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1669 bytes --]


The following patch addresses several critical problems in the raid1
driver. Please apply. We've tested these patches pretty thoroughly
against the 2.4.9-21 Red Hat kernel. The source for raid1 is nearly
identical from 2.4.9-21 to 2.4.18. The patch is against 2.4.18. If you
have any questions, concerns, or comments, please send them to me.

Thanks,
Paul

--
Paul Clements
SteelEye Technology
Paul.Clements@SteelEye.com
--

The problems are, briefly:

1) overuse of device_lock spin lock 

The device_lock was being used for two separate, unrelated purposes.
This was causing too much contention and caused a deadlock in one case.
The device_lock will be split into two separate locks by introducing a
new spin lock, "memory_lock".

2) non-atomic memory allocation

Due to use of GFP_KERNEL rather than GFP_ATOMIC, certain threads of the
raid1 driver were scheduled out while holding a spin lock, causing a
deadlock to occur. Memory allocation during critical sections where a
spin lock is held will be changed to atomic allocations.

3) incorrect enabling/disabling of interrupts during locking

In several cases, the wrong spin_lock* macros were being used. There
were a few cases where the irqsave/irqrestore versions of the macros
were needed, but were not used. The symptom of these problems was that
interrupts were enabled or disabled at inappropriate times, resulting in
deadlocks.

4) incorrect setting of conf->cnt_future and conf->phase resync counters

The symptoms of this problem were that, if I/O was occurring when a
resync ended (or was aborted), the resync would hang and never complete.
This eventually would cause all I/O to the md device to hang.

[-- Attachment #2: raid1_2_4_18_official_patch.diff --]
[-- Type: text/plain, Size: 9507 bytes --]

diff -urN linux-2.4.18.PRISTINE/drivers/md/raid1.c linux-2.4.18/drivers/md/raid1.c
--- linux-2.4.18.PRISTINE/drivers/md/raid1.c	Fri Mar 22 12:53:32 2002
+++ linux-2.4.18/drivers/md/raid1.c	Fri Mar 22 13:05:05 2002
@@ -11,6 +11,8 @@
  *
  * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
  * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
+ * Various fixes by Paul Clements <Paul.Clements@SteelEye.com> and
+ *                  James Bottomley <James.Bottomley@SteelEye.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -64,7 +66,7 @@
 
 	while(cnt) {
 		struct buffer_head *t;
-		md_spin_lock_irq(&conf->device_lock);
+		md_spin_lock_irq(&conf->memory_lock);
 		if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
 			while (cnt) {
 				t = conf->freebh;
@@ -75,7 +77,7 @@
 				conf->freebh_cnt--;
 				cnt--;
 			}
-		md_spin_unlock_irq(&conf->device_lock);
+		md_spin_unlock_irq(&conf->memory_lock);
 		if (cnt == 0)
 			break;
 		t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
@@ -98,7 +100,7 @@
 static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
 {
 	unsigned long flags;
-	spin_lock_irqsave(&conf->device_lock, flags);
+	spin_lock_irqsave(&conf->memory_lock, flags);
 	while (bh) {
 		struct buffer_head *t = bh;
 		bh=bh->b_next;
@@ -110,7 +112,7 @@
 			conf->freebh_cnt++;
 		}
 	}
-	spin_unlock_irqrestore(&conf->device_lock, flags);
+	spin_unlock_irqrestore(&conf->memory_lock, flags);
 	wake_up(&conf->wait_buffer);
 }
 
@@ -124,12 +126,12 @@
 		bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
 		if (!bh) break;
 
-		md_spin_lock_irq(&conf->device_lock);
+		md_spin_lock_irq(&conf->memory_lock);
 		bh->b_pprev = &conf->freebh;
 		bh->b_next = conf->freebh;
 		conf->freebh = bh;
 		conf->freebh_cnt++;
-		md_spin_unlock_irq(&conf->device_lock);
+		md_spin_unlock_irq(&conf->memory_lock);
 
 		i++;
 	}
@@ -140,14 +142,14 @@
 {
 	/* discard all buffer_heads */
 
-	md_spin_lock_irq(&conf->device_lock);
+	md_spin_lock_irq(&conf->memory_lock);
 	while (conf->freebh) {
 		struct buffer_head *bh = conf->freebh;
 		conf->freebh = bh->b_next;
 		kmem_cache_free(bh_cachep, bh);
 		conf->freebh_cnt--;
 	}
-	md_spin_unlock_irq(&conf->device_lock);
+	md_spin_unlock_irq(&conf->memory_lock);
 }
 		
 
@@ -156,7 +158,7 @@
 	struct raid1_bh *r1_bh = NULL;
 
 	do {
-		md_spin_lock_irq(&conf->device_lock);
+		md_spin_lock_irq(&conf->memory_lock);
 		if (!conf->freer1_blocked && conf->freer1) {
 			r1_bh = conf->freer1;
 			conf->freer1 = r1_bh->next_r1;
@@ -165,7 +167,7 @@
 			r1_bh->state = (1 << R1BH_PreAlloc);
 			r1_bh->bh_req.b_state = 0;
 		}
-		md_spin_unlock_irq(&conf->device_lock);
+		md_spin_unlock_irq(&conf->memory_lock);
 		if (r1_bh)
 			return r1_bh;
 		r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
@@ -191,11 +193,11 @@
 
 	if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
 		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
+		spin_lock_irqsave(&conf->memory_lock, flags);
 		r1_bh->next_r1 = conf->freer1;
 		conf->freer1 = r1_bh;
 		conf->freer1_cnt++;
-		spin_unlock_irqrestore(&conf->device_lock, flags);
+		spin_unlock_irqrestore(&conf->memory_lock, flags);
 		/* don't need to wakeup wait_buffer because
 		 *  raid1_free_bh below will do that
 		 */
@@ -226,14 +228,14 @@
 
 static void raid1_shrink_r1bh(raid1_conf_t *conf)
 {
-	md_spin_lock_irq(&conf->device_lock);
+	md_spin_lock_irq(&conf->memory_lock);
 	while (conf->freer1) {
 		struct raid1_bh *r1_bh = conf->freer1;
 		conf->freer1 = r1_bh->next_r1;
 		conf->freer1_cnt--;
 		kfree(r1_bh);
 	}
-	md_spin_unlock_irq(&conf->device_lock);
+	md_spin_unlock_irq(&conf->memory_lock);
 }
 
 
@@ -245,10 +247,10 @@
 	raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
 	r1_bh->mirror_bh_list = NULL;
 	
-	spin_lock_irqsave(&conf->device_lock, flags);
+	spin_lock_irqsave(&conf->memory_lock, flags);
 	r1_bh->next_r1 = conf->freebuf;
 	conf->freebuf = r1_bh;
-	spin_unlock_irqrestore(&conf->device_lock, flags);
+	spin_unlock_irqrestore(&conf->memory_lock, flags);
 	raid1_free_bh(conf, bh);
 }
 
@@ -256,12 +258,12 @@
 {
 	struct raid1_bh *r1_bh;
 
-	md_spin_lock_irq(&conf->device_lock);
-	wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
+	md_spin_lock_irq(&conf->memory_lock);
+	wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->memory_lock);
 	r1_bh = conf->freebuf;
 	conf->freebuf = r1_bh->next_r1;
 	r1_bh->next_r1= NULL;
-	md_spin_unlock_irq(&conf->device_lock);
+	md_spin_unlock_irq(&conf->memory_lock);
 
 	return r1_bh;
 }
@@ -269,17 +271,18 @@
 static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
 {
 	int i = 0;
+	unsigned long flags;
 
-	md_spin_lock_irq(&conf->device_lock);
+	md_spin_lock_irqsave(&conf->memory_lock, flags);
 	while (i < cnt) {
 		struct raid1_bh *r1_bh;
 		struct page *page;
 
-		page = alloc_page(GFP_KERNEL);
+		page = alloc_page(GFP_ATOMIC);
 		if (!page)
 			break;
 
-		r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
+		r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_ATOMIC);
 		if (!r1_bh) {
 			__free_page(page);
 			break;
@@ -291,20 +294,21 @@
 		conf->freebuf = r1_bh;
 		i++;
 	}
-	md_spin_unlock_irq(&conf->device_lock);
+	md_spin_unlock_irqrestore(&conf->memory_lock, flags);
 	return i;
 }
 
 static void raid1_shrink_buffers (raid1_conf_t *conf)
 {
-	md_spin_lock_irq(&conf->device_lock);
+	unsigned long flags;
+	md_spin_lock_irqsave(&conf->memory_lock, flags);
 	while (conf->freebuf) {
 		struct raid1_bh *r1_bh = conf->freebuf;
 		conf->freebuf = r1_bh->next_r1;
 		__free_page(r1_bh->bh_req.b_page);
 		kfree(r1_bh);
 	}
-	md_spin_unlock_irq(&conf->device_lock);
+	md_spin_unlock_irqrestore(&conf->memory_lock, flags);
 }
 
 static int raid1_map (mddev_t *mddev, kdev_t *rdev)
@@ -723,7 +727,7 @@
 
 #define DISK_FAILED KERN_ALERT \
 "raid1: Disk failure on %s, disabling device. \n" \
-"	Operation continuing on %d devices\n"
+"        Operation continuing on %d devices\n"
 
 #define START_SYNCING KERN_ALERT \
 "raid1: start syncing spare disk.\n"
@@ -815,26 +819,33 @@
 static void close_sync(raid1_conf_t *conf)
 {
 	mddev_t *mddev = conf->mddev;
+	unsigned long flags;
 	/* If reconstruction was interrupted, we need to close the "active" and "pending"
 	 * holes.
 	 * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
 	 */
 	/* this is really needed when recovery stops too... */
-	spin_lock_irq(&conf->segment_lock);
+	spin_lock_irqsave(&conf->segment_lock, flags);
 	conf->start_active = conf->start_pending;
 	conf->start_ready = conf->start_pending;
 	wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
-	conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
-	conf->start_future = mddev->sb->size+1;
-	conf->cnt_pending = conf->cnt_future;
-	conf->cnt_future = 0;
-	conf->phase = conf->phase ^1;
-	wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
+	/* have to be careful here - we want phase to go to 0 (although,
+	 * this isn't really necessary since it's just a toggle) - but we
+	 * cannot be sure of the current phase value */
+	if (conf->phase) {
+		/* close the future window and wait for pending I/O to finish */
+		conf->start_active = conf->start_ready = conf->start_pending = conf->start_future;
+		conf->start_future = mddev->sb->size+1;
+		conf->cnt_pending = conf->cnt_future;
+		conf->cnt_future = 0;
+		conf->phase = conf->phase ^1;
+		wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
+	}
+	/* open the future window across the whole device, closing the others */
 	conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
-	conf->phase = 0;
-	conf->cnt_future = conf->cnt_done;;
+	conf->cnt_future += conf->cnt_done;
 	conf->cnt_done = 0;
-	spin_unlock_irq(&conf->segment_lock);
+	spin_unlock_irqrestore(&conf->segment_lock, flags);
 	wake_up(&conf->wait_done);
 }
 
@@ -1347,6 +1358,7 @@
 		conf->window = buffs*(PAGE_SIZE>>9)/2;
 		conf->cnt_future += conf->cnt_done+conf->cnt_pending;
 		conf->cnt_done = conf->cnt_pending = 0;
+		wake_up(&conf->wait_ready);
 		if (conf->cnt_ready || conf->cnt_active)
 			MD_BUG();
 	}
@@ -1614,7 +1626,7 @@
 	conf->nr_disks = sb->nr_disks;
 	conf->mddev = mddev;
 	conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
-
+	conf->memory_lock = MD_SPIN_LOCK_UNLOCKED;
 	conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
 	init_waitqueue_head(&conf->wait_buffer);
 	init_waitqueue_head(&conf->wait_done);
diff -urN linux-2.4.18.PRISTINE/include/linux/raid/raid1.h linux-2.4.18/include/linux/raid/raid1.h
--- linux-2.4.18.PRISTINE/include/linux/raid/raid1.h	Sun Aug 12 15:39:02 2001
+++ linux-2.4.18/include/linux/raid/raid1.h	Fri Mar 22 13:03:53 2002
@@ -33,6 +33,9 @@
 	int			resync_mirrors;
 	struct mirror_info	*spare;
 	md_spinlock_t		device_lock;
+	md_spinlock_t		memory_lock;/* use this to protect 
+					     * memory (de)allocations instead
+					     * of using the device_lock */
 
 	/* buffer pool */
 	/* buffer_heads that we have pre-allocated have b_pprev -> &freebh

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync  counter errors
@ 2002-03-24 22:42 Neil Brown
  2002-03-24 23:24 ` Andrew Morton
  2002-03-25 18:52 ` Paul Clements
  0 siblings, 2 replies; 7+ messages in thread
From: Neil Brown @ 2002-03-24 22:42 UTC (permalink / raw)
  To: Paul Clements; +Cc: marcelo, linux-kernel

On Friday March 22, Paul.Clements@SteelEye.com wrote:
> 
> The following patch addresses several critical problems in the raid1
> driver. Please apply.

I would have thought that the "please apply" request should wait until
*after* you have asked for an received comments...

>                       We've tested these patches pretty thoroughly
> against the 2.4.9-21 Red Hat kernel.

"We've tested" is a long way different to "these patches have been
around for a while, and several people have used them without
problems, and there has been independant review" which I would have
thought was the benchmark for the "stable" kernel.

>                                      The source for raid1 is nearly
> identical from 2.4.9-21 to 2.4.18. The patch is against 2.4.18. If you
> have any questions, concerns, or comments, please send them to me.
> 
......
> 
> The problems are, briefly:
> 
> 1) overuse of device_lock spin lock 
> 
> The device_lock was being used for two separate, unrelated purposes.
> This was causing too much contention and caused a deadlock in one case.
> The device_lock will be split into two separate locks by introducing a
> new spin lock, "memory_lock".

I can believe that there could be extra contention because of the dual
use of this spin lock.  Do you have lockmeter numbers at all?

However I cannot see how it would cause a deadlock.  Could you please
give details?

> 
> 2) non-atomic memory allocation
> 
> Due to use of GFP_KERNEL rather than GFP_ATOMIC, certain threads of the
> raid1 driver were scheduled out while holding a spin lock, causing a
> deadlock to occur. Memory allocation during critical sections where a
> spin lock is held will be changed to atomic allocations.

You are definately right that we should not be calling kmalloc with a
spinlock held - my bad.
However I don't think your fix is ideal.  The relevant code is
"raid1_grow_buffers" which allocates a bunch of buffers and attaches
them to the device structure.
The lock is only realy needed for the attachment.  A better fix would
be to build a separate list, and then just claim the lock while
attaching that list to the structure.

> 
> 3) incorrect enabling/disabling of interrupts during locking
> 
> In several cases, the wrong spin_lock* macros were being used. There
> were a few cases where the irqsave/irqrestore versions of the macros
> were needed, but were not used. The symptom of these problems was that
> interrupts were enabled or disabled at inappropriate times, resulting in
> deadlocks.

I don't believe that this is true.
The save/restore versions are only needed if the code might be called
from interrupt context.  However the routines where you made this
change: raid1_grow_buffers, raid1_shrink_buffers, close_sync, 
are only ever called from process context, with interrupts enabled.
Or am I missing something?

> 
> 4) incorrect setting of conf->cnt_future and conf->phase resync counters
> 
> The symptoms of this problem were that, if I/O was occurring when a
> resync ended (or was aborted), the resync would hang and never complete.
> This eventually would cause all I/O to the md device to hang.

I'll have to look at this one a bit more closely.  I'll let you know 
what I think of it.

NeilBrown

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync  counter errors
  2002-03-24 22:42 [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync counter errors Neil Brown
@ 2002-03-24 23:24 ` Andrew Morton
  2002-03-25 18:33   ` Paul Clements
  2002-03-25 18:52 ` Paul Clements
  1 sibling, 1 reply; 7+ messages in thread
From: Andrew Morton @ 2002-03-24 23:24 UTC (permalink / raw)
  To: Neil Brown; +Cc: Paul Clements, marcelo, linux-kernel

Neil Brown wrote:
> 
> ...
> The save/restore versions are only needed if the code might be called
> from interrupt context.

Or if the caller may wish to keep interrupts disabled.

> However the routines where you made this
> change: raid1_grow_buffers, raid1_shrink_buffers, close_sync,
> are only ever called from process context, with interrupts enabled.
> Or am I missing something?

If those functions are always called with interrupts enabled then
no, you're not missing anything ;)

However a bare spin_unlock_irq() in a function means that
callers which wish to keep interrupts disabled are subtly
subverted.   We've had bugs from this before.

So the irqrestore functions are much more robust.  I believe
that they should be the default choice.  The non-restore
versions should be viewed as a micro-optimised version,
to be used with caution.  The additional expense of the save/restore
is quite tiny - 20-30 cycles, perhaps.

-

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync  counter errors
  2002-03-24 23:24 ` Andrew Morton
@ 2002-03-25 18:33   ` Paul Clements
  0 siblings, 0 replies; 7+ messages in thread
From: Paul Clements @ 2002-03-25 18:33 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Neil Brown, Paul Clements, marcelo, linux-kernel

On Sun, 24 Mar 2002, Andrew Morton wrote:

> However a bare spin_unlock_irq() in a function means that
> callers which wish to keep interrupts disabled are subtly
> subverted.   We've had bugs from this before.

Yes, that was precisely what was happening in raid1. There were
"nested" spin_lock_irq() calls.

> So the irqrestore functions are much more robust.  I believe
> that they should be the default choice.  The non-restore
> versions should be viewed as a micro-optimised version,
> to be used with caution.  The additional expense of the save/restore
> is quite tiny - 20-30 cycles, perhaps.

I was wondering about the performance of these. I was reluctant 
to change all occurrences of spin_lock_irq() to the save/restore 
versions, even though that seemed like the safest thing to do, so 
I had to analyze every code path where spin_locks were involved 
to see which ones absolutely needed to change...very tedious. 

Thanks for the explanations.

--
Paul

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync  counter errors
  2002-03-24 22:42 [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync counter errors Neil Brown
  2002-03-24 23:24 ` Andrew Morton
@ 2002-03-25 18:52 ` Paul Clements
  1 sibling, 0 replies; 7+ messages in thread
From: Paul Clements @ 2002-03-25 18:52 UTC (permalink / raw)
  To: Neil Brown; +Cc: Paul Clements, marcelo, linux-kernel

Neil,

Thanks for your feedback. Replies below...

--
Paul Clements
SteelEye Technology
Paul.Clements@SteelEye.com


On Mon, 25 Mar 2002, Neil Brown wrote:

> On Friday March 22, Paul.Clements@SteelEye.com wrote:
> > 
> > The problems are, briefly:
> > 
> > 1) overuse of device_lock spin lock 
> > 
> > The device_lock was being used for two separate, unrelated purposes.
> > This was causing too much contention and caused a deadlock in one case.
> > The device_lock will be split into two separate locks by introducing a
> > new spin lock, "memory_lock".
> 
> I can believe that there could be extra contention because of the dual
> use of this spin lock.  Do you have lockmeter numbers at all?

No, I'm not familiar with that. How do I get those? Is it fairly simple?

I wasn't so much concerned about extra contention as the (in my mind) 
logical separation of these two different tasks, and the fact that 
the lack of separation had led to a deadlock.


> However I cannot see how it would cause a deadlock.  Could you please
> give details?

raid1_diskop() calls close_sync() -- close_sync() schedules itself out
to wait for pending I/O to quiesce so that the resync can end... 
meanwhile #CPUs (in my case, 2) tasks enter into any of the memory 
(de)allocation routines and spin on the device_lock forever...

> > 
> > 2) non-atomic memory allocation
> > 
> > Due to use of GFP_KERNEL rather than GFP_ATOMIC, certain threads of the
> > raid1 driver were scheduled out while holding a spin lock, causing a
> > deadlock to occur. Memory allocation during critical sections where a
> > spin lock is held will be changed to atomic allocations.
> 
> You are definately right that we should not be calling kmalloc with a
> spinlock held - my bad.
> However I don't think your fix is ideal.  The relevant code is
> "raid1_grow_buffers" which allocates a bunch of buffers and attaches
> them to the device structure.
> The lock is only realy needed for the attachment.  A better fix would
> be to build a separate list, and then just claim the lock while
> attaching that list to the structure.

Unfortunately, this won't work, because the segment_lock is also held
while this code is executing (see raid1_sync_request).

 
> > 
> > 3) incorrect enabling/disabling of interrupts during locking
> > 
> > In several cases, the wrong spin_lock* macros were being used. There
> > were a few cases where the irqsave/irqrestore versions of the macros
> > were needed, but were not used. The symptom of these problems was that
> > interrupts were enabled or disabled at inappropriate times, resulting in
> > deadlocks.
> 
> I don't believe that this is true.
> The save/restore versions are only needed if the code might be called
> from interrupt context.  However the routines where you made this
> change: raid1_grow_buffers, raid1_shrink_buffers, close_sync, 
> are only ever called from process context, with interrupts enabled.
> Or am I missing something?

please see my other e-mail reply to Andrew Morton regarding this...

 
> > 
> > 4) incorrect setting of conf->cnt_future and conf->phase resync counters
> > 
> > The symptoms of this problem were that, if I/O was occurring when a
> > resync ended (or was aborted), the resync would hang and never complete.
> > This eventually would cause all I/O to the md device to hang.
> 
> I'll have to look at this one a bit more closely.  I'll let you know 
> what I think of it.

OK. If you come up with something better, please let me know.


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync  counter errors
@ 2002-03-26  1:52 Neil Brown
  2002-03-26 22:06 ` Paul Clements
  0 siblings, 1 reply; 7+ messages in thread
From: Neil Brown @ 2002-03-26  1:52 UTC (permalink / raw)
  To: Paul.Clements; +Cc: linux-kernel

On Monday March 25, kernel@steeleye.com wrote:
> Neil,
> 
> Thanks for your feedback. Replies below...
> 

ditto :-)

> > 
> > I can believe that there could be extra contention because of the dual
> > use of this spin lock.  Do you have lockmeter numbers at all?
> 
> No, I'm not familiar with that. How do I get those? Is it fairly
> simple?

I've never tried myself, so I don't know of simple it is.  The relevant
home page seems to be:
        http://oss.sgi.com/projects/lockmeter/

> 
> I wasn't so much concerned about extra contention as the (in my mind) 
> logical separation of these two different tasks, and the fact that 
> the lack of separation had led to a deadlock.
> 

Certainly these are logically distinct uses of the same lock, though
there are still closely related.
There is often a tension between splitting a lock to reduce
granularity and keeping the number of spinlocks to a minimum.
In general I would only split a lock if their was either a clear
semantic need, or measureable performance impact.

If a deadlock were a consequence of not splitting, that would be a
strong argument, but I think we can avoid the deadlock by other means.

> 
> > However I cannot see how it would cause a deadlock.  Could you please
> > give details?
> 
> raid1_diskop() calls close_sync() -- close_sync() schedules itself out
> to wait for pending I/O to quiesce so that the resync can end... 
> meanwhile #CPUs (in my case, 2) tasks enter into any of the memory 
> (de)allocation routines and spin on the device_lock forever...

Ahhhh.... Thanks....
close_sync() definately shouldn't be called with a spinlock held.  In
the patch below I have moved it out of the locked region.

> > You are definately right that we should not be calling kmalloc with a
> > spinlock held - my bad.
> > However I don't think your fix is ideal.  The relevant code is
> > "raid1_grow_buffers" which allocates a bunch of buffers and attaches
> > them to the device structure.
> > The lock is only realy needed for the attachment.  A better fix would
> > be to build a separate list, and then just claim the lock while
> > attaching that list to the structure.
> 
> Unfortunately, this won't work, because the segment_lock is also held
> while this code is executing (see raid1_sync_request).
> 

Good point, thanks.  I think this means that the call to
raid1_grow_buffers needs to be moved out from inside the locked
region.  This is done in the following patch.

>  
> > > 
> > > 3) incorrect enabling/disabling of interrupts during locking
> > > 
...
> > 
> > I don't believe that this is true.
> > The save/restore versions are only needed if the code might be called
> > from interrupt context.  However the routines where you made this
> > change: raid1_grow_buffers, raid1_shrink_buffers, close_sync, 
> > are only ever called from process context, with interrupts enabled.
> > Or am I missing something?
> 
> please see my other e-mail reply to Andrew Morton regarding this...

OK, I understand now.  spin_lock_irq is being called while
spin_lock_irq is already in-force.  I think this is best fixed by
moving calls outside of locked regions as mentioned above.


> 
>  
> > > 
> > > 4) incorrect setting of conf->cnt_future and conf->phase resync counters
> > > 
> > > The symptoms of this problem were that, if I/O was occurring when a
> > > resync ended (or was aborted), the resync would hang and never complete.
> > > This eventually would cause all I/O to the md device to hang.
> > 
> > I'll have to look at this one a bit more closely.  I'll let you know 
> > what I think of it.
> 
> OK. If you come up with something better, please let me know.

OK, I've had a look, and I see what the problem is:

	conf->start_future = mddev->sb->size+1;

start_future is in sectors.  sb->size is in Kibibytes :-(
Should be
	conf->start_future = (mddev->sb->size<<1)+1;

This error would explain your symptom.


Below is a patch which I believe should address all of your symptoms
and the bugs that they expose.  If your are able to test it and let me
know how it works for you I would appreciate it.

Thanks
NeilBrown



 ----------- Diffstat output ------------
 ./drivers/md/raid1.c |   54 +++++++++++++++++++++++++++++++++++++--------------
 1 files changed, 40 insertions(+), 14 deletions(-)

--- ./drivers/md/raid1.c	2002/03/25 21:53:59	1.1
+++ ./drivers/md/raid1.c	2002/03/26 01:47:56	1.2
@@ -269,8 +269,9 @@
 static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
 {
 	int i = 0;
+	struct raid1_bh *head = NULL, **tail;
+	tail = &head;
 
-	md_spin_lock_irq(&conf->device_lock);
 	while (i < cnt) {
 		struct raid1_bh *r1_bh;
 		struct page *page;
@@ -287,10 +288,18 @@
 		memset(r1_bh, 0, sizeof(*r1_bh));
 		r1_bh->bh_req.b_page = page;
 		r1_bh->bh_req.b_data = page_address(page);
-		r1_bh->next_r1 = conf->freebuf;
-		conf->freebuf = r1_bh;
+		*tail = r1_bh;
+		r1_bh->next_r1 = NULL;
+		tail = & r1_bh->next_r1;
 		i++;
 	}
+	/* this lock probably isn't needed, as at the time when
+	 * we are allocating buffers, nobody else will be touching the
+	 * freebuf list.  But it doesn't hurt....
+	 */
+	md_spin_lock_irq(&conf->device_lock);
+	*tail = conf->freebuf;
+	conf->freebuf = head;
 	md_spin_unlock_irq(&conf->device_lock);
 	return i;
 }
@@ -825,7 +834,7 @@
 	conf->start_ready = conf->start_pending;
 	wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
 	conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
-	conf->start_future = mddev->sb->size+1;
+	conf->start_future = (mddev->sb->size<<1)+1;
 	conf->cnt_pending = conf->cnt_future;
 	conf->cnt_future = 0;
 	conf->phase = conf->phase ^1;
@@ -849,6 +858,14 @@
 	mdk_rdev_t *spare_rdev, *failed_rdev;
 
 	print_raid1_conf(conf);
+
+	switch (state) {
+	case DISKOP_SPARE_ACTIVE:
+	case DISKOP_SPARE_INACTIVE:
+		/* need to wait for pending sync io before locking device */
+		close_sync(conf);
+	}
+
 	md_spin_lock_irq(&conf->device_lock);
 	/*
 	 * find the disk ...
@@ -951,7 +968,11 @@
 	 * Deactivate a spare disk:
 	 */
 	case DISKOP_SPARE_INACTIVE:
-		close_sync(conf);
+		if (conf->start_future > 0) {
+			MD_BUG();
+			err = -EBUSY;
+			break;
+		}
 		sdisk = conf->mirrors + spare_disk;
 		sdisk->operational = 0;
 		sdisk->write_only = 0;
@@ -964,7 +985,11 @@
 	 * property)
 	 */
 	case DISKOP_SPARE_ACTIVE:
-		close_sync(conf);
+		if (conf->start_future > 0) {
+			MD_BUG();
+			err = -EBUSY;
+			break;
+		}
 		sdisk = conf->mirrors + spare_disk;
 		fdisk = conf->mirrors + failed_disk;
 
@@ -1328,23 +1353,25 @@
 	int bsize;
 	int disk;
 	int block_nr;
+	int buffs;
 
+	if (!sector_nr) {
+		/* we want enough buffers to hold twice the window of 128*/
+		buffs = 128 *2 / (PAGE_SIZE>>9);
+		buffs = raid1_grow_buffers(conf, buffs);
+		if (buffs < 2)
+			goto nomem;
+		conf->window = buffs*(PAGE_SIZE>>9)/2;
+	}
 	spin_lock_irq(&conf->segment_lock);
 	if (!sector_nr) {
 		/* initialize ...*/
-		int buffs;
 		conf->start_active = 0;
 		conf->start_ready = 0;
 		conf->start_pending = 0;
 		conf->start_future = 0;
 		conf->phase = 0;
-		/* we want enough buffers to hold twice the window of 128*/
-		buffs = 128 *2 / (PAGE_SIZE>>9);
-		buffs = raid1_grow_buffers(conf, buffs);
-		if (buffs < 2)
-			goto nomem;
 		
-		conf->window = buffs*(PAGE_SIZE>>9)/2;
 		conf->cnt_future += conf->cnt_done+conf->cnt_pending;
 		conf->cnt_done = conf->cnt_pending = 0;
 		if (conf->cnt_ready || conf->cnt_active)
@@ -1429,7 +1456,6 @@
 
 nomem:
 	raid1_shrink_buffers(conf);
-	spin_unlock_irq(&conf->segment_lock);
 	return -ENOMEM;
 }
 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync  counter errors
  2002-03-26  1:52 Neil Brown
@ 2002-03-26 22:06 ` Paul Clements
  0 siblings, 0 replies; 7+ messages in thread
From: Paul Clements @ 2002-03-26 22:06 UTC (permalink / raw)
  To: Neil Brown; +Cc: Paul.Clements, linux-kernel

Neil,

I got a chance to test your patch for several hours today. 
It looks like all the problems are fixed.

Thanks,
Paul

--
Paul Clements
SteelEye Technology
Paul.Clements@SteelEye.com


On Tue, 26 Mar 2002, Neil Brown wrote:

> On Monday March 25, kernel@steeleye.com wrote:
> > Neil,
> > 
> > Thanks for your feedback. Replies below...
> > 
> 
> ditto :-)
> 
> > > 
> > > I can believe that there could be extra contention because of the dual
> > > use of this spin lock.  Do you have lockmeter numbers at all?
> > 
> > No, I'm not familiar with that. How do I get those? Is it fairly
> > simple?
> 
> I've never tried myself, so I don't know of simple it is.  The relevant
> home page seems to be:
>         http://oss.sgi.com/projects/lockmeter/
> 
> > 
> > I wasn't so much concerned about extra contention as the (in my mind) 
> > logical separation of these two different tasks, and the fact that 
> > the lack of separation had led to a deadlock.
> > 
> 
> Certainly these are logically distinct uses of the same lock, though
> there are still closely related.
> There is often a tension between splitting a lock to reduce
> granularity and keeping the number of spinlocks to a minimum.
> In general I would only split a lock if their was either a clear
> semantic need, or measureable performance impact.
> 
> If a deadlock were a consequence of not splitting, that would be a
> strong argument, but I think we can avoid the deadlock by other means.
> 
> > 
> > > However I cannot see how it would cause a deadlock.  Could you please
> > > give details?
> > 
> > raid1_diskop() calls close_sync() -- close_sync() schedules itself out
> > to wait for pending I/O to quiesce so that the resync can end... 
> > meanwhile #CPUs (in my case, 2) tasks enter into any of the memory 
> > (de)allocation routines and spin on the device_lock forever...
> 
> Ahhhh.... Thanks....
> close_sync() definately shouldn't be called with a spinlock held.  In
> the patch below I have moved it out of the locked region.
> 
> > > You are definately right that we should not be calling kmalloc with a
> > > spinlock held - my bad.
> > > However I don't think your fix is ideal.  The relevant code is
> > > "raid1_grow_buffers" which allocates a bunch of buffers and attaches
> > > them to the device structure.
> > > The lock is only realy needed for the attachment.  A better fix would
> > > be to build a separate list, and then just claim the lock while
> > > attaching that list to the structure.
> > 
> > Unfortunately, this won't work, because the segment_lock is also held
> > while this code is executing (see raid1_sync_request).
> > 
> 
> Good point, thanks.  I think this means that the call to
> raid1_grow_buffers needs to be moved out from inside the locked
> region.  This is done in the following patch.
> 
> >  
> > > > 
> > > > 3) incorrect enabling/disabling of interrupts during locking
> > > > 
> ...
> > > 
> > > I don't believe that this is true.
> > > The save/restore versions are only needed if the code might be called
> > > from interrupt context.  However the routines where you made this
> > > change: raid1_grow_buffers, raid1_shrink_buffers, close_sync, 
> > > are only ever called from process context, with interrupts enabled.
> > > Or am I missing something?
> > 
> > please see my other e-mail reply to Andrew Morton regarding this...
> 
> OK, I understand now.  spin_lock_irq is being called while
> spin_lock_irq is already in-force.  I think this is best fixed by
> moving calls outside of locked regions as mentioned above.
> 
> 
> > 
> >  
> > > > 
> > > > 4) incorrect setting of conf->cnt_future and conf->phase resync counters
> > > > 
> > > > The symptoms of this problem were that, if I/O was occurring when a
> > > > resync ended (or was aborted), the resync would hang and never complete.
> > > > This eventually would cause all I/O to the md device to hang.
> > > 
> > > I'll have to look at this one a bit more closely.  I'll let you know 
> > > what I think of it.
> > 
> > OK. If you come up with something better, please let me know.
> 
> OK, I've had a look, and I see what the problem is:
> 
> 	conf->start_future = mddev->sb->size+1;
> 
> start_future is in sectors.  sb->size is in Kibibytes :-(
> Should be
> 	conf->start_future = (mddev->sb->size<<1)+1;
> 
> This error would explain your symptom.
> 
> 
> Below is a patch which I believe should address all of your symptoms
> and the bugs that they expose.  If your are able to test it and let me
> know how it works for you I would appreciate it.
> 
> Thanks
> NeilBrown
> 
> 
> 
>  ----------- Diffstat output ------------
>  ./drivers/md/raid1.c |   54 +++++++++++++++++++++++++++++++++++++--------------
>  1 files changed, 40 insertions(+), 14 deletions(-)
> 
> --- ./drivers/md/raid1.c	2002/03/25 21:53:59	1.1
> +++ ./drivers/md/raid1.c	2002/03/26 01:47:56	1.2
> @@ -269,8 +269,9 @@
>  static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
>  {
>  	int i = 0;
> +	struct raid1_bh *head = NULL, **tail;
> +	tail = &head;
>  
> -	md_spin_lock_irq(&conf->device_lock);
>  	while (i < cnt) {
>  		struct raid1_bh *r1_bh;
>  		struct page *page;
> @@ -287,10 +288,18 @@
>  		memset(r1_bh, 0, sizeof(*r1_bh));
>  		r1_bh->bh_req.b_page = page;
>  		r1_bh->bh_req.b_data = page_address(page);
> -		r1_bh->next_r1 = conf->freebuf;
> -		conf->freebuf = r1_bh;
> +		*tail = r1_bh;
> +		r1_bh->next_r1 = NULL;
> +		tail = & r1_bh->next_r1;
>  		i++;
>  	}
> +	/* this lock probably isn't needed, as at the time when
> +	 * we are allocating buffers, nobody else will be touching the
> +	 * freebuf list.  But it doesn't hurt....
> +	 */
> +	md_spin_lock_irq(&conf->device_lock);
> +	*tail = conf->freebuf;
> +	conf->freebuf = head;
>  	md_spin_unlock_irq(&conf->device_lock);
>  	return i;
>  }
> @@ -825,7 +834,7 @@
>  	conf->start_ready = conf->start_pending;
>  	wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
>  	conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
> -	conf->start_future = mddev->sb->size+1;
> +	conf->start_future = (mddev->sb->size<<1)+1;
>  	conf->cnt_pending = conf->cnt_future;
>  	conf->cnt_future = 0;
>  	conf->phase = conf->phase ^1;
> @@ -849,6 +858,14 @@
>  	mdk_rdev_t *spare_rdev, *failed_rdev;
>  
>  	print_raid1_conf(conf);
> +
> +	switch (state) {
> +	case DISKOP_SPARE_ACTIVE:
> +	case DISKOP_SPARE_INACTIVE:
> +		/* need to wait for pending sync io before locking device */
> +		close_sync(conf);
> +	}
> +
>  	md_spin_lock_irq(&conf->device_lock);
>  	/*
>  	 * find the disk ...
> @@ -951,7 +968,11 @@
>  	 * Deactivate a spare disk:
>  	 */
>  	case DISKOP_SPARE_INACTIVE:
> -		close_sync(conf);
> +		if (conf->start_future > 0) {
> +			MD_BUG();
> +			err = -EBUSY;
> +			break;
> +		}
>  		sdisk = conf->mirrors + spare_disk;
>  		sdisk->operational = 0;
>  		sdisk->write_only = 0;
> @@ -964,7 +985,11 @@
>  	 * property)
>  	 */
>  	case DISKOP_SPARE_ACTIVE:
> -		close_sync(conf);
> +		if (conf->start_future > 0) {
> +			MD_BUG();
> +			err = -EBUSY;
> +			break;
> +		}
>  		sdisk = conf->mirrors + spare_disk;
>  		fdisk = conf->mirrors + failed_disk;
>  
> @@ -1328,23 +1353,25 @@
>  	int bsize;
>  	int disk;
>  	int block_nr;
> +	int buffs;
>  
> +	if (!sector_nr) {
> +		/* we want enough buffers to hold twice the window of 128*/
> +		buffs = 128 *2 / (PAGE_SIZE>>9);
> +		buffs = raid1_grow_buffers(conf, buffs);
> +		if (buffs < 2)
> +			goto nomem;
> +		conf->window = buffs*(PAGE_SIZE>>9)/2;
> +	}
>  	spin_lock_irq(&conf->segment_lock);
>  	if (!sector_nr) {
>  		/* initialize ...*/
> -		int buffs;
>  		conf->start_active = 0;
>  		conf->start_ready = 0;
>  		conf->start_pending = 0;
>  		conf->start_future = 0;
>  		conf->phase = 0;
> -		/* we want enough buffers to hold twice the window of 128*/
> -		buffs = 128 *2 / (PAGE_SIZE>>9);
> -		buffs = raid1_grow_buffers(conf, buffs);
> -		if (buffs < 2)
> -			goto nomem;
>  		
> -		conf->window = buffs*(PAGE_SIZE>>9)/2;
>  		conf->cnt_future += conf->cnt_done+conf->cnt_pending;
>  		conf->cnt_done = conf->cnt_pending = 0;
>  		if (conf->cnt_ready || conf->cnt_active)
> @@ -1429,7 +1456,6 @@
>  
>  nomem:
>  	raid1_shrink_buffers(conf);
> -	spin_unlock_irq(&conf->segment_lock);
>  	return -ENOMEM;
>  }
>  
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2002-03-26 22:07 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-03-24 22:42 [PATCH] 2.4.18 raid1 - fix SMP locking/interrupt errors, fix resync counter errors Neil Brown
2002-03-24 23:24 ` Andrew Morton
2002-03-25 18:33   ` Paul Clements
2002-03-25 18:52 ` Paul Clements
  -- strict thread matches above, loose matches on Subject: below --
2002-03-26  1:52 Neil Brown
2002-03-26 22:06 ` Paul Clements
2002-03-22 18:28 Paul Clements

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox