Locking bugs in 2.4 md.c

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Locking bugs in 2.4 md.c
@ 2003-09-18 15:23 Lars Marowsky-Bree
  2003-09-19  2:08 ` Neil Brown
  0 siblings, 1 reply; 8+ messages in thread
From: Lars Marowsky-Bree @ 2003-09-18 15:23 UTC (permalink / raw)
  To: linux-raid, Neil Brown; +Cc: axboe

Hi Neil,

I can pretty reliably Oops the 2.4 md raid1 in md_update_sb with a

while true
	mdadm /dev/md0 -f /dev/foo -r /dev/foo
	mdadm /dev/md0 -a /dev/foo
done &
while true
	mdadm /dev/md1 -f /dev/bar -r /dev/bar
	mdadm /dev/md1 -a /dev/bar
done &

in parallel on several md devices. It will finally die in md_update_sb,
and appears to be related to some locking bugs.

(The above is a stress test and makes it occur faster; customers report
it happens in the field too, that's why I went looking.)

In March (see http://www.spinics.net/lists/raid/msg02335.html) you wrote
you had a patch which made the locking in 2.4 "better", even though it
was rather ugly. Well, ugly it might be, but needed still ;-)

I assume you want to use the reconfig_sem for it? If you don't have a
recent patch, could you sketch out what you wanted to do so I could
start from there?

Thanks!

Sincerely,
    Lars Marowsky-Brée <lmb@suse.de>

(Or maybe you want to backport the 2.6 md to 2.4... ;-)

-- 
High Availability & Clustering		ever tried. ever failed. no matter.
SuSE Labs				try again. fail again. fail better.
Research & Development, SuSE Linux AG		-- Samuel Beckett

-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Locking bugs in 2.4 md.c
  2003-09-18 15:23 Locking bugs in 2.4 md.c Lars Marowsky-Bree
@ 2003-09-19  2:08 ` Neil Brown
  2003-09-19  7:26   ` Lars Marowsky-Bree
  2003-09-24 10:31   ` [fixes] " Lars Marowsky-Bree
  0 siblings, 2 replies; 8+ messages in thread
From: Neil Brown @ 2003-09-19  2:08 UTC (permalink / raw)
  To: Lars Marowsky-Bree; +Cc: linux-raid, axboe

On Thursday September 18, lmb@suse.de wrote:
> Hi Neil,
> 
> I can pretty reliably Oops the 2.4 md raid1 in md_update_sb with a
> 
> while true
> 	mdadm /dev/md0 -f /dev/foo -r /dev/foo
> 	mdadm /dev/md0 -a /dev/foo
> done &
> while true
> 	mdadm /dev/md1 -f /dev/bar -r /dev/bar
> 	mdadm /dev/md1 -a /dev/bar
> done &
> 
> in parallel on several md devices. It will finally die in md_update_sb,
> and appears to be related to some locking bugs.
> 
> (The above is a stress test and makes it occur faster; customers report
> it happens in the field too, that's why I went looking.)
> 
> In March (see http://www.spinics.net/lists/raid/msg02335.html) you wrote
> you had a patch which made the locking in 2.4 "better", even though it
> was rather ugly. Well, ugly it might be, but needed still ;-)
> 
> I assume you want to use the reconfig_sem for it? If you don't have a
> recent patch, could you sketch out what you wanted to do so I could
> start from there?

A patch against 2.4-current is below.  It probably works but I haven't
reviewed it in the context of other recent changes to md so I make no
promises.
> 
> (Or maybe you want to backport the 2.6 md to 2.4... ;-)

That might be nice, but it was lots of changes!!
NeilBrown

===============================
Improve locking of MD related structure, particularly when reconfiguring

 ----------- Diffstat output ------------
 ./drivers/md/md.c           |  212 +++++++++++++++++++++++++++++---------------
 ./drivers/md/multipath.c    |    8 +
 ./drivers/md/raid1.c        |    8 +
 ./drivers/md/raid5.c        |    8 +
 ./include/linux/raid/md_k.h |   23 +++-
 5 files changed, 174 insertions(+), 85 deletions(-)

diff ./drivers/md/md.c~current~ ./drivers/md/md.c
--- ./drivers/md/md.c~current~	2003-09-19 11:55:45.000000000 +1000
+++ ./drivers/md/md.c	2003-09-19 11:59:34.000000000 +1000
@@ -34,6 +34,7 @@
 #include <linux/sysctl.h>
 #include <linux/raid/xor.h>
 #include <linux/devfs_fs_kernel.h>
+#include <linux/rwsem.h>
 
 #include <linux/init.h>
 
@@ -130,16 +131,69 @@ static struct gendisk md_gendisk=
 
 /*
  * Enables to iterate over all existing md arrays
+ *
+ * Locking rules:
+ * - access to all_mddevs requires all_mddevs_sem.
+ * - an mddev can be locked while all_mddevs_sem is held.
+ * - When removing an mddev, we
+ *     lock the mddev
+ *     check that ->active is 1 (us).
+ *     set "dying"
+ *     unlock the mddev
+ *     claim all_mddevs_sem
+ *     actually remove device
+ *     release all_mddevs_sem
+ * - to get a reference to an mddev, we
+ *     claim all_mddevs_sem
+ *     find the mddev in the list
+ *     check that it isn't "dying"
+ *     increase ->active or take a lock
  */
 static MD_LIST_HEAD(all_mddevs);
+static DECLARE_RWSEM(all_mddevs_sem);
 
 /*
- * The mapping between kdev and mddev is not necessary a simple
+ * The mapping between kdev and mddev is not necessarily a simple
  * one! Eg. HSM uses several sub-devices to implement Logical
  * Volumes. All these sub-devices map to the same mddev.
  */
 dev_mapping_t mddev_map[MAX_MD_DEVS];
 
+
+static inline mddev_t * kdev_to_mddev (kdev_t dev)
+{
+	mddev_t *mddev;
+	if (MAJOR(dev) != MD_MAJOR)
+		BUG();
+	down_read(&all_mddevs_sem);
+        mddev = mddev_map[MINOR(dev)].mddev;
+	if (mddev &&  !mddev->dying)
+		atomic_inc(&mddev->active);
+	else
+		mddev = NULL;
+	up_read(&all_mddevs_sem);
+	return mddev;
+}
+
+static inline mddev_t * kdev_to_mddev_lock_interruptible (kdev_t dev, int *err)
+{
+	mddev_t *mddev;
+	if (MAJOR(dev) != MD_MAJOR)
+		BUG();
+	down_read(&all_mddevs_sem);
+        mddev = mddev_map[MINOR(dev)].mddev;
+	*err = 0;
+	if (mddev) {
+		if (mddev->dying) {
+			*err = -EBUSY;
+			mddev = NULL;
+		} else
+			*err = lock_mddev_interruptible(mddev);
+	}
+	up_read(&all_mddevs_sem);
+	return mddev;
+}
+
 void add_mddev_mapping(mddev_t * mddev, kdev_t dev, void *data)
 {
 	unsigned int minor = MINOR(dev);
@@ -175,13 +229,19 @@ void del_mddev_mapping(mddev_t * mddev, 
 static int md_make_request(request_queue_t *q, int rw, struct buffer_head * bh)
 {
 	mddev_t *mddev = kdev_to_mddev(bh->b_rdev);
+	int rv;
 
 	if (mddev && mddev->pers)
-		return mddev->pers->make_request(mddev, rw, bh);
+		rv =  mddev->pers->make_request(mddev, rw, bh);
 	else {
 		buffer_IO_error(bh);
-		return 0;
+		rv = 0;
 	}
+	if (mddev)
+		/* should really drop count when request completes... */
+		if (atomic_dec_and_test(&mddev->active))
+			BUG();
+	return rv;
 }
 
 static mddev_t * alloc_mddev(kdev_t dev)
@@ -199,20 +259,22 @@ static mddev_t * alloc_mddev(kdev_t dev)
 	memset(mddev, 0, sizeof(*mddev));
 
 	mddev->__minor = MINOR(dev);
-	init_MUTEX(&mddev->reconfig_sem);
+	init_MUTEX_LOCKED(&mddev->reconfig_sem);
 	init_MUTEX(&mddev->recovery_sem);
 	init_MUTEX(&mddev->resync_sem);
 	MD_INIT_LIST_HEAD(&mddev->disks);
 	MD_INIT_LIST_HEAD(&mddev->all_mddevs);
-	atomic_set(&mddev->active, 0);
+	atomic_set(&mddev->active, 1);
 
 	/*
 	 * The 'base' mddev is the one with data NULL.
 	 * personalities can create additional mddevs
 	 * if necessary.
 	 */
+	down_write(&all_mddevs_sem);
 	add_mddev_mapping(mddev, dev, 0);
 	md_list_add(&mddev->all_mddevs, &all_mddevs);
+	up_write(&all_mddevs_sem);
 
 	MOD_INC_USE_COUNT;
 
@@ -744,18 +806,10 @@ static void free_mddev(mddev_t *mddev)
 	md_size[mdidx(mddev)] = 0;
 	md_hd_struct[mdidx(mddev)].nr_sects = 0;
 
-	/*
-	 * Make sure nobody else is using this mddev
-	 * (careful, we rely on the global kernel lock here)
-	 */
-	while (sem_getcount(&mddev->resync_sem) != 1)
-		schedule();
-	while (sem_getcount(&mddev->recovery_sem) != 1)
-		schedule();
-
+	down_write(&all_mddevs_sem);
 	del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
 	md_list_del(&mddev->all_mddevs);
-	MD_INIT_LIST_HEAD(&mddev->all_mddevs);
+	up_write(&all_mddevs_sem);
 	kfree(mddev);
 	MOD_DEC_USE_COUNT;
 }
@@ -826,7 +880,9 @@ void md_print_devices(void)
 	printk("md:	**********************************\n");
 	printk("md:	* <COMPLETE RAID STATE PRINTOUT> *\n");
 	printk("md:	**********************************\n");
-	ITERATE_MDDEV(mddev,tmp) {
+
+	down_read(&all_mddevs_sem);
+	ITERATE_MDDEV/*_LOCK*/(mddev,tmp) {
 		printk("md%d: ", mdidx(mddev));
 
 		ITERATE_RDEV(mddev,rdev,tmp2)
@@ -841,6 +897,7 @@ void md_print_devices(void)
 		ITERATE_RDEV(mddev,rdev,tmp2)
 			print_rdev(rdev);
 	}
+	up_read(&all_mddevs_sem);
 	printk("md:	**********************************\n");
 	printk("\n");
 }
@@ -921,10 +978,6 @@ static int write_disk_sb(mdk_rdev_t * rd
 		MD_BUG();
 		return 1;
 	}
-	if (rdev->faulty) {
-		MD_BUG();
-		return 1;
-	}
 	if (rdev->sb->md_magic != MD_SB_MAGIC) {
 		MD_BUG();
 		return 1;
@@ -1010,6 +1063,11 @@ int md_update_sb(mddev_t * mddev)
 	struct md_list_head *tmp;
 	mdk_rdev_t *rdev;
 
+	if (!mddev->dying && !down_trylock(&mddev->reconfig_sem)) {
+		up(&mddev->reconfig_sem);
+		BUG();
+	}
+		
 	if (!mddev->sb_dirty) {
 		printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0));
 		return 0;
@@ -1825,6 +1883,9 @@ static int do_md_stop(mddev_t * mddev, i
 		if (mddev->recovery_running)
 			md_interrupt_thread(md_recovery_thread);
 
+		mddev->dying = 1; /* make sure nobody tries to use this */
+		unlock_mddev(mddev);
+
 		/*
 		 * This synchronizes with signal delivery to the
 		 * resync or reconstruction thread. It also nicely
@@ -1846,6 +1907,7 @@ static int do_md_stop(mddev_t * mddev, i
 			if (mddev->pers->stop(mddev)) {
 				if (mddev->ro)
 					set_device_ro(dev, 1);
+				mddev->dying = 0;
 				OUT(-EBUSY);
 			}
 			if (mddev->ro)
@@ -1863,8 +1925,11 @@ static int do_md_stop(mddev_t * mddev, i
 			mddev->sb_dirty = 1;
 			md_update_sb(mddev);
 		}
-		if (ro)
+		if (ro) {
 			set_device_ro(dev, 1);
+			lock_mddev(mddev);
+			mddev->dying = 0;
+		}
 	}
 
 	/*
@@ -1896,7 +1961,7 @@ int detect_old_array(mdp_super_t *sb)
 }
 
 
-static void autorun_array(mddev_t *mddev)
+static int autorun_array(mddev_t *mddev)
 {
 	mdk_rdev_t *rdev;
 	struct md_list_head *tmp;
@@ -1904,7 +1969,8 @@ static void autorun_array(mddev_t *mddev
 
 	if (mddev->disks.prev == &mddev->disks) {
 		MD_BUG();
-		return;
+		unlock_mddev(mddev);
+		return 0;
 	}
 
 	printk(KERN_INFO "md: running: ");
@@ -1922,7 +1988,10 @@ static void autorun_array(mddev_t *mddev
 		 */
 		mddev->sb_dirty = 0;
 		do_md_stop (mddev, 0);
+		return err;
 	}
+	unlock_mddev(mddev);
+	return 0;
 }
 
 /*
@@ -1978,6 +2047,7 @@ static void autorun_devices(kdev_t count
 			       mdidx(mddev), partition_name(rdev0->dev));
 			ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
 				export_rdev(rdev);
+			atomic_dec(&mddev->active);
 			continue;
 		}
 		mddev = alloc_mddev(md_kdev);
@@ -1985,15 +2055,15 @@ static void autorun_devices(kdev_t count
 			printk(KERN_ERR "md: cannot allocate memory for md drive.\n");
 			break;
 		}
-		if (md_kdev == countdev)
-			atomic_inc(&mddev->active);
 		printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
 		ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
 			bind_rdev_to_array(rdev, mddev);
 			md_list_del(&rdev->pending);
 			MD_INIT_LIST_HEAD(&rdev->pending);
 		}
-		autorun_array(mddev);
+		if (autorun_array(mddev)== 0
+		    && md_kdev != countdev)
+			atomic_dec(&mddev->active);
 	}
 	printk(KERN_INFO "md: ... autorun DONE.\n");
 }
@@ -2638,7 +2708,9 @@ static int md_ioctl(struct inode *inode,
 	 * Commands creating/starting a new array:
 	 */
 
-	mddev = kdev_to_mddev(dev);
+	mddev = kdev_to_mddev_lock_interruptible(dev, &err);
+	if (mddev == NULL && err)
+		goto abort;
 
 	switch (cmd)
 	{
@@ -2648,7 +2720,7 @@ static int md_ioctl(struct inode *inode,
 				printk(KERN_WARNING "md: array md%d already exists!\n",
 								mdidx(mddev));
 				err = -EEXIST;
-				goto abort;
+				goto abort_unlock;
 			}
 		default:;
 	}
@@ -2660,17 +2732,6 @@ static int md_ioctl(struct inode *inode,
 				err = -ENOMEM;
 				goto abort;
 			}
-			atomic_inc(&mddev->active);
-
-			/*
-			 * alloc_mddev() should possibly self-lock.
-			 */
-			err = lock_mddev(mddev);
-			if (err) {
-				printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n",
-				       err, cmd);
-				goto abort;
-			}
 
 			if (mddev->sb) {
 				printk(KERN_WARNING "md: array md%d already has a superblock!\n",
@@ -2693,14 +2754,11 @@ static int md_ioctl(struct inode *inode,
 			goto done_unlock;
 
 		case START_ARRAY:
-			/*
-			 * possibly make it lock the array ...
-			 */
 			err = autostart_array((kdev_t)arg, dev);
 			if (err) {
 				printk(KERN_WARNING "md: autostart %s failed!\n",
 					partition_name((kdev_t)arg));
-				goto abort;
+				goto abort_unlock;
 			}
 			goto done;
 
@@ -2715,11 +2773,7 @@ static int md_ioctl(struct inode *inode,
 		err = -ENODEV;
 		goto abort;
 	}
-	err = lock_mddev(mddev);
-	if (err) {
-		printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
-		goto abort;
-	}
+
 	/* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
 	if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
 		err = -ENODEV;
@@ -2888,17 +2942,17 @@ static int md_open(struct inode *inode, 
 	/*
 	 * Always succeed, but increment the usage count
 	 */
-	mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
-	if (mddev)
-		atomic_inc(&mddev->active);
+	kdev_to_mddev(inode->i_rdev);
 	return (0);
 }
 
 static int md_release(struct inode *inode, struct file * file)
 {
 	mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
-	if (mddev)
+	if (mddev) {
+		atomic_dec(&mddev->active);
 		atomic_dec(&mddev->active);
+	}
 	return 0;
 }
 
@@ -3183,7 +3237,8 @@ static void *md_seq_next(struct seq_file
 {
 	struct list_head *tmp;
 	mddev_t *next_mddev, *mddev = v;
-	
+
+	down_read(&all_mddevs_sem);
 	++*pos;
 	if (v == (void*)2)
 		return NULL;
@@ -3205,7 +3260,7 @@ static void *md_seq_next(struct seq_file
 
 static void md_seq_stop(struct seq_file *seq, void *v)
 {
-
+	up_read(&all_mddevs_sem);
 }
 
 static int md_seq_show(struct seq_file *seq, void *v)
@@ -3233,6 +3288,8 @@ static int md_seq_show(struct seq_file *
 		status_unused(seq);
 		return 0;
 	}
+	if (mddev->dying)
+		return 0;
 
 	seq_printf(seq, "md%d : %sactive", mdidx(mddev),
 		   mddev->pers ? "" : "in");
@@ -3432,6 +3489,7 @@ int md_do_sync(mddev_t *mddev, mdp_disk_
 
 recheck:
 	serialize = 0;
+	down_read(&all_mddevs_sem);
 	ITERATE_MDDEV(mddev2,tmp) {
 		if (mddev2 == mddev)
 			continue;
@@ -3443,6 +3501,7 @@ recheck:
 			break;
 		}
 	}
+	up_read(&all_mddevs_sem);
 	if (serialize) {
 		interruptible_sleep_on(&resync_wait);
 		if (md_signal_pending(current)) {
@@ -3589,8 +3648,10 @@ void md_do_recovery(void *data)
 	struct md_list_head *tmp;
 
 	printk(KERN_INFO "md: recovery thread got woken up ...\n");
-restart:
-	ITERATE_MDDEV(mddev,tmp) {
+
+ restart:
+	down_read(&all_mddevs_sem);
+	ITERATE_MDDEV_LOCK(mddev,tmp) {
 		sb = mddev->sb;
 		if (!sb)
 			continue;
@@ -3617,9 +3678,13 @@ restart:
 			continue;
 		if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
 			continue;
+		unlock_mddev(mddev);
+		up_read(&all_mddevs_sem);
 		down(&mddev->recovery_sem);
 		mddev->recovery_running = 1;
 		err = md_do_sync(mddev, spare);
+
+		lock_mddev(mddev);
 		if (err == -EIO) {
 			printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n",
 			       mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
@@ -3645,26 +3710,28 @@ restart:
 							 DISKOP_SPARE_INACTIVE);
 			up(&mddev->recovery_sem);
 			mddev->recovery_running = 0;
-			continue;
 		} else {
 			mddev->recovery_running = 0;
 			up(&mddev->recovery_sem);
+
+			if (!disk_faulty(spare)) {
+				/*
+				 * the SPARE_ACTIVE diskop possibly changes the
+				 * pointer too
+				 */
+				mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
+				mark_disk_sync(spare);
+				mark_disk_active(spare);
+				sb->active_disks++;
+				sb->spare_disks--;
+			}
+			mddev->sb_dirty = 1;
+			md_update_sb(mddev);
 		}
-		if (!disk_faulty(spare)) {
-			/*
-			 * the SPARE_ACTIVE diskop possibly changes the
-			 * pointer too
-			 */
-			mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
-			mark_disk_sync(spare);
-			mark_disk_active(spare);
-			sb->active_disks++;
-			sb->spare_disks--;
-		}
-		mddev->sb_dirty = 1;
-		md_update_sb(mddev);
+		unlock_mddev(mddev);
 		goto restart;
 	}
+	up_read(&all_mddevs_sem);
 	printk(KERN_INFO "md: recovery thread finished ...\n");
 
 }
@@ -3680,7 +3747,7 @@ int md_notify_reboot(struct notifier_blo
 
 		printk(KERN_INFO "md: stopping all md devices.\n");
 
-		ITERATE_MDDEV(mddev,tmp)
+		ITERATE_MDDEV_LOCK(mddev,tmp)
 			do_md_stop (mddev, 1);
 		/*
 		 * certain more exotic SCSI devices are known to be
@@ -4024,6 +4091,9 @@ void md__init md_setup_drive(void)
 			mddev->sb_dirty = 0;
 			do_md_stop(mddev, 0);
 			printk(KERN_WARNING "md: starting md%d failed\n", minor);
+		} else {
+			unlock_mddev(mddev);
+			atomic_dec(&mddev->active);
 		}
 	}
 }

diff ./drivers/md/multipath.c~current~ ./drivers/md/multipath.c
--- ./drivers/md/multipath.c~current~	2003-09-19 11:55:45.000000000 +1000
+++ ./drivers/md/multipath.c	2003-09-19 11:53:09.000000000 +1000
@@ -700,8 +700,12 @@ static void multipathd (void *data)
 		md_spin_unlock_irqrestore(&retry_list_lock, flags);
 
 		mddev = mp_bh->mddev;
-		if (mddev->sb_dirty)
-			md_update_sb(mddev);
+		if (mddev->sb_dirty) {
+			lock_mddev(mddev);
+			if (mddev->sb_dirty)
+				md_update_sb(mddev);
+			unlock_mddev(mddev);
+		}
 		bh = &mp_bh->bh_req;
 		dev = bh->b_dev;
 		

diff ./drivers/md/raid1.c~current~ ./drivers/md/raid1.c
--- ./drivers/md/raid1.c~current~	2003-09-19 11:55:45.000000000 +1000
+++ ./drivers/md/raid1.c	2003-09-19 11:53:09.000000000 +1000
@@ -1160,8 +1160,12 @@ static void raid1d (void *data)
 	mddev_t *mddev = conf->mddev;
 	kdev_t dev;
 
-	if (mddev->sb_dirty)
-		md_update_sb(mddev);
+	if (mddev->sb_dirty) {
+		lock_mddev(mddev);
+		if (mddev->sb_dirty)
+			md_update_sb(mddev);
+		unlock_mddev(mddev);
+	}
 
 	for (;;) {
 		md_spin_lock_irqsave(&retry_list_lock, flags);

diff ./drivers/md/raid5.c~current~ ./drivers/md/raid5.c
--- ./drivers/md/raid5.c~current~	2003-09-19 11:55:45.000000000 +1000
+++ ./drivers/md/raid5.c	2003-09-19 11:53:09.000000000 +1000
@@ -1301,8 +1301,12 @@ static void raid5d (void *data)
 
 	handled = 0;
 
-	if (mddev->sb_dirty)
-		md_update_sb(mddev);
+	if (mddev->sb_dirty) {
+		lock_mddev(mddev);
+		if (mddev->sb_dirty)
+			md_update_sb(mddev);
+		unlock_mddev(mddev);
+	}
 	md_spin_lock_irq(&conf->device_lock);
 	while (1) {
 		struct list_head *first;

diff ./include/linux/raid/md_k.h~current~ ./include/linux/raid/md_k.h
--- ./include/linux/raid/md_k.h~current~	2003-09-19 11:55:45.000000000 +1000
+++ ./include/linux/raid/md_k.h	2003-09-19 11:53:09.000000000 +1000
@@ -75,13 +75,6 @@ typedef struct dev_mapping_s {
 
 extern dev_mapping_t mddev_map [MAX_MD_DEVS];
 
-static inline mddev_t * kdev_to_mddev (kdev_t dev)
-{
-	if (MAJOR(dev) != MD_MAJOR)
-		BUG();
-        return mddev_map[MINOR(dev)].mddev;
-}
-
 /*
  * options passed in raidrun:
  */
@@ -207,6 +200,7 @@ struct mddev_s
 	unsigned long			resync_mark_cnt;/* blocks written at resync_mark */
 	char				*name;
 	int				recovery_running;
+	int				dying;
 	struct semaphore		reconfig_sem;
 	struct semaphore		recovery_sem;
 	struct semaphore		resync_sem;
@@ -310,7 +304,20 @@ extern mdp_disk_t *get_spare(mddev_t *md
 			tmp = tmp->next, tmp->prev != &all_mddevs	\
 		; )
 
-static inline int lock_mddev (mddev_t * mddev)
+#define ITERATE_MDDEV_LOCK(mddev,tmp)					\
+									\
+	for (tmp = all_mddevs.next;					\
+		mddev = md_list_entry(tmp, mddev_t, all_mddevs),	\
+			tmp = tmp->next, tmp->prev != &all_mddevs 	\
+			 && (down(&mddev->reconfig_sem), 1)		\
+		; up(&mddev->reconfig_sem)) if (!mddev->dying)
+
+static inline void lock_mddev (mddev_t * mddev)
+{
+	return down(&mddev->reconfig_sem);
+}
+
+static inline int lock_mddev_interruptible (mddev_t * mddev)
 {
 	return down_interruptible(&mddev->reconfig_sem);
 }

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Locking bugs in 2.4 md.c
  2003-09-19  2:08 ` Neil Brown
@ 2003-09-19  7:26   ` Lars Marowsky-Bree
  2003-09-19  8:36     ` Lars Marowsky-Bree
  2003-09-24 10:31   ` [fixes] " Lars Marowsky-Bree
  1 sibling, 1 reply; 8+ messages in thread
From: Lars Marowsky-Bree @ 2003-09-19  7:26 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-raid, axboe

On 2003-09-19T12:08:29,
   Neil Brown <neilb@cse.unsw.edu.au> said:

> A patch against 2.4-current is below.  It probably works but I haven't
> reviewed it in the context of other recent changes to md so I make no
> promises.

Thanks. I'll merge it against the current 2.4.

I had already started putting lock_/unlock_mddev() statements through
potential candidates, but that didn't actually help. raid1d was racy
with md_update_sb somehow, and I couldn't figure out how ;-)

A global lock may indeed be better.

I'll let you know how it goes.


Sincerely,
    Lars Marowsky-Brée <lmb@suse.de>

-- 
High Availability & Clustering		ever tried. ever failed. no matter.
SuSE Labs				try again. fail again. fail better.
Research & Development, SuSE Linux AG		-- Samuel Beckett

-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Locking bugs in 2.4 md.c
  2003-09-19  7:26   ` Lars Marowsky-Bree
@ 2003-09-19  8:36     ` Lars Marowsky-Bree
  2003-09-19 14:18       ` Lars Marowsky-Bree
  2003-09-22  5:54       ` Neil Brown
  0 siblings, 2 replies; 8+ messages in thread
From: Lars Marowsky-Bree @ 2003-09-19  8:36 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-raid, axboe

On 2003-09-19T09:26:10,
   Lars Marowsky-Bree <lmb@suse.de> said:

> > A patch against 2.4-current is below.  It probably works but I haven't
> > reviewed it in the context of other recent changes to md so I make no
> > promises.
> Thanks. I'll merge it against the current 2.4.

Hi Neil, turns out your patch did exactly what I did with lock_mddev(),
just with a bigger granularity (which I think is a good thing, but I was
too shy to introduce a new lock).

However, I can still reliably panic the kernel with the sequence I gave
on a SMP box, even though the panic now seems to occur in different
places...

So if you have any further idea what could cause that, I appreciate any
hints ;-)

I'll go dig in some deeper now and gear up kgdb...


Sincerely,
    Lars Marowsky-Brée <lmb@suse.de>

-- 
High Availability & Clustering		ever tried. ever failed. no matter.
SuSE Labs				try again. fail again. fail better.
Research & Development, SuSE Linux AG		-- Samuel Beckett

-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Locking bugs in 2.4 md.c
  2003-09-19  8:36     ` Lars Marowsky-Bree
@ 2003-09-19 14:18       ` Lars Marowsky-Bree
  2003-09-22  5:54       ` Neil Brown
  1 sibling, 0 replies; 8+ messages in thread
From: Lars Marowsky-Bree @ 2003-09-19 14:18 UTC (permalink / raw)
  To: linux-raid

[-- Attachment #1: Type: text/plain, Size: 922 bytes --]

On 2003-09-19T10:36:40,
   Lars Marowsky-Bree <lmb@suse.de> said:

OK. Some good news, some bad news.

The patch by Neil (my version of it is attached) does fix the bugs I
saw, if combined with my additional change to raid1.c to not output
empty slots (which is completely pointless to do, and I only added it to
reduce useless debugging output).

That this second change is required suggests that the timing is the
critical part. (I was logging to serial console, so this change changed
the timing immensely)

Alas, I can no longer reproduce the bug on an 8 way HT machine. I was
not fully able to figure out where it occured, but at least the attached
patch is a partial fix for the bug...


Sincerely,
    Lars Marowsky-Brée <lmb@suse.de>

-- 
High Availability & Clustering		ever tried. ever failed. no matter.
SuSE Labs				try again. fail again. fail better.
Research & Development, SUSE LINUX AG		-- Samuel Beckett


[-- Attachment #2: md-locking --]
[-- Type: text/plain, Size: 16802 bytes --]

diff -x '*~' -x '*.rej' -x '*.orig' -x CVS -x '*.o' -x '.*' -ruN linux-2.4.21-SP3.orig/include/linux/raid/md_k.h linux-2.4.21-SP3.lmb/include/linux/raid/md_k.h
--- linux-2.4.21-SP3.orig/include/linux/raid/md_k.h	2003-09-18 12:07:48.000000000 +0200
+++ linux-2.4.21-SP3.lmb/include/linux/raid/md_k.h	2003-09-19 09:49:41.000000000 +0200
@@ -75,13 +75,6 @@
 
 extern dev_mapping_t mddev_map [MAX_MD_DEVS];
 
-static inline mddev_t * kdev_to_mddev (kdev_t dev)
-{
-	if (MAJOR(dev) != MD_MAJOR)
-		BUG();
-        return mddev_map[MINOR(dev)].mddev;
-}
-
 /*
  * options passed in raidrun:
  */
@@ -214,6 +210,7 @@
 	unsigned long			resync_mark_cnt;/* blocks written at resync_mark */
 	char				*name;
 	int				recovery_running;
+	int				dying;
 	struct semaphore		reconfig_sem;
 	struct semaphore		recovery_sem;
 	struct semaphore		resync_sem;
@@ -317,7 +314,20 @@
 			tmp = tmp->next, tmp->prev != &all_mddevs	\
 		; )
 
-static inline int lock_mddev (mddev_t * mddev)
+#define ITERATE_MDDEV_LOCK(mddev,tmp)					\
+									\
+	for (tmp = all_mddevs.next;					\
+		mddev = md_list_entry(tmp, mddev_t, all_mddevs),	\
+			tmp = tmp->next, tmp->prev != &all_mddevs 	\
+			 && (down(&mddev->reconfig_sem), 1)		\
+		; up(&mddev->reconfig_sem)) if (!mddev->dying)
+
+static inline void lock_mddev (mddev_t * mddev)
+{
+	return down(&mddev->reconfig_sem);
+}
+
+static inline int lock_mddev_interruptible (mddev_t * mddev)
 {
 	return down_interruptible(&mddev->reconfig_sem);
 }
diff -x '*~' -x '*.rej' -x '*.orig' -x CVS -x '*.o' -x '.*' -ruN linux-2.4.21-SP3.orig/drivers/md/md.c linux-2.4.21-SP3.lmb/drivers/md/md.c
--- linux-2.4.21-SP3.orig/drivers/md/md.c	2003-09-18 12:08:06.000000000 +0200
+++ linux-2.4.21-SP3.lmb/drivers/md/md.c	2003-09-19 10:16:05.000000000 +0200
@@ -34,6 +34,7 @@
 #include <linux/sysctl.h>
 #include <linux/raid/xor.h>
 #include <linux/devfs_fs_kernel.h>
+#include <linux/rwsem.h>
 
 #include <linux/init.h>
 
@@ -130,16 +131,69 @@
 
 /*
  * Enables to iterate over all existing md arrays
+ *
+ * Locking rules:
+ * - access to all_mddevs requires all_mddevs_sem.
+ * - an mddev can be locked while all_mddevs_sem is held.
+ * - When removing an mddev, we
+ *     lock the mddev
+ *     check that ->active is 1 (us).
+ *     set "dying"
+ *     unlock the mddev
+ *     claim all_mddevs_sem
+ *     actually remove device
+ *     release all_mddevs_sem
+ * - to get a reference to an mddev, we
+ *     claim all_mddevs_sem
+ *     find the mddev in the list
+ *     check that it isn't "dying"
+ *     increase ->active or take a lock
  */
 static MD_LIST_HEAD(all_mddevs);
+static DECLARE_RWSEM(all_mddevs_sem);
 
 /*
- * The mapping between kdev and mddev is not necessary a simple
+ * The mapping between kdev and mddev is not necessarily a simple
  * one! Eg. HSM uses several sub-devices to implement Logical
  * Volumes. All these sub-devices map to the same mddev.
  */
 dev_mapping_t mddev_map[MAX_MD_DEVS];
 
+
+static inline mddev_t * kdev_to_mddev (kdev_t dev)
+{
+	mddev_t *mddev;
+	if (MAJOR(dev) != MD_MAJOR)
+		BUG();
+	down_read(&all_mddevs_sem);
+        mddev = mddev_map[MINOR(dev)].mddev;
+	if (mddev &&  !mddev->dying)
+		atomic_inc(&mddev->active);
+	else
+		mddev = NULL;
+	up_read(&all_mddevs_sem);
+	return mddev;
+}
+
+static inline mddev_t * kdev_to_mddev_lock_interruptible (kdev_t dev, int *err)
+{
+	mddev_t *mddev;
+	if (MAJOR(dev) != MD_MAJOR)
+		BUG();
+	down_read(&all_mddevs_sem);
+        mddev = mddev_map[MINOR(dev)].mddev;
+	*err = 0;
+	if (mddev) {
+		if (mddev->dying) {
+			*err = -EBUSY;
+			mddev = NULL;
+		} else
+			*err = lock_mddev_interruptible(mddev);
+	}
+	up_read(&all_mddevs_sem);
+	return mddev;
+}
+
 void add_mddev_mapping(mddev_t * mddev, kdev_t dev, void *data)
 {
 	unsigned int minor = MINOR(dev);
@@ -175,13 +229,19 @@
 static int md_make_request(request_queue_t *q, int rw, struct buffer_head * bh)
 {
 	mddev_t *mddev = kdev_to_mddev(bh->b_rdev);
+	int rv;
 
 	if (mddev && mddev->pers)
-		return mddev->pers->make_request(mddev, rw, bh);
+		rv =  mddev->pers->make_request(mddev, rw, bh);
 	else {
 		buffer_IO_error(bh);
-		return 0;
+		rv = 0;
 	}
+	if (mddev)
+		/* should really drop count when request completes... */
+		if (atomic_dec_and_test(&mddev->active))
+			BUG();
+	return rv;
 }
 
 static mddev_t * alloc_mddev(kdev_t dev)
@@ -199,20 +259,22 @@
 	memset(mddev, 0, sizeof(*mddev));
 
 	mddev->__minor = MINOR(dev);
-	init_MUTEX(&mddev->reconfig_sem);
+	init_MUTEX_LOCKED(&mddev->reconfig_sem);
 	init_MUTEX(&mddev->recovery_sem);
 	init_MUTEX(&mddev->resync_sem);
 	MD_INIT_LIST_HEAD(&mddev->disks);
 	MD_INIT_LIST_HEAD(&mddev->all_mddevs);
-	atomic_set(&mddev->active, 0);
+	atomic_set(&mddev->active, 1);
 
 	/*
 	 * The 'base' mddev is the one with data NULL.
 	 * personalities can create additional mddevs
 	 * if necessary.
 	 */
+	down_write(&all_mddevs_sem);
 	add_mddev_mapping(mddev, dev, 0);
 	md_list_add(&mddev->all_mddevs, &all_mddevs);
+	up_write(&all_mddevs_sem);
 
 	MOD_INC_USE_COUNT;
 
@@ -745,18 +807,10 @@
 	md_size[mdidx(mddev)] = 0;
 	md_hd_struct[mdidx(mddev)].nr_sects = 0;
 
-	/*
-	 * Make sure nobody else is using this mddev
-	 * (careful, we rely on the global kernel lock here)
-	 */
-	while (sem_getcount(&mddev->resync_sem) != 1)
-		schedule();
-	while (sem_getcount(&mddev->recovery_sem) != 1)
-		schedule();
-
+	down_write(&all_mddevs_sem);
 	del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
 	md_list_del(&mddev->all_mddevs);
-	MD_INIT_LIST_HEAD(&mddev->all_mddevs);
+	up_write(&all_mddevs_sem);
 	kfree(mddev);
 	MOD_DEC_USE_COUNT;
 }
@@ -827,7 +881,10 @@
 	printk("md:	**********************************\n");
 	printk("md:	* <COMPLETE RAID STATE PRINTOUT> *\n");
 	printk("md:	**********************************\n");
-	ITERATE_MDDEV(mddev,tmp) {
+
+	down_read(&all_mddevs_sem);
+	/* XXX Review whether locking is needed for the mddev here or not */
+	ITERATE_MDDEV_LOCK(mddev,tmp) {
 		printk("md%d: ", mdidx(mddev));
 
 		ITERATE_RDEV(mddev,rdev,tmp2)
@@ -842,6 +899,7 @@
 		ITERATE_RDEV(mddev,rdev,tmp2)
 			print_rdev(rdev);
 	}
+	up_read(&all_mddevs_sem);
 	printk("md:	**********************************\n");
 	printk("\n");
 }
@@ -922,10 +980,6 @@
 		MD_BUG();
 		return 1;
 	}
-	if (rdev->faulty) {
-		MD_BUG();
-		return 1;
-	}
 	if (rdev->sb->md_magic != MD_SB_MAGIC) {
 		MD_BUG();
 		return 1;
@@ -1011,6 +1065,11 @@
 	struct md_list_head *tmp;
 	mdk_rdev_t *rdev;
 
+	if (!mddev->dying && !down_trylock(&mddev->reconfig_sem)) {
+		up(&mddev->reconfig_sem);
+		BUG();
+	}
+		
 	if (!mddev->sb_dirty) {
 		printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0));
 		return 0;
@@ -1047,8 +1106,13 @@
 		printk(KERN_INFO "md: %s ", 
 				partition_name(rdev->dev));
 		
-		if (rdev->faulty) {
+		if (!rdev) {
+			MD_BUG();
+			printk("(rdev == NULL)\n");
+		} else if (rdev->faulty) {
 			printk("(skipping faulty)\n");
+		} else if (!rdev->sb) {
+			printk("(rdev->sb == NULL)\n");
 		} else if (rdev_is_alias(rdev)) {
 			printk("(skipping alias)\n");
 		} else if (disk_faulty(&rdev->sb->this_disk)) {
@@ -1815,6 +1879,9 @@
 		if (mddev->recovery_running)
 			md_interrupt_thread(md_recovery_thread);
 
+		mddev->dying = 1; /* make sure nobody tries to use this */
+		unlock_mddev(mddev);
+
 		/*
 		 * This synchronizes with signal delivery to the
 		 * resync or reconstruction thread. It also nicely
@@ -1836,6 +1903,7 @@
 			if (mddev->pers->stop(mddev)) {
 				if (mddev->ro)
 					set_device_ro(dev, 1);
+				mddev->dying = 0;
 				OUT(-EBUSY);
 			}
 			if (mddev->ro)
@@ -1853,8 +1921,11 @@
 			mddev->sb_dirty = 1;
 			md_update_sb(mddev);
 		}
-		if (ro)
+		if (ro) {
 			set_device_ro(dev, 1);
+			lock_mddev(mddev);
+			mddev->dying = 0;
+		}
 	}
 
 	/*
@@ -1886,7 +1957,7 @@
 }
 
 
-static void autorun_array(mddev_t *mddev)
+static int autorun_array(mddev_t *mddev)
 {
 	mdk_rdev_t *rdev;
 	struct md_list_head *tmp;
@@ -1894,7 +1965,7 @@
 
 	if (mddev->disks.prev == &mddev->disks) {
 		MD_BUG();
-		return;
+		goto out_unlock;
 	}
 
 	printk(KERN_INFO "md: running: ");
@@ -1912,6 +1983,7 @@
 		 */
 		mddev->sb_dirty = 0;
 		do_md_stop (mddev, 0);
+		return err;
 	} else {
 		/* Create an rdev for the freshly started md device
 		 * and add to the end of the list */
@@ -1924,17 +1996,21 @@
 			 * imported the device! */
 			if (!rdev) {
 				MD_BUG();
-				return;
+				goto out_unlock;
 			}
 			if (rdev->faulty) {
 				MD_BUG();
-				return;
+				goto out_unlock;
 			}
 			printk("md: added md%d to the autodetection\n",
 				mdidx(mddev));
 			md_list_add(&rdev->pending, pending_raid_disks.prev);
 		}
 	}
+
+out_unlock:
+ 	unlock_mddev(mddev);
+ 	return 0;
 }
 
 /*
@@ -1990,6 +2066,7 @@
 			       mdidx(mddev), partition_name(rdev0->dev));
 			ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp)
 				export_rdev(rdev);
+			atomic_dec(&mddev->active);
 			continue;
 		}
 		mddev = alloc_mddev(md_kdev);
@@ -1997,15 +2074,15 @@
 			printk(KERN_ERR "md: cannot allocate memory for md drive.\n");
 			break;
 		}
-		if (md_kdev == countdev)
-			atomic_inc(&mddev->active);
 		printk(KERN_INFO "md: created md%d\n", mdidx(mddev));
 		ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
 			bind_rdev_to_array(rdev, mddev);
 			md_list_del(&rdev->pending);
 			MD_INIT_LIST_HEAD(&rdev->pending);
 		}
-		autorun_array(mddev);
+		if (autorun_array(mddev)== 0
+		    && md_kdev != countdev)
+			atomic_dec(&mddev->active);
 	}
 	printk(KERN_INFO "md: ... autorun DONE.\n");
 }
@@ -2697,7 +2774,9 @@
 	 * Commands creating/starting a new array:
 	 */
 
-	mddev = kdev_to_mddev(dev);
+	mddev = kdev_to_mddev_lock_interruptible(dev, &err);
+	if (mddev == NULL && err)
+		goto abort;
 
 	switch (cmd)
 	{
@@ -2707,7 +2786,7 @@
 				printk(KERN_WARNING "md: array md%d already exists!\n",
 								mdidx(mddev));
 				err = -EEXIST;
-				goto abort;
+				goto abort_unlock;
 			}
 		default:;
 	}
@@ -2719,17 +2798,6 @@
 				err = -ENOMEM;
 				goto abort;
 			}
-			atomic_inc(&mddev->active);
-
-			/*
-			 * alloc_mddev() should possibly self-lock.
-			 */
-			err = lock_mddev(mddev);
-			if (err) {
-				printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n",
-				       err, cmd);
-				goto abort;
-			}
 
 			if (mddev->sb) {
 				printk(KERN_WARNING "md: array md%d already has a superblock!\n",
@@ -2752,14 +2820,11 @@
 			goto done_unlock;
 
 		case START_ARRAY:
-			/*
-			 * possibly make it lock the array ...
-			 */
 			err = autostart_array((kdev_t)arg, dev);
 			if (err) {
 				printk(KERN_WARNING "md: autostart %s failed!\n",
 					partition_name((kdev_t)arg));
-				goto abort;
+				goto abort_unlock;
 			}
 			goto done;
 
@@ -2774,11 +2839,7 @@
 		err = -ENODEV;
 		goto abort;
 	}
-	err = lock_mddev(mddev);
-	if (err) {
-		printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd);
-		goto abort;
-	}
+
 	/* if we don't have a superblock yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
 	if (!mddev->sb && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
 		err = -ENODEV;
@@ -2948,17 +3009,17 @@
 	/*
 	 * Always succeed, but increment the usage count
 	 */
-	mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
-	if (mddev)
-		atomic_inc(&mddev->active);
+	kdev_to_mddev(inode->i_rdev);
 	return (0);
 }
 
 static int md_release(struct inode *inode, struct file * file)
 {
 	mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
-	if (mddev)
+	if (mddev) {
 		atomic_dec(&mddev->active);
+		atomic_dec(&mddev->active);
+	}
 	return 0;
 }
 
@@ -3239,7 +3300,8 @@
 {
 	struct list_head *tmp;
 	mddev_t *next_mddev, *mddev = v;
-	
+
+	down_read(&all_mddevs_sem);
 	++*pos;
 	if (v == (void*)2)
 		return NULL;
@@ -3261,7 +3323,7 @@
 
 static void md_seq_stop(struct seq_file *seq, void *v)
 {
-
+	up_read(&all_mddevs_sem);
 }
 
 static int md_seq_show(struct seq_file *seq, void *v)
@@ -3289,6 +3351,8 @@
 		status_unused(seq);
 		return 0;
 	}
+	if (mddev->dying)
+		return 0;
 
 	seq_printf(seq, "md%d : %sactive", mdidx(mddev),
 		   mddev->pers ? "" : "in");
@@ -3489,6 +3553,7 @@
 
 recheck:
 	serialize = 0;
+	down_read(&all_mddevs_sem);
 	ITERATE_MDDEV(mddev2,tmp) {
 		if (mddev2 == mddev)
 			continue;
@@ -3500,6 +3565,7 @@
 			break;
 		}
 	}
+	up_read(&all_mddevs_sem);
 	if (serialize) {
 		interruptible_sleep_on(&resync_wait);
 		if (md_signal_pending(current)) {
@@ -3638,8 +3704,10 @@
 	struct md_list_head *tmp;
 
 	printk(KERN_INFO "md: recovery thread got woken up ...\n");
-restart:
-	ITERATE_MDDEV(mddev,tmp) {
+
+ restart:
+	down_read(&all_mddevs_sem);
+	ITERATE_MDDEV_LOCK(mddev,tmp) {
 		sb = mddev->sb;
 		if (!sb)
 			continue;
@@ -3668,9 +3736,13 @@
 			continue;
 		if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
 			continue;
+		unlock_mddev(mddev);
+		up_read(&all_mddevs_sem);
 		down(&mddev->recovery_sem);
 		mddev->recovery_running = 1;
 		err = md_do_sync(mddev, spare);
+
+		lock_mddev(mddev);
 		if (err == -EIO) {
 			printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n",
 			       mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor)));
@@ -3696,26 +3768,28 @@
 							 DISKOP_SPARE_INACTIVE);
 			up(&mddev->recovery_sem);
 			mddev->recovery_running = 0;
-			continue;
 		} else {
 			mddev->recovery_running = 0;
 			up(&mddev->recovery_sem);
+
+			if (!disk_faulty(spare)) {
+				/*
+				 * the SPARE_ACTIVE diskop possibly changes the
+				 * pointer too
+				 */
+				mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
+				mark_disk_sync(spare);
+				mark_disk_active(spare);
+				sb->active_disks++;
+				sb->spare_disks--;
+			}
+			mddev->sb_dirty = 1;
+			md_update_sb(mddev);
 		}
-		if (!disk_faulty(spare)) {
-			/*
-			 * the SPARE_ACTIVE diskop possibly changes the
-			 * pointer too
-			 */
-			mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
-			mark_disk_sync(spare);
-			mark_disk_active(spare);
-			sb->active_disks++;
-			sb->spare_disks--;
-		}
-		mddev->sb_dirty = 1;
-		md_update_sb(mddev);
+		unlock_mddev(mddev);
 		goto restart;
 	}
+	up_read(&all_mddevs_sem);
 	printk(KERN_INFO "md: recovery thread finished ...\n");
 
 }
@@ -3731,7 +3805,7 @@
 
 		printk(KERN_INFO "md: stopping all md devices.\n");
 
-		ITERATE_MDDEV(mddev,tmp)
+		ITERATE_MDDEV_LOCK(mddev,tmp)
 			do_md_stop (mddev, 1);
 		/*
 		 * certain more exotic SCSI devices are known to be
@@ -4075,6 +4149,9 @@
 			mddev->sb_dirty = 0;
 			do_md_stop(mddev, 0);
 			printk(KERN_WARNING "md: starting md%d failed\n", minor);
+		} else {
+			unlock_mddev(mddev);
+			atomic_dec(&mddev->active);
 		}
 	}
 }
diff -x '*~' -x '*.rej' -x '*.orig' -x CVS -x '*.o' -x '.*' -ruN linux-2.4.21-SP3.orig/drivers/md/multipath.c linux-2.4.21-SP3.lmb/drivers/md/multipath.c
--- linux-2.4.21-SP3.orig/drivers/md/multipath.c	2003-09-18 12:07:48.000000000 +0200
+++ linux-2.4.21-SP3.lmb/drivers/md/multipath.c	2003-09-19 09:49:41.000000000 +0200
@@ -971,8 +971,12 @@
 		md_spin_unlock_irqrestore(&retry_list_lock, flags);
 
 		mddev = mp_bh->mddev;
-		if (mddev->sb_dirty)
-			md_update_sb(mddev);
+		if (mddev->sb_dirty) {
+			lock_mddev(mddev);
+			if (mddev->sb_dirty)
+				md_update_sb(mddev);
+			unlock_mddev(mddev);
+		}
 		bh = &mp_bh->bh_req;
 		dev = bh->b_dev;
 		
diff -x '*~' -x '*.rej' -x '*.orig' -x CVS -x '*.o' -x '.*' -ruN linux-2.4.21-SP3.orig/drivers/md/raid1.c linux-2.4.21-SP3.lmb/drivers/md/raid1.c
--- linux-2.4.21-SP3.orig/drivers/md/raid1.c	2003-09-18 12:07:20.000000000 +0200
+++ linux-2.4.21-SP3.lmb/drivers/md/raid1.c	2003-09-19 10:46:42.000000000 +0200
@@ -823,6 +823,9 @@
 
 	for (i = 0; i < MD_SB_DISKS; i++) {
 		tmp = conf->mirrors + i;
+		/* lmb: Skip completely empty slots */
+		if (tmp->spare || tmp->operational || tmp->number ||
+				tmp->raid_disk || tmp->used_slot)
 		printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
 			i, tmp->spare,tmp->operational,
 			tmp->number,tmp->raid_disk,tmp->used_slot,
@@ -1161,8 +1164,12 @@
 	mddev_t *mddev = conf->mddev;
 	kdev_t dev;
 
-	if (mddev->sb_dirty)
-		md_update_sb(mddev);
+	if (mddev->sb_dirty) {
+		lock_mddev(mddev);
+		if (mddev->sb_dirty)
+			md_update_sb(mddev);
+		unlock_mddev(mddev);
+	}
 
 	for (;;) {
 		md_spin_lock_irqsave(&retry_list_lock, flags);
diff -x '*~' -x '*.rej' -x '*.orig' -x CVS -x '*.o' -x '.*' -ruN linux-2.4.21-SP3.orig/drivers/md/raid5.c linux-2.4.21-SP3.lmb/drivers/md/raid5.c
--- linux-2.4.21-SP3.orig/drivers/md/raid5.c	2003-09-18 12:07:58.000000000 +0200
+++ linux-2.4.21-SP3.lmb/drivers/md/raid5.c	2003-09-19 09:49:41.000000000 +0200
@@ -1294,8 +1294,12 @@
 
 	handled = 0;
 
-	if (mddev->sb_dirty)
-		md_update_sb(mddev);
+	if (mddev->sb_dirty) {
+		lock_mddev(mddev);
+		if (mddev->sb_dirty)
+			md_update_sb(mddev);
+		unlock_mddev(mddev);
+	}
 	md_spin_lock_irq(&conf->device_lock);
 	while (1) {
 		struct list_head *first;

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Locking bugs in 2.4 md.c
  2003-09-19  8:36     ` Lars Marowsky-Bree
  2003-09-19 14:18       ` Lars Marowsky-Bree
@ 2003-09-22  5:54       ` Neil Brown
  2003-09-23 11:08         ` Lars Marowsky-Bree
  1 sibling, 1 reply; 8+ messages in thread
From: Neil Brown @ 2003-09-22  5:54 UTC (permalink / raw)
  To: Lars Marowsky-Bree; +Cc: linux-raid, axboe

On Friday September 19, lmb@suse.de wrote:
> On 2003-09-19T09:26:10,
>    Lars Marowsky-Bree <lmb@suse.de> said:
> 
> > > A patch against 2.4-current is below.  It probably works but I haven't
> > > reviewed it in the context of other recent changes to md so I make no
> > > promises.
> > Thanks. I'll merge it against the current 2.4.
> 
> Hi Neil, turns out your patch did exactly what I did with lock_mddev(),
> just with a bigger granularity (which I think is a good thing, but I was
> too shy to introduce a new lock).
> 
> However, I can still reliably panic the kernel with the sequence I gave
> on a SMP box, even though the panic now seems to occur in different
> places...

I can't !!
If you can show me a stack trace or some pointers to wher it panics, I
can look further.

NeilBrown

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: Locking bugs in 2.4 md.c
  2003-09-22  5:54       ` Neil Brown
@ 2003-09-23 11:08         ` Lars Marowsky-Bree
  0 siblings, 0 replies; 8+ messages in thread
From: Lars Marowsky-Bree @ 2003-09-23 11:08 UTC (permalink / raw)
  To: linux-raid

On 2003-09-22T15:54:46,
   Neil Brown <neilb@cse.unsw.edu.au> said:

> If you can show me a stack trace or some pointers to wher it panics, I
> can look further.

OK, it looks like the patch indeed fixes the locking races in md. The
other one I've been seeing was unrelated to md.

Though I did change the ITERATE_MDDEV/*_LOCK*/ in the the complete raid
state printout to the locked version.

Sincerely,
    Lars Marowsky-Brée <lmb@suse.de>

-- 
High Availability & Clustering		ever tried. ever failed. no matter.
SuSE Labs				try again. fail again. fail better.
Research & Development, SUSE LINUX AG		-- Samuel Beckett

-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [fixes] Re: Locking bugs in 2.4 md.c
  2003-09-19  2:08 ` Neil Brown
  2003-09-19  7:26   ` Lars Marowsky-Bree
@ 2003-09-24 10:31   ` Lars Marowsky-Bree
  1 sibling, 0 replies; 8+ messages in thread
From: Lars Marowsky-Bree @ 2003-09-24 10:31 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-raid


[-- Attachment #1.1: Type: text/plain, Size: 946 bytes --]

On 2003-09-19T12:08:29,
   Neil Brown <neilb@cse.unsw.edu.au> said:

Hi Neil,

your patch needs the following two addendums.

Andi Kleen: Locking in md_seq_next is wrong, should happen in
md_seq_start

Andrea & I: Refcounting for the raidautostart feature is wrong.
This is actually a long-standing bug which is catched by your diffs,
which notice the wrong counter and BUG().

What would happen: user-space open()s /dev/md0, issues
ioctl(RAID_AUTOSTART) - which creates all md devices and sets active ==
0 ultimately -, and then close()s /dev/md0, which decrements active to
-1. Oops next time md_make_request gets called.

Patch fixes this and adds safeguard to md_release() already.


Sincerely,
    Lars Marowsky-Brée <lmb@suse.de>

-- 
High Availability & Clustering		ever tried. ever failed. no matter.
SuSE Labs				try again. fail again. fail better.
Research & Development, SUSE LINUX AG		-- Samuel Beckett


[-- Attachment #1.2: md-seqlock --]
[-- Type: text/plain, Size: 442 bytes --]

--- linux/drivers/md/md.c-o	2003-09-24 09:45:39.000000000 +0000
+++ linux/drivers/md/md.c	2003-09-24 09:47:23.000000000 +0000
@@ -3287,6 +3287,7 @@
 
 	if (l > 0x10000)
 		return NULL;
+	down_read(&all_mddevs_sem);
 	if (!l--)
 		/* header */
 		return (void*)1;
@@ -3304,7 +3305,6 @@
 	struct list_head *tmp;
 	mddev_t *next_mddev, *mddev = v;
 
-	down_read(&all_mddevs_sem);
 	++*pos;
 	if (v == (void*)2)
 		return NULL;

[-- Attachment #1.3: md-refcount-fix --]
[-- Type: text/plain, Size: 1351 bytes --]

--- linux-2.4.21/drivers/md/md.c	2003-09-24 10:54:35.000000000 +0200
+++ linux-2.4.21.new/drivers/md/md.c	2003-09-24 10:53:34.000000000 +0200
@@ -60,7 +60,7 @@
 #endif
 
 #ifndef MODULE
-static void autostart_arrays (void);
+static void autostart_arrays (kdev_t);
 #endif
 
 static mdk_personality_t *pers[MAX_PERSONALITY];
@@ -2753,7 +2753,7 @@
 #ifndef MODULE
 		case RAID_AUTORUN:
 			err = 0;
-			autostart_arrays();
+			autostart_arrays(dev);
 			goto done;
 #endif
 
@@ -3017,8 +3017,11 @@
 {
 	mddev_t *mddev = kdev_to_mddev(inode->i_rdev);
 	if (mddev) {
-		atomic_dec(&mddev->active);
-		atomic_dec(&mddev->active);
+		if (atomic_dec_and_test(&mddev->active))
+			/* Refcount should _never_ drop to zero here! */
+			BUG();
+		else
+			atomic_dec(&mddev->active);
 	}
 	return 0;
 }
@@ -3923,7 +3926,7 @@
 }
 
 
-static void autostart_arrays(void)
+static void autostart_arrays(kdev_t countdev)
 {
 	mdk_rdev_t *rdev;
 	int i;
@@ -3954,7 +3957,7 @@
 	}
 	dev_cnt = 0;
 
-	autorun_devices(-1);
+	autorun_devices(countdev);
 }
 
 static struct {
@@ -4183,7 +4186,7 @@
 	if (raid_setup_args.noautodetect)
 		printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=noautodetect)\n");
 	else
-		autostart_arrays();
+		autostart_arrays(-1);
 	md_setup_drive();
 	return 0;
 }

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2003-09-24 10:31 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2003-09-18 15:23 Locking bugs in 2.4 md.c Lars Marowsky-Bree
2003-09-19  2:08 ` Neil Brown
2003-09-19  7:26   ` Lars Marowsky-Bree
2003-09-19  8:36     ` Lars Marowsky-Bree
2003-09-19 14:18       ` Lars Marowsky-Bree
2003-09-22  5:54       ` Neil Brown
2003-09-23 11:08         ` Lars Marowsky-Bree
2003-09-24 10:31   ` [fixes] " Lars Marowsky-Bree

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).