linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH AUTOSEL 5.4 154/350] md: no longer compare spare disk superblock events in super_load
       [not found] <20191210210735.9077-1-sashal@kernel.org>
@ 2019-12-10 21:04 ` Sasha Levin
  2019-12-10 21:04 ` [PATCH AUTOSEL 5.4 156/350] md/bitmap: avoid race window between md_bitmap_resize and bitmap_file_clear_bit Sasha Levin
  1 sibling, 0 replies; 2+ messages in thread
From: Sasha Levin @ 2019-12-10 21:04 UTC (permalink / raw)
  To: linux-kernel, stable; +Cc: Yufen Yu, Song Liu, Sasha Levin, linux-raid

From: Yufen Yu <yuyufen@huawei.com>

[ Upstream commit 6a5cb53aaa4ef515ddeffa04ce18b771121127b4 ]

We have a test case as follow:

  mdadm -CR /dev/md1 -l 1 -n 4 /dev/sd[a-d] \
	--assume-clean --bitmap=internal
  mdadm -S /dev/md1
  mdadm -A /dev/md1 /dev/sd[b-c] --run --force

  mdadm --zero /dev/sda
  mdadm /dev/md1 -a /dev/sda

  echo offline > /sys/block/sdc/device/state
  echo offline > /sys/block/sdb/device/state
  sleep 5
  mdadm -S /dev/md1

  echo running > /sys/block/sdb/device/state
  echo running > /sys/block/sdc/device/state
  mdadm -A /dev/md1 /dev/sd[a-c] --run --force

When we readd /dev/sda to the array, it started to do recovery.
After offline the other two disks in md1, the recovery have
been interrupted and superblock update info cannot be written
to the offline disks. While the spare disk (/dev/sda) can continue
to update superblock info.

After stopping the array and assemble it, we found the array
run fail, with the follow kernel message:

[  172.986064] md: kicking non-fresh sdb from array!
[  173.004210] md: kicking non-fresh sdc from array!
[  173.022383] md/raid1:md1: active with 0 out of 4 mirrors
[  173.022406] md1: failed to create bitmap (-5)
[  173.023466] md: md1 stopped.

Since both sdb and sdc have the value of 'sb->events' smaller than
that in sda, they have been kicked from the array. However, the only
remained disk sda is in 'spare' state before stop and it cannot be
added to conf->mirrors[] array. In the end, raid array assemble
and run fail.

In fact, we can use the older disk sdb or sdc to assemble the array.
That means we should not choose the 'spare' disk as the fresh disk in
analyze_sbs().

To fix the problem, we do not compare superblock events when it is
a spare disk, as same as validate_super.

Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/md/md.c | 57 +++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 51 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1be7abeb24fdc..fc6ae8276a92f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1149,7 +1149,15 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
 		rdev->desc_nr = sb->this_disk.number;
 
 	if (!refdev) {
-		ret = 1;
+		/*
+		 * Insist on good event counter while assembling, except
+		 * for spares (which don't need an event count)
+		 */
+		if (sb->disks[rdev->desc_nr].state & (
+			(1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
+			ret = 1;
+		else
+			ret = 0;
 	} else {
 		__u64 ev1, ev2;
 		mdp_super_t *refsb = page_address(refdev->sb_page);
@@ -1165,7 +1173,14 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
 		}
 		ev1 = md_event(sb);
 		ev2 = md_event(refsb);
-		if (ev1 > ev2)
+
+		/*
+		 * Insist on good event counter while assembling, except
+		 * for spares (which don't need an event count)
+		 */
+		if (sb->disks[rdev->desc_nr].state & (
+			(1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)) &&
+			(ev1 > ev2))
 			ret = 1;
 		else
 			ret = 0;
@@ -1525,6 +1540,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 	sector_t sectors;
 	char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
 	int bmask;
+	__u64 role;
 
 	/*
 	 * Calculate the position of the superblock in 512byte sectors.
@@ -1658,8 +1674,20 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 	    sb->level != 0)
 		return -EINVAL;
 
+	role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+
 	if (!refdev) {
-		ret = 1;
+		/*
+		 * Insist of good event counter while assembling, except for
+		 * spares (which don't need an event count)
+		 */
+		if (rdev->desc_nr >= 0 &&
+		    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
+			(role < MD_DISK_ROLE_MAX ||
+			 role == MD_DISK_ROLE_JOURNAL))
+			ret = 1;
+		else
+			ret = 0;
 	} else {
 		__u64 ev1, ev2;
 		struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
@@ -1676,7 +1704,14 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 		ev1 = le64_to_cpu(sb->events);
 		ev2 = le64_to_cpu(refsb->events);
 
-		if (ev1 > ev2)
+		/*
+		 * Insist of good event counter while assembling, except for
+		 * spares (which don't need an event count)
+		 */
+		if (rdev->desc_nr >= 0 &&
+		    rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
+			(role < MD_DISK_ROLE_MAX ||
+			 role == MD_DISK_ROLE_JOURNAL) && ev1 > ev2)
 			ret = 1;
 		else
 			ret = 0;
@@ -3597,7 +3632,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
  * Check a full RAID array for plausibility
  */
 
-static void analyze_sbs(struct mddev *mddev)
+static int analyze_sbs(struct mddev *mddev)
 {
 	int i;
 	struct md_rdev *rdev, *freshest, *tmp;
@@ -3618,6 +3653,12 @@ static void analyze_sbs(struct mddev *mddev)
 			md_kick_rdev_from_array(rdev);
 		}
 
+	/* Cannot find a valid fresh disk */
+	if (!freshest) {
+		pr_warn("md: cannot find a valid disk\n");
+		return -EINVAL;
+	}
+
 	super_types[mddev->major_version].
 		validate_super(mddev, freshest);
 
@@ -3652,6 +3693,8 @@ static void analyze_sbs(struct mddev *mddev)
 			clear_bit(In_sync, &rdev->flags);
 		}
 	}
+
+	return 0;
 }
 
 /* Read a fixed-point number.
@@ -5570,7 +5613,9 @@ int md_run(struct mddev *mddev)
 	if (!mddev->raid_disks) {
 		if (!mddev->persistent)
 			return -EINVAL;
-		analyze_sbs(mddev);
+		err = analyze_sbs(mddev);
+		if (err)
+			return -EINVAL;
 	}
 
 	if (mddev->level != LEVEL_NONE)
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* [PATCH AUTOSEL 5.4 156/350] md/bitmap: avoid race window between md_bitmap_resize and bitmap_file_clear_bit
       [not found] <20191210210735.9077-1-sashal@kernel.org>
  2019-12-10 21:04 ` [PATCH AUTOSEL 5.4 154/350] md: no longer compare spare disk superblock events in super_load Sasha Levin
@ 2019-12-10 21:04 ` Sasha Levin
  1 sibling, 0 replies; 2+ messages in thread
From: Sasha Levin @ 2019-12-10 21:04 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Guoqing Jiang, Jack Wang, NeilBrown, Song Liu, Sasha Levin,
	linux-raid

From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>

[ Upstream commit fadcbd2901a0f7c8721f3bdb69eac95c272dc8ed ]

We need to move "spin_lock_irq(&bitmap->counts.lock)" before unmap previous
storage, otherwise panic like belows could happen as follows.

[  902.353802] sdl: detected capacity change from 1077936128 to 3221225472
[  902.616948] general protection fault: 0000 [#1] SMP
[snip]
[  902.618588] CPU: 12 PID: 33698 Comm: md0_raid1 Tainted: G           O    4.14.144-1-pserver #4.14.144-1.1~deb10
[  902.618870] Hardware name: Supermicro SBA-7142G-T4/BHQGE, BIOS 3.00       10/24/2012
[  902.619120] task: ffff9ae1860fc600 task.stack: ffffb52e4c704000
[  902.619301] RIP: 0010:bitmap_file_clear_bit+0x90/0xd0 [md_mod]
[  902.619464] RSP: 0018:ffffb52e4c707d28 EFLAGS: 00010087
[  902.619626] RAX: ffe8008b0d061000 RBX: ffff9ad078c87300 RCX: 0000000000000000
[  902.619792] RDX: ffff9ad986341868 RSI: 0000000000000803 RDI: ffff9ad078c87300
[  902.619986] RBP: ffff9ad0ed7a8000 R08: 0000000000000000 R09: 0000000000000000
[  902.620154] R10: ffffb52e4c707ec0 R11: ffff9ad987d1ed44 R12: ffff9ad0ed7a8360
[  902.620320] R13: 0000000000000003 R14: 0000000000060000 R15: 0000000000000800
[  902.620487] FS:  0000000000000000(0000) GS:ffff9ad987d00000(0000) knlGS:0000000000000000
[  902.620738] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  902.620901] CR2: 000055ff12aecec0 CR3: 0000001005207000 CR4: 00000000000406e0
[  902.621068] Call Trace:
[  902.621256]  bitmap_daemon_work+0x2dd/0x360 [md_mod]
[  902.621429]  ? find_pers+0x70/0x70 [md_mod]
[  902.621597]  md_check_recovery+0x51/0x540 [md_mod]
[  902.621762]  raid1d+0x5c/0xeb0 [raid1]
[  902.621939]  ? try_to_del_timer_sync+0x4d/0x80
[  902.622102]  ? del_timer_sync+0x35/0x40
[  902.622265]  ? schedule_timeout+0x177/0x360
[  902.622453]  ? call_timer_fn+0x130/0x130
[  902.622623]  ? find_pers+0x70/0x70 [md_mod]
[  902.622794]  ? md_thread+0x94/0x150 [md_mod]
[  902.622959]  md_thread+0x94/0x150 [md_mod]
[  902.623121]  ? wait_woken+0x80/0x80
[  902.623280]  kthread+0x119/0x130
[  902.623437]  ? kthread_create_on_node+0x60/0x60
[  902.623600]  ret_from_fork+0x22/0x40
[  902.624225] RIP: bitmap_file_clear_bit+0x90/0xd0 [md_mod] RSP: ffffb52e4c707d28

Because mdadm was running on another cpu to do resize, so bitmap_resize was
called to replace bitmap as below shows.

PID: 38801  TASK: ffff9ad074a90e00  CPU: 0   COMMAND: "mdadm"
   [exception RIP: queued_spin_lock_slowpath+56]
   [snip]
-- <NMI exception stack> --
 #5 [ffffb52e60f17c58] queued_spin_lock_slowpath at ffffffff9c0b27b8
 #6 [ffffb52e60f17c58] bitmap_resize at ffffffffc0399877 [md_mod]
 #7 [ffffb52e60f17d30] raid1_resize at ffffffffc0285bf9 [raid1]
 #8 [ffffb52e60f17d50] update_size at ffffffffc038a31a [md_mod]
 #9 [ffffb52e60f17d70] md_ioctl at ffffffffc0395ca4 [md_mod]

And the procedure to keep resize bitmap safe is allocate new storage
space, then quiesce, copy bits, replace bitmap, and re-start.

However the daemon (bitmap_daemon_work) could happen even the array is
quiesced, which means when bitmap_file_clear_bit is triggered by raid1d,
then it thinks it should be fine to access store->filemap since
counts->lock is held, but resize could change the storage without the
protection of the lock.

Cc: Jack Wang <jinpu.wang@cloud.ionos.com>
Cc: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/md/md-bitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index b092c7b5282f9..3ad18246fcb3c 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -2139,6 +2139,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 		memcpy(page_address(store.sb_page),
 		       page_address(bitmap->storage.sb_page),
 		       sizeof(bitmap_super_t));
+	spin_lock_irq(&bitmap->counts.lock);
 	md_bitmap_file_unmap(&bitmap->storage);
 	bitmap->storage = store;
 
@@ -2154,7 +2155,6 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 	blocks = min(old_counts.chunks << old_counts.chunkshift,
 		     chunks << chunkshift);
 
-	spin_lock_irq(&bitmap->counts.lock);
 	/* For cluster raid, need to pre-allocate bitmap */
 	if (mddev_is_clustered(bitmap->mddev)) {
 		unsigned long page;
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2019-12-10 21:04 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20191210210735.9077-1-sashal@kernel.org>
2019-12-10 21:04 ` [PATCH AUTOSEL 5.4 154/350] md: no longer compare spare disk superblock events in super_load Sasha Levin
2019-12-10 21:04 ` [PATCH AUTOSEL 5.4 156/350] md/bitmap: avoid race window between md_bitmap_resize and bitmap_file_clear_bit Sasha Levin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).