All of lore.kernel.org
 help / color / mirror / Atom feed
From: NeilBrown <neilb@suse.de>
To: Andrew Morton <akpm@osdl.org>
Cc: linux-raid@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 007 of 11] md: Allow checkpoint of recovery with version-1 superblock.
Date: Mon, 1 May 2006 15:30:42 +1000	[thread overview]
Message-ID: <1060501053042.22985@suse.de> (raw)
In-Reply-To: 20060501152229.18367.patches@notabene


For a while we have had checkpointing of resync.
The version-1 superblock allows recovery to be checkpointed
as well, and this patch implements that.

Due to early carelessness we need to add a feature flag
to signal that the recovery_offset field is in use, otherwise
older kernels would assume that a partially recovered array
is in fact fully recovered.

Signed-off-by: Neil Brown <neilb@suse.de>

### Diffstat output
 ./drivers/md/md.c           |  115 +++++++++++++++++++++++++++++++++++---------
 ./drivers/md/raid1.c        |    3 -
 ./drivers/md/raid10.c       |    3 -
 ./drivers/md/raid5.c        |    1 
 ./include/linux/raid/md_k.h |    6 ++
 ./include/linux/raid/md_p.h |    5 +
 6 files changed, 109 insertions(+), 24 deletions(-)

diff ./drivers/md/md.c~current~ ./drivers/md/md.c
--- ./drivers/md/md.c~current~	2006-05-01 15:10:18.000000000 +1000
+++ ./drivers/md/md.c	2006-05-01 15:12:34.000000000 +1000
@@ -1165,7 +1165,11 @@ static int super_1_validate(mddev_t *mdd
 			set_bit(Faulty, &rdev->flags);
 			break;
 		default:
-			set_bit(In_sync, &rdev->flags);
+			if ((le32_to_cpu(sb->feature_map) &
+			     MD_FEATURE_RECOVERY_OFFSET))
+				rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
+			else
+				set_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = role;
 			break;
 		}
@@ -1189,6 +1193,7 @@ static void super_1_sync(mddev_t *mddev,
 
 	sb->feature_map = 0;
 	sb->pad0 = 0;
+	sb->recovery_offset = cpu_to_le64(0);
 	memset(sb->pad1, 0, sizeof(sb->pad1));
 	memset(sb->pad2, 0, sizeof(sb->pad2));
 	memset(sb->pad3, 0, sizeof(sb->pad3));
@@ -1209,6 +1214,14 @@ static void super_1_sync(mddev_t *mddev,
 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
 	}
+
+	if (rdev->raid_disk >= 0 &&
+	    !test_bit(In_sync, &rdev->flags) &&
+	    rdev->recovery_offset > 0) {
+		sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
+		sb->recovery_offset = cpu_to_le64(rdev->recovery_offset);
+	}
+
 	if (mddev->reshape_position != MaxSector) {
 		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
 		sb->reshape_position = cpu_to_le64(mddev->reshape_position);
@@ -1233,11 +1246,12 @@ static void super_1_sync(mddev_t *mddev,
 			sb->dev_roles[i] = cpu_to_le16(0xfffe);
 		else if (test_bit(In_sync, &rdev2->flags))
 			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
+		else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
+			sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
 		else
 			sb->dev_roles[i] = cpu_to_le16(0xffff);
 	}
 
-	sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
 	sb->sb_csum = calc_sb_1_csum(sb);
 }
 
@@ -2590,8 +2604,6 @@ static struct kobject *md_probe(dev_t de
 	return NULL;
 }
 
-void md_wakeup_thread(mdk_thread_t *thread);
-
 static void md_safemode_timeout(unsigned long data)
 {
 	mddev_t *mddev = (mddev_t *) data;
@@ -2773,6 +2785,36 @@ static int do_md_run(mddev_t * mddev)
 	mddev->queue->queuedata = mddev;
 	mddev->queue->make_request_fn = mddev->pers->make_request;
 
+	/* If there is a partially-recovered drive we need to
+	 * start recovery here.  If we leave it to md_check_recovery,
+	 * it will remove the drives and not do the right thing
+	 */
+	if (mddev->degraded) {
+		struct list_head *rtmp;
+		int spares = 0;
+		ITERATE_RDEV(mddev,rdev,rtmp)
+			if (rdev->raid_disk >= 0 &&
+			    !test_bit(In_sync, &rdev->flags) &&
+			    !test_bit(Faulty, &rdev->flags))
+				/* complete an interrupted recovery */
+				spares++;
+		if (spares && mddev->pers->sync_request) {
+			mddev->recovery = 0;
+			set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+			mddev->sync_thread = md_register_thread(md_do_sync,
+								mddev,
+								"%s_resync");
+			if (!mddev->sync_thread) {
+				printk(KERN_ERR "%s: could not start resync"
+				       " thread...\n",
+				       mdname(mddev));
+				/* leave the spares where they are, it shouldn't hurt */
+				mddev->recovery = 0;
+			} else
+				md_wakeup_thread(mddev->sync_thread);
+		}
+	}
+
 	mddev->changed = 1;
 	md_new_event(mddev);
 	return 0;
@@ -2806,6 +2848,7 @@ static int restart_array(mddev_t *mddev)
 		 */
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		md_wakeup_thread(mddev->thread);
+		md_wakeup_thread(mddev->sync_thread);
 		err = 0;
 	} else {
 		printk(KERN_ERR "md: %s has no personality assigned.\n",
@@ -2829,6 +2872,7 @@ static int do_md_stop(mddev_t * mddev, i
 		}
 
 		if (mddev->sync_thread) {
+			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 			md_unregister_thread(mddev->sync_thread);
 			mddev->sync_thread = NULL;
@@ -2858,13 +2902,14 @@ static int do_md_stop(mddev_t * mddev, i
 			if (mddev->ro)
 				mddev->ro = 0;
 		}
-		if (!mddev->in_sync) {
+		if (!mddev->in_sync || mddev->sb_dirty) {
 			/* mark array as shutdown cleanly */
 			mddev->in_sync = 1;
 			md_update_sb(mddev);
 		}
 		if (ro)
 			set_disk_ro(disk, 1);
+		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	}
 
 	/*
@@ -4652,10 +4697,14 @@ void md_do_sync(mddev_t *mddev)
 	struct list_head *tmp;
 	sector_t last_check;
 	int skipped = 0;
+	struct list_head *rtmp;
+	mdk_rdev_t *rdev;
 
 	/* just incase thread restarts... */
 	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
 		return;
+	if (mddev->ro) /* never try to sync a read-only array */
+		return;
 
 	/* we overload curr_resync somewhat here.
 	 * 0 == not engaged in resync at all
@@ -4714,17 +4763,30 @@ void md_do_sync(mddev_t *mddev)
 		}
 	} while (mddev->curr_resync < 2);
 
+	j = 0;
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 		/* resync follows the size requested by the personality,
 		 * which defaults to physical size, but can be virtual size
 		 */
 		max_sectors = mddev->resync_max_sectors;
 		mddev->resync_mismatches = 0;
+		/* we don't use the checkpoint if there's a bitmap */
+		if (!mddev->bitmap &&
+		    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
+			j = mddev->recovery_cp;
 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		max_sectors = mddev->size << 1;
-	else
+	else {
 		/* recovery follows the physical size of devices */
 		max_sectors = mddev->size << 1;
+		j = MaxSector;
+		ITERATE_RDEV(mddev,rdev,rtmp)
+			if (rdev->raid_disk >= 0 &&
+			    !test_bit(Faulty, &rdev->flags) &&
+			    !test_bit(In_sync, &rdev->flags) &&
+			    rdev->recovery_offset < j)
+				j = rdev->recovery_offset;
+	}
 
 	printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
 	printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
@@ -4734,12 +4796,7 @@ void md_do_sync(mddev_t *mddev)
 	       speed_max(mddev));
 
 	is_mddev_idle(mddev); /* this also initializes IO event counters */
-	/* we don't use the checkpoint if there's a bitmap */
-	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap
-	    && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
-		j = mddev->recovery_cp;
-	else
-		j = 0;
+
 	io_sectors = 0;
 	for (m = 0; m < SYNC_MARKS; m++) {
 		mark[m] = jiffies;
@@ -4860,15 +4917,28 @@ void md_do_sync(mddev_t *mddev)
 	if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
 	    test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
 	    !test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
-	    mddev->curr_resync > 2 &&
-	    mddev->curr_resync >= mddev->recovery_cp) {
-		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
-			printk(KERN_INFO 
-				"md: checkpointing recovery of %s.\n",
-				mdname(mddev));
-			mddev->recovery_cp = mddev->curr_resync;
-		} else
-			mddev->recovery_cp = MaxSector;
+	    mddev->curr_resync > 2) {
+		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
+			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+				if (mddev->curr_resync >= mddev->recovery_cp) {
+					printk(KERN_INFO
+					       "md: checkpointing recovery of %s.\n",
+					       mdname(mddev));
+					mddev->recovery_cp = mddev->curr_resync;
+				}
+			} else
+				mddev->recovery_cp = MaxSector;
+		} else {
+			if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
+				mddev->curr_resync = MaxSector;
+			ITERATE_RDEV(mddev,rdev,rtmp)
+				if (rdev->raid_disk >= 0 &&
+				    !test_bit(Faulty, &rdev->flags) &&
+				    !test_bit(In_sync, &rdev->flags) &&
+				    rdev->recovery_offset < mddev->curr_resync)
+					rdev->recovery_offset = mddev->curr_resync;
+			mddev->sb_dirty = 1;
+		}
 	}
 
  skip:
@@ -4989,6 +5059,8 @@ void md_check_recovery(mddev_t *mddev)
 		clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
 		clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 
+		if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+			goto unlock;
 		/* no recovery is running.
 		 * remove any failed drives, then
 		 * add spares if possible.
@@ -5011,6 +5083,7 @@ void md_check_recovery(mddev_t *mddev)
 			ITERATE_RDEV(mddev,rdev,rtmp)
 				if (rdev->raid_disk < 0
 				    && !test_bit(Faulty, &rdev->flags)) {
+					rdev->recovery_offset = 0;
 					if (mddev->pers->hot_add_disk(mddev,rdev)) {
 						char nm[20];
 						sprintf(nm, "rd%d", rdev->raid_disk);

diff ./drivers/md/raid1.c~current~ ./drivers/md/raid1.c
--- ./drivers/md/raid1.c~current~	2006-05-01 15:10:00.000000000 +1000
+++ ./drivers/md/raid1.c	2006-05-01 15:12:34.000000000 +1000
@@ -1888,7 +1888,8 @@ static int run(mddev_t *mddev)
 
 		disk = conf->mirrors + i;
 
-		if (!disk->rdev) {
+		if (!disk->rdev ||
+		    !test_bit(In_sync, &rdev->flags)) {
 			disk->head_position = 0;
 			mddev->degraded++;
 		}

diff ./drivers/md/raid10.c~current~ ./drivers/md/raid10.c
--- ./drivers/md/raid10.c~current~	2006-05-01 15:10:17.000000000 +1000
+++ ./drivers/md/raid10.c	2006-05-01 15:12:34.000000000 +1000
@@ -2015,7 +2015,8 @@ static int run(mddev_t *mddev)
 
 		disk = conf->mirrors + i;
 
-		if (!disk->rdev) {
+		if (!disk->rdev ||
+		    !test_bit(In_sync, &rdev->flags)) {
 			disk->head_position = 0;
 			mddev->degraded++;
 		}

diff ./drivers/md/raid5.c~current~ ./drivers/md/raid5.c
--- ./drivers/md/raid5.c~current~	2006-05-01 15:10:18.000000000 +1000
+++ ./drivers/md/raid5.c	2006-05-01 15:12:34.000000000 +1000
@@ -3555,6 +3555,7 @@ static int raid5_start_reshape(mddev_t *
 				set_bit(In_sync, &rdev->flags);
 				conf->working_disks++;
 				added_devices++;
+				rdev->recovery_offset = 0;
 				sprintf(nm, "rd%d", rdev->raid_disk);
 				sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
 			} else

diff ./include/linux/raid/md_k.h~current~ ./include/linux/raid/md_k.h
--- ./include/linux/raid/md_k.h~current~	2006-05-01 15:10:17.000000000 +1000
+++ ./include/linux/raid/md_k.h	2006-05-01 15:12:34.000000000 +1000
@@ -88,6 +88,10 @@ struct mdk_rdev_s
 					 * array and could again if we did a partial
 					 * resync from the bitmap
 					 */
+	sector_t	recovery_offset;/* If this device has been partially
+					 * recovered, this is where we were
+					 * up to.
+					 */
 
 	atomic_t	nr_pending;	/* number of pending requests.
 					 * only maintained for arrays that
@@ -183,6 +187,8 @@ struct mddev_s
 #define	MD_RECOVERY_REQUESTED	6
 #define	MD_RECOVERY_CHECK	7
 #define MD_RECOVERY_RESHAPE	8
+#define	MD_RECOVERY_FROZEN	9
+
 	unsigned long			recovery;
 
 	int				in_sync;	/* know to not need resync */

diff ./include/linux/raid/md_p.h~current~ ./include/linux/raid/md_p.h
--- ./include/linux/raid/md_p.h~current~	2006-05-01 15:09:20.000000000 +1000
+++ ./include/linux/raid/md_p.h	2006-05-01 15:12:34.000000000 +1000
@@ -265,9 +265,12 @@ struct mdp_superblock_1 {
 
 /* feature_map bits */
 #define MD_FEATURE_BITMAP_OFFSET	1
+#define	MD_FEATURE_RECOVERY_OFFSET	2 /* recovery_offset is present and
+					   * must be honoured
+					   */
 #define	MD_FEATURE_RESHAPE_ACTIVE	4
 
-#define	MD_FEATURE_ALL			5
+#define	MD_FEATURE_ALL			(1|2|4)
 
 #endif 
 

  parent reply	other threads:[~2006-05-01  5:30 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2006-05-01  5:29 [PATCH 000 of 11] md: Introduction - assort md enhancements for 2.6.18 NeilBrown
2006-05-01  5:29 ` NeilBrown
2006-05-01  5:30 ` [PATCH 001 of 11] md: Reformat code in raid1_end_write_request to avoid goto NeilBrown
2006-05-01  5:30 ` [PATCH 002 of 11] md: Remove arbitrary limit on chunk size NeilBrown
2006-05-01  5:30 ` [PATCH 003 of 11] md: Remove useless ioctl warning NeilBrown
2006-05-01  5:30 ` [PATCH 004 of 11] md: Increase the delay before marking metadata clean, and make it configurable NeilBrown
2006-05-01  5:44   ` Andrew Morton
2006-05-01  6:02     ` Neil Brown
2006-05-01  6:13       ` Andrew Morton
2006-05-01 15:17         ` Linus Torvalds
2006-05-01  6:15       ` Nick Piggin
2006-05-02  5:56   ` bert hubert
2006-05-09  1:40     ` Neil Brown
2006-05-01  5:30 ` [PATCH 006 of 11] md: Remove nuisance message at shutdown NeilBrown
2006-05-01  5:30 ` NeilBrown [this message]
2006-05-01  5:30 ` [PATCH 008 of 11] md: Allow a linear array to have drives added while active NeilBrown
2006-05-01  5:30 ` [PATCH 009 of 11] md: Support stripe/offset mode in raid10 NeilBrown
2006-05-02 16:38   ` Al Boldi
2006-05-03  0:05     ` Neil Brown
2006-05-03  4:00       ` Al Boldi
2006-05-08  7:17         ` Neil Brown
2006-05-08 16:59           ` Al Boldi
2006-05-17 21:32           ` Raid5 resize "testing opportunity" Patrik Jonsson
2006-05-17 23:49             ` Neil Brown
2006-05-19  0:40               ` Patrik Jonsson
2006-05-19  0:44                 ` Neil Brown
2006-05-19 20:11                   ` Per Lindstrand
2006-05-01  5:31 ` [PATCH 010 of 11] md: make md_print_devices() static NeilBrown
2006-05-01  5:31   ` NeilBrown
2006-05-01  5:31 ` [PATCH 011 of 11] md: Split reshape portion of raid5 sync_request into a separate function NeilBrown
     [not found] ` <1060501053025.22961@suse.de>
2006-05-01  5:40   ` [PATCH 005 of 11] md: Merge raid5 and raid6 code H. Peter Anvin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1060501053042.22985@suse.de \
    --to=neilb@suse.de \
    --cc=akpm@osdl.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-raid@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.