linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Dan Williams <dan.j.williams@intel.com>
To: neilb@suse.de
Cc: ed.ciechanowski@intel.com, marcin.labun@intel.com,
	linux-raid@vger.kernel.org
Subject: [PATCH 2/2] md: add 'recovery_start' sysfs attribute
Date: Sat, 12 Dec 2009 21:17:12 -0700	[thread overview]
Message-ID: <20091213041711.12532.26335.stgit@dwillia2-linux.ch.intel.com> (raw)
In-Reply-To: <20091213041123.12532.15225.stgit@dwillia2-linux.ch.intel.com>

Enable external metadata arrays to manage rebuild checkpointing via a
md/recovery_start attribute that overrides rdev->recovery_offset.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---

 Documentation/md.txt |   15 +++++++++--
 drivers/md/md.c      |   69 +++++++++++++++++++++++++++++++++++++++++++-------
 drivers/md/md.h      |    1 +
 3 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/Documentation/md.txt b/Documentation/md.txt
index 4edd39e..2b03814 100644
--- a/Documentation/md.txt
+++ b/Documentation/md.txt
@@ -233,9 +233,18 @@ All md devices contain:
 
   resync_start
      The point at which resync should start.  If no resync is needed,
-     this will be a very large number.  At array creation it will
-     default to 0, though starting the array as 'clean' will
-     set it much larger.
+     this will be a very large number (or 'none' since 2.6.30-rc1).  At
+     array creation it will default to 0, though starting the array as
+     'clean' will set it much larger.
+
+  recovery_start
+     The point at which recovery should start when rebuilding a degraded
+     array member.  This value overrides the 'recovery_offset' read from
+     the metadata.  Setting this value to zero tells md to use/report
+     the default recovery_offset read from the metadata.  This value
+     auto-resets itself to zero (default recovery_offset) after it has
+     been consumed by the recovery process.  This value cannot be
+     changed while a recovery is in-flight.
 
    new_dev
      This file can be written but not read.  The value written should
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3e8fb67..5f09d40 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2983,6 +2983,56 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len)
 static struct md_sysfs_entry md_resync_start =
 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
 
+static sector_t md_recovery_offset(mddev_t *mddev)
+{
+	/* this is sometimes called outside mddev_lock() hence the
+	 * rcu_read_lock()
+	 */
+	sector_t recovery_offset = MaxSector;
+	mdk_rdev_t *rdev;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+		if (rdev->raid_disk >= 0 &&
+		    !test_bit(Faulty, &rdev->flags) &&
+		    !test_bit(In_sync, &rdev->flags) &&
+		    rdev->recovery_offset < recovery_offset)
+			recovery_offset = rdev->recovery_offset;
+	rcu_read_unlock();
+
+	return recovery_offset;
+}
+
+static ssize_t recovery_start_show(mddev_t *mddev, char *page)
+{
+	unsigned long long recovery_start = mddev->recovery_start;
+
+	if (recovery_start == 0)
+		recovery_start = md_recovery_offset(mddev);
+
+	if (recovery_start == MaxSector)
+		return sprintf(page, "none\n");
+
+	return sprintf(page, "%llu\n", recovery_start);
+}
+
+static ssize_t recovery_start_store(mddev_t *mddev, const char *buf, size_t len)
+{
+	unsigned long long recovery_start;
+
+	if (strict_strtoull(buf, 10, &recovery_start))
+		return -EINVAL;
+
+	if (!mddev->ro || !mddev->degraded || md_recovery_offset(mddev) > 0)
+		return -EBUSY;
+
+	mddev->recovery_start = recovery_start;
+	return len;
+}
+
+static struct md_sysfs_entry md_recovery_start =
+__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
+
 /*
  * The array state can be:
  *
@@ -3788,6 +3838,7 @@ static struct attribute *md_default_attrs[] = {
 	&md_chunk_size.attr,
 	&md_size.attr,
 	&md_resync_start.attr,
+	&md_recovery_start.attr,
 	&md_metadata.attr,
 	&md_new_device.attr,
 	&md_safe_delay.attr,
@@ -4426,6 +4477,7 @@ out:
 		mddev->dev_sectors = 0;
 		mddev->raid_disks = 0;
 		mddev->recovery_cp = 0;
+		mddev->recovery_start = 0;
 		mddev->resync_min = 0;
 		mddev->resync_max = MaxSector;
 		mddev->reshape_position = MaxSector;
@@ -6338,18 +6390,15 @@ void md_do_sync(mddev_t *mddev)
 
 	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		max_sectors = mddev->dev_sectors;
-	else {
+	else if (mddev->recovery_start) {
+		/* userspace requested override of rdev->recovery_offset */
+		max_sectors = mddev->dev_sectors;
+		j = mddev->recovery_start;
+		mddev->recovery_start = 0;
+	} else {
 		/* recovery follows the physical size of devices */
 		max_sectors = mddev->dev_sectors;
-		j = MaxSector;
-		rcu_read_lock();
-		list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
-			if (rdev->raid_disk >= 0 &&
-			    !test_bit(Faulty, &rdev->flags) &&
-			    !test_bit(In_sync, &rdev->flags) &&
-			    rdev->recovery_offset < j)
-				j = rdev->recovery_offset;
-		rcu_read_unlock();
+		j = md_recovery_offset(mddev);
 	}
 
 	printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
diff --git a/drivers/md/md.h b/drivers/md/md.h
index f184b69..03a18b4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -252,6 +252,7 @@ struct mddev_s
 	atomic_t			recovery_active; /* blocks scheduled, but not written */
 	wait_queue_head_t		recovery_wait;
 	sector_t			recovery_cp;
+	sector_t			recovery_start;	/* override rdev->recovery_offset */
 	sector_t			resync_min;	/* user requested sync
 							 * starts here */
 	sector_t			resync_max;	/* resync should pause


  parent reply	other threads:[~2009-12-13  4:17 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-12-13  4:17 [GIT PATCH 0/2] external-metadata recovery checkpointing for 2.6.33 Dan Williams
2009-12-13  4:17 ` [PATCH 1/2] md: rcu_read_lock() walk of mddev->disks in md_do_sync() Dan Williams
2009-12-13  4:17 ` Dan Williams [this message]
2009-12-14  4:07 ` [GIT PATCH 0/2] external-metadata recovery checkpointing for 2.6.33 Neil Brown
2009-12-14  4:49   ` Dan Williams
2009-12-14  5:35     ` Neil Brown
2009-12-15  0:37   ` Dan Williams
2009-12-15  4:19     ` Dan Williams
2009-12-15 18:03       ` Dan Williams
2009-12-16  5:16         ` Neil Brown
2009-12-16  6:24           ` Dan Williams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20091213041711.12532.26335.stgit@dwillia2-linux.ch.intel.com \
    --to=dan.j.williams@intel.com \
    --cc=ed.ciechanowski@intel.com \
    --cc=linux-raid@vger.kernel.org \
    --cc=marcin.labun@intel.com \
    --cc=neilb@suse.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).