linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: NeilBrown <neilb@suse.de>
To: Adam Kwolek <adam.kwolek@intel.com>
Cc: linux-raid@vger.kernel.org, ed.ciechanowski@intel.com,
	marcin.labun@intel.com, dan.j.williams@intel.com,
	lukasz.dorau@intel.com
Subject: Re: [PATCH 08/12] FIX: use md position to reshape restart
Date: Thu, 9 Feb 2012 12:36:25 +1100	[thread overview]
Message-ID: <20120209123625.13dd5ae4@notabene.brown> (raw)
In-Reply-To: <20120207140359.20627.10861.stgit@gklab-128-013.igk.intel.com>

[-- Attachment #1: Type: text/plain, Size: 5119 bytes --]

On Tue, 07 Feb 2012 15:03:59 +0100 Adam Kwolek <adam.kwolek@intel.com> wrote:

> When reshape is broken, it can occur that metadata is not saved properly.
> This can cause that reshape process is farther in md than metadata states.
> 
> On reshape restart use md position as start position, if it is farther than
> position specified in metadata. Opposite situation treat as error.
> 
> Signed-off-by: Adam Kwolek <adam.kwolek@intel.com>
> ---
> 
>  Grow.c |   86 +++++++++++++++++++++++++++++++++++++++++++++-------------------
>  1 files changed, 60 insertions(+), 26 deletions(-)
> 
> diff --git a/Grow.c b/Grow.c
> index 70bdee1..0668a16 100644
> --- a/Grow.c
> +++ b/Grow.c
> @@ -1862,6 +1862,55 @@ release:
>  	return rv;
>  }
>  
> +/* verify_reshape_position()
> + *	Function checks if reshape position in metadata is not farther
> + *	than position in md.
> + * Return value:
> + *	 0 : not valid sysfs entry
> + *		it can be caused by not started reshape, it should be started
> + *		by reshape array or raid0 array is before takeover
> + *	-1 :	error, reshape position is obviously wrong
> + *	 1 :	success, reshape progress correct or updated
> +*/
> +static int verify_reshape_position(struct mdinfo *info, int level)
> +{
> +	int ret_val = 0;
> +	char buf[40];
> +
> +	/* read sync_max, failure can mean raid0 array */
> +	if (sysfs_get_str(info, NULL, "sync_max", buf, 40) > 0) {
> +		char *ep;
> +		unsigned long long position = strtoull(buf, &ep, 0);
> +
> +		dprintf(Name": Read sync_max sysfs entry is: %s\n", buf);
> +		if (!(ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' '))) {
> +			position *= get_data_disks(level,
> +						   info->new_layout,
> +						   info->array.raid_disks);
> +			if (info->reshape_progress < position) {
> +				dprintf("Corrected reshape progress (%llu) to "
> +					"md position (%llu)\n",
> +					info->reshape_progress, position);
> +				info->reshape_progress = position;
> +				ret_val = 1;
> +			} else if (info->reshape_progress > position) {
> +				fprintf(stderr, Name ": Fatal error: array "
> +					"reshape was not properly frozen "
> +					"(expected reshape position is %llu, "
> +					"but reshape progress is %llu.\n",
> +					position, info->reshape_progress);
> +				ret_val = -1;
> +			} else {
> +				dprintf("Reshape position in md and metadata "
> +					"are the same;");
> +				ret_val = 1;
> +			}
> +		}
> +	}
> +
> +	return ret_val;
> +}
> +
>  static int reshape_array(char *container, int fd, char *devname,
>  			 struct supertype *st, struct mdinfo *info,
>  			 int force, struct mddev_dev *devlist,
> @@ -2251,9 +2300,16 @@ started:
>  
>  	sra->new_chunk = info->new_chunk;
>  
> -	if (restart)
> +	if (restart) {
> +		/* for external metadata checkpoint saved by mdmon can be lost
> +		 * or missed /due to e.g. crash/. Check if md is not during
> +		 * restart farther than metadata points to.
> +		 * If so, this means metadata information is obsolete.
> +		 */
> +		if (st->ss->external)
> +			verify_reshape_position(info, reshape.level);
>  		sra->reshape_progress = info->reshape_progress;
> -	else {
> +	} else {
>  		sra->reshape_progress = 0;
>  		if (reshape.after.data_disks < reshape.before.data_disks)
>  			/* start from the end of the new array */
> @@ -3765,8 +3821,6 @@ int Grow_continue_command(char *devname, int fd,
>  	char buf[40];
>  	int cfd = -1;
>  	int fd2 = -1;
> -	char *ep;
> -	unsigned long long position;
>  
>  	dprintf("Grow continue from command line called for %s\n",
>  		devname);
> @@ -3894,28 +3948,8 @@ int Grow_continue_command(char *devname, int fd,
>  	/* verify that array under reshape is started from
>  	 * correct position
>  	 */
> -	ret_val = sysfs_get_str(content, NULL, "sync_max", buf, 40);
> -	if (ret_val <= 0) {
                    ^^^

This test is '<=' however ...


> -		fprintf(stderr, Name
> -			": cannot open verify reshape progress for %s (%i)\n",
> -			content->sys_name, ret_val);
> -		ret_val = 1;
> -		goto Grow_continue_command_exit;
> -	}
> -	dprintf(Name ": Read sync_max sysfs entry is: %s\n", buf);
> -	position = strtoull(buf, &ep, 0);
> -	if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' ')) {
> -		fprintf(stderr, Name ": Fatal error: array reshape was"
> -			" not properly frozen\n");
> -		ret_val = 1;
> -		goto Grow_continue_command_exit;
> -	}
> -	position *= get_data_disks(map_name(pers, mdstat->level),
> -				   content->new_layout,
> -				   content->array.raid_disks);
> -	if (position != content->reshape_progress) {
> -		fprintf(stderr, Name ": Fatal error: array reshape was"
> -			" not properly frozen.\n");
> +	if (verify_reshape_position(content,
> +				    map_name(pers, mdstat->level)) < 0) {
                                                                   ^^

this one is just '<'.  However it looks like it should be the same.

Was there a reason for the change, or was it an oversight?

Thanks,
NeilBrown




>  		ret_val = 1;
>  		goto Grow_continue_command_exit;
>  	}


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

  reply	other threads:[~2012-02-09  1:36 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-02-07 14:02 [PATCH 00/12] Problems during restart container reshape Adam Kwolek
2012-02-07 14:03 ` [PATCH 01/12] FIX: NULL pointer to strdup() can be passed Adam Kwolek
2012-02-07 14:03 ` [PATCH 02/12] imsm: FIX: No new missing disks are allowed during general migration Adam Kwolek
2012-02-07 14:03 ` [PATCH 03/12] FIX: Array is not run when expansion disks are added Adam Kwolek
2012-02-07 14:03 ` [PATCH 04/12] imsm: FIX: imsm_get_allowed_degradation() doesn't count degradation for raid1 Adam Kwolek
2012-02-07 14:03 ` [PATCH 05/12] Fix: Sometimes mdmon throws core dump during reshape Adam Kwolek
2012-02-07 14:03 ` [PATCH 06/12] Flush mdmon before next reshape step during container operation Adam Kwolek
2012-02-07 14:03 ` [PATCH 07/12] imsm: FIX: Chunk size migration problem Adam Kwolek
2012-02-07 14:03 ` [PATCH 08/12] FIX: use md position to reshape restart Adam Kwolek
2012-02-09  1:36   ` NeilBrown [this message]
2012-02-09  8:16     ` Kwolek, Adam
2012-02-09  9:19       ` NeilBrown
2012-02-09  9:49         ` Kwolek, Adam
2012-02-09 10:19           ` NeilBrown
2012-02-09 10:23             ` Kwolek, Adam
2012-02-09 10:53               ` Kwolek, Adam
2012-02-07 14:04 ` [PATCH 09/12] imsm: " Adam Kwolek
2012-02-07 14:04 ` [PATCH 10/12] imsm: FIX: Clear migration record when migration switches to next volume Adam Kwolek
2012-02-07 14:04 ` [PATCH 11/12] FIX: restart reshape when reshape process is stopped just between 2 reshapes Adam Kwolek
2012-02-07 14:04 ` [PATCH 12/12] FIX: Do not try to (continue) reshape using inactive array Adam Kwolek
2012-02-09  1:39 ` [PATCH 00/12] Problems during restart container reshape NeilBrown

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20120209123625.13dd5ae4@notabene.brown \
    --to=neilb@suse.de \
    --cc=adam.kwolek@intel.com \
    --cc=dan.j.williams@intel.com \
    --cc=ed.ciechanowski@intel.com \
    --cc=linux-raid@vger.kernel.org \
    --cc=lukasz.dorau@intel.com \
    --cc=marcin.labun@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).