Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH 2/5] md/raid10: get rid of duplicated conditional expression
From: Namhyung Kim @ 2011-06-15  2:02 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-raid
In-Reply-To: <1308103324-2375-1-git-send-email-namhyung@gmail.com>

Variable 'first' is initialized to zero and updated to @rdev->raid_disk
only if it is greater than 0. Thus condition '>= first' always implies
'>= 0' so the latter is not needed.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid10.c |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index fc56bdd8c3fb..fcb86e86bc31 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1093,8 +1093,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
-	if (rdev->saved_raid_disk >= 0 &&
-	    rdev->saved_raid_disk >= first &&
+	if (rdev->saved_raid_disk >= first &&
 	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
 		mirror = rdev->saved_raid_disk;
 	else
-- 
1.7.5.2


^ permalink raw reply related

* [PATCH 3/5] md/raid10: factor out common bio handling code
From: Namhyung Kim @ 2011-06-15  2:02 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-raid
In-Reply-To: <1308103324-2375-1-git-send-email-namhyung@gmail.com>

When normal-write and sync-read/write bio completes, we should
find out the disk number the bio belongs to. Factor those common
code out to a separate function.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid10.c |   44 +++++++++++++++++++++++---------------------
 1 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index fcb86e86bc31..a53779ffdf89 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -244,6 +244,23 @@ static inline void update_head_pos(int slot, r10bio_t *r10_bio)
 		r10_bio->devs[slot].addr + (r10_bio->sectors);
 }
 
+/*
+ * Find the disk number which triggered given bio
+ */
+static int find_bio_disk(conf_t *conf, r10bio_t *r10_bio, struct bio *bio)
+{
+	int slot;
+
+	for (slot = 0; slot < conf->copies; slot++)
+		if (r10_bio->devs[slot].bio == bio)
+			break;
+
+	BUG_ON(slot == conf->copies);
+	update_head_pos(slot, r10_bio);
+
+	return r10_bio->devs[slot].devnum;
+}
+
 static void raid10_end_read_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -289,13 +306,10 @@ static void raid10_end_write_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r10bio_t *r10_bio = bio->bi_private;
-	int slot, dev;
+	int dev;
 	conf_t *conf = r10_bio->mddev->private;
 
-	for (slot = 0; slot < conf->copies; slot++)
-		if (r10_bio->devs[slot].bio == bio)
-			break;
-	dev = r10_bio->devs[slot].devnum;
+	dev = find_bio_disk(conf, r10_bio, bio);
 
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
@@ -316,8 +330,6 @@ static void raid10_end_write_request(struct bio *bio, int error)
 		 */
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
 
-	update_head_pos(slot, r10_bio);
-
 	/*
 	 *
 	 * Let's see if all mirrored write operations have finished
@@ -1173,14 +1185,9 @@ static void end_sync_read(struct bio *bio, int error)
 {
 	r10bio_t *r10_bio = bio->bi_private;
 	conf_t *conf = r10_bio->mddev->private;
-	int i,d;
+	int d;
 
-	for (i=0; i<conf->copies; i++)
-		if (r10_bio->devs[i].bio == bio)
-			break;
-	BUG_ON(i == conf->copies);
-	update_head_pos(i, r10_bio);
-	d = r10_bio->devs[i].devnum;
+	d = find_bio_disk(conf, r10_bio, bio);
 
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
@@ -1211,18 +1218,13 @@ static void end_sync_write(struct bio *bio, int error)
 	r10bio_t *r10_bio = bio->bi_private;
 	mddev_t *mddev = r10_bio->mddev;
 	conf_t *conf = mddev->private;
-	int i,d;
+	int d;
 
-	for (i = 0; i < conf->copies; i++)
-		if (r10_bio->devs[i].bio == bio)
-			break;
-	d = r10_bio->devs[i].devnum;
+	d = find_bio_disk(conf, r10_bio, bio);
 
 	if (!uptodate)
 		md_error(mddev, conf->mirrors[d].rdev);
 
-	update_head_pos(i, r10_bio);
-
 	rdev_dec_pending(conf->mirrors[d].rdev, mddev);
 	while (atomic_dec_and_test(&r10_bio->remaining)) {
 		if (r10_bio->master_bio == NULL) {
-- 
1.7.5.2


^ permalink raw reply related

* [PATCH v2 4/5] md/raid10: share pages between read and write bio's during recovery
From: Namhyung Kim @ 2011-06-15  2:02 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-raid
In-Reply-To: <1308103324-2375-1-git-send-email-namhyung@gmail.com>

When performing a recovery, only first 2 slots in r10_bio are in use,
for read and write respectively. However all of pages in the write bio
are never used and just replaced to read bio's when the read completes.

Get rid of those unused pages and share read pages properly.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid10.c |   24 +++++++++++++-----------
 1 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a53779ffdf89..dea73bdb99b8 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -123,7 +123,15 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 	for (j = 0 ; j < nalloc; j++) {
 		bio = r10_bio->devs[j].bio;
 		for (i = 0; i < RESYNC_PAGES; i++) {
-			page = alloc_page(gfp_flags);
+			if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
+						&conf->mddev->recovery)) {
+				/* we can share bv_page's during recovery */
+				struct bio *rbio = r10_bio->devs[0].bio;
+				page = rbio->bi_io_vec[i].bv_page;
+				get_page(page);
+			} else {
+				page = alloc_page(gfp_flags);
+			}
 			if (unlikely(!page))
 				goto out_free_pages;
 
@@ -1360,20 +1368,14 @@ done:
 static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 {
 	conf_t *conf = mddev->private;
-	int i, d;
-	struct bio *bio, *wbio;
-
+	int d;
+	struct bio *wbio;
 
-	/* move the pages across to the second bio
+	/*
+	 * share the pages with the first bio
 	 * and submit the write request
 	 */
-	bio = r10_bio->devs[0].bio;
 	wbio = r10_bio->devs[1].bio;
-	for (i=0; i < wbio->bi_vcnt; i++) {
-		struct page *p = bio->bi_io_vec[i].bv_page;
-		bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
-		wbio->bi_io_vec[i].bv_page = p;
-	}
 	d = r10_bio->devs[1].devnum;
 
 	atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-- 
1.7.5.2


^ permalink raw reply related

* [PATCH 5/5] md/raid10: spread read for subordinate r10bios during recovery
From: Namhyung Kim @ 2011-06-15  2:02 UTC (permalink / raw)
  To: Neil Brown; +Cc: linux-raid
In-Reply-To: <1308103324-2375-1-git-send-email-namhyung@gmail.com>

In the current scheme, multiple read request could be directed to
the first active disk during recovery if there are several disk
failure at the same time. Spreading those requests on other in-sync
disks might be helpful.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
---
 drivers/md/raid10.c |   10 +++++++---
 1 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index dea73bdb99b8..d0188e49f881 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1832,6 +1832,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
 	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 		/* recovery... the complicated one */
 		int j, k;
+		int last_read = -1;
 		r10_bio = NULL;
 
 		for (i=0 ; i<conf->raid_disks; i++) {
@@ -1891,7 +1892,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
 						      &sync_blocks, still_degraded);
 
 			for (j=0; j<conf->copies;j++) {
-				int d = r10_bio->devs[j].devnum;
+				int c = (last_read + j + 1) % conf->copies;
+				int d = r10_bio->devs[c].devnum;
+
 				if (!conf->mirrors[d].rdev ||
 				    !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
 					continue;
@@ -1902,13 +1905,14 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
 				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_read;
 				bio->bi_rw = READ;
-				bio->bi_sector = r10_bio->devs[j].addr +
+				bio->bi_sector = r10_bio->devs[c].addr +
 					conf->mirrors[d].rdev->data_offset;
 				bio->bi_bdev = conf->mirrors[d].rdev->bdev;
 				atomic_inc(&conf->mirrors[d].rdev->nr_pending);
 				atomic_inc(&r10_bio->remaining);
-				/* and we write to 'i' */
+				last_read = c;
 
+				/* and we write to 'i' */
 				for (k=0; k<conf->copies; k++)
 					if (r10_bio->devs[k].devnum == i)
 						break;
-- 
1.7.5.2


^ permalink raw reply related

* Re: [PATCH v2 1/5] md/raid10: optimize read_balance() for 'far offset' arrays
From: NeilBrown @ 2011-06-15  2:57 UTC (permalink / raw)
  To: Namhyung Kim; +Cc: linux-raid
In-Reply-To: <1308103324-2375-2-git-send-email-namhyung@gmail.com>

On Wed, 15 Jun 2011 11:02:00 +0900 Namhyung Kim <namhyung@gmail.com> wrote:

> If @conf->far_offset > 0, there is only 1 stripe so that we can treat
> the array same as 'near' arrays.
> 
> Signed-off-by: Namhyung Kim <namhyung@gmail.com>
> ---
>  drivers/md/raid10.c |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 6e846688962f..fc56bdd8c3fb 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -531,7 +531,7 @@ retry:
>  			break;
>  
>  		/* for far > 1 always use the lowest address */
> -		if (conf->far_copies > 1)
> +		if (conf->far_copies > 1 && conf->far_offset == 0)
>  			new_distance = r10_bio->devs[slot].addr;
>  		else
>  			new_distance = abs(r10_bio->devs[slot].addr -

Hi,
 I realise that I said before that this part was OK but having thought about
 it some more I am not convinced.
 Using the 'distance' from were the head previously was is a fairly poor
 heuristic.  In some cases it is the best we have, but it still isn't good.

 With an 'offset' layout, like with a 'far' layout, sections of the array are
 in a RAID0 layout and we know that reading from a RAID0 gives good speed by
 algorithmically distributing the reads evenly over all the devices.

 Setting new_distance to the 'addr' means that we use the algorithmic
 approach to distribute reads.  Setting it to the difference between addr and
 head_position uses the heuristic approach.

 I much prefer the algorithmic (RAID0) approach were it is possible.  So I'm
 not going to apply this patch.  However if you can demonstrate real speedups
 in some realist test I will reconsider my position.

Thanks,
NeilBrown

^ permalink raw reply

* Re: [PATCH 5/5] md/raid10: spread read for subordinate r10bios during recovery
From: NeilBrown @ 2011-06-15  3:09 UTC (permalink / raw)
  To: Namhyung Kim; +Cc: linux-raid
In-Reply-To: <1308103324-2375-6-git-send-email-namhyung@gmail.com>

On Wed, 15 Jun 2011 11:02:04 +0900 Namhyung Kim <namhyung@gmail.com> wrote:

> In the current scheme, multiple read request could be directed to
> the first active disk during recovery if there are several disk
> failure at the same time. Spreading those requests on other in-sync
> disks might be helpful.

I don't find this convincing either.  Spreading requests over disks in a
haphazard way is not certain to improve anything and could easily cause
regressions.

For example if I have an 'n3' array on 3 devices, this will read alternately
from the first 2 devices while rebuilding the last.  This is simply a waste.
One disk would be enough keep the rebuilding disk busy - the other should be
left of regular reads.

Again: if you can demonstrate a speed up in some configuration I'll be happy
to reconsider the patch.

Thanks,
NeilBrown


> 
> Signed-off-by: Namhyung Kim <namhyung@gmail.com>
> ---
>  drivers/md/raid10.c |   10 +++++++---
>  1 files changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index dea73bdb99b8..d0188e49f881 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -1832,6 +1832,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
>  	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
>  		/* recovery... the complicated one */
>  		int j, k;
> +		int last_read = -1;
>  		r10_bio = NULL;
>  
>  		for (i=0 ; i<conf->raid_disks; i++) {
> @@ -1891,7 +1892,9 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
>  						      &sync_blocks, still_degraded);
>  
>  			for (j=0; j<conf->copies;j++) {
> -				int d = r10_bio->devs[j].devnum;
> +				int c = (last_read + j + 1) % conf->copies;
> +				int d = r10_bio->devs[c].devnum;
> +
>  				if (!conf->mirrors[d].rdev ||
>  				    !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
>  					continue;
> @@ -1902,13 +1905,14 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr,
>  				bio->bi_private = r10_bio;
>  				bio->bi_end_io = end_sync_read;
>  				bio->bi_rw = READ;
> -				bio->bi_sector = r10_bio->devs[j].addr +
> +				bio->bi_sector = r10_bio->devs[c].addr +
>  					conf->mirrors[d].rdev->data_offset;
>  				bio->bi_bdev = conf->mirrors[d].rdev->bdev;
>  				atomic_inc(&conf->mirrors[d].rdev->nr_pending);
>  				atomic_inc(&r10_bio->remaining);
> -				/* and we write to 'i' */
> +				last_read = c;
>  
> +				/* and we write to 'i' */
>  				for (k=0; k<conf->copies; k++)
>  					if (r10_bio->devs[k].devnum == i)
>  						break;


^ permalink raw reply

* Re: [PATCH RESEND 0/5] md/raid10 changes
From: NeilBrown @ 2011-06-15  3:09 UTC (permalink / raw)
  To: Namhyung Kim; +Cc: linux-raid
In-Reply-To: <1308103324-2375-1-git-send-email-namhyung@gmail.com>

On Wed, 15 Jun 2011 11:01:59 +0900 Namhyung Kim <namhyung@gmail.com> wrote:

> Hello Neil,
> 
> This is a resend of my previous raid10 patches. Some of them are revised
> according to your comments and some of them are not reviewed at all.
> Please take a look.
> 
> Thanks.
> 
> 
> Namhyung Kim (5):
>   md/raid10: optimize read_balance() for 'far offset' arrays
>   md/raid10: get rid of duplicated conditional expression
>   md/raid10: factor out common bio handling code
>   md/raid10: share pages between read and write bio's during recovery
>   md/raid10: spread read for subordinate r10bios during recovery
> 

Thanks.
 I've reject the first and last as explained separately but the others look
 good and I have added them to my tree.  They will probably be submitted in
 the next merge window.

Thanks,
NeilBrown


^ permalink raw reply

* Re: [PATCH RESEND 0/5] md/raid10 changes
From: Namhyung Kim @ 2011-06-15  3:33 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid
In-Reply-To: <20110615130954.3e58a12c@notabene.brown>

2011-06-15 (수), 13:09 +1000, NeilBrown:
> On Wed, 15 Jun 2011 11:01:59 +0900 Namhyung Kim <namhyung@gmail.com> wrote:
> 
> > Hello Neil,
> > 
> > This is a resend of my previous raid10 patches. Some of them are revised
> > according to your comments and some of them are not reviewed at all.
> > Please take a look.
> > 
> > Thanks.
> > 
> > 
> > Namhyung Kim (5):
> >   md/raid10: optimize read_balance() for 'far offset' arrays
> >   md/raid10: get rid of duplicated conditional expression
> >   md/raid10: factor out common bio handling code
> >   md/raid10: share pages between read and write bio's during recovery
> >   md/raid10: spread read for subordinate r10bios during recovery
> > 
> 
> Thanks.
>  I've reject the first and last as explained separately but the others look
>  good and I have added them to my tree.  They will probably be submitted in
>  the next merge window.
> 
> Thanks,
> NeilBrown
> 

Hi,

Thanks for reviewing them. As I said before, I don't have any realistic
test environment for those patches. They've just come from the code
inspection, and I (wrongly) thought they would be somewhat helpful,
sorry. :)


-- 
Regards,
Namhyung Kim


--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2 1/5] md/raid10: optimize read_balance() for 'far offset' arrays
From: Keld Jørn Simonsen @ 2011-06-15  6:51 UTC (permalink / raw)
  To: Namhyung Kim; +Cc: Neil Brown, linux-raid
In-Reply-To: <1308103324-2375-2-git-send-email-namhyung@gmail.com>

On Wed, Jun 15, 2011 at 11:02:00AM +0900, Namhyung Kim wrote:
> If @conf->far_offset > 0, there is only 1 stripe so that we can treat
> the array same as 'near' arrays.

does it also work with more than 2 copies - eg 3 copies?
I think the original code just takes the available data blocks with the 
lowest address.

Best regards
keld

> Signed-off-by: Namhyung Kim <namhyung@gmail.com>
> ---
>  drivers/md/raid10.c |    2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 6e846688962f..fc56bdd8c3fb 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -531,7 +531,7 @@ retry:
>  			break;
>  
>  		/* for far > 1 always use the lowest address */
> -		if (conf->far_copies > 1)
> +		if (conf->far_copies > 1 && conf->far_offset == 0)
>  			new_distance = r10_bio->devs[slot].addr;
>  		else
>  			new_distance = abs(r10_bio->devs[slot].addr -
> -- 
> 1.7.5.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: HDD reports errors while completing RAID6 array check
From: Gordon Henderson @ 2011-06-15  9:11 UTC (permalink / raw)
  To: Linux-RAID
In-Reply-To: <BANLkTi=2xO5ZR5Vy2Qb2WJPzCPesy32nkA@mail.gmail.com>

[-- Attachment #1: Type: TEXT/PLAIN, Size: 3356 bytes --]

On Mon, 13 Jun 2011, Mathias Burén wrote:

> On 13 June 2011 19:30, Tim Blundell <tim.blundell@gmail.com> wrote:
>>
>> On 6/11/2011 5:49 AM, Mathias Burén wrote:
>>>
>>> === START OF INFORMATION SECTION ===
>>> Model Family:     Western Digital Caviar Green (Adv. Format) family
>>> Device Model:     WDC WD20EARS-00MVWB0
>>> Serial Number:    WD-WMAZ20188479
>>> Firmware Version: 50.0AB50
>>> User Capacity:    2,000,398,934,016 bytes
>>> Device is:        In smartctl database [for details use: -P show]
>>> ATA Version is:   8
>>> ATA Standard is:  Exact ATA specification draft version not indicated
>>> Local Time is:    Sat Jun 11 10:48:05 2011 IST
>>> SMART support is: Available - device has SMART capability.
>>> SMART support is: Enabled
>>
>> Not certain if this was mentioned. While WDC WD20EARS drives can be used in
>> an RAID array, WD recommends using there RAID capable drives in an
>> enterprise environment.
>> I tried using same drives in a simple RAID-1 array and had serious
>> performance issues (sync taking a week) and stalls when writing to disk. Are
>> you using the stock firmware on these drives?

> I'm using stock firmware as far as I know (I've not flashed them
> manually), and I experience no performance issues. Of course, my
> system is limited (RAID6 with an Intel Atom), so I can't really push
> them all out to test it. But still, no issues.

I've just put a pair into my own workstation - which is an Atom (2 core/4 
threads) with 2GB of RAM, running stock Debian Squeeze, however I've just 
installed my own kernel... (2.6.35.13)

They work just fine! Sync took overnight to complete on all partitions.

I'm a fan of multiple partitions, so my /proc/mdstat looks like:

Personalities : [linear] [raid0] [raid1] [raid10]
md1 : active raid1 sdb1[1] sda1[0]
       1048512 blocks [2/2] [UU]

md2 : active raid10 sdb2[1] sda2[0]
       8387584 blocks 512K chunks 2 far-copies [2/2] [UU]

md3 : active raid10 sdb3[1] sda3[0]
       2096128 blocks 512K chunks 2 far-copies [2/2] [UU]

md5 : active raid10 sda5[0] sdb5[1]
       922439680 blocks 512K chunks 2 far-copies [2/2] [UU]

md6 : active raid10 sdb6[1] sda6[0]
       1019538432 blocks 512K chunks 2 far-copies [2/2] [UU]

And a quick & dirty speed test looks like:

# hdparm -tT /dev/md{1,2}

/dev/md1:
  Timing cached reads:   1080 MB in  2.00 seconds = 539.70 MB/sec
  Timing buffered disk reads: 352 MB in  3.01 seconds = 116.76 MB/sec

/dev/md2:
  Timing cached reads:   1106 MB in  2.00 seconds = 552.92 MB/sec
  Timing buffered disk reads: 534 MB in  3.00 seconds = 177.78 MB/sec

which are numbers I'm quite happy with.

md1 is raid1 as I wasn't sure if LILO likes RAID10 yet. It just contains 
root. My 'df -h -t ext4' output looks like:

Filesystem            Size  Used Avail Use% Mounted on
/dev/md1             1008M  235M  722M  25% /
/dev/md2              7.9G  4.2G  3.4G  55% /usr
/dev/md5              866G  178G  645G  22% /var
/dev/md6              958G  200M  909G   1% /archive

With these drives (WDC EARS) it is absolutely essential that you partition 
them correctly - partitions *must* start on a 4K aligned boundary (sector 
must be evenly divisible by 8) They have a 4K physical sector size, but a 
512K logical sector size - and as Linux also uses a 4K block size, then 
any mis-alignment seriously degrades drive performance.

Gordon

^ permalink raw reply

* mark spare as active sync device?
From: Lars Täuber @ 2011-06-15 10:10 UTC (permalink / raw)
  To: linux-raid

Hi there,

is it possible to mark a nearly completly synced device as active in sync?

There was failing a device shortly before a spare drive was completely
synced. I want to get the directory structure from the device.

md3 : active raid6 sdz[16] sdaf[13] sds[0] sdac[10] sdah[15] sdag[14] sdae[12] sdad [11](F) sdab[9] sdaa[17](F) sdy[6] sdx[5] sdw[4] sdv[3] sdu[2] sdt[1]
      27349202944 blocks level 6, 64k chunk, algorithm 2 [16/13] [UUUUUUU__UU_UUUU]
      [===================>.]  recovery = 98.3% (1921986020/1953514496) finish=25.9min speed=20244K/sec

/dev/sdz was the device that was inserted and is now marked as spare. Is it possible to mark sdz as active and in sync?

Is there any chance to get data from this array?

Thanks
Lars

^ permalink raw reply

* ***Can You Do It?***
From: Mr, K.J Robert @ 2011-06-15 11:47 UTC (permalink / raw)


Greetings from Hong Kong,

I find it pleasurable to request your partnership in business, I want to solicit your assistance and honesty to receive money on my behalf. The reason am contacting you is because my status would not permit me to do this alone.

I will send you the full details and more information about myself and the funds. If interested, please reply through my alternate Email: jkrobert77@fm.bb

Sincerely Yours
Robert




^ permalink raw reply

* Re: [PATCH v2 1/5] md/raid10: optimize read_balance() for 'far offset' arrays
From: Namhyung Kim @ 2011-06-15 12:25 UTC (permalink / raw)
  To: Keld Jørn Simonsen; +Cc: Neil Brown, linux-raid
In-Reply-To: <20110615065144.GA28174@www2.open-std.org>

2011-06-15 (수), 08:51 +0200, Keld Jørn Simonsen:
> On Wed, Jun 15, 2011 at 11:02:00AM +0900, Namhyung Kim wrote:
> > If @conf->far_offset > 0, there is only 1 stripe so that we can treat
> > the array same as 'near' arrays.
> 
> does it also work with more than 2 copies - eg 3 copies?
> I think the original code just takes the available data blocks with the 
> lowest address.
> 

Hi,

Let me clarify this: AFAIK, 'far offset' array saves redundant data in
the diagonally adjacent chunk/disk, so it could be roughly thought as
'raid0' array with reduced size - just ignore redundant chunks here. It
was my mistake considering it as 'near' array. :(

Therefore, it makes more sense distributing reads over the array based
on some criteria - here, the address of starting sector - like RAID0
does. Now I see that the same goes to the 'far copies' array exactly, so
the original code is correct.

Thanks.

-- 
Regards,
Namhyung Kim

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2 1/5] md/raid10: optimize read_balance() for 'far offset' arrays
From: Namhyung Kim @ 2011-06-15 14:35 UTC (permalink / raw)
  To: Keld Jørn Simonsen; +Cc: Neil Brown, linux-raid
In-Reply-To: <1308140733.1358.41.camel@leonhard>

Namhyung Kim <namhyung@gmail.com> writes:
> 2011-06-15 (수), 08:51 +0200, Keld Jørn Simonsen:
>> On Wed, Jun 15, 2011 at 11:02:00AM +0900, Namhyung Kim wrote:
>> > If @conf->far_offset > 0, there is only 1 stripe so that we can treat
>> > the array same as 'near' arrays.
>> 
>> does it also work with more than 2 copies - eg 3 copies?
>> I think the original code just takes the available data blocks with the 
>> lowest address.
>> 
>
> Hi,
>
> Let me clarify this: AFAIK, 'far offset' array saves redundant data in
> the diagonally adjacent chunk/disk, so it could be roughly thought as
> 'raid0' array with reduced size - just ignore redundant chunks here. It
> was my mistake considering it as 'near' array. :(
>

I'm confused again. If fo > 0 && fc > 1 && nc > 1 then it turns out to
a near array with reduced size, no? Does it still need to be treaded
as RAID0?


> Therefore, it makes more sense distributing reads over the array based
> on some criteria - here, the address of starting sector - like RAID0
> does. Now I see that the same goes to the 'far copies' array exactly, so
> the original code is correct.
>
> Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH/RFC] Fix resync hang after surprise removal
From: Jim Paradis @ 2011-06-15 16:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Jim Paradis

We ran into a situation where surprise removal of a non-boot 2-disk raid1
array with I/O running can result in a tight loop in which md claims to be
resyncing the array.

It appears that remove_add_spares() in md.c contains two sets of conditions
used to determine if there is a spare available.  The disk that was pulled
has been marked 'faulty' in rdev->flags and its raid_disk value is >= 0.
Since it is neither In_Sync nor Blocked, spares gets incremented and so md
thinks there is a spare when in fact there is not.

One of my colleagues at Stratus proposed this patch, which rearranges the
order of the tests and makes them mutually exclusive.  Running with this
patch resolves the problem in our lab: we were able to run stress tests
with surprise removals without incident.

Since neither of us is an md expert, we'd like feedback as to whether
this patch is reasonable and whether it can be pushed upstream.

Jim Paradis
Onsite Red Hat Partner Engineer
Stratus Technologies, Inc.

 md.c |   17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

Signed-off-by: Jim Paradis <james.paradis@stratus.com>

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4332fc2..cdc5276 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7086,10 +7086,6 @@ static int remove_and_add_spares(mddev_t *mddev)

 	if (mddev->degraded && !mddev->recovery_disabled) {
 		list_for_each_entry(rdev, &mddev->disks, same_set) {
-			if (rdev->raid_disk >= 0 &&
-			    !test_bit(In_sync, &rdev->flags) &&
-			    !test_bit(Blocked, &rdev->flags))
-				spares++;
 			if (rdev->raid_disk < 0
 			    && !test_bit(Faulty, &rdev->flags)) {
 				rdev->recovery_offset = 0;
@@ -7100,11 +7096,18 @@ static int remove_and_add_spares(mddev_t *mddev)
 					if (sysfs_create_link(&mddev->kobj,
 							      &rdev->kobj, nm))
 						/* failure here is OK */;
-					spares++;
+						spares++;
 					md_new_event(mddev);
 					set_bit(MD_CHANGE_DEVS, &mddev->flags);
-				} else
-					break;
+				}
+			}
+			else if (rdev->raid_disk >= 0 &&
+			    !test_bit(In_sync, &rdev->flags) &&
+			    !test_bit(Blocked, &rdev->flags)) {
+				spares++;
+			}
+			else {
+				break;
 			}
 		}
 	}

^ permalink raw reply related

* Re: [PATCH v2 1/5] md/raid10: optimize read_balance() for 'far offset' arrays
From: NeilBrown @ 2011-06-15 23:56 UTC (permalink / raw)
  To: Namhyung Kim; +Cc: Keld Jørn Simonsen, linux-raid
In-Reply-To: <87aadj9n7a.fsf@gmail.com>

On Wed, 15 Jun 2011 23:35:53 +0900 Namhyung Kim <namhyung@gmail.com> wrote:

> Namhyung Kim <namhyung@gmail.com> writes:
> > 2011-06-15 (수), 08:51 +0200, Keld Jørn Simonsen:
> >> On Wed, Jun 15, 2011 at 11:02:00AM +0900, Namhyung Kim wrote:
> >> > If @conf->far_offset > 0, there is only 1 stripe so that we can treat
> >> > the array same as 'near' arrays.
> >> 
> >> does it also work with more than 2 copies - eg 3 copies?
> >> I think the original code just takes the available data blocks with the 
> >> lowest address.
> >> 
> >
> > Hi,
> >
> > Let me clarify this: AFAIK, 'far offset' array saves redundant data in
> > the diagonally adjacent chunk/disk, so it could be roughly thought as
> > 'raid0' array with reduced size - just ignore redundant chunks here. It
> > was my mistake considering it as 'near' array. :(
> >
> 
> I'm confused again. If fo > 0 && fc > 1 && nc > 1 then it turns out to
> a near array with reduced size, no? Does it still need to be treaded
> as RAID0?

This would be a mix of near and offset.  I'm not at all sure what the "best"
read balancing approach would be.  But as I don't think anyone would ever
actually use it, I don't think it really matters.

Thanks,
NeilBrown



> 
> 
> > Therefore, it makes more sense distributing reads over the array based
> > on some criteria - here, the address of starting sector - like RAID0
> > does. Now I see that the same goes to the 'far copies' array exactly, so
> > the original code is correct.
> >
> > Thanks.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: Why do I get different results for 'mdadm --detail' & 'mdadm --examine' for the same array?
From: jeffs_linux @ 2011-06-16  0:04 UTC (permalink / raw)
  To: linux-raid
In-Reply-To: <1307843366.27897.1462148437@webmail.messagingengine.com>

anyone?


^ permalink raw reply

* Re: Why do I get different results for 'mdadm --detail' & 'mdadm --examine' for the same array?
From: NeilBrown @ 2011-06-16  0:18 UTC (permalink / raw)
  To: jeffs_linux; +Cc: linux-raid
In-Reply-To: <1307840471.18429.1462140465@webmail.messagingengine.com>

On Sat, 11 Jun 2011 18:01:11 -0700 jeffs_linux@123mail.org wrote:

> Hi,
> 
> I'm working on setting up my 1st Linux production server with RAID for
> our office.

---- cut lots of detail-----
It is very good to include lots of detail, but more effective you include it
after you have asked the question, otherwise one gets bored long before one
reaches the actual question....

> 
> 
> Now, I'm going about characterizing the arrays, and the Volumes on them,
> so I can deal with recovery if & when it's necessary.
> 
> When I "look" at the array with these two commands,
> 
> mdadm --examine --scan
> 	ARRAY /dev/md/jeffadm1 metadata=1.2
> 	UUID=d84afb64:e6fa2b64:ff21c975:f9765431 name=jeffadm:jeffadm1
> 	ARRAY /dev/md126 UUID=19f2b21c:e54f9e1a:be5ad16e:9754ab5e
> 
> mdadm --detail --scan
> 	ARRAY /dev/md127 metadata=1.2 name=jeffadm:jeffadm1
> 	UUID=d84afb64:e6fa2b64:ff21c975:f9765431
> 	ARRAY /dev/md/0_0 metadata=0.90
> 	UUID=19f2b21c:e54f9e1a:be5ad16e:9754ab5e
> 
> 
> I get different results for each one.

It is just the different names that has you bothered - correct?

Names are messy things.  We pretend that everything has just one name but
that isn't really true (reminds me of "The Traveller in Black" ... he had
many names but one nature.  But I digress).

In the two different cases the 'name' of the array is found in different ways.
Both are correct, they are just different.

So unless there is some real confusion being caused, best just to move on and
not let it worry you..

NeilBrown



> 
> >From my reading about naming in mdadm.conf, I was expecting to see:
> 
>   /dev/md/0_0
>   /dev/jeffadm:jeffadm1
> 
> 
> Why do I get this mix of different results,
> 
> 	/dev/md/jeffadm1
> 	/dev/md126
> 
> from the "--detail" output, and
> 
> 	/dev/md127 metadata=1.2 name=jeffadm:jeffadm1
> 	/dev/md/0_0
> 
> according to the "--examine" output?
> 
> Is my mdadm.conf OK?  What really should I expect to see for the names
> of my arrays?
> 
> Jeff
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply

* Re: Why do I get different results for 'mdadm --detail' & 'mdadm --examine' for the same array?
From: jeffs_linux @ 2011-06-16  0:39 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid
In-Reply-To: <20110616101812.761f7397@notabene.brown>

On Thu, 16 Jun 2011 10:18 +1000, "NeilBrown" <neilb@suse.de> wrote:
> ---- cut lots of detail-----
> It is very good to include lots of detail, but more effective you include
> it
> after you have asked the question, otherwise one gets bored long before
> one
> reaches the actual question....

Ok.  I got beaten up for top posting once, and was told to 'keep things
in order'.  Next time, though -- question first, in BOTH the title and
the body.

> > I get different results for each one.
> 
> It is just the different names that has you bothered - correct?

No, not just the names. I guess I don't know the PURPOSE of the names,
if the names we give things aren't necessarily the ones that get used.

But more a problem are the different UUID's:.

cat /dev/.mdadm/map
---------------------------------------------------------------
md126 0.90 19f2b21c:e54f9e1a:be5ad16e:9754ab5e /dev/md/0_0
md127 1.2 79fb7ad4:289bfae5:86c535ff:202960f2 /dev/md127
---------------------------------------------------------------

mdadm --detail --scan
        ARRAY /dev/md127 metadata=1.2 name=jeffadm:jeffadm1
        UUID=d84afb64:e6fa2b64:ff21c975:f9765431
        ARRAY /dev/md/0_0 metadata=0.90
        UUID=19f2b21c:e54f9e1a:be5ad16e:9754ab5e

Notice that the UUIDs for /dev/md/0_0 match, but for /dev/md127 the
don't.

jeff

^ permalink raw reply

* Re: mark spare as active sync device?
From: NeilBrown @ 2011-06-16  0:54 UTC (permalink / raw)
  To: Lars Täuber; +Cc: linux-raid
In-Reply-To: <20110615121026.f689e4b2.taeuber@bbaw.de>

On Wed, 15 Jun 2011 12:10:26 +0200 Lars Täuber <taeuber@bbaw.de> wrote:

> Hi there,
> 
> is it possible to mark a nearly completly synced device as active in sync?
> 
> There was failing a device shortly before a spare drive was completely
> synced. I want to get the directory structure from the device.
> 
> md3 : active raid6 sdz[16] sdaf[13] sds[0] sdac[10] sdah[15] sdag[14] sdae[12] sdad [11](F) sdab[9] sdaa[17](F) sdy[6] sdx[5] sdw[4] sdv[3] sdu[2] sdt[1]
>       27349202944 blocks level 6, 64k chunk, algorithm 2 [16/13] [UUUUUUU__UU_UUUU]
>       [===================>.]  recovery = 98.3% (1921986020/1953514496) finish=25.9min speed=20244K/sec
> 
> /dev/sdz was the device that was inserted and is now marked as spare. Is it possible to mark sdz as active and in sync?

Not really.... and should you mark it as the in-sync member number 7, 8, or
11 ??

> 
> Is there any chance to get data from this array?

Your best bet is to try to 'create' the array again using '--assume-clean'
and putting the devices that you thing are working in the correct place.
e.g. something like:

 mdadm -S /dev/md3
 mdadm -C /dev/md3 --metadata=0.90 --level=6 --algorithm=2 --chunk=64 \
   --raid-devices=16  --assume-clean \
   /dev/sds /dev/sdt /dev/sdu /dev/sdv   \
   /dev/sdw /dev/sdx /dev/sdy /dev/sdz   \
   missing /dev/sdab /dev/sdac missing   \
   /dev/sdae /dev/sdaf /dev/sdag /dev/sdah

Then "fsck -n /dev/md3"  and see if it looks reasonably OK.
If it doesn't, try placing /dev/sdz in place of a different 'missing'.

Of course you should double check the order of devices that I have given
here, and all the other details.

NeilBrown

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: Why do I get different results for 'mdadm --detail' & 'mdadm --examine' for the same array?
From: NeilBrown @ 2011-06-16  1:22 UTC (permalink / raw)
  To: jeffs_linux; +Cc: linux-raid
In-Reply-To: <1308184754.28723.1463652469@webmail.messagingengine.com>

On Wed, 15 Jun 2011 17:39:14 -0700 jeffs_linux@123mail.org wrote:

> 
> 
> On Thu, 16 Jun 2011 10:18 +1000, "NeilBrown" <neilb@suse.de> wrote:
> > ---- cut lots of detail-----
> > It is very good to include lots of detail, but more effective you include
> > it
> > after you have asked the question, otherwise one gets bored long before
> > one
> > reaches the actual question....
> 
> Ok.  I got beaten up for top posting once, and was told to 'keep things
> in order'.  Next time, though -- question first, in BOTH the title and
> the body.

:-)  You can't win, can you...

> 
> > > I get different results for each one.
> > 
> > It is just the different names that has you bothered - correct?
> 
> No, not just the names. I guess I don't know the PURPOSE of the names,
> if the names we give things aren't necessarily the ones that get used.

Well there is the names that people like to use (/dev/md/jeffadm) and names
that the kernel likes to use (md1, md2, md127).

When you use --examine, it tries to use the name that people might like.
When you use --detail --scan it primarily has access to the names the kernel
likes, so it used those.

> 
> But more a problem are the different UUID's:.
> 
> cat /dev/.mdadm/map
> ---------------------------------------------------------------
> md126 0.90 19f2b21c:e54f9e1a:be5ad16e:9754ab5e /dev/md/0_0
> md127 1.2 79fb7ad4:289bfae5:86c535ff:202960f2 /dev/md127
> ---------------------------------------------------------------
> 
> mdadm --detail --scan
>         ARRAY /dev/md127 metadata=1.2 name=jeffadm:jeffadm1
>         UUID=d84afb64:e6fa2b64:ff21c975:f9765431
>         ARRAY /dev/md/0_0 metadata=0.90
>         UUID=19f2b21c:e54f9e1a:be5ad16e:9754ab5e

Yes, that it weird.  I don't know how they came to be out of sync.

  mdadm --incremental --rebuild-map

will fix it..

NeilBrown


> 
> 
> Notice that the UUIDs for /dev/md/0_0 match, but for /dev/md127 the
> don't.
> 
> 
> jeff


^ permalink raw reply

* Re: [PATCH/RFC] Fix resync hang after surprise removal
From: NeilBrown @ 2011-06-16  1:36 UTC (permalink / raw)
  To: Jim Paradis; +Cc: linux-raid
In-Reply-To: <20110615160117.31326.31562.sendpatchset@localhost.localdomain>

On Wed, 15 Jun 2011 12:02:15 -0400 Jim Paradis <jparadis@redhat.com> wrote:

> We ran into a situation where surprise removal of a non-boot 2-disk raid1
> array with I/O running can result in a tight loop in which md claims to be
> resyncing the array.
> 
> It appears that remove_add_spares() in md.c contains two sets of conditions
> used to determine if there is a spare available.  The disk that was pulled
> has been marked 'faulty' in rdev->flags and its raid_disk value is >= 0.
> Since it is neither In_Sync nor Blocked, spares gets incremented and so md
> thinks there is a spare when in fact there is not.
> 
> One of my colleagues at Stratus proposed this patch, which rearranges the
> order of the tests and makes them mutually exclusive.  Running with this
> patch resolves the problem in our lab: we were able to run stress tests
> with surprise removals without incident.
> 
> Since neither of us is an md expert, we'd like feedback as to whether
> this patch is reasonable and whether it can be pushed upstream.

Hi,
 thanks for the report and the patch.

 However I don't think the patch really does what you want.

 The two tests are already mutually exclusive as one begins with
      raid_disk >= 0
 and the other with
      raid_disk < 0
 and neither change raid_disk.

 The reason the patch has an effect is the 'break' that has been added.
 i.e. as soon as you find a normal working device you break out of the loop
 and stop looking for spares.

 I think the correct fix is simply:

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4332fc2..91e31e2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7088,6 +7088,7 @@ static int remove_and_add_spares(mddev_t *mddev)
 		list_for_each_entry(rdev, &mddev->disks, same_set) {
 			if (rdev->raid_disk >= 0 &&
 			    !test_bit(In_sync, &rdev->flags) &&
+			    !test_bit(Faulty, &rdev->flags) &&
 			    !test_bit(Blocked, &rdev->flags))
 				spares++;
 			if (rdev->raid_disk < 0


i.e. never consider a Faulty device to be a spare.

It looks like this bug was introduced by commit dfc70645000616777
in 2.6.26 when we allowed partially recovered devices to remain in the array
when a different device fails.

Can you please conform that this patch removes your symptom?

Thanks,
NeilBrown


^ permalink raw reply related

* Re: Why do I get different results for 'mdadm --detail' & 'mdadm --examine' for the same array?
From: jeffs_linux @ 2011-06-16  1:47 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid
In-Reply-To: <20110616112243.22a20610@notabene.brown>

On Thu, 16 Jun 2011 11:22 +1000, "NeilBrown" <neilb@suse.de> wrote:
> > Ok.  I got beaten up for top posting once, and was told to 'keep things
> > in order'.  Next time, though -- question first, in BOTH the title and
> > the body.
> 
> :-)  You can't win, can you...

Not even checkers with my son ;-)

> When you use --examine, it tries to use the name that people might like.
> When you use --detail --scan it primarily has access to the names the
> kernel
> likes, so it used those.

Thanks for that.  Into my notes ...

> > cat /dev/.mdadm/map
> > ---------------------------------------------------------------
> > md126 0.90 19f2b21c:e54f9e1a:be5ad16e:9754ab5e /dev/md/0_0
> > md127 1.2 79fb7ad4:289bfae5:86c535ff:202960f2 /dev/md127
> > ---------------------------------------------------------------
> > 
> > mdadm --detail --scan
> >         ARRAY /dev/md127 metadata=1.2 name=jeffadm:jeffadm1
> >         UUID=d84afb64:e6fa2b64:ff21c975:f9765431
> >         ARRAY /dev/md/0_0 metadata=0.90
> >         UUID=19f2b21c:e54f9e1a:be5ad16e:9754ab5e
> 
> Yes, that it weird.  I don't know how they came to be out of sync.
> 
>   mdadm --incremental --rebuild-map
> 
> will fix it..

Ok.  This is officially the first time that I'll actually try to fix
anything on my 'production' array.

I'm reading the manpage -- again! -- and see both the "--incremental"
and "--rebuild-map" sections.  So I get what they do.

WHEN can/should I do it?  On my live running array while at runlevel 5? 
A lower runlevel?  From a separate boot disk?

jeff

^ permalink raw reply

* Re: md 3.2.1 and xfs kernel panic on Linux 2.6.38
From: NeilBrown @ 2011-06-16  1:55 UTC (permalink / raw)
  To: fibreraid@gmail.com; +Cc: linux-raid, linux-xfs
In-Reply-To: <BANLkTimCyX=fBuXfuYx30X7M-ninCULq4Q@mail.gmail.com>

On Sun, 12 Jun 2011 11:50:01 -0700 "fibreraid@gmail.com"
<fibreraid@gmail.com> wrote:

> Hi All,
> 
> I am benchmarking md RAID with XFS on a server running Linux 2.6.38
> kernel. The server has 24 x HDD's, dual 2.4GHz 6-core CPUs, and 24GB
> RAM.
> 
> I created an md0 array using RAID 5, 64k chunk, 23 active drives, and
> 1 hot-spare. I then created a LVM2 volume group from this md0, and
> created an LV out of it. The volume was formatted XFS as follows:
> 
> /sbin/mkfs.xfs –f –l lazy-count=1 -l size=128m -s size=4096
> /dev/mapper/pool1-vol1
> 
> I then mounted it as follows:
> 
> /dev/mapper/pool1-vol1 on /volumes/pool1/vol1 type xfs
> (rw,_netdev,noatime,nodiratime,osyncisdsync,nobarrier,logbufs=8,delaylog)
> 
> Once md synchronization was complete, I removed one of the active 23
> drives. After attempting some IO, the md0 array began to rebuild to
> the hot-spare. In a few hours, it was complete and the md0 array was
> listed as active and healthy again (though now lacking a hot-spare
> obviously).
> 
> As a test, I removed one more drive to see what would happen. As
> expected, mdadm reported the array as active but degraded, and since
> there was no hot-spare available, there was no rebuilding happening.
> 
....
> 
> What surprised me though is that I was no longer able to run IO on the
> md0 device. As a test, I am using fio to generate IO to the XFS
> mountpoint /volumes/pool1/vol1. However, IO failed. A few minutes
> later, I received the following kernel dumps in /var/log/messages. Any
> ideas?
> 
> 
> 
> Jun 12 11:33:54 TESTBA16 kernel: [59435.936575] fio             D
> ffff88060c6e1a50     0 30463      1 0x00000000
> Jun 12 11:33:54 TESTBA16 kernel: [59435.936578]  ffff880609887778
> 0000000000000086 0000000000000001 0000000000000086
> Jun 12 11:33:54 TESTBA16 kernel: [59435.936581]  0000000000011e40
> ffff88060c6e16c0 ffff88060c6e1a50 ffff880609887fd8
> Jun 12 11:33:54 TESTBA16 kernel: [59435.936583]  ffff88060c6e1a58
> 0000000000011e40 ffff880609886010 0000000000011e40
> Jun 12 11:33:54 TESTBA16 kernel: [59435.936586] Call Trace:
> Jun 12 11:33:54 TESTBA16 kernel: [59435.936594]  [<ffffffffa025e698>]
> make_request+0x138/0x3d0 [raid456]

> 
> The errors seem to be a combination of XFS and md related messages.
> Any insight into this issue would be greatly appreciated. Thanks!
> 

Very peculiar!

It appears that make_request in raid5.c is entering schedule() in an
uninterruptible wait.
There are 4 places where make_request calls schedule.
2 can only happen if the  array is being reshaped (e.2. 5 drives to 6 drives)
but that does not appear to be happening.
1 causes and interruptible wait, so it cannot be that one.

That just leaves the one on line 4105.
This requires either than the stripe is being reshaped (which we already
decided isn't happening) or that md/raid5 has received overlapping requests.

i.e. while one request (either read or write) was pending, another request
(either read or write, not necessarily the same) arrives for a range of
sectors which over-laps the previous request.

When this happens (which it shouldn't because it would be dumb for a
filesystem to do that, but you never know) md/raid5 will wait for the first
request to be completely handled before letting the second proceed.
So we should be waiting here for at most a small fraction of a second.
Clearly we are waiting longer than that...

So this cannot possibly happen (as is so often the case when debugging :-)

Hmmm... maybe we are missing the wakeup call.  I can find where we wake-up
anyone waiting for an overlapping read request to complete, but I cannot find
where we wake-up someone waiting for when an overlapping write request
completes.  That should probably go in handle_stripe_clean_event.

Do you have the system still hanging in this state?  If not, can you get it
back into this state easily?
If so, you can force a wakeup with the magic incantation:

 cat /sys/block/mdXX/md/suspend_lo > /sys/block/mdXX/md/suspend_lo

(with 'XX' suitably substituted).

If that makes a difference, then I know I am on the right track

Thanks,
NeilBrown
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: Why do I get different results for 'mdadm --detail' & 'mdadm --examine' for the same array?
From: NeilBrown @ 2011-06-16  2:00 UTC (permalink / raw)
  To: jeffs_linux; +Cc: linux-raid
In-Reply-To: <1308188856.14820.1463669409@webmail.messagingengine.com>

On Wed, 15 Jun 2011 18:47:36 -0700 jeffs_linux@123mail.org wrote:

>> > > cat /dev/.mdadm/map
> > > ---------------------------------------------------------------
> > > md126 0.90 19f2b21c:e54f9e1a:be5ad16e:9754ab5e /dev/md/0_0
> > > md127 1.2 79fb7ad4:289bfae5:86c535ff:202960f2 /dev/md127
> > > ---------------------------------------------------------------
> > > 
> > > mdadm --detail --scan
> > >         ARRAY /dev/md127 metadata=1.2 name=jeffadm:jeffadm1
> > >         UUID=d84afb64:e6fa2b64:ff21c975:f9765431
> > >         ARRAY /dev/md/0_0 metadata=0.90
> > >         UUID=19f2b21c:e54f9e1a:be5ad16e:9754ab5e
> > 
> > Yes, that it weird.  I don't know how they came to be out of sync.
> > 
> >   mdadm --incremental --rebuild-map
> > 
> > will fix it..
> 
> Ok.  This is officially the first time that I'll actually try to fix
> anything on my 'production' array.
> 
> I'm reading the manpage -- again! -- and see both the "--incremental"
> and "--rebuild-map" sections.  So I get what they do.
> 
> WHEN can/should I do it?  On my live running array while at runlevel 5? 
> A lower runlevel?  From a separate boot disk?
> 

Any time at all is fine.  The 'map' file is used to help with incremental
assembly of arrays.  When "mdadm -I" is given a device that looks like part of
an array it looks in the map file to find out if any of that array has already
been assembled.
So one everything is assembled it is not interesting any longer.

I think you can even just remove it.  If mdadm needs it and finds it doesn't
exist, it perform the equivalent of "mdadm --incremental --rebuild", then
tries again.

So it really is safe to run it at any time that you aren't actively rebooting
or plugging in new devices.

NeilBrown

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox