* [md PATCH 3/3] md: define mddev flags, recovery flags and r1bio state bits using enums
From: NeilBrown @ 2016-11-08 23:21 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <147864718560.1076.2148299631932240330.stgit@noble>
This is less error prone than using individual #defines.
Signed-off-by: NeilBrown <neilb@suse.com>
---
drivers/md/md.h | 76 +++++++++++++++++++++++++---------------------------
drivers/md/raid1.h | 18 +++++++-----
2 files changed, 46 insertions(+), 48 deletions(-)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 21bd94fad96a..af6b33c30d2d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -192,6 +192,25 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
struct md_cluster_info;
+enum mddev_flags {
+ MD_CHANGE_DEVS, /* Some device status has changed */
+ MD_CHANGE_CLEAN, /* transition to or from 'clean' */
+ MD_CHANGE_PENDING, /* switch from 'clean' to 'active' in progress */
+ MD_ARRAY_FIRST_USE, /* First use of array, needs initialization */
+ MD_CLOSING, /* If set, we are closing the array, do not open
+ * it then */
+ MD_JOURNAL_CLEAN, /* A raid with journal is already clean */
+ MD_HAS_JOURNAL, /* The raid array has journal feature set */
+ MD_RELOAD_SB, /* Reload the superblock because another node
+ * updated it.
+ */
+ MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
+ * already took resync lock, need to
+ * release the lock */
+};
+#define MD_UPDATE_SB_FLAGS (BIT(MD_CHANGE_DEVS) | \
+ BIT(MD_CHANGE_CLEAN) | \
+ BIT(MD_CHANGE_PENDING)) /* If these are set, md_update_sb needed */
struct mddev {
void *private;
struct md_personality *pers;
@@ -199,21 +218,6 @@ struct mddev {
int md_minor;
struct list_head disks;
unsigned long flags;
-#define MD_CHANGE_DEVS 0 /* Some device status has changed */
-#define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */
-#define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */
-#define MD_UPDATE_SB_FLAGS (1 | 2 | 4) /* If these are set, md_update_sb needed */
-#define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */
-#define MD_CLOSING 4 /* If set, we are closing the array, do not open
- * it then */
-#define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */
-#define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */
-#define MD_RELOAD_SB 7 /* Reload the superblock because another node
- * updated it.
- */
-#define MD_CLUSTER_RESYNC_LOCKED 8 /* cluster raid only, which means node
- * already took resync lock, need to
- * release the lock */
int suspended;
atomic_t active_io;
@@ -307,31 +311,6 @@ struct mddev {
int parallel_resync;
int ok_start_degraded;
- /* recovery/resync flags
- * NEEDED: we might need to start a resync/recover
- * RUNNING: a thread is running, or about to be started
- * SYNC: actually doing a resync, not a recovery
- * RECOVER: doing recovery, or need to try it.
- * INTR: resync needs to be aborted for some reason
- * DONE: thread is done and is waiting to be reaped
- * REQUEST: user-space has requested a sync (used with SYNC)
- * CHECK: user-space request for check-only, no repair
- * RESHAPE: A reshape is happening
- * ERROR: sync-action interrupted because io-error
- *
- * If neither SYNC or RESHAPE are set, then it is a recovery.
- */
-#define MD_RECOVERY_RUNNING 0
-#define MD_RECOVERY_SYNC 1
-#define MD_RECOVERY_RECOVER 2
-#define MD_RECOVERY_INTR 3
-#define MD_RECOVERY_DONE 4
-#define MD_RECOVERY_NEEDED 5
-#define MD_RECOVERY_REQUESTED 6
-#define MD_RECOVERY_CHECK 7
-#define MD_RECOVERY_RESHAPE 8
-#define MD_RECOVERY_FROZEN 9
-#define MD_RECOVERY_ERROR 10
unsigned long recovery;
/* If a RAID personality determines that recovery (of a particular
@@ -445,6 +424,23 @@ struct mddev {
unsigned int good_device_nr; /* good device num within cluster raid */
};
+enum recovery_flags {
+ /*
+ * If neither SYNC or RESHAPE are set, then it is a recovery.
+ */
+ MD_RECOVERY_RUNNING, /* a thread is running, or about to be started */
+ MD_RECOVERY_SYNC, /* actually doing a resync, not a recovery */
+ MD_RECOVERY_RECOVER, /* doing recovery, or need to try it. */
+ MD_RECOVERY_INTR, /* resync needs to be aborted for some reason */
+ MD_RECOVERY_DONE, /* thread is done and is waiting to be reaped */
+ MD_RECOVERY_NEEDED, /* we might need to start a resync/recover */
+ MD_RECOVERY_REQUESTED, /* user-space has requested a sync (used with SYNC) */
+ MD_RECOVERY_CHECK, /* user-space request for check-only, no repair */
+ MD_RECOVERY_RESHAPE, /* A reshape is happening */
+ MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */
+ MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */
+};
+
static inline int __must_check mddev_lock(struct mddev *mddev)
{
return mutex_lock_interruptible(&mddev->reconfig_mutex);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 61c39b390cd8..5ec19449779d 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -161,14 +161,15 @@ struct r1bio {
};
/* bits for r1bio.state */
-#define R1BIO_Uptodate 0
-#define R1BIO_IsSync 1
-#define R1BIO_Degraded 2
-#define R1BIO_BehindIO 3
+enum r1bio_state {
+ R1BIO_Uptodate,
+ R1BIO_IsSync,
+ R1BIO_Degraded,
+ R1BIO_BehindIO,
/* Set ReadError on bios that experience a readerror so that
* raid1d knows what to do with them.
*/
-#define R1BIO_ReadError 4
+ R1BIO_ReadError,
/* For write-behind requests, we call bi_end_io when
* the last non-write-behind device completes, providing
* any write was successful. Otherwise we call when
@@ -176,10 +177,11 @@ struct r1bio {
* with failure when last write completes (and all failed).
* Record that bi_end_io was called with this flag...
*/
-#define R1BIO_Returned 6
+ R1BIO_Returned,
/* If a write for this request means we can clear some
* known-bad-block records, we set this flag
*/
-#define R1BIO_MadeGood 7
-#define R1BIO_WriteError 8
+ R1BIO_MadeGood,
+ R1BIO_WriteError,
+};
#endif
^ permalink raw reply related
* [md PATCH 1/3] md/raid1: fix: IO can block resync indefinitely
From: NeilBrown @ 2016-11-08 23:21 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <147864718560.1076.2148299631932240330.stgit@noble>
While performing a resync/recovery, raid1 divides the
array space into three regions:
- before the resync
- at or shortly after the resync point
- much further ahead of the resync point.
Write requests to the first or third do not need to wait. Write
requests to the middle region do need to wait if resync requests are
pending.
If there are any active write requests in the middle region, resync
will wait for them.
Due to an accounting error, there is a small range of addresses,
between conf->next_resync and conf->start_next_window, where write
requests will *not* be blocked, but *will* be counted in the middle
region. This can effectively block resync indefinitely if filesystem
writes happen repeatedly to this region.
As ->next_window_requests is incremented when the sector is before
conf->start_next_window + NEXT_NORMALIO_DISTANCE
the same boundary should be used for determining when write requests
should wait.
Signed-off-by: NeilBrown <neilb@suse.com>
---
drivers/md/raid1.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index aac2a05cf8d1..9ac61cd85e5c 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -834,7 +834,7 @@ static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
else if (conf->barrier && bio_data_dir(bio) == WRITE) {
if ((conf->mddev->curr_resync_completed
>= bio_end_sector(bio)) ||
- (conf->next_resync + NEXT_NORMALIO_DISTANCE
+ (conf->start_next_window + NEXT_NORMALIO_DISTANCE
<= bio->bi_iter.bi_sector))
wait = false;
else
^ permalink raw reply related
* [md PATCH 0/3] Three unrelated md patches.
From: NeilBrown @ 2016-11-08 23:21 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-raid
The first is a minor bug fix, the other to are just code improvements.
I have some patches to add "failfast" functionality, and I want to get
the small clean-ups out of the way first.
Thanks,
NeilBrown
---
NeilBrown (3):
md/raid1: fix: IO can block resync indefinitely
md: remove md_super_wait() call after bitmap_flush()
md: define mddev flags, recovery flags and r1bio state bits using enums
drivers/md/md.c | 1 -
drivers/md/md.h | 76 +++++++++++++++++++++++++---------------------------
drivers/md/raid1.c | 2 +
drivers/md/raid1.h | 18 +++++++-----
4 files changed, 47 insertions(+), 50 deletions(-)
--
Signature
^ permalink raw reply
* [md PATCH 2/3] md: remove md_super_wait() call after bitmap_flush()
From: NeilBrown @ 2016-11-08 23:21 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <147864718560.1076.2148299631932240330.stgit@noble>
bitmap_flush() finishes with bitmap_update_sb(), and that finishes
with write_page(..., 1), so write_page() will wait for all writes
to complete. So there is no point calling md_super_wait()
immediately afterwards.
Signed-off-by: NeilBrown <neilb@suse.com>
---
drivers/md/md.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f389d8abe137..1f1c7f007b68 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5472,7 +5472,6 @@ static void __md_stop_writes(struct mddev *mddev)
del_timer_sync(&mddev->safemode_timer);
bitmap_flush(mddev);
- md_super_wait(mddev);
if (mddev->ro == 0 &&
((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
^ permalink raw reply related
* Re: WARNING: mismatch_cnt is not 0 on <array device>
From: Wols Lists @ 2016-11-08 21:01 UTC (permalink / raw)
To: Phil Turmel, Benjammin2068, Linux-RAID
In-Reply-To: <1497737a-a307-4501-4158-9703a051ef67@turmel.org>
On 08/11/16 20:38, Phil Turmel wrote:
> Have you added up the peak current draws of your drives to make sure
> your power supply keeps up when all drives are writing simultaneously
> (common with parity raid)?
On that point, be aware that many power supplies quote the sum of the
power to all rails. It could well be that the supply is nominally plenty
powerful enough, but the load on an individual rail is too high.
Cheers,
Wol
^ permalink raw reply
* Re: WARNING: mismatch_cnt is not 0 on <array device>
From: Phil Turmel @ 2016-11-08 20:38 UTC (permalink / raw)
To: Benjammin2068, Linux-RAID
In-Reply-To: <287df6d6-3850-1142-5c69-c7b54a8a22d4@gmail.com>
On 11/08/2016 02:53 PM, Benjammin2068 wrote:
> On 11/08/2016 12:47 PM, Benjammin2068 wrote:
> Now that I think about it -- and have been talking out loud to myself (I don't think I'm crazy)...
>
> A parallel to all this is:
>
> I don't think the mismatch_cnt started showing up until I moved from RAID5 to RAID6.
>
> :O
>
> How painful is it to switch back to RAID5 to test that theory?
Don't. Sounds like raid6's stricter calculations are catching a real
problem. Do you have ECC RAM? If so, are you getting any machine check
exceptions? If not, have you done a thorough memtest any time in the
recent past?
If it's not memory, can you exercise the controller channels heavily to
see if they drop from errors?
Have you added up the peak current draws of your drives to make sure
your power supply keeps up when all drives are writing simultaneously
(common with parity raid)?
One more: do you have swap on top of md raid?
Phil
^ permalink raw reply
* Re: WARNING: mismatch_cnt is not 0 on <array device>
From: Benjammin2068 @ 2016-11-08 19:53 UTC (permalink / raw)
To: Linux-RAID
In-Reply-To: <0f6bd6f6-20ee-1720-23fc-27d206063bfc@gmail.com>
On 11/08/2016 12:47 PM, Benjammin2068 wrote:
> Hey all,
>
> I'm still trying to work through this..
>
> I've replaced the cables on the new AOC-SAS2LP-MV8 and the card itself (supermicro sent replacements)
>
>
> but I still occasionally have the mismatch_cnt error.. (like a week will go by and a weekend raid-check will happen with nothing.)
>
> So -- my question now (as recommended by someone else) is how to see if it's a drive issue somehow. (not that it's a "real" drive issue, but a card firmware issue that's aggravated by a drive's firmware somehow.)
>
> I've looked through the logs -- but how do I trace down a mismatch_cnt? I don't see anything in dmesg or messages....
>
> There are a couple 3Gb/s drives attached to this card.. maybe that's it? Who knows? But I don't have debug info to help me chase it down now that I'm trying to work it out with Supermicro tech support.
>
> Where do I look for more info about source or the event of the mismatch?
>
>
Now that I think about it -- and have been talking out loud to myself (I don't think I'm crazy)...
A parallel to all this is:
I don't think the mismatch_cnt started showing up until I moved from RAID5 to RAID6.
:O
How painful is it to switch back to RAID5 to test that theory?
-Ben
^ permalink raw reply
* Re: WARNING: mismatch_cnt is not 0 on <array device>
From: Benjammin2068 @ 2016-11-08 19:52 UTC (permalink / raw)
To: Linux-RAID
In-Reply-To: <0f6bd6f6-20ee-1720-23fc-27d206063bfc@gmail.com>
Hey all,
I'm still trying to work through this..
I've replaced the cables on the new AOC-SAS2LP-MV8 and the card itself (supermicro sent replacements)
but I still occasionally have the mismatch_cnt error.. (like a week will go by and a weekend raid-check will happen with nothing.)
So -- my question now (as recommended by someone else) is how to see if it's a drive issue somehow. (not that it's a "real" drive issue, but a card firmware issue that's aggravated by a drive's firmware somehow.)
I've looked through the logs -- but how do I trace down a mismatch_cnt? I don't see anything in dmesg or messages....
There are a couple 3Gb/s drives attached to this card.. maybe that's it? Who knows? But I don't have debug info to help me chase it down now that I'm trying to work it out with Supermicro tech support.
Where do I look for more info about source or the event of the mismatch?
Thanks,
-Ben
^ permalink raw reply
* Re: [PATCH 2/2] mdadm: raid10.c Remove near atomic break
From: Robert LeBlanc @ 2016-11-08 0:12 UTC (permalink / raw)
To: NeilBrown; +Cc: linux-raid
In-Reply-To: <87oa1snrch.fsf@notabene.neil.brown.name>
I'm not sure why 'near' performance can't be close to 'far'
performance. Here is the results from some tests I did today. These
are 6TB SAS drives that I filled with 5TB of fio data (took all day
Friday to fill them) so that I prevent short stroking drives.
The format of the name is
[NVME-]RAID(level)-(num_drives)[-parity_layout]. The NVME test was an
afterthought so there may be some variance between tests not seen in
the others. I usually do several tests and average the results and do
a distribution to get what is significant, but I didn't have a lot of
time.
Pre-patch clat (usec)
Seq io(MB) bw(KB/s) iops min max avg stdev
Single 12762 217801 54450 0 60462 17.71 156.30
RAID1-4 12903 220216 55053 0 42778 17.75 160.62
RAID10-4-n4 20057 342298 85574 0 50977 11.52 283.84
RAID10-4-f4 48711 831319 207829 0 74020 4.62 175.52
RAID10-3-n2 18439 314684 78671 0 61328 12.45 340.17
RAID10-3-f2 37169 634293 158573 0 65365 6.10 210.42
NVME-RAID10-4-n4 171950 2934682 733641 0 8016 1.16 14.15
NVME-RAID10-4-f4 172480 2943693 735903 0 7309 1.16 16.78
Post-patch
Seq
Single 12898 220118 55029 0 47805 17.85 159.62
RAID1-4 12895 220067 55016 0 51156 17.85 168.47
RAID10-4-n4 12797 218385 54596 0 65610 18.01 377.55
RAID10-4-f4 48751 832000 208000 0 90652 4.61 183.18
RAID10-3-n2 18656 318388 79596 0 62684 12.30 262.32
RAID10-3-f2 37181 634487 158621 0 72696 6.11 211.63
NVME-RAID10-4-n4 172738 2947174 737001 0 1057 1.16 13.08
NVME-RAID10-4-f4 188423 3215770 803926 0 1242 1.05 16.33
Pre-patch
Random
Single 19.5 333.3 83 1000 48000 12000 4010
RAID1-4 19.4 331.6 82 2000 49000 12060 4110
RAID10-4-n4 19.5 332.9 83 1000 38000 12010 4210
RAID10-4-f4 27.2 463.9 115 1000 50000 8620 3190
RAID10-3-n2 22.6 385.6 96 1000 44000 10370 3620
RAID10-3-f2 26.1 444.7 111 1000 366000 8990 5430
NVME-RAID10-4-n4 2458.3 41954.0 10488 77 414 94 10
NVME-RAID10-4-f4 2509.7 42830.0 10707 74 373 93 14
Post-patch
Random
Single 19.5 332.5 83 2000 37000 12020 4040
RAID1-4 19.4 331.0 82 2000 34000 12080 4070
RAID10-4-n4 27.0 460.6 115 178 50950 8678 3278
RAID10-4-f4 27.0 460.1 115 1000 43000 8690 3260
RAID10-3-n2 25.3 431.6 107 1000 46000 9260 3330
RAID10-3-f2 26.1 445.4 111 1000 44000 8970 3270
NVME-RAID10-4-n4 2334.5 39840.0 9960 47 308 100 13
NVME-RAID10-4-f4 2376.6 40551.0 10137 73 2675 97 18
With this patch, 'near' performance is almost exactly 'far'
performance for random reads. The sequential reads suffer from this
patch, but not worse than the the RAID1 or bare drive. RAID10-4-n4 has
38% random performance increase, RAID10-3-n2 has 12% random read
performance increase and RAID-4-n4 has 36% sequential performance
degradation where the RAID10-3-n2 seq performance has a 1% performance
increase (probably insignificant).
Interesting note:
Pre-patch Seq RAID10-4-n4 split the reads between the drives pretty
good and pre-patch random RAID10-4-n4 has all I/O going to one drive.
Post-patch these results are swapped with Seq RAID10-4-n4 being
serviced from a single drive and random RAID10-4-n4 spreading I/O to
all drives.
The patch doesn't really seem to impact NVME, there is possibly some
error in this test that throws doubts on the results in my mind since
both 'far' and 'near' have the same amount of change (~5%).
I hope this helps explain my reasoning. Just need to keep/improve the
original seq performance but get the improved random performance.
Robert LeBlanc
^ permalink raw reply
* Re: [md PATCH 4/4] md/bitmap: Don't write bitmap while earlier writes might be in-flight
From: Shaohua Li @ 2016-11-07 22:57 UTC (permalink / raw)
To: NeilBrown; +Cc: linux-raid
In-Reply-To: <87d1i7niva.fsf@notabene.neil.brown.name>
On Tue, Nov 08, 2016 at 07:19:05AM +1100, Neil Brown wrote:
> On Tue, Nov 08 2016, Shaohua Li wrote:
>
> > On Mon, Nov 07, 2016 at 09:53:42AM +1100, Neil Brown wrote:
> >> On Sat, Nov 05 2016, Shaohua Li wrote:
> >>
> >> > On Fri, Nov 04, 2016 at 04:46:03PM +1100, Neil Brown wrote:
> >> >> As we don't wait for writes to complete in bitmap_daemon_work, they
> >> >> could still be in-flight when bitmap_unplug writes again. Or when
> >> >> bitmap_daemon_work tries to write again.
> >> >> This can be confusing and could risk the wrong data being written last.
> >> >
> >> > Applied the first 3 patches, thanks!
> >> >
> >> > This one seems not completely solving the race condition. It's still possible
> >> > bitmap_daemon_work clears BITMAP_PAGE_NEEDWRITE but hasn't dispatch the IO yet,
> >> > bitmap_unplug then does nothing and thinks bitmap is updated to disk. Why don't
> >> > we add locking here?
> >>
> >> Thanks for the review!
> >>
> >> BITMAP_PAGE_NEEDWRITE is set for pages that need to be written out in
> >> order to clear bits that don't need to be set any more. There is never
> >> any urgency to do this.
> >> BITMAP_PAGE_DIRTY is set of pages that need to be written out in order
> >> to set bits representing regions that are about to be written to. These
> >> have to be flushed by bitmap_unplug().
> >> Pages can have both bits set, in which case bitmap_daemon_work() will
> >> leave them for bitmap_unplug() to deal with.
> >>
> >> So if bitmap_daemon_work() clears BITMAP_PAGE_PENDING on a page, then it
> >> is a page that bitmap_unplug() doesn't need to wait for.
> >
> > Oops, I misread the code. Yes, this is very clear, thanks for the explaination.
> >
> > So I can understand the patch fixes the confusion. when is there risk wrong
> > data is written?
>
> The goal is to avoid the possibility that two writes to the same
> location are in flight at the same time.
> It isn't clear that this would cause a problem, but it isn't clear that
> it is completely safe either.
> In general, this doesn't happen - certainly not from the page cache - so
> drivers will not be prepared for it.
> As an extreme example, suppose that the target device is a multi-path
> device.
> One write is sent and just after it is DMAed to one path, the in-memory
> page is changed and a second write is submitted, and the data is DMAed
> down the other path. Are we certain the two will be committed to
> storage in the intended order?
> Probably they will be. But maybe with a more complex stack, the chance
> of one IO overtaking the other (even after the first DMA) increases.
>
> So I cannot argue very strongly for this code, but it seems like a good
> idea and is reasonably simple....
I didn't try to argue about the patch, it makes things clear actually. Just
want to know where the data corruption comes from. I think I have an answer
now.
I'll add this patch.
Thanks,
Shaohua
^ permalink raw reply
* Re: [BUG 4.4.26] bio->bi_bdev == NULL in raid6 return_io()
From: Konstantin Khlebnikov @ 2016-11-07 20:34 UTC (permalink / raw)
To: Shaohua Li
Cc: Konstantin Khlebnikov, linux-kernel@vger.kernel.org, linux-raid,
linux-block, Neil Brown, Jens Axboe
In-Reply-To: <20161107194627.hsdk7zqoxznxdixl@kernel.org>
On Mon, Nov 7, 2016 at 10:46 PM, Shaohua Li <shli@kernel.org> wrote:
> On Sat, Nov 05, 2016 at 01:48:45PM +0300, Konstantin Khlebnikov wrote:
>> return_io() resolves request_queue even if trace point isn't active:
>>
>> static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
>> {
>> return bdev->bd_disk->queue; /* this is never NULL */
>> }
>>
>> static void return_io(struct bio_list *return_bi)
>> {
>> struct bio *bi;
>> while ((bi = bio_list_pop(return_bi)) != NULL) {
>> bi->bi_iter.bi_size = 0;
>> trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
>> bi, 0);
>> bio_endio(bi);
>> }
>> }
>
> I can't see how this could happen. What kind of tests/environment are these running?
That was a random piece of production somewhere.
Cording to time all crashes happened soon after reboot.
There're several raids, probably some of them were still under resync.
For now we have only few machines with this kernel. But I'm sure that
I'll get much more soon =)
>
> Thanks,
> Shaohua
>
>> kernel build with gcc version 4.6.3 (Ubuntu/Linaro 4.6.3-1ubuntu5) from ubuntu precise
>>
>> <6>[ 1659.710716] md: md2: resync done.
>> <6>[ 1659.968273] md: resync of RAID array md0
>> <6>[ 1659.968281] md: minimum _guaranteed_ speed: 1000 KB/sec/disk.
>> <6>[ 1659.968284] md: using maximum available idle IO bandwidth (but not more than 200000 KB/sec) for resync.
>> <6>[ 1659.968310] md: using 128k window, over a total of 16770816k.
>> <6>[ 1659.968311] md: resuming resync of md0 from checkpoint.
>> <7>[ 1659.968674] RAID conf printout:
>> <7>[ 1659.968678] --- level:6 rd:6 wd:6
>> <7>[ 1659.968680] disk 0, o:1, dev:sda3
>> <7>[ 1659.968682] disk 1, o:1, dev:sdc3
>> <7>[ 1659.968683] disk 2, o:1, dev:sdb3
>> <7>[ 1659.968684] disk 3, o:1, dev:sdd3
>> <7>[ 1659.968685] disk 4, o:1, dev:sde3
>> <7>[ 1659.968686] disk 5, o:1, dev:sdf3
>> <7>[ 1779.468199] RAID conf printout:
>> <7>[ 1779.468204] --- level:6 rd:6 wd:6
>> <7>[ 1779.468206] disk 0, o:1, dev:sda1
>> <7>[ 1779.468208] disk 1, o:1, dev:sdc1
>> <7>[ 1779.468209] disk 2, o:1, dev:sdb1
>> <7>[ 1779.468210] disk 3, o:1, dev:sdd1
>> <7>[ 1779.468211] disk 4, o:1, dev:sde1
>> <7>[ 1779.468212] disk 5, o:1, dev:sdf1
>> <1>[ 4658.730260] IP: return_io (include/linux/blkdev.h:825 drivers/md/raid5.c:231) raid456
>> <4>[ 4658.737189] PGD 0
>> <4>[ 4658.739452] Oops: 0000 [#1] SMP
>> <4>[ 4658.743080] Modules linked in: netconsole(E) configfs(E) unix_diag(E)
>> tcp_diag(E) inet_diag(E) ip6t_REJECT(E) nf_reject_ipv6(E) ip6table_filter(E)
>> ip6table_mangle(E) ip6_tables(E) ipt_R
>> EJECT(E) nf_reject_ipv4(E) iptable_filter(E) iptable_mangle(E) ip_tables(E)
>> x_tables(E) ipmi_devintf(E) nfsd(E) nfs_acl(E) auth_rpcgss(E) nfs(E)
>> fscache(E) lockd(E) sunrpc(E) grace(E) cls_u32
>> (E) sch_prio(E) ipmi_ssif(E) intel_rapl(E) iosf_mbi(E)
>> x86_pkg_temp_thermal(E) intel_powerclamp(E) 8021q(E) coretemp(E) mrp(E)
>> garp(E) stp(E) kvm_intel(E) llc(E) kvm(E) irqbypass(E) crc32_pcl
>> mul(E) sb_edac(E) serio_raw(E) joydev(E) input_leds(E) edac_core(E)
>> mei_me(E) mei(E) ioatdma(E) lpc_ich(E) ipmi_si(E) 8250_fintek(E)
>> ipmi_msghandler(E) shpchp(E) wmi(E) mac_hid(E) ip6_tunnel(
>> E) tunnel6(E) ipip(E) ip_tunnel(E) tunnel4(E) xfs(E)<4>[ 4658.822823]
>> raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E) async_pq(E)
>> async_xor(E) async_tx(E) xor(E) hid_generic(E) usb
>> hid(E) raid6_pq(E) libcrc32c(E) hid(E) igb(E) i2c_algo_bit(E) raid1(E)
>> isci(E) dca(E) raid0(E) ptp(E) multipath(E) libsas(E) ahci(E) pps_core(E)
>> scsi_transport_sas(E) psmouse(E) libahci(E) fj
>> es(E) linear(E)
>> <4>[ 4658.855131] CPU: 14 PID: 501 Comm: md2_raid6 Tainted: G E 4.4.26-9 #1
>> <4>[ 4658.863621] Hardware name: Supermicro X9DRW/X9DRW, BIOS 3.00 07/05/2013
>> <4>[ 4658.871041] task: ffff882035781a80 ti: ffff882033c08000 task.ti: ffff882033c08000
>> <4>[ 4658.879455] RIP: return_io (include/linux/blkdev.h:825 drivers/md/raid5.c:231) raid456
>> <4>[ 4658.889155] RSP: 0018:ffff882033c0bb18 EFLAGS: 00010246
>> <4>[ 4658.895118] RAX: 0000000000000000 RBX: ffff881ff22af2c0 RCX: ffff881ff22af4e0
>> <4>[ 4658.903122] RDX: 0000000000000000 RSI: ffff881ff22af2c0 RDI: ffff882033c0bc28
>> <4>[ 4658.911127] RBP: ffff882033c0bb48 R08: 0000000000000000 R09: 0000000000000000
>> <4>[ 4658.919130] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88203643db00
>> <4>[ 4658.927134] R13: 0000000000000006 R14: 0000000000000004 R15: ffff882033c0bc28
>> <4>[ 4658.935139] FS: 0000000000000000(0000) GS:ffff88203f380000(0000) knlGS:0000000000000000
>> <4>[ 4658.944233] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> <4>[ 4658.950683] CR2: 0000000000000098 CR3: 0000000001e0b000 CR4: 00000000000406e0
>> <4>[ 4658.958682] Stack:
>> <4>[ 4658.960952] ffff882033c0bb78 ffff881ff22af2c0 ffff8820354b0800 0000000000000006
>> <4>[ 4658.969329] 0000000000000004 0000000000000006 ffff882033c0bc88 ffffffffa015c0dd
>> <4>[ 4658.977697] 0000000000000000 ffff8820354b0a78 0000000000000000 0000000000000000
>> <4>[ 4658.986067] Call Trace:
>> <4>[ 4658.988827] handle_stripe (drivers/md/raid5.c:4635) raid456
>> <4>[ 4658.996156] ? default_wake_function (kernel/sched/core.c:3376)
>> <4>[ 4659.003190] ? autoremove_wake_function (kernel/sched/wait.c:295)
>> <4>[ 4659.010516] ? __wake_up_common (kernel/sched/wait.c:73)
>> <4>[ 4659.017065] handle_active_stripes.isra.49 (drivers/md/raid5.c:5776) raid456
>> <4>[ 4659.025877] raid5d (drivers/md/raid5.c:5889) raid456
>> <4>[ 4659.032428] ? _raw_spin_lock_irqsave
>> (./arch/x86/include/asm/paravirt.h:696 ./arch/x86/include/asm/qspinlock.h:28
>> include/asm-generic/qspinlock.h:102 include/linux/spinlock.h:155
>> include/linux/spinlock_api_smp.h:121 kernel/locking/spinlock.c:159)
>> <4>[ 4659.039559] md_thread (drivers/md/md.c:7099)
>> <4>[ 4659.045434] ? add_wait_queue (kernel/sched/wait.c:292)
>> <4>[ 4659.051786] ? md_rdev_init (drivers/md/md.c:7083)
>> <4>[ 4659.058140] kthread (kernel/kthread.c:209)
>> <4>[ 4659.063618] ? flush_kthread_worker (kernel/kthread.c:178)
>> <4>[ 4659.070554] ret_from_fork (arch/x86/entry/entry_64.S:469)
>> <4>[ 4659.076619] ? flush_kthread_worker (kernel/kthread.c:178)
>> <4>[ 4659.083553] Code: 83 ec 08 eb 41 49 8b 04 24 48 85 c0 49 89 07 0f 84
>> a3 00 00 00 49 8b 44 24 08 49 c7 04 24 00 00 00 00 41 c7 44 24 28 00 00 00
>> 00 <48> 8b 80 98 00 00 00 4c 8b a8 c0 03 00 00 66 66 66 66 90 4c 89
>> All code
>> ========
>> 0: 83 ec 08 sub $0x8,%esp
>> 3: eb 41 jmp 0x46
>> 5: 49 8b 04 24 mov (%r12),%rax
>> 9: 48 85 c0 test %rax,%rax
>> c: 49 89 07 mov %rax,(%r15)
>> f: 0f 84 a3 00 00 00 je 0xb8
>> 15: 49 8b 44 24 08 mov 0x8(%r12),%rax
>> 1a: 49 c7 04 24 00 00 00 movq $0x0,(%r12)
>> 21: 00
>> 22: 41 c7 44 24 28 00 00 movl $0x0,0x28(%r12)
>> 29: 00 00
>> 2b:* 48 8b 80 98 00 00 00 mov 0x98(%rax),%rax <-- trapping instruction
>> 32: 4c 8b a8 c0 03 00 00 mov 0x3c0(%rax),%r13
>> 39: 66 66 66 66 90 data32 data32 data32 xchg %ax,%ax
>> 3e: 4c rex.WR
>> 3f: 89 .byte 0x89
>>
>> Code starting with the faulting instruction
>> ===========================================
>> 0: 48 8b 80 98 00 00 00 mov 0x98(%rax),%rax
>> 7: 4c 8b a8 c0 03 00 00 mov 0x3c0(%rax),%r13
>> e: 66 66 66 66 90 data32 data32 data32 xchg %ax,%ax
>> 13: 4c rex.WR
>> 14: 89 .byte 0x89
>> <1>[ 4659.105577] RIP return_io (include/linux/blkdev.h:825 drivers/md/raid5.c:231) raid456
>>
>>
>> Couple times kernel failed second dereference
>>
>> <6>[ 1815.549178] md: md2: resync done.
>> <7>[ 1815.675433] RAID conf printout:
>> <7>[ 1815.675439] --- level:6 rd:6 wd:6
>> <7>[ 1815.675441] disk 0, o:1, dev:sda3
>> <7>[ 1815.675442] disk 1, o:1, dev:sdb3
>> <7>[ 1815.675443] disk 2, o:1, dev:sdc3
>> <7>[ 1815.675444] disk 3, o:1, dev:sdd3
>> <7>[ 1815.675445] disk 4, o:1, dev:sde3
>> <7>[ 1815.675446] disk 5, o:1, dev:sdf3
>>
>> <1>[ 2698.718595] IP: return_io (include/linux/blkdev.h:825 drivers/md/raid5.c:231) raid456
>> <4>[ 2698.725521] PGD 0
>> <4>[ 2698.727774] Oops: 0000 [#1] SMP
>> <4>[ 2698.731409] Modules linked in: netconsole(E) configfs(E) unix_diag(E)
>> tcp_diag(E) inet_diag(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E)
>> ip_tables(E) x_tables(E) ipmi_devintf(E
>> ) nfsd(E) nfs_acl(E) auth_rpcgss(E) nfs(E) fscache(E) lockd(E) sunrpc(E)
>> grace(E) cls_u32(E) sch_prio(E) ipmi_ssif(E) intel_rapl(E) iosf_mbi(E)
>> x86_pkg_temp_thermal(E) intel_powerclamp(E) cor
>> etemp(E) kvm_intel(E) kvm(E) 8021q(E) irqbypass(E) mrp(E) garp(E)
>> crc32_pclmul(E) stp(E) llc(E) serio_raw(E) input_leds(E) joydev(E)
>> sb_edac(E) edac_core(E) mei_me(E) lpc_ich(E) mei(E) ipmi_s
>> i(E) ioatdma(E) ipmi_msghandler(E) 8250_fintek(E) shpchp(E) wmi(E)
>> mac_hid(E) ip6_tunnel(E) tunnel6(E) ipip(E) ip_tunnel(E) tunnel4(E) xfs(E)
>> raid10(E) raid456(E) async_raid6_recov(E) async_m
>> emcpy(E) async_pq(E) async_xor(E) async_tx(E) xor(E)<4>[ 2698.811146]
>> hid_generic(E) raid6_pq(E) libcrc32c(E) usbhid(E) igb(E) hid(E)
>> i2c_algo_bit(E) raid1(E) isci(E) dca(E) raid0(E) libsas(
>> E) ahci(E) ptp(E) multipath(E) psmouse(E) libahci(E) scsi_transport_sas(E) pps_core(E) linear(E) fjes(E)
>> <4>[ 2698.833480] CPU: 2 PID: 514 Comm: md2_raid6 Tainted: G E 4.4.26-9 #1
>> <4>[ 2698.841845] Hardware name: Supermicro X9DRW/X9DRW, BIOS 3.0c 10/30/2014
>> <4>[ 2698.849241] task: ffff882033ec1a80 ti: ffff882033ef4000 task.ti: ffff882033ef4000
>> <4>[ 2698.857656] RIP: return_io (include/linux/blkdev.h:825 drivers/md/raid5.c:231) raid456
>> <4>[ 2698.867346] RSP: 0018:ffff882033ef7b18 EFLAGS: 00010246
>> <4>[ 2698.873307] RAX: 0000000000000000 RBX: ffff881fef26afd0 RCX: ffff881fef26b1f0
>> <4>[ 2698.881311] RDX: 0000000000000000 RSI: ffff881fef26afd0 RDI: ffff882033ef7c28
>> <4>[ 2698.889314] RBP: ffff882033ef7b48 R08: 0000000000000000 R09: 0000000000000000
>> <4>[ 2698.897319] R10: 0000000000000000 R11: 0000000000000000 R12: ffff880f9880cd00
>> <4>[ 2698.905322] R13: 0000000000000006 R14: 0000000000000004 R15: ffff882033ef7c28
>> <4>[ 2698.913327] FS: 0000000000000000(0000) GS:ffff88103fa80000(0000) knlGS:0000000000000000
>> <4>[ 2698.922420] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>> <4>[ 2698.928869] CR2: 00000000000003c0 CR3: 0000000001e0b000 CR4: 00000000000406e0
>> <4>[ 2698.936871] Stack:
>> <4>[ 2698.939153] ffff882033ef7b78 ffff881fef26afd0 ffff8810355cbc00 0000000000000006
>> <4>[ 2698.947512] 0000000000000004 0000000000000006 ffff882033ef7c88 ffffffffa01970dd
>> <4>[ 2698.955867] 0000000000000000 ffff8810355cbe78 0000000000000000 0000000000000000
>> <4>[ 2698.964222] Call Trace:
>> <4>[ 2698.966982] handle_stripe (drivers/md/raid5.c:4635) raid456
>> <4>[ 2698.974318] ? default_wake_function (kernel/sched/core.c:3376)
>> <4>[ 2698.981350] ? autoremove_wake_function (kernel/sched/wait.c:295)
>> <4>[ 2698.988665] ? __wake_up_common (kernel/sched/wait.c:73)
>> <4>[ 2698.995214] handle_active_stripes.isra.49 (drivers/md/raid5.c:5776) raid456
>> <4>[ 2699.004020] raid5d (drivers/md/raid5.c:5889) raid456
>> <4>[ 2699.010571] ? _raw_spin_lock_irqsave
>> (./arch/x86/include/asm/paravirt.h:696 ./arch/x86/include/asm/qspinlock.h:28
>> include/asm-generic/qspinlock.h:102 include/linux/spinlock.h:155
>> include/linux/spinlock_api_smp.h:121 kernel/locking/spinlock.c:159)
>> <4>[ 2699.017711] md_thread (drivers/md/md.c:7099)
>> <4>[ 2699.023592] ? add_wait_queue (kernel/sched/wait.c:292)
>> <4>[ 2699.029952] ? md_rdev_init (drivers/md/md.c:7083)
>> <4>[ 2699.036305] kthread (kernel/kthread.c:209)
>> <4>[ 2699.041783] ? flush_kthread_worker (kernel/kthread.c:178)
>> <4>[ 2699.048719] ret_from_fork (arch/x86/entry/entry_64.S:469)
>> <4>[ 2699.054779] ? flush_kthread_worker (kernel/kthread.c:178)
>> <4>[ 2699.061713] Code: 04 24 48 85 c0 49 89 07 0f 84 a3 00 00 00 49 8b 44
>> 24 08 49 c7 04 24 00 00 00 00 41 c7 44 24 28 00 00 00 00 48 8b 80 98 00 00
>> 00 <4c> 8b a8 c0 03 00 00 66 66 66 66 90 4c 89 e7 e8 f4 09 20 e1 4d
>> All code
>> ========
>> 0: 04 24 add $0x24,%al
>> 2: 48 85 c0 test %rax,%rax
>> 5: 49 89 07 mov %rax,(%r15)
>> 8: 0f 84 a3 00 00 00 je 0xb1
>> e: 49 8b 44 24 08 mov 0x8(%r12),%rax
>> 13: 49 c7 04 24 00 00 00 movq $0x0,(%r12)
>> 1a: 00
>> 1b: 41 c7 44 24 28 00 00 movl $0x0,0x28(%r12)
>> 22: 00 00
>> 24: 48 8b 80 98 00 00 00 mov 0x98(%rax),%rax
>> 2b:* 4c 8b a8 c0 03 00 00 mov 0x3c0(%rax),%r13 <-- trapping instruction
>> 32: 66 66 66 66 90 data32 data32 data32 xchg %ax,%ax
>> 37: 4c 89 e7 mov %r12,%rdi
>> 3a: e8 f4 09 20 e1 callq 0xffffffffe1200a33
>> 3f: 4d rex.WRB
>>
>> Code starting with the faulting instruction
>> ===========================================
>> 0: 4c 8b a8 c0 03 00 00 mov 0x3c0(%rax),%r13
>> 7: 66 66 66 66 90 data32 data32 data32 xchg %ax,%ax
>> c: 4c 89 e7 mov %r12,%rdi
>> f: e8 f4 09 20 e1 callq 0xffffffffe1200a08
>> 14: 4d rex.WRB
>>
>>
>> --
>> Konstantin
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [md PATCH 4/4] md/bitmap: Don't write bitmap while earlier writes might be in-flight
From: NeilBrown @ 2016-11-07 20:19 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <20161107191947.mzdkby6kbqpqssbv@kernel.org>
[-- Attachment #1: Type: text/plain, Size: 2622 bytes --]
On Tue, Nov 08 2016, Shaohua Li wrote:
> On Mon, Nov 07, 2016 at 09:53:42AM +1100, Neil Brown wrote:
>> On Sat, Nov 05 2016, Shaohua Li wrote:
>>
>> > On Fri, Nov 04, 2016 at 04:46:03PM +1100, Neil Brown wrote:
>> >> As we don't wait for writes to complete in bitmap_daemon_work, they
>> >> could still be in-flight when bitmap_unplug writes again. Or when
>> >> bitmap_daemon_work tries to write again.
>> >> This can be confusing and could risk the wrong data being written last.
>> >
>> > Applied the first 3 patches, thanks!
>> >
>> > This one seems not completely solving the race condition. It's still possible
>> > bitmap_daemon_work clears BITMAP_PAGE_NEEDWRITE but hasn't dispatch the IO yet,
>> > bitmap_unplug then does nothing and thinks bitmap is updated to disk. Why don't
>> > we add locking here?
>>
>> Thanks for the review!
>>
>> BITMAP_PAGE_NEEDWRITE is set for pages that need to be written out in
>> order to clear bits that don't need to be set any more. There is never
>> any urgency to do this.
>> BITMAP_PAGE_DIRTY is set of pages that need to be written out in order
>> to set bits representing regions that are about to be written to. These
>> have to be flushed by bitmap_unplug().
>> Pages can have both bits set, in which case bitmap_daemon_work() will
>> leave them for bitmap_unplug() to deal with.
>>
>> So if bitmap_daemon_work() clears BITMAP_PAGE_PENDING on a page, then it
>> is a page that bitmap_unplug() doesn't need to wait for.
>
> Oops, I misread the code. Yes, this is very clear, thanks for the explaination.
>
> So I can understand the patch fixes the confusion. when is there risk wrong
> data is written?
The goal is to avoid the possibility that two writes to the same
location are in flight at the same time.
It isn't clear that this would cause a problem, but it isn't clear that
it is completely safe either.
In general, this doesn't happen - certainly not from the page cache - so
drivers will not be prepared for it.
As an extreme example, suppose that the target device is a multi-path
device.
One write is sent and just after it is DMAed to one path, the in-memory
page is changed and a second write is submitted, and the data is DMAed
down the other path. Are we certain the two will be committed to
storage in the intended order?
Probably they will be. But maybe with a more complex stack, the chance
of one IO overtaking the other (even after the first DMA) increases.
So I cannot argue very strongly for this code, but it seems like a good
idea and is reasonably simple....
Thanks,
NeilBrown
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]
^ permalink raw reply
* Re: [BUG 4.4.26] bio->bi_bdev == NULL in raid6 return_io()
From: Shaohua Li @ 2016-11-07 19:46 UTC (permalink / raw)
To: Konstantin Khlebnikov
Cc: linux-kernel@vger.kernel.org, linux-raid, linux-block, Neil Brown,
Jens Axboe
In-Reply-To: <251e243a-ebcd-ae83-0850-a2143d2423ca@yandex-team.ru>
On Sat, Nov 05, 2016 at 01:48:45PM +0300, Konstantin Khlebnikov wrote:
> return_io() resolves request_queue even if trace point isn't active:
>
> static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
> {
> return bdev->bd_disk->queue; /* this is never NULL */
> }
>
> static void return_io(struct bio_list *return_bi)
> {
> struct bio *bi;
> while ((bi = bio_list_pop(return_bi)) != NULL) {
> bi->bi_iter.bi_size = 0;
> trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
> bi, 0);
> bio_endio(bi);
> }
> }
I can't see how this could happen. What kind of tests/environment are these running?
Thanks,
Shaohua
> kernel build with gcc version 4.6.3 (Ubuntu/Linaro 4.6.3-1ubuntu5) from ubuntu precise
>
> <6>[ 1659.710716] md: md2: resync done.
> <6>[ 1659.968273] md: resync of RAID array md0
> <6>[ 1659.968281] md: minimum _guaranteed_ speed: 1000 KB/sec/disk.
> <6>[ 1659.968284] md: using maximum available idle IO bandwidth (but not more than 200000 KB/sec) for resync.
> <6>[ 1659.968310] md: using 128k window, over a total of 16770816k.
> <6>[ 1659.968311] md: resuming resync of md0 from checkpoint.
> <7>[ 1659.968674] RAID conf printout:
> <7>[ 1659.968678] --- level:6 rd:6 wd:6
> <7>[ 1659.968680] disk 0, o:1, dev:sda3
> <7>[ 1659.968682] disk 1, o:1, dev:sdc3
> <7>[ 1659.968683] disk 2, o:1, dev:sdb3
> <7>[ 1659.968684] disk 3, o:1, dev:sdd3
> <7>[ 1659.968685] disk 4, o:1, dev:sde3
> <7>[ 1659.968686] disk 5, o:1, dev:sdf3
> <7>[ 1779.468199] RAID conf printout:
> <7>[ 1779.468204] --- level:6 rd:6 wd:6
> <7>[ 1779.468206] disk 0, o:1, dev:sda1
> <7>[ 1779.468208] disk 1, o:1, dev:sdc1
> <7>[ 1779.468209] disk 2, o:1, dev:sdb1
> <7>[ 1779.468210] disk 3, o:1, dev:sdd1
> <7>[ 1779.468211] disk 4, o:1, dev:sde1
> <7>[ 1779.468212] disk 5, o:1, dev:sdf1
> <1>[ 4658.730260] IP: return_io (include/linux/blkdev.h:825 drivers/md/raid5.c:231) raid456
> <4>[ 4658.737189] PGD 0
> <4>[ 4658.739452] Oops: 0000 [#1] SMP
> <4>[ 4658.743080] Modules linked in: netconsole(E) configfs(E) unix_diag(E)
> tcp_diag(E) inet_diag(E) ip6t_REJECT(E) nf_reject_ipv6(E) ip6table_filter(E)
> ip6table_mangle(E) ip6_tables(E) ipt_R
> EJECT(E) nf_reject_ipv4(E) iptable_filter(E) iptable_mangle(E) ip_tables(E)
> x_tables(E) ipmi_devintf(E) nfsd(E) nfs_acl(E) auth_rpcgss(E) nfs(E)
> fscache(E) lockd(E) sunrpc(E) grace(E) cls_u32
> (E) sch_prio(E) ipmi_ssif(E) intel_rapl(E) iosf_mbi(E)
> x86_pkg_temp_thermal(E) intel_powerclamp(E) 8021q(E) coretemp(E) mrp(E)
> garp(E) stp(E) kvm_intel(E) llc(E) kvm(E) irqbypass(E) crc32_pcl
> mul(E) sb_edac(E) serio_raw(E) joydev(E) input_leds(E) edac_core(E)
> mei_me(E) mei(E) ioatdma(E) lpc_ich(E) ipmi_si(E) 8250_fintek(E)
> ipmi_msghandler(E) shpchp(E) wmi(E) mac_hid(E) ip6_tunnel(
> E) tunnel6(E) ipip(E) ip_tunnel(E) tunnel4(E) xfs(E)<4>[ 4658.822823]
> raid10(E) raid456(E) async_raid6_recov(E) async_memcpy(E) async_pq(E)
> async_xor(E) async_tx(E) xor(E) hid_generic(E) usb
> hid(E) raid6_pq(E) libcrc32c(E) hid(E) igb(E) i2c_algo_bit(E) raid1(E)
> isci(E) dca(E) raid0(E) ptp(E) multipath(E) libsas(E) ahci(E) pps_core(E)
> scsi_transport_sas(E) psmouse(E) libahci(E) fj
> es(E) linear(E)
> <4>[ 4658.855131] CPU: 14 PID: 501 Comm: md2_raid6 Tainted: G E 4.4.26-9 #1
> <4>[ 4658.863621] Hardware name: Supermicro X9DRW/X9DRW, BIOS 3.00 07/05/2013
> <4>[ 4658.871041] task: ffff882035781a80 ti: ffff882033c08000 task.ti: ffff882033c08000
> <4>[ 4658.879455] RIP: return_io (include/linux/blkdev.h:825 drivers/md/raid5.c:231) raid456
> <4>[ 4658.889155] RSP: 0018:ffff882033c0bb18 EFLAGS: 00010246
> <4>[ 4658.895118] RAX: 0000000000000000 RBX: ffff881ff22af2c0 RCX: ffff881ff22af4e0
> <4>[ 4658.903122] RDX: 0000000000000000 RSI: ffff881ff22af2c0 RDI: ffff882033c0bc28
> <4>[ 4658.911127] RBP: ffff882033c0bb48 R08: 0000000000000000 R09: 0000000000000000
> <4>[ 4658.919130] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88203643db00
> <4>[ 4658.927134] R13: 0000000000000006 R14: 0000000000000004 R15: ffff882033c0bc28
> <4>[ 4658.935139] FS: 0000000000000000(0000) GS:ffff88203f380000(0000) knlGS:0000000000000000
> <4>[ 4658.944233] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> <4>[ 4658.950683] CR2: 0000000000000098 CR3: 0000000001e0b000 CR4: 00000000000406e0
> <4>[ 4658.958682] Stack:
> <4>[ 4658.960952] ffff882033c0bb78 ffff881ff22af2c0 ffff8820354b0800 0000000000000006
> <4>[ 4658.969329] 0000000000000004 0000000000000006 ffff882033c0bc88 ffffffffa015c0dd
> <4>[ 4658.977697] 0000000000000000 ffff8820354b0a78 0000000000000000 0000000000000000
> <4>[ 4658.986067] Call Trace:
> <4>[ 4658.988827] handle_stripe (drivers/md/raid5.c:4635) raid456
> <4>[ 4658.996156] ? default_wake_function (kernel/sched/core.c:3376)
> <4>[ 4659.003190] ? autoremove_wake_function (kernel/sched/wait.c:295)
> <4>[ 4659.010516] ? __wake_up_common (kernel/sched/wait.c:73)
> <4>[ 4659.017065] handle_active_stripes.isra.49 (drivers/md/raid5.c:5776) raid456
> <4>[ 4659.025877] raid5d (drivers/md/raid5.c:5889) raid456
> <4>[ 4659.032428] ? _raw_spin_lock_irqsave
> (./arch/x86/include/asm/paravirt.h:696 ./arch/x86/include/asm/qspinlock.h:28
> include/asm-generic/qspinlock.h:102 include/linux/spinlock.h:155
> include/linux/spinlock_api_smp.h:121 kernel/locking/spinlock.c:159)
> <4>[ 4659.039559] md_thread (drivers/md/md.c:7099)
> <4>[ 4659.045434] ? add_wait_queue (kernel/sched/wait.c:292)
> <4>[ 4659.051786] ? md_rdev_init (drivers/md/md.c:7083)
> <4>[ 4659.058140] kthread (kernel/kthread.c:209)
> <4>[ 4659.063618] ? flush_kthread_worker (kernel/kthread.c:178)
> <4>[ 4659.070554] ret_from_fork (arch/x86/entry/entry_64.S:469)
> <4>[ 4659.076619] ? flush_kthread_worker (kernel/kthread.c:178)
> <4>[ 4659.083553] Code: 83 ec 08 eb 41 49 8b 04 24 48 85 c0 49 89 07 0f 84
> a3 00 00 00 49 8b 44 24 08 49 c7 04 24 00 00 00 00 41 c7 44 24 28 00 00 00
> 00 <48> 8b 80 98 00 00 00 4c 8b a8 c0 03 00 00 66 66 66 66 90 4c 89
> All code
> ========
> 0: 83 ec 08 sub $0x8,%esp
> 3: eb 41 jmp 0x46
> 5: 49 8b 04 24 mov (%r12),%rax
> 9: 48 85 c0 test %rax,%rax
> c: 49 89 07 mov %rax,(%r15)
> f: 0f 84 a3 00 00 00 je 0xb8
> 15: 49 8b 44 24 08 mov 0x8(%r12),%rax
> 1a: 49 c7 04 24 00 00 00 movq $0x0,(%r12)
> 21: 00
> 22: 41 c7 44 24 28 00 00 movl $0x0,0x28(%r12)
> 29: 00 00
> 2b:* 48 8b 80 98 00 00 00 mov 0x98(%rax),%rax <-- trapping instruction
> 32: 4c 8b a8 c0 03 00 00 mov 0x3c0(%rax),%r13
> 39: 66 66 66 66 90 data32 data32 data32 xchg %ax,%ax
> 3e: 4c rex.WR
> 3f: 89 .byte 0x89
>
> Code starting with the faulting instruction
> ===========================================
> 0: 48 8b 80 98 00 00 00 mov 0x98(%rax),%rax
> 7: 4c 8b a8 c0 03 00 00 mov 0x3c0(%rax),%r13
> e: 66 66 66 66 90 data32 data32 data32 xchg %ax,%ax
> 13: 4c rex.WR
> 14: 89 .byte 0x89
> <1>[ 4659.105577] RIP return_io (include/linux/blkdev.h:825 drivers/md/raid5.c:231) raid456
>
>
> Couple times kernel failed second dereference
>
> <6>[ 1815.549178] md: md2: resync done.
> <7>[ 1815.675433] RAID conf printout:
> <7>[ 1815.675439] --- level:6 rd:6 wd:6
> <7>[ 1815.675441] disk 0, o:1, dev:sda3
> <7>[ 1815.675442] disk 1, o:1, dev:sdb3
> <7>[ 1815.675443] disk 2, o:1, dev:sdc3
> <7>[ 1815.675444] disk 3, o:1, dev:sdd3
> <7>[ 1815.675445] disk 4, o:1, dev:sde3
> <7>[ 1815.675446] disk 5, o:1, dev:sdf3
>
> <1>[ 2698.718595] IP: return_io (include/linux/blkdev.h:825 drivers/md/raid5.c:231) raid456
> <4>[ 2698.725521] PGD 0
> <4>[ 2698.727774] Oops: 0000 [#1] SMP
> <4>[ 2698.731409] Modules linked in: netconsole(E) configfs(E) unix_diag(E)
> tcp_diag(E) inet_diag(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E)
> ip_tables(E) x_tables(E) ipmi_devintf(E
> ) nfsd(E) nfs_acl(E) auth_rpcgss(E) nfs(E) fscache(E) lockd(E) sunrpc(E)
> grace(E) cls_u32(E) sch_prio(E) ipmi_ssif(E) intel_rapl(E) iosf_mbi(E)
> x86_pkg_temp_thermal(E) intel_powerclamp(E) cor
> etemp(E) kvm_intel(E) kvm(E) 8021q(E) irqbypass(E) mrp(E) garp(E)
> crc32_pclmul(E) stp(E) llc(E) serio_raw(E) input_leds(E) joydev(E)
> sb_edac(E) edac_core(E) mei_me(E) lpc_ich(E) mei(E) ipmi_s
> i(E) ioatdma(E) ipmi_msghandler(E) 8250_fintek(E) shpchp(E) wmi(E)
> mac_hid(E) ip6_tunnel(E) tunnel6(E) ipip(E) ip_tunnel(E) tunnel4(E) xfs(E)
> raid10(E) raid456(E) async_raid6_recov(E) async_m
> emcpy(E) async_pq(E) async_xor(E) async_tx(E) xor(E)<4>[ 2698.811146]
> hid_generic(E) raid6_pq(E) libcrc32c(E) usbhid(E) igb(E) hid(E)
> i2c_algo_bit(E) raid1(E) isci(E) dca(E) raid0(E) libsas(
> E) ahci(E) ptp(E) multipath(E) psmouse(E) libahci(E) scsi_transport_sas(E) pps_core(E) linear(E) fjes(E)
> <4>[ 2698.833480] CPU: 2 PID: 514 Comm: md2_raid6 Tainted: G E 4.4.26-9 #1
> <4>[ 2698.841845] Hardware name: Supermicro X9DRW/X9DRW, BIOS 3.0c 10/30/2014
> <4>[ 2698.849241] task: ffff882033ec1a80 ti: ffff882033ef4000 task.ti: ffff882033ef4000
> <4>[ 2698.857656] RIP: return_io (include/linux/blkdev.h:825 drivers/md/raid5.c:231) raid456
> <4>[ 2698.867346] RSP: 0018:ffff882033ef7b18 EFLAGS: 00010246
> <4>[ 2698.873307] RAX: 0000000000000000 RBX: ffff881fef26afd0 RCX: ffff881fef26b1f0
> <4>[ 2698.881311] RDX: 0000000000000000 RSI: ffff881fef26afd0 RDI: ffff882033ef7c28
> <4>[ 2698.889314] RBP: ffff882033ef7b48 R08: 0000000000000000 R09: 0000000000000000
> <4>[ 2698.897319] R10: 0000000000000000 R11: 0000000000000000 R12: ffff880f9880cd00
> <4>[ 2698.905322] R13: 0000000000000006 R14: 0000000000000004 R15: ffff882033ef7c28
> <4>[ 2698.913327] FS: 0000000000000000(0000) GS:ffff88103fa80000(0000) knlGS:0000000000000000
> <4>[ 2698.922420] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> <4>[ 2698.928869] CR2: 00000000000003c0 CR3: 0000000001e0b000 CR4: 00000000000406e0
> <4>[ 2698.936871] Stack:
> <4>[ 2698.939153] ffff882033ef7b78 ffff881fef26afd0 ffff8810355cbc00 0000000000000006
> <4>[ 2698.947512] 0000000000000004 0000000000000006 ffff882033ef7c88 ffffffffa01970dd
> <4>[ 2698.955867] 0000000000000000 ffff8810355cbe78 0000000000000000 0000000000000000
> <4>[ 2698.964222] Call Trace:
> <4>[ 2698.966982] handle_stripe (drivers/md/raid5.c:4635) raid456
> <4>[ 2698.974318] ? default_wake_function (kernel/sched/core.c:3376)
> <4>[ 2698.981350] ? autoremove_wake_function (kernel/sched/wait.c:295)
> <4>[ 2698.988665] ? __wake_up_common (kernel/sched/wait.c:73)
> <4>[ 2698.995214] handle_active_stripes.isra.49 (drivers/md/raid5.c:5776) raid456
> <4>[ 2699.004020] raid5d (drivers/md/raid5.c:5889) raid456
> <4>[ 2699.010571] ? _raw_spin_lock_irqsave
> (./arch/x86/include/asm/paravirt.h:696 ./arch/x86/include/asm/qspinlock.h:28
> include/asm-generic/qspinlock.h:102 include/linux/spinlock.h:155
> include/linux/spinlock_api_smp.h:121 kernel/locking/spinlock.c:159)
> <4>[ 2699.017711] md_thread (drivers/md/md.c:7099)
> <4>[ 2699.023592] ? add_wait_queue (kernel/sched/wait.c:292)
> <4>[ 2699.029952] ? md_rdev_init (drivers/md/md.c:7083)
> <4>[ 2699.036305] kthread (kernel/kthread.c:209)
> <4>[ 2699.041783] ? flush_kthread_worker (kernel/kthread.c:178)
> <4>[ 2699.048719] ret_from_fork (arch/x86/entry/entry_64.S:469)
> <4>[ 2699.054779] ? flush_kthread_worker (kernel/kthread.c:178)
> <4>[ 2699.061713] Code: 04 24 48 85 c0 49 89 07 0f 84 a3 00 00 00 49 8b 44
> 24 08 49 c7 04 24 00 00 00 00 41 c7 44 24 28 00 00 00 00 48 8b 80 98 00 00
> 00 <4c> 8b a8 c0 03 00 00 66 66 66 66 90 4c 89 e7 e8 f4 09 20 e1 4d
> All code
> ========
> 0: 04 24 add $0x24,%al
> 2: 48 85 c0 test %rax,%rax
> 5: 49 89 07 mov %rax,(%r15)
> 8: 0f 84 a3 00 00 00 je 0xb1
> e: 49 8b 44 24 08 mov 0x8(%r12),%rax
> 13: 49 c7 04 24 00 00 00 movq $0x0,(%r12)
> 1a: 00
> 1b: 41 c7 44 24 28 00 00 movl $0x0,0x28(%r12)
> 22: 00 00
> 24: 48 8b 80 98 00 00 00 mov 0x98(%rax),%rax
> 2b:* 4c 8b a8 c0 03 00 00 mov 0x3c0(%rax),%r13 <-- trapping instruction
> 32: 66 66 66 66 90 data32 data32 data32 xchg %ax,%ax
> 37: 4c 89 e7 mov %r12,%rdi
> 3a: e8 f4 09 20 e1 callq 0xffffffffe1200a33
> 3f: 4d rex.WRB
>
> Code starting with the faulting instruction
> ===========================================
> 0: 4c 8b a8 c0 03 00 00 mov 0x3c0(%rax),%r13
> 7: 66 66 66 66 90 data32 data32 data32 xchg %ax,%ax
> c: 4c 89 e7 mov %r12,%rdi
> f: e8 f4 09 20 e1 callq 0xffffffffe1200a08
> 14: 4d rex.WRB
>
>
> --
> Konstantin
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* Re: [md PATCH 4/4] md/bitmap: Don't write bitmap while earlier writes might be in-flight
From: Shaohua Li @ 2016-11-07 19:19 UTC (permalink / raw)
To: NeilBrown; +Cc: linux-raid
In-Reply-To: <87r36onrt5.fsf@notabene.neil.brown.name>
On Mon, Nov 07, 2016 at 09:53:42AM +1100, Neil Brown wrote:
> On Sat, Nov 05 2016, Shaohua Li wrote:
>
> > On Fri, Nov 04, 2016 at 04:46:03PM +1100, Neil Brown wrote:
> >> As we don't wait for writes to complete in bitmap_daemon_work, they
> >> could still be in-flight when bitmap_unplug writes again. Or when
> >> bitmap_daemon_work tries to write again.
> >> This can be confusing and could risk the wrong data being written last.
> >
> > Applied the first 3 patches, thanks!
> >
> > This one seems not completely solving the race condition. It's still possible
> > bitmap_daemon_work clears BITMAP_PAGE_NEEDWRITE but hasn't dispatch the IO yet,
> > bitmap_unplug then does nothing and thinks bitmap is updated to disk. Why don't
> > we add locking here?
>
> Thanks for the review!
>
> BITMAP_PAGE_NEEDWRITE is set for pages that need to be written out in
> order to clear bits that don't need to be set any more. There is never
> any urgency to do this.
> BITMAP_PAGE_DIRTY is set of pages that need to be written out in order
> to set bits representing regions that are about to be written to. These
> have to be flushed by bitmap_unplug().
> Pages can have both bits set, in which case bitmap_daemon_work() will
> leave them for bitmap_unplug() to deal with.
>
> So if bitmap_daemon_work() clears BITMAP_PAGE_PENDING on a page, then it
> is a page that bitmap_unplug() doesn't need to wait for.
Oops, I misread the code. Yes, this is very clear, thanks for the explaination.
So I can understand the patch fixes the confusion. when is there risk wrong
data is written?
Thanks,
Shaohua
^ permalink raw reply
* Re: RAID10 with 2 drives auto-assembled as RAID1
From: Darko Luketic @ 2016-11-07 17:13 UTC (permalink / raw)
To: Phil Turmel, linux-raid
In-Reply-To: <c2c0659b-9dd6-1f5f-9309-8a47ea3400f4@turmel.org>
On 11/07/2016 02:49 PM, Phil Turmel wrote:
> Good morning,
>
Good morning Phil
> On 11/07/2016 08:36 AM, Darko Luketic wrote:
>> On 11/05/2016 11:54 PM, Phil Turmel wrote:
>>> Assuming you had an ext2/3/4 filesystem in the array, try this
>>> one-liner:
>>>
>>> for x in /dev/sd[ef]1; do echo -e "\nDevice $x"; dd if=$x bs=1M
>>> count=16k |hexdump -C |egrep '^[0-9a-f]+30 .+ 53 ef'; done
>
>> There are many lines of output.
>> Meanwhile I went the commercial route. Managed so save all important
>> stuff but the software (rdata product version 6 for Windows) didn't do
>> it properly.
>> I'll have to start a new restore.
>> I also used a free product for gnu/linux called R-Linux which is able to
>> find most dirs and files but hangs at 1.03TB and is unable to find the
>> most important .thunderbird dir .
>> Also tried sleuthkit with autopsy2 without success.
>
> Ok.
>
>> For now I can cope with the 128GB ssd as a temporary solution, but life
>> must go on and in the long run I need more space and to use the drives.
>>
>> What was I searching for with this chain of commands?
>
> Magic number for an ext2/3/4 filesystem at a suitable offset within a
> sector.
>
>> Device /dev/sde1
>> 07f00430 b5 9b 1c 58 7a 02 ff ff 53 ef 01 00 01 00 00 00
>> |...Xz...S.......|
>
> This is a candidate, with a last write timestamp of Fri, 04 Nov 2016
> 14:31:17 GMT.
>
>
>> 0ff00030 c4 36 78 52 00 00 ff ff 53 ef 00 00 01 00 00 00
>> |.6xR....S.......|
>
> And another candidate with a last write @ Tue, 05 Nov 2013 00:07:32 GMT.
>
>> 1ff00030 c4 36 78 52 00 00 ff ff 53 ef 00 00 01 00 00 00
>> |.6xR....S.......|
>
> And this one appears to be a backup superblock for the 2013 filesystem.
> The spacing between these two make me wonder if they are both backups
> for a superblock that's been overwritten.
>
> I'd suggest sharing these emails with the Ext4 mailing list to see if
> you can get some more specific recovery help. I'd say the odds are fair
> to good.
>
> Phil
Yes (probably) the array was created in 2013.
Thank you so much for your help Phil.
Darko
^ permalink raw reply
* Re:
From: Wols Lists @ 2016-11-07 17:13 UTC (permalink / raw)
To: Dennis Dataopslag, linux-raid
In-Reply-To: <5820B0F0.2050306@youngman.org.uk>
On 07/11/16 16:50, Wols Lists wrote:
> You're looking at a big forensic job. I've moved the relevant page to
> the archaeology area - probably a bit too soon - but you need to read
> the following page
>
> https://raid.wiki.kernel.org/index.php/Reconstruction
>
> Especially the bit about overlays. And wait for the experts to chime in
> about how to do a hexdump and work out the values you need to pass to
> mdadm to get the array back. It's a lot of work and you could be looking
> at a week what with the delays as you wait for replies.
Whoops, sorry. Wrong page, you need this one ...
https://raid.wiki.kernel.org/index.php/Recovering_a_failed_software_RAID
Cheers,
Wol
^ permalink raw reply
* Re:
From: Wols Lists @ 2016-11-07 16:50 UTC (permalink / raw)
To: Dennis Dataopslag, linux-raid
In-Reply-To: <CAGELip+rb=Ztv2K8jPdn608Li_RakEq9Gp5Jyof9eC_bO5QMrQ@mail.gmail.com>
On 06/11/16 21:00, Dennis Dataopslag wrote:
> Help wanted very much!
Quick response ...
>
> My setup:
> Thecus N5550 NAS with 5 1TB drives installed.
>
> MD0: RAID 5 config of 4 drives (SD[ABCD]2)
> MD10: RAID 1 config of all 5 drives (SD..1), system generated array
> MD50: RAID 1 config of 4 drives (SD[ABCD]3), system generated array
>
> 1 drive (SDE) set as global hot spare.
>
Bit late now, but you would probably have been better with raid-6.
>
> What happened:
> This weekend I thought it might be a good idea to do a SMART test for
> the drives in my NAS.
> I started the test on 1 drive and after it ran for a while I started
> the other ones.
> While the test was running drive 3 failed. I got a message the RAID
> was degraded and started rebuilding. (My assumption is that at this
> moment the global hot spare will automatically be added to the array)
>
> I stopped the SMART tests of all drives at this moment since it seemed
> logical to me the SMART test (or the outcomes) made the drive fail.
> In stopping the tests, drive 1 also failed!!
> I let it for a little but the admin interface kept telling me it was
> degraded, did not seem to take any actions to start rebuilding.
It can't - there's no spare drive to rebuild on, and there aren't enough
drives to build a working array.
> At this point I started googling and found I should remove and reseat
> the drives. This is also what I did but nothing seemd to happen.
> The turned up as new drives in the admin interface and I re-added them
> to the array, they were added as spares.
> Even after adding them the array didn't start rebuilding.
> I checked stat in mdadm and it told me clean FAILED opposed to the
> degraded in the admin interface.
Yup. You've only got two drives of a four-drive raid 5.
Where did you google? Did you read the linux raid wiki?
https://raid.wiki.kernel.org/index.php/Linux_Raid
>
> I rebooted the NAS since it didn't seem to be doing anything I might interrupt.
> after rebooting it seemed as if the entire array had disappeared!!
> I started looking for options in MDADM and tried every "normal"option
> to rebuild the array (--assemble --scan for example)
> Unfortunately I cannot produce a complete list since I cannot find how
> to get it from the logging.
>
> Finally I mdadm --create a new array with the original 4 drives with
> all the right settings. (Got them from 1 of the original volumes)
OUCH OUCH OUCH!
Are you sure you've got the right settings? A lot of "hidden" settings
have changed their values over the years. Do you know which mdadm was
used to create the array in the first place?
> The creation worked but after creation it doesn't seem to have a valid
> partition table. This is the point where I realized I probably fucked
> it up big-time and should call in the help squad!!!
> What I think went wrong is that I re-created an array with the
> original 4 drives from before the first failure but the hot-spare was
> already added?
Nope. You've probably used a newer version of mdadm. That's assuming the
array is still all the original drives. If some of them have been
replaced you've got a still messier problem.
>
> The most important data from the array is saved in an offline backup
> luckily but I would very much like it if there is any way I could
> restore the data from the array.
>
> Is there any way I could get it back online?
You're looking at a big forensic job. I've moved the relevant page to
the archaeology area - probably a bit too soon - but you need to read
the following page
https://raid.wiki.kernel.org/index.php/Reconstruction
Especially the bit about overlays. And wait for the experts to chime in
about how to do a hexdump and work out the values you need to pass to
mdadm to get the array back. It's a lot of work and you could be looking
at a week what with the delays as you wait for replies.
I think it's recoverable. Is it worth it?
Cheers,
Wol
^ permalink raw reply
* Re: RAID10 with 2 drives auto-assembled as RAID1
From: Phil Turmel @ 2016-11-07 13:49 UTC (permalink / raw)
To: Darko Luketic, linux-raid
In-Reply-To: <904643ba-b712-0c8a-249d-a78e7fc68876@luketic.de>
Good morning,
On 11/07/2016 08:36 AM, Darko Luketic wrote:
> On 11/05/2016 11:54 PM, Phil Turmel wrote:
>> Assuming you had an ext2/3/4 filesystem in the array, try this
>> one-liner:
>>
>> for x in /dev/sd[ef]1; do echo -e "\nDevice $x"; dd if=$x bs=1M
>> count=16k |hexdump -C |egrep '^[0-9a-f]+30 .+ 53 ef'; done
> There are many lines of output.
> Meanwhile I went the commercial route. Managed so save all important
> stuff but the software (rdata product version 6 for Windows) didn't do
> it properly.
> I'll have to start a new restore.
> I also used a free product for gnu/linux called R-Linux which is able to
> find most dirs and files but hangs at 1.03TB and is unable to find the
> most important .thunderbird dir .
> Also tried sleuthkit with autopsy2 without success.
Ok.
> For now I can cope with the 128GB ssd as a temporary solution, but life
> must go on and in the long run I need more space and to use the drives.
>
> What was I searching for with this chain of commands?
Magic number for an ext2/3/4 filesystem at a suitable offset within a
sector.
> Device /dev/sde1
> 07f00430 b5 9b 1c 58 7a 02 ff ff 53 ef 01 00 01 00 00 00
> |...Xz...S.......|
This is a candidate, with a last write timestamp of Fri, 04 Nov 2016
14:31:17 GMT.
> 0ff00030 c4 36 78 52 00 00 ff ff 53 ef 00 00 01 00 00 00
> |.6xR....S.......|
And another candidate with a last write @ Tue, 05 Nov 2013 00:07:32 GMT.
> 1ff00030 c4 36 78 52 00 00 ff ff 53 ef 00 00 01 00 00 00
> |.6xR....S.......|
And this one appears to be a backup superblock for the 2013 filesystem.
The spacing between these two make me wonder if they are both backups
for a superblock that's been overwritten.
I'd suggest sharing these emails with the Ext4 mailing list to see if
you can get some more specific recovery help. I'd say the odds are fair
to good.
Phil
^ permalink raw reply
* Re: MD Remnants After --stop
From: NeilBrown @ 2016-11-07 5:44 UTC (permalink / raw)
To: Marc Smith, linux-raid
In-Reply-To: <CAHkw+Lf1SErbGro4bq5MM5LyB-Zqyqi4E90R7c+uAZHv1WgSrA@mail.gmail.com>
[-- Attachment #1: Type: text/plain, Size: 1989 bytes --]
On Sat, Nov 05 2016, Marc Smith wrote:
> Hi,
>
> It may be that I've never noticed this before, so maybe its not a
> problem... after using '--stop' to deactivate/stop an MD array, there
> are remnants of it lingering, namely an entry in /sys/block (eg,
> /sys/block/md127) and the device node in /dev remains (eg,
> /dev/md127).
>
> Is this normal? Like I said, it probably is, and I've just never
> noticed it before. I assume its not going to hurt anything, but is
> there a way to clean it up, without rebooting? Obviously I could
> remove the /dev entry, but what about /sys/block?
>
You can remove them both by running
mdadm -S /dev/md127
but they'll probably just reappear again.
This seems to be an on-going battle between md and udev. I've "fixed"
it at least once, but it keeps coming back.
When md removes the md127 device, a message is sent to udev.
As part of its response to this message, udev tries to open /dev/md127.
Because of the rather unusual way that md devices are created (it made
sense nearly 20 years ago when it was designed), opening /dev/md127
causes md to create device md127 again.
You could
mv /dev/md127 /dev/md127X
mdadm -S /dev/md127X
rm /dev/md127X
that stop udev from opening /dev/md127. It seems to work reliably.
md used to generate a CHANGE event before the REMOVE event, and only the
CHANGE event caused udev to open the device file. I removed that and
the problem went away. Apparently some change has happened to udev and
now it opens the file in response to REMOVE as well.
So to "fix" it again, you need to figure out what udev is doing and fix
that.
Alternately... place "create names=yes" in your mdadm.conf
and always use names, not numbers, to work with md arrays.
e.g. /dev/md/home /dev/md/root /dev/md/scratch etc.
When will trigger the use of an alternate scheme for creating md devices
(using minor numbers >= 512) which udev cannot break so easily. When it
tries to open /dev/md_home, that will fail.
NeilBrown
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]
^ permalink raw reply
* Re: clearing blocks wrongfully marked as bad if --update=no-bbl can't be used?
From: Phil Turmel @ 2016-11-07 3:36 UTC (permalink / raw)
To: Marc MERLIN, NeilBrown
Cc: Roman Mamedov, Neil Brown, Andreas Klauer, linux-raid
In-Reply-To: <20161107011342.fld53ntd3djrctb2@merlins.org>
On 11/06/2016 08:13 PM, Marc MERLIN wrote:
> On Mon, Nov 07, 2016 at 11:16:56AM +1100, NeilBrown wrote:
>> Is this a 32bit system you are using? Such systems can only support
>> buffered IO up to 8TB. If you use iflags=direct to avoid buffering, you
>> should get access to the whole device.
>
> You found the problem, and you also found the reason why btrfs_tools
> also fails past 8GB. It is indeed a 32bit distro. If I put a 64bit
> kernel with the 32bit userland, there is a weird problem with a sound
> driver/video driver sync, so I've stuck with 32bits.
Huh. Learn something new every day, I suppose. Never would have
thought of this. Thanks, Neil.
Phil
^ permalink raw reply
* Re: clearing blocks wrongfully marked as bad if --update=no-bbl can't be used?
From: Marc MERLIN @ 2016-11-07 1:13 UTC (permalink / raw)
To: NeilBrown
Cc: Roman Mamedov, Phil Turmel, Neil Brown, Andreas Klauer,
linux-raid
In-Reply-To: <87lgwwnnyf.fsf@notabene.neil.brown.name>
On Mon, Nov 07, 2016 at 11:16:56AM +1100, NeilBrown wrote:
> On Sat, Nov 05 2016, Marc MERLIN wrote:
> >
> > What's interesting is that it started exactly at 50%, which is also
> > likely where my reads were failing.
> >
> > myth:/sys/block/md5/md# echo repair > sync_action
> >
> > md5 : active raid5 sdg1[0] sdd1[5] sde1[3] sdf1[2] sdh1[6]
> > 15627542528 blocks super 1.2 level 5, 512k chunk, algorithm 2 [5/5] [UUUUU]
> > [==========>..........] resync = 50.0% (1953925916/3906885632) finish=1899.1min speed=17138K/sec
> > bitmap: 0/30 pages [0KB], 65536KB chunk
>
> Yep, that is weird.
>
> You can cause that to happen by e.g
> echo 7813771264 > /sys/block/md5/md/sync_min
>
> but you are unlikely to have done that deliberately.
I might have done this by mistake instead of sync_speed_min, but as you
say, unlikely. Then again, this is not the main problem and I think you
did find the reason below.
> s_maxbytes will be MAX_LFS_FILESIZE which, on a 32bit system, is
>
> #define MAX_LFS_FILESIZE (((loff_t)PAGE_SIZE << (BITS_PER_LONG-1))-1)
>
> That is 2^(12+31) or 2^43 or 8TB.
>
> Is this a 32bit system you are using? Such systems can only support
> buffered IO up to 8TB. If you use iflags=direct to avoid buffering, you
> should get access to the whole device.
You found the problem, and you also found the reason why btrfs_tools
also fails past 8GB. It is indeed a 32bit distro. If I put a 64bit
kernel with the 32bit userland, there is a weird problem with a sound
driver/video driver sync, so I've stuck with 32bits.
This also explains why my btrfs filesystem mounts perfectly because the
kernel knows how to deal with it, but as soon as I use btrfs check
(32bits), it fails to access data past the 8TB limit, and falls on its
face too.
myth:/sys/block/md5/md# dd if=/dev/md5 of=/dev/null bs=1GiB skip=8190
dd: reading `/dev/md5': Invalid argument
2+0 records in
2+0 records out
2147483648 bytes (2.1 GB) copied, 37.0785 s, 57.9 MB/s
myth:/sys/block/md5/md# dd if=/dev/md5 of=/dev/null bs=1GiB skip=8190 count=3 iflag=direct
3+0 records in
3+0 records out
3221225472 bytes (3.2 GB) copied, 41.0663 s, 78.4 MB/s
So a big thanks for solving this mystery.
Marc
--
"A mouse is a device used to point at the xterm you want to type in" - A.S.R.
Microsoft is to operating systems ....
.... what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/ | PGP 1024R/763BE901
^ permalink raw reply
* Re: clearing blocks wrongfully marked as bad if --update=no-bbl can't be used?
From: NeilBrown @ 2016-11-07 0:16 UTC (permalink / raw)
To: Marc MERLIN, Roman Mamedov
Cc: Phil Turmel, Neil Brown, Andreas Klauer, linux-raid
In-Reply-To: <20161104195127.ymenm7ezmhscbzn6@merlins.org>
[-- Attachment #1: Type: text/plain, Size: 1652 bytes --]
On Sat, Nov 05 2016, Marc MERLIN wrote:
>
> What's interesting is that it started exactly at 50%, which is also
> likely where my reads were failing.
>
> myth:/sys/block/md5/md# echo repair > sync_action
>
> md5 : active raid5 sdg1[0] sdd1[5] sde1[3] sdf1[2] sdh1[6]
> 15627542528 blocks super 1.2 level 5, 512k chunk, algorithm 2 [5/5] [UUUUU]
> [==========>..........] resync = 50.0% (1953925916/3906885632) finish=1899.1min speed=17138K/sec
> bitmap: 0/30 pages [0KB], 65536KB chunk
Yep, that is weird.
You can cause that to happen by e.g
echo 7813771264 > /sys/block/md5/md/sync_min
but you are unlikely to have done that deliberately.
>
> That said, as this resync is processing, I'd think/hope it would move
> the error forward, but it does not seem to:
> myth:/sys/block/md5/md# dd if=/dev/md5 of=/dev/null bs=1GiB skip=8190
> dd: reading `/dev/md5': Invalid argument
> 2+0 records in
> 2+0 records out
> 2147483648 bytes (2.1 GB) copied, 27.8491 s, 77.1 MB/s
EINVAL from a read() system call is surprising in this context.....
do_generic_file_read can return it:
if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
return -EINVAL;
s_maxbytes will be MAX_LFS_FILESIZE which, on a 32bit system, is
#define MAX_LFS_FILESIZE (((loff_t)PAGE_SIZE << (BITS_PER_LONG-1))-1)
That is 2^(12+31) or 2^43 or 8TB.
Is this a 32bit system you are using? Such systems can only support
buffered IO up to 8TB. If you use iflags=direct to avoid buffering, you
should get access to the whole device.
If this is a 64bit system, then the problem must be elsewhere.
NeilBrown
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]
^ permalink raw reply
* Re: [PATCH 2/2] mdadm: raid10.c Remove near atomic break
From: NeilBrown @ 2016-11-06 23:03 UTC (permalink / raw)
To: Robert LeBlanc; +Cc: linux-raid
In-Reply-To: <CAANLjFrXjS3HX-qv_FAXBapD6PE-n+y=d=6kzCwO3pSjajafWQ@mail.gmail.com>
[-- Attachment #1: Type: text/plain, Size: 2916 bytes --]
On Fri, Nov 04 2016, Robert LeBlanc wrote:
> On Thu, Nov 3, 2016 at 10:01 PM, NeilBrown <neilb@suse.com> wrote:
>> On Fri, Nov 04 2016, Robert LeBlanc wrote:
>>
>>> This is always triggered for small reads preventing spreading the reads
>>> across all available drives. The comments are also confusing as it is
>>> supposed to apply only to 'far' layouts, but really only applies to 'near'
>>> layouts. Since there isn't problems with 'far' layouts, there shouldn't
>>> be a problem for 'near' layouts either. This change fairly distributes
>>> reads across all drives where before only came from the first drive.
>>
>> Why is "fairness" an issue?
>> The current code will use a device if it finds that it is completely
>> idle. i.e. if nr_pending is 0.
>> Why is that ever the wrong thing to do?
>
> The code also looks for a drive that is closest to the requested
> sector which doesn't get a chance to happen without this patch. The
> way this part of code is written, as soon as it finds a good disk, it
> cuts out of the loop searching for a better disk. So it doesn't even
> look for another disk. In a healthy array with array-disks X and -p
> nX, this means that the first disk gets all the reads for small I/O.
> Where nY is less than X, it may be covered up because the data is
> naturally striped, but it still may be picking a disk that is farther
> away from the selected sector causing extra head seeks.
>
>> Does your testing show that overall performance is improved? If so,
>> that would certainly be useful.
>> But it isn't clear (to me) that simply spreading the load more "fairly"
>> is a worthy goal.
>
> I'll see if I have some mechanical drives somewhere to test (I've been
> testing four loopback devices on a single NVME drive so you don't see
> an improvement). You can see from the fio I posted [1] that before the
> patch, one drive had all the I/O and after the patch the I/O was
> distributed between all the drives (it doesn't have to be exactly
> even, just not as skewed as it was before is good enough). I would
> expect similar results to the 'far' tests done here [0]. Based on the
> previous tests I did, when I saw this code, it just made complete
> sense to me why we had great performance with 'far' and subpar
> performance with 'near'. I'll come back with some results tomorrow.
The whole point of the "far" layout is to provide better read
performance than "near" because reads get spread out over all devices
like they do with RAID0. So I wouldn't expect the make "near" close to
"far", but it certainly doesn't hurt to improve it.
As Shaohua notes, we you two different algorithms in RAID1, depending on
whether there are rotational devices or not. Introducing similar logic
to RAID10 would seem to make sense.
Patches for performance improvements are always more convincing when
they come with numbers. I'll look forward to your test results.
Thanks,
NeilBrown
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]
^ permalink raw reply
* Re: [md PATCH 4/4] md/bitmap: Don't write bitmap while earlier writes might be in-flight
From: NeilBrown @ 2016-11-06 22:53 UTC (permalink / raw)
To: Shaohua Li; +Cc: linux-raid
In-Reply-To: <20161105003325.qnmcu363ocbli74x@kernel.org>
[-- Attachment #1: Type: text/plain, Size: 1345 bytes --]
On Sat, Nov 05 2016, Shaohua Li wrote:
> On Fri, Nov 04, 2016 at 04:46:03PM +1100, Neil Brown wrote:
>> As we don't wait for writes to complete in bitmap_daemon_work, they
>> could still be in-flight when bitmap_unplug writes again. Or when
>> bitmap_daemon_work tries to write again.
>> This can be confusing and could risk the wrong data being written last.
>
> Applied the first 3 patches, thanks!
>
> This one seems not completely solving the race condition. It's still possible
> bitmap_daemon_work clears BITMAP_PAGE_NEEDWRITE but hasn't dispatch the IO yet,
> bitmap_unplug then does nothing and thinks bitmap is updated to disk. Why don't
> we add locking here?
Thanks for the review!
BITMAP_PAGE_NEEDWRITE is set for pages that need to be written out in
order to clear bits that don't need to be set any more. There is never
any urgency to do this.
BITMAP_PAGE_DIRTY is set of pages that need to be written out in order
to set bits representing regions that are about to be written to. These
have to be flushed by bitmap_unplug().
Pages can have both bits set, in which case bitmap_daemon_work() will
leave them for bitmap_unplug() to deal with.
So if bitmap_daemon_work() clears BITMAP_PAGE_PENDING on a page, then it
is a page that bitmap_unplug() doesn't need to wait for.
Does that answer your concerns?
Thanks,
NeilBrown
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]
^ permalink raw reply
* (unknown),
From: Dennis Dataopslag @ 2016-11-06 21:00 UTC (permalink / raw)
To: linux-raid
Help wanted very much!
My setup:
Thecus N5550 NAS with 5 1TB drives installed.
MD0: RAID 5 config of 4 drives (SD[ABCD]2)
MD10: RAID 1 config of all 5 drives (SD..1), system generated array
MD50: RAID 1 config of 4 drives (SD[ABCD]3), system generated array
1 drive (SDE) set as global hot spare.
What happened:
This weekend I thought it might be a good idea to do a SMART test for
the drives in my NAS.
I started the test on 1 drive and after it ran for a while I started
the other ones.
While the test was running drive 3 failed. I got a message the RAID
was degraded and started rebuilding. (My assumption is that at this
moment the global hot spare will automatically be added to the array)
I stopped the SMART tests of all drives at this moment since it seemed
logical to me the SMART test (or the outcomes) made the drive fail.
In stopping the tests, drive 1 also failed!!
I let it for a little but the admin interface kept telling me it was
degraded, did not seem to take any actions to start rebuilding.
At this point I started googling and found I should remove and reseat
the drives. This is also what I did but nothing seemd to happen.
The turned up as new drives in the admin interface and I re-added them
to the array, they were added as spares.
Even after adding them the array didn't start rebuilding.
I checked stat in mdadm and it told me clean FAILED opposed to the
degraded in the admin interface.
I rebooted the NAS since it didn't seem to be doing anything I might interrupt.
after rebooting it seemed as if the entire array had disappeared!!
I started looking for options in MDADM and tried every "normal"option
to rebuild the array (--assemble --scan for example)
Unfortunately I cannot produce a complete list since I cannot find how
to get it from the logging.
Finally I mdadm --create a new array with the original 4 drives with
all the right settings. (Got them from 1 of the original volumes)
The creation worked but after creation it doesn't seem to have a valid
partition table. This is the point where I realized I probably fucked
it up big-time and should call in the help squad!!!
What I think went wrong is that I re-created an array with the
original 4 drives from before the first failure but the hot-spare was
already added?
The most important data from the array is saved in an offline backup
luckily but I would very much like it if there is any way I could
restore the data from the array.
Is there any way I could get it back online?
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox