Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH 3/5] r5cache: naive reclaim approach
From: Song Liu @ 2016-08-31 22:18 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, shli, kernel-team, dan.j.williams, hch, Song Liu
In-Reply-To: <1472681902-1172317-1-git-send-email-songliubraving@fb.com>

This patch adds a naive reclaim for r5c cache.

There are two limited resources, stripe cache and journal disk space.
For better performance, we priotize reclaim of stripes with more data
in cache. To free up more journal space, we free earliest data on
the journal.

In current implementation, reclaim decision is made in two places:
at the end of cached write, and from r5l_reclaim_thread.

At the end of every cached write, we check wthether we should reclaim
this stripe. Specifically, the stripe is reclaimed if:
 1. it is full stripe
 2. 50% of stripe cache space are in cached
 3. it is occupying large chunk of journal space

The reclaim thread (r5l_reclaim_thread) wakes up every 5 secounds. In
this thread, r5c_do_reclaim reclaims stripe cache space, while
r5l_do_reclaim reclaims journal space.

When resource is not limited, r5c_do_reclaim will do nothing.
Otherwise, r5c_do_reclaim walks through r5c_cached_list and freeze
up to R5C_RECLAIM_STRIPE_GROUP (set to 8) stripes.

r5c_cache keeps all data in cache (not fully committed to RAID) in
a list (stripe_in_cache). These stripes are in the order of their
first appearance on the journal. So the log tail (last_checkpoint)
should point to the journal_start of the first item in the list.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 167 ++++++++++++++++++++++++++++++++++++++++++-----
 drivers/md/raid5.c       |  14 +++-
 drivers/md/raid5.h       |   2 +
 3 files changed, 166 insertions(+), 17 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 78eeb6df..68f1470 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -34,6 +34,10 @@
 #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
 #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
 
+/* wake up reclaim thread periodically */
+#define R5C_RECLAIM_WAKEUP_INTERVAL (5 * HZ)
+/* reclaim stripes in groups */
+#define R5C_RECLAIM_STRIPE_GROUP  8
 /*
  * We only need 2 bios per I/O unit to make progress, but ensure we
  * have a few more available to not get too tight.
@@ -109,6 +113,9 @@ struct r5l_log {
 
 	/* for r5c_cache */
 	enum r5c_state r5c_state;
+	struct list_head stripe_in_cache; /* all stripes in the cache, with
+					   * sh->log_start in order */
+	spinlock_t stripe_in_cache_lock;  /* lock for stripe_in_cache */
 };
 
 /*
@@ -462,6 +469,7 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 	int meta_size;
 	int ret;
 	struct r5l_io_unit *io;
+	unsigned long flags;
 
 	meta_size =
 		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
@@ -505,6 +513,14 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 	atomic_inc(&io->pending_stripe);
 	sh->log_io = io;
 
+	if (sh->log_start == MaxSector) {
+		BUG_ON(!list_empty(&sh->r5c));
+		sh->log_start = io->log_start;
+		spin_lock_irqsave(&log->stripe_in_cache_lock, flags);
+		list_add_tail(&sh->r5c,
+			      &log->stripe_in_cache);
+		spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
+	}
 	return 0;
 }
 
@@ -705,15 +721,69 @@ static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
 	wake_up(&log->iounit_wait);
 }
 
+/*
+ * Check whether we want to reclaim this stripe.
+ * Return true if the stripe should be freezed
+ *
+ * We would like to reclaim the stripe if
+ * 1. it is full stripe
+ * 2. 50% of stripe cache space are in cached
+ * 3. it is occupying large chunk of journal space
+ */
+static bool r5c_check_stripe_for_reclaim(struct stripe_head *sh,
+					 sector_t log_start)
+{
+	struct r5conf *conf = sh->raid_conf;
+	struct r5l_log *log = conf->log;
+	bool ret = false;
+
+	/* only check active stripe (STRIPE_ACTIVE) or
+	 * stripe in r5c_cached_list */
+	if (!test_bit(STRIPE_ACTIVE, &sh->state)) {
+		assert_spin_locked(&conf->device_lock);
+		WARN_ON(list_empty(&sh->r5c));
+	}
+
+	if (atomic_read(&sh->dev_in_cache) ==
+	    conf->raid_disks - conf->max_degraded) {
+		pr_debug("%s: freeze stripe for full stripe\n", __func__);
+		return true;
+	}
+
+	if (atomic_read(&conf->r5c_cached_stripes) * 2 >
+	    conf->min_nr_stripes) {
+		pr_debug("%s: freeze stripe for stripe cache\n", __func__);
+		return true;
+	}
+
+	/* TODO: do we need protection reading log->log_start? */
+	if (r5l_ring_distance(log, sh->log_start, log_start) >
+		   log->max_free_space) {
+		pr_debug("%s: freeze stripe for journal space\n", __func__);
+		ret = true;
+	}
+	return ret;
+}
+
 void r5l_stripe_write_finished(struct stripe_head *sh)
 {
+	struct r5conf *conf = sh->raid_conf;
+	struct r5l_log *log = conf->log;
 	struct r5l_io_unit *io;
+	sector_t log_start;
 
 	io = sh->log_io;
 	sh->log_io = NULL;
 
 	if (io && atomic_dec_and_test(&io->pending_stripe))
 		__r5l_stripe_write_finished(io);
+
+	mutex_lock(&log->io_mutex);
+	log_start = log->log_start;
+	mutex_unlock(&log->io_mutex);
+	if (!test_bit(STRIPE_R5C_FROZEN, &sh->state))
+		if (r5c_check_stripe_for_reclaim(sh, log_start))
+			r5c_freeze_stripe_for_reclaim(sh);
 }
 
 static void r5l_log_flush_endio(struct bio *bio)
@@ -817,6 +887,10 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
 		blkdev_issue_discard(bdev, log->rdev->data_offset, end,
 				GFP_NOIO, 0);
 	}
+	mutex_lock(&log->io_mutex);
+	log->last_checkpoint = end;
+	log->last_cp_seq = log->next_cp_seq;
+	mutex_unlock(&log->io_mutex);
 }
 
 static void r5l_do_reclaim(struct r5l_log *log)
@@ -855,19 +929,30 @@ static void r5l_do_reclaim(struct r5l_log *log)
 	if (reclaimable == 0)
 		return;
 
-	/*
-	 * write_super will flush cache of each raid disk. We must write super
-	 * here, because the log area might be reused soon and we don't want to
-	 * confuse recovery
-	 */
-	r5l_write_super_and_discard_space(log, next_checkpoint);
+	r5l_run_no_space_stripes(log);
+}
 
-	mutex_lock(&log->io_mutex);
-	log->last_checkpoint = next_checkpoint;
-	log->last_cp_seq = next_cp_seq;
-	mutex_unlock(&log->io_mutex);
+static void r5c_update_super(struct r5conf *conf)
+{
+	struct stripe_head *sh;
+	struct r5l_log *log = conf->log;
+	sector_t end = MaxSector;
+	unsigned long flags;
 
-	r5l_run_no_space_stripes(log);
+	spin_lock_irqsave(&log->stripe_in_cache_lock, flags);
+	if (list_empty(&conf->log->stripe_in_cache)) {
+		/* all stripes flushed */
+		spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
+		r5l_write_super_and_discard_space(log, log->next_checkpoint);
+		return;
+	}
+	sh = list_first_entry(&conf->log->stripe_in_cache,
+			      struct stripe_head, r5c);
+	end = sh->log_start;
+	spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
+
+	if (end != log->last_checkpoint && end != MaxSector)
+		r5l_write_super_and_discard_space(log, end);
 }
 
 static void r5l_reclaim_thread(struct md_thread *thread)
@@ -878,7 +963,10 @@ static void r5l_reclaim_thread(struct md_thread *thread)
 
 	if (!log)
 		return;
+	r5c_do_reclaim(conf);
 	r5l_do_reclaim(log);
+	r5c_update_super(conf);
+	md_wakeup_thread(mddev->thread);
 }
 
 void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
@@ -913,9 +1001,10 @@ void r5l_quiesce(struct r5l_log *log, int state)
 		/* make sure r5l_write_super_and_discard_space exits */
 		mddev = log->rdev->mddev;
 		wake_up(&mddev->sb_wait);
-		r5l_wake_reclaim(log, -1L);
+		r5l_wake_reclaim(log, MaxSector);
 		md_unregister_thread(&log->reclaim_thread);
 		r5l_do_reclaim(log);
+		r5c_update_super(log->rdev->mddev->private);
 	}
 }
 
@@ -1194,6 +1283,7 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp)
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 }
 
+
 static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
 {
 	list_del_init(&sh->lru);
@@ -1326,6 +1416,7 @@ void r5c_handle_stripe_written(struct r5conf *conf,
 			       struct stripe_head *sh) {
 	int i;
 	int do_wakeup = 0;
+	unsigned long flags;
 
 	if (test_and_clear_bit(STRIPE_R5C_WRITTEN, &sh->state)) {
 		WARN_ON(!test_bit(STRIPE_R5C_FROZEN, &sh->state));
@@ -1338,6 +1429,10 @@ void r5c_handle_stripe_written(struct r5conf *conf,
 			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
 				do_wakeup = 1;
 		}
+		spin_lock_irqsave(&conf->log->stripe_in_cache_lock, flags);
+		list_del_init(&sh->r5c);
+		spin_unlock_irqrestore(&conf->log->stripe_in_cache_lock, flags);
+		sh->log_start = MaxSector;
 	}
 
 	if (do_wakeup)
@@ -1413,13 +1508,49 @@ void r5c_do_reclaim(struct r5conf *conf)
 {
 	struct stripe_head *sh, *next;
 	struct r5l_log *log = conf->log;
-
-	assert_spin_locked(&conf->device_lock);
+	int count = 0;
+	unsigned long flags;
+	bool skip_reclaim = true;
+	sector_t log_start;
 
 	if (!log)
 		return;
-	list_for_each_entry_safe(sh, next, &conf->r5c_cached_list, lru)
-		r5c_flush_stripe(conf, sh);
+	if (atomic_read(&conf->r5c_cached_stripes) +
+	    atomic_read(&conf->active_stripes) > conf->min_nr_stripes * 3 / 4)
+		skip_reclaim = false;
+	else {
+		struct list_head *l;
+
+		spin_lock_irqsave(&log->stripe_in_cache_lock, flags);
+		if (!list_empty(&log->stripe_in_cache)) {
+			l = log->stripe_in_cache.next;
+			sh = list_entry(l, struct stripe_head, r5c);
+			if (r5l_ring_distance(log, sh->log_start, log->log_start) >
+			    log->max_free_space)
+				skip_reclaim = false;
+		}
+		spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
+	}
+	if (skip_reclaim)
+		return;
+
+	/* lock io_mutex and get log->log_start before holding device_lock*/
+	mutex_lock(&log->io_mutex);
+	log_start = log->log_start;
+	mutex_unlock(&log->io_mutex);
+
+	spin_lock_irqsave(&conf->device_lock, flags);
+	list_for_each_entry_safe(sh, next, &conf->r5c_cached_list, lru) {
+		if (r5c_check_stripe_for_reclaim(sh, log_start)) {
+			count++;
+			r5c_flush_stripe(conf, sh);
+		}
+		if (count >= R5C_RECLAIM_STRIPE_GROUP)
+			break;
+	}
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+	if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
+		wake_up(&conf->wait_for_stripe);
 }
 
 static int r5l_load_log(struct r5l_log *log)
@@ -1534,6 +1665,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 						 log->rdev->mddev, "reclaim");
 	if (!log->reclaim_thread)
 		goto reclaim_thread;
+	log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
+
 	init_waitqueue_head(&log->iounit_wait);
 
 	INIT_LIST_HEAD(&log->no_mem_stripes);
@@ -1543,6 +1676,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 
 	/* flush full stripe */
 	log->r5c_state = R5C_STATE_WRITE_BACK;
+	INIT_LIST_HEAD(&log->stripe_in_cache);
+	spin_lock_init(&log->stripe_in_cache_lock);
 
 	if (r5l_load_log(log))
 		goto error;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7956d13..af6875b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -691,6 +691,8 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
 			if (!sh) {
 				set_bit(R5_INACTIVE_BLOCKED,
 					&conf->cache_state);
+				if (conf->log)
+					r5l_wake_reclaim(conf->log, 0);
 				wait_event_lock_irq(
 					conf->wait_for_stripe,
 					!list_empty(conf->inactive_list + hash) &&
@@ -729,6 +731,15 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
 	} while (sh == NULL);
 
 	spin_unlock_irq(conf->hash_locks + hash);
+
+	if (conf->log &&
+	    (atomic_read(&conf->active_stripes) +
+	     atomic_read(&conf->r5c_cached_stripes) >
+	     conf->max_nr_stripes * 3 / 4)) {
+		set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
+		r5l_wake_reclaim(conf->log, 0);
+	}
+
 	return sh;
 }
 
@@ -2036,8 +2047,10 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 		spin_lock_init(&sh->batch_lock);
 		INIT_LIST_HEAD(&sh->batch_list);
 		INIT_LIST_HEAD(&sh->lru);
+		INIT_LIST_HEAD(&sh->r5c);
 		atomic_set(&sh->count, 1);
 		atomic_set(&sh->dev_in_cache, 0);
+		sh->log_start = MaxSector;
 		for (i = 0; i < disks; i++) {
 			struct r5dev *dev = &sh->dev[i];
 
@@ -6029,7 +6042,6 @@ static void raid5d(struct md_thread *thread)
 			md_check_recovery(mddev);
 			spin_lock_irq(&conf->device_lock);
 		}
-		r5c_do_reclaim(conf);
 	}
 	pr_debug("%d stripes handled\n", handled);
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index dbc128e..901fd41 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -227,6 +227,8 @@ struct stripe_head {
 	struct r5l_io_unit	*log_io;
 	struct list_head	log_list;
 	atomic_t		dev_in_cache;
+	sector_t		log_start; /* first meta block on the journal */
+	struct list_head	r5c; /* for r5c_cache->stripe_in_cache */
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
-- 
2.8.0.rc2


^ permalink raw reply related

* [PATCH 4/5] r5cache: r5c recovery
From: Song Liu @ 2016-08-31 22:18 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, shli, kernel-team, dan.j.williams, hch, Song Liu
In-Reply-To: <1472681902-1172317-1-git-send-email-songliubraving@fb.com>

This is the recovery part of raid5-cache.

With cache feature, there are 2 different scenarios of recovery:
1. Data-Parity stripe: a stripe with complete parity in journal.
2. Data-Only stripe: a stripe with only data in journal (or partial
   parity).

The code differentiate Data-Parity stripe from Data-Only stripe with
flag (STRIPE_R5C_WRITTEN).

For Data-Parity stripes, we use the same procedure as raid5 journal,
where all the data and parity are replayed to the RAID devices.

For Data-Only strips, we need to finish complete calculate parity and
finish the full reconstruct write or RMW write. For simplicity, in
the recovery, we load the stripe to stripe cache. Once the array is
started, the stripe cache state machine will handle these stripes
through normal write path.

r5c_recovery_flush_log contains the main procedure of recovery. The
recovery code first scans through the journal and loads data to
stripe cache. The code keeps tracks of all these stripes in a list
(use sh->lru and ctx->cached_list), stripes in the list are
organized in the order of its first appearance on the journal.
During the scan, the recovery code assesses each stripe as
Data-Parity or Data-Only.

During scan, the array may run out of stripe cache. In these cases,
the recovery code tries to release some stripe head by replaying
existing Data-Parity stripes. Once these replays are done, these
stripes can be released. When releasing Data-Parity stripes is not
enough, the recovery code will also call raid5_set_cache_size to
increase stripe cache size.

At the end of scan, the recovery code replays all Data-Parity
stripes, and sets proper states for Data-Only stripes. The recovery
code also increases seq number by 10 and rewrites all Data-Only
stripes to journal. This is to avoid confusion after repeated
crashes. More details is explained in raid5-cache.c before
r5c_recovery_rewrite_data_only_stripes().

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 678 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 544 insertions(+), 134 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 68f1470..7214595 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2015 Shaohua Li <shli@fb.com>
+ * Copyright (C) 2016 Song Liu <songliubraving@fb.com>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
@@ -1029,10 +1030,13 @@ struct r5l_recovery_ctx {
 	sector_t meta_total_blocks;	/* total size of current meta and data */
 	sector_t pos;			/* recovery position */
 	u64 seq;			/* recovery position seq */
+	int data_parity_stripes;	/* number of data_parity stripes */
+	int data_only_stripes;		/* number of data_only stripes */
+	struct list_head cached_list;
 };
 
-static int r5l_read_meta_block(struct r5l_log *log,
-			       struct r5l_recovery_ctx *ctx)
+static int r5l_recovery_read_meta_block(struct r5l_log *log,
+					struct r5l_recovery_ctx *ctx)
 {
 	struct page *page = ctx->meta_page;
 	struct r5l_meta_block *mb;
@@ -1064,170 +1068,574 @@ static int r5l_read_meta_block(struct r5l_log *log,
 	return 0;
 }
 
-static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
-					 struct r5l_recovery_ctx *ctx,
-					 sector_t stripe_sect,
-					 int *offset, sector_t *log_offset)
+/*
+ * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
+ * to mark valid (potentially not flushed) data in the journal.
+ *
+ * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
+ * so there should not be any mismatch here.
+ */
+static void r5l_recovery_load_data(struct r5l_log *log,
+				   struct stripe_head *sh,
+				   struct r5l_recovery_ctx *ctx,
+				   struct r5l_payload_data_parity *payload,
+				   sector_t log_offset)
 {
-	struct r5conf *conf = log->rdev->mddev->private;
-	struct stripe_head *sh;
-	struct r5l_payload_data_parity *payload;
+	struct mddev *mddev = log->rdev->mddev;
+	struct r5conf *conf = mddev->private;
 	int disk_index;
 
-	sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
-	while (1) {
-		payload = page_address(ctx->meta_page) + *offset;
+	raid5_compute_sector(conf,
+			     le64_to_cpu(payload->location), 0,
+			     &disk_index, sh);
+	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
+		     sh->dev[disk_index].page, REQ_OP_READ, 0, false);
+	sh->dev[disk_index].log_checksum =
+		le32_to_cpu(payload->checksum[0]);
+	ctx->meta_total_blocks += BLOCK_SECTORS;
 
-		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
-			raid5_compute_sector(conf,
-					     le64_to_cpu(payload->location), 0,
-					     &disk_index, sh);
+	set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
+}
 
-			sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
-				     sh->dev[disk_index].page, REQ_OP_READ, 0,
-				     false);
-			sh->dev[disk_index].log_checksum =
-				le32_to_cpu(payload->checksum[0]);
-			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
-			ctx->meta_total_blocks += BLOCK_SECTORS;
-		} else {
-			disk_index = sh->pd_idx;
-			sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
-				     sh->dev[disk_index].page, REQ_OP_READ, 0,
-				     false);
-			sh->dev[disk_index].log_checksum =
-				le32_to_cpu(payload->checksum[0]);
-			set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
-
-			if (sh->qd_idx >= 0) {
-				disk_index = sh->qd_idx;
-				sync_page_io(log->rdev,
-					     r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
-					     PAGE_SIZE, sh->dev[disk_index].page,
-					     REQ_OP_READ, 0, false);
-				sh->dev[disk_index].log_checksum =
-					le32_to_cpu(payload->checksum[1]);
-				set_bit(R5_Wantwrite,
-					&sh->dev[disk_index].flags);
-			}
-			ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
-		}
+static void r5l_recovery_load_parity(struct r5l_log *log,
+				     struct stripe_head *sh,
+				     struct r5l_recovery_ctx *ctx,
+				     struct r5l_payload_data_parity *payload,
+				     sector_t log_offset)
+{
+	struct mddev *mddev = log->rdev->mddev;
+	struct r5conf *conf = mddev->private;
 
-		*log_offset = r5l_ring_add(log, *log_offset,
-					   le32_to_cpu(payload->size));
-		*offset += sizeof(struct r5l_payload_data_parity) +
-			sizeof(__le32) *
-			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
-		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
-			break;
+	ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
+	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
+		     sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
+	sh->dev[sh->pd_idx].log_checksum =
+		le32_to_cpu(payload->checksum[0]);
+	set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
+
+	if (sh->qd_idx >= 0) {
+		sync_page_io(log->rdev,
+			     r5l_ring_add(log, log_offset, BLOCK_SECTORS),
+			     PAGE_SIZE, sh->dev[sh->qd_idx].page,
+			     REQ_OP_READ, 0, false);
+		sh->dev[sh->qd_idx].log_checksum =
+			le32_to_cpu(payload->checksum[1]);
+		set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
 	}
+	set_bit(STRIPE_R5C_WRITTEN, &sh->state);
+}
 
-	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
-		void *addr;
-		u32 checksum;
+static void r5l_recovery_reset_stripe(struct stripe_head *sh)
+{
+	int i;
+
+	sh->state = 0;
+	sh->log_start = MaxSector;
+	for (i = sh->disks; i--; )
+		sh->dev[i].flags = 0;
+}
 
+static void
+r5l_recovery_replay_one_stripe(struct r5conf *conf,
+			       struct stripe_head *sh,
+			       struct r5l_recovery_ctx *ctx)
+{
+	struct md_rdev *rdev, *rrdev;
+	int disk_index;
+	int data_count = 0;
+
+	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
 		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
 			continue;
-		addr = kmap_atomic(sh->dev[disk_index].page);
-		checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
-		kunmap_atomic(addr);
-		if (checksum != sh->dev[disk_index].log_checksum)
-			goto error;
+		if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
+			continue;
+		data_count++;
 	}
+	/* stripes only have parity are already flushed to RAID */
+	if (data_count == 0)
+		goto out;
 
 	for (disk_index = 0; disk_index < sh->disks; disk_index++) {
-		struct md_rdev *rdev, *rrdev;
-
-		if (!test_and_clear_bit(R5_Wantwrite,
-					&sh->dev[disk_index].flags))
+		if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
 			continue;
 
 		/* in case device is broken */
 		rdev = rcu_dereference(conf->disks[disk_index].rdev);
 		if (rdev)
-			sync_page_io(rdev, stripe_sect, PAGE_SIZE,
+			sync_page_io(rdev, sh->sector, PAGE_SIZE,
 				     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
 				     false);
 		rrdev = rcu_dereference(conf->disks[disk_index].replacement);
 		if (rrdev)
-			sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
+			sync_page_io(rrdev, sh->sector, PAGE_SIZE,
 				     sh->dev[disk_index].page, REQ_OP_WRITE, 0,
 				     false);
 	}
-	raid5_release_stripe(sh);
+	ctx->data_parity_stripes++;
+out:
+	r5l_recovery_reset_stripe(sh);
+}
+
+static void
+r5l_recovery_create_emtpy_meta_block(struct r5l_log *log,
+				     struct page *page,
+				     sector_t pos, u64 seq)
+{
+	struct r5l_meta_block *mb;
+	u32 crc;
+
+	mb = page_address(page);
+	clear_page(mb);
+	mb->magic = cpu_to_le32(R5LOG_MAGIC);
+	mb->version = R5LOG_VERSION;
+	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
+	mb->seq = cpu_to_le64(seq);
+	mb->position = cpu_to_le64(pos);
+	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
+	mb->checksum = cpu_to_le32(crc);
+}
+
+static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
+					  u64 seq)
+{
+	struct page *page;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+	r5l_recovery_create_emtpy_meta_block(log, page, pos, seq);
+	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
+			  WRITE_FUA, false)) {
+		__free_page(page);
+		return -EIO;
+	}
+	__free_page(page);
 	return 0;
+}
 
-error:
-	for (disk_index = 0; disk_index < sh->disks; disk_index++)
-		sh->dev[disk_index].flags = 0;
-	raid5_release_stripe(sh);
-	return -EINVAL;
+static struct stripe_head *
+r5c_recovery_alloc_stripe(struct r5conf *conf,
+			  struct list_head *recovery_list,
+			  sector_t stripe_sect,
+			  sector_t log_start)
+{
+	struct stripe_head *sh;
+
+	sh = raid5_get_active_stripe(conf, stripe_sect, 0, 1, 0);
+	if (!sh)
+		return NULL;  /* no more stripe available */
+
+	r5l_recovery_reset_stripe(sh);
+	sh->log_start = log_start;
+
+	return sh;
 }
 
-static int r5l_recovery_flush_one_meta(struct r5l_log *log,
-				       struct r5l_recovery_ctx *ctx)
+static struct stripe_head *
+r5c_recovery_lookup_stripe(struct list_head *list, sector_t sect)
 {
-	struct r5conf *conf = log->rdev->mddev->private;
+	struct stripe_head *sh;
+
+	list_for_each_entry(sh, list, lru)
+		if (sh->sector == sect)
+			return sh;
+	return NULL;
+}
+
+static void
+r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
+			    struct r5l_recovery_ctx *ctx)
+{
+	struct stripe_head *sh, *next;
+
+	list_for_each_entry_safe(sh, next, cached_stripe_list, lru)
+		if (test_bit(STRIPE_R5C_WRITTEN, &sh->state)) {
+			r5l_recovery_replay_one_stripe(sh->raid_conf, sh, ctx);
+			list_del_init(&sh->lru);
+			raid5_release_stripe(sh);
+		}
+}
+
+/* returns 0 for match; 1 for mismtach */
+static int
+r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page,
+				  sector_t log_offset, __le32 log_checksum)
+{
+	void *addr;
+	u32 checksum;
+
+	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
+		     page, REQ_OP_READ, 0, false);
+	addr = kmap_atomic(page);
+	checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
+	kunmap_atomic(addr);
+	return le32_to_cpu(log_checksum) != checksum;
+}
+
+/*
+ * before loading data to stripe cache, we need verify checksum for all data,
+ * if there is mismatch for any data page, we drop all data in the mata block
+ */
+static int
+r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
+					 struct r5l_recovery_ctx *ctx)
+{
+	struct mddev *mddev = log->rdev->mddev;
+	struct r5conf *conf = mddev->private;
+	struct r5l_meta_block *mb = page_address(ctx->meta_page);
+	sector_t mb_offset = sizeof(struct r5l_meta_block);
+	sector_t log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
+	struct page *page;
 	struct r5l_payload_data_parity *payload;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	while (mb_offset < le32_to_cpu(mb->meta_size)) {
+		payload = (void *)mb + mb_offset;
+
+		if (payload->header.type == R5LOG_PAYLOAD_DATA) {
+			if (r5l_recovery_verify_data_checksum(
+				    log, page, log_offset,
+				    payload->checksum[0]))
+				goto mismatch;
+		} else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
+			if (r5l_recovery_verify_data_checksum(
+				    log, page, log_offset,
+				    payload->checksum[0]))
+				goto mismatch;
+			if (conf->max_degraded == 2 && /* q for RAID 6 */
+			    r5l_recovery_verify_data_checksum(
+				    log, page,
+				    r5l_ring_add(log, log_offset,
+						 BLOCK_SECTORS),
+				    payload->checksum[1]))
+				goto mismatch;
+		} else
+			goto mismatch;
+
+		log_offset = r5l_ring_add(log, log_offset,
+					  le32_to_cpu(payload->size));
+
+		mb_offset += sizeof(struct r5l_payload_data_parity) +
+			sizeof(__le32) *
+			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+	}
+
+	put_page(page);
+	return 0;
+
+mismatch:
+	put_page(page);
+	return -EINVAL;
+}
+
+static int
+r5c_recovery_analyze_meta_block(struct r5l_log *log,
+				struct r5l_recovery_ctx *ctx,
+				struct list_head *cached_stripe_list)
+{
+	struct mddev *mddev = log->rdev->mddev;
+	struct r5conf *conf = mddev->private;
 	struct r5l_meta_block *mb;
-	int offset;
+	struct r5l_payload_data_parity *payload;
+	int mb_offset;
 	sector_t log_offset;
-	sector_t stripe_sector;
+	sector_t stripe_sect;
+	struct stripe_head *sh;
+	int ret;
+
+	/* for mismatch in data blocks, we will drop all data in this mb, but
+	 * we will still read next mb for other data with FLUSH flag, as
+	 * io_unit could finish out of order.
+	 */
+	ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
+	if (ret == -EINVAL)
+		return -EAGAIN;
+	else if (ret)
+		return ret;
 
 	mb = page_address(ctx->meta_page);
-	offset = sizeof(struct r5l_meta_block);
+	mb_offset = sizeof(struct r5l_meta_block);
 	log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
 
-	while (offset < le32_to_cpu(mb->meta_size)) {
+	while (mb_offset < le32_to_cpu(mb->meta_size)) {
 		int dd;
 
-		payload = (void *)mb + offset;
-		stripe_sector = raid5_compute_sector(conf,
-						     le64_to_cpu(payload->location), 0, &dd, NULL);
-		if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
-						  &offset, &log_offset))
+		payload = (void *)mb + mb_offset;
+		stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
+			raid5_compute_sector(
+				conf, le64_to_cpu(payload->location), 0, &dd,
+				NULL)
+			: le64_to_cpu(payload->location);
+
+		sh = r5c_recovery_lookup_stripe(cached_stripe_list,
+						stripe_sect);
+
+		if (!sh) {
+			sh = r5c_recovery_alloc_stripe(conf, cached_stripe_list,
+						       stripe_sect, ctx->pos);
+			/* cannot get stripe from raid5_get_active_stripe
+			 * try replay some stripes
+			 */
+			if (!sh) {
+				r5c_recovery_replay_stripes(
+					cached_stripe_list, ctx);
+				sh = r5c_recovery_alloc_stripe(
+					conf, cached_stripe_list,
+					stripe_sect, ctx->pos);
+			}
+			if (!sh) {
+				raid5_set_cache_size(mddev,
+						     conf->min_nr_stripes * 2);
+				sh = r5c_recovery_alloc_stripe(
+					conf, cached_stripe_list, stripe_sect,
+					ctx->pos);
+			}
+			if (!sh) {
+				pr_err("md/raid:%s: Cannot get enough stripe_cache. Recovery interrupted.\n",
+				       mdname(mddev));
+				return -ENOMEM;
+			}
+			list_add_tail(&sh->lru, cached_stripe_list);
+		}
+		if (!sh)
+			return -ENOMEM;
+
+		if (payload->header.type == R5LOG_PAYLOAD_DATA) {
+			if (test_bit(STRIPE_R5C_WRITTEN, &sh->state)) {
+				r5l_recovery_reset_stripe(sh);
+				sh->log_start = ctx->pos;
+				list_move_tail(&sh->lru, cached_stripe_list);
+			}
+			r5l_recovery_load_data(log, sh, ctx, payload,
+					       log_offset);
+		} else if (payload->header.type == R5LOG_PAYLOAD_PARITY)
+			r5l_recovery_load_parity(log, sh, ctx, payload,
+						 log_offset);
+		else
 			return -EINVAL;
+
+		log_offset = r5l_ring_add(log, log_offset,
+					  le32_to_cpu(payload->size));
+
+		mb_offset += sizeof(struct r5l_payload_data_parity) +
+			sizeof(__le32) *
+			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
 	}
+
 	return 0;
 }
 
-/* copy data/parity from log to raid disks */
-static void r5l_recovery_flush_log(struct r5l_log *log,
+/*
+ * Load the stripe into cache. The stripe will be written out later by
+ * the stripe cache state machine.
+ */
+static void r5c_recovery_load_one_stripe(struct r5l_log *log,
+					 struct stripe_head *sh)
+{
+	struct r5conf *conf = sh->raid_conf;
+	struct r5dev *dev;
+	int i;
+
+	atomic_set(&sh->dev_in_cache, 0);
+	for (i = sh->disks; i--; ) {
+		dev = sh->dev + i;
+		if (test_and_clear_bit(R5_Wantwrite, &dev->flags)) {
+			set_bit(R5_InCache, &dev->flags);
+			atomic_inc(&sh->dev_in_cache);
+		}
+	}
+	set_bit(STRIPE_IN_R5C_CACHE, &sh->state);
+	atomic_inc(&conf->r5c_cached_stripes);
+	list_add_tail(&sh->r5c, &log->stripe_in_cache);
+}
+
+/*
+ * Scan through the log for all to-be-flushed data
+ *
+ * For stripes with data and parity, namely Data-Parity stripe
+ * (STRIPE_R5C_WRITTEN == 0), we simply replay all the writes.
+ *
+ * For stripes with only data, namely Data-Only stripe
+ * (STRIPE_R5C_WRITTEN == 1), we load them to stripe cache state machine.
+ *
+ * For a stripe, if we see data after parity, we should discard all previous
+ * data and parity for this stripe, as these data are already flushed to
+ * the array.
+ *
+ * At the end of the scan, we return the new journal_tail, which points to
+ * first data-only stripe on the journal device, or next invalid meta block.
+ */
+static void r5c_recovery_flush_log(struct r5l_log *log,
 				   struct r5l_recovery_ctx *ctx)
 {
+	struct stripe_head *sh, *next;
+	int ret;
+
+	/* scan through the log */
 	while (1) {
-		if (r5l_read_meta_block(log, ctx))
-			return;
-		if (r5l_recovery_flush_one_meta(log, ctx))
-			return;
+		if (r5l_recovery_read_meta_block(log, ctx))
+			break;
+
+		ret = r5c_recovery_analyze_meta_block(log, ctx,
+						      &ctx->cached_list);
+		/* -EAGAIN means mismatch in data block, in this case, we still
+		 * try scan the next metablock
+		 */
+		if (ret && ret != -EAGAIN)
+			break;
 		ctx->seq++;
 		ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
 	}
+
+	/* replay data-parity stripes */
+	r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
+
+	/* load data-only stripes to stripe cache */
+	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
+		WARN_ON(test_bit(STRIPE_R5C_WRITTEN, &sh->state));
+		r5c_recovery_load_one_stripe(log, sh);
+		list_del_init(&sh->lru);
+		raid5_release_stripe(sh);
+		ctx->data_only_stripes++;
+	}
+
+	return;
 }
 
-static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
-					  u64 seq)
+/*
+ * we did a recovery. Now ctx.pos points to an invalid meta block. New
+ * log will start here. but we can't let superblock point to last valid
+ * meta block. The log might looks like:
+ * | meta 1| meta 2| meta 3|
+ * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
+ * superblock points to meta 1, we write a new valid meta 2n.  if crash
+ * happens again, new recovery will start from meta 1. Since meta 2n is
+ * valid now, recovery will think meta 3 is valid, which is wrong.
+ * The solution is we create a new meta in meta2 with its seq == meta
+ * 1's seq + 10 and let superblock points to meta2. The same recovery will
+ * not think meta 3 is a valid meta, because its seq doesn't match
+ */
+
+/*
+ * Before recovery, the log looks like the following
+ *
+ *   ---------------------------------------------
+ *   |           valid log        | invalid log  |
+ *   ---------------------------------------------
+ *   ^
+ *   |- log->last_checkpoint
+ *   |- log->last_cp_seq
+ *
+ * Now we scan through the log until we see invalid entry
+ *
+ *   ---------------------------------------------
+ *   |           valid log        | invalid log  |
+ *   ---------------------------------------------
+ *   ^                            ^
+ *   |- log->last_checkpoint      |- ctx->pos
+ *   |- log->last_cp_seq          |- ctx->seq
+ *
+ * From this point, we need to increase seq number by 10 to avoid
+ * confusing next recovery.
+ *
+ *   ---------------------------------------------
+ *   |           valid log        | invalid log  |
+ *   ---------------------------------------------
+ *   ^                              ^
+ *   |- log->last_checkpoint        |- ctx->pos+1
+ *   |- log->last_cp_seq            |- ctx->seq+11
+ *
+ * However, it is not safe to start the state machine yet, because data only
+ * parities are not yet secured in RAID. To save these data only parities, we
+ * rewrite them from seq+11.
+ *
+ *   -----------------------------------------------------------------
+ *   |           valid log        | data only stripes | invalid log  |
+ *   -----------------------------------------------------------------
+ *   ^                                                ^
+ *   |- log->last_checkpoint                          |- ctx->pos+n
+ *   |- log->last_cp_seq                              |- ctx->seq+10+n
+ *
+ * If failure happens again during this process, the recovery can safe start
+ * again from log->last_checkpoint.
+ *
+ * Once data only stripes are rewritten to journal, we move log_tail
+ *
+ *   -----------------------------------------------------------------
+ *   |     old log        |    data only stripes    | invalid log  |
+ *   -----------------------------------------------------------------
+ *                        ^                         ^
+ *                        |- log->last_checkpoint   |- ctx->pos+n
+ *                        |- log->last_cp_seq       |- ctx->seq+10+n
+ *
+ * Then we can safely start the state machine. If failure happens from this
+ * point on, the recovery will start from new log->last_checkpoint.
+ */
+static int
+r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
+				       struct r5l_recovery_ctx *ctx)
 {
+	struct stripe_head *sh;
+	struct mddev *mddev = log->rdev->mddev;
 	struct page *page;
-	struct r5l_meta_block *mb;
-	u32 crc;
 
-	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-	if (!page)
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
+		       mdname(mddev));
 		return -ENOMEM;
-	mb = page_address(page);
-	mb->magic = cpu_to_le32(R5LOG_MAGIC);
-	mb->version = R5LOG_VERSION;
-	mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
-	mb->seq = cpu_to_le64(seq);
-	mb->position = cpu_to_le64(pos);
-	crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
-	mb->checksum = cpu_to_le32(crc);
+	}
 
-	if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
-			  WRITE_FUA, false)) {
-		__free_page(page);
-		return -EIO;
+	ctx->seq += 10;
+	list_for_each_entry(sh, &ctx->cached_list, lru) {
+		struct r5l_meta_block *mb;
+		int i;
+		int offset;
+		sector_t write_pos;
+
+		WARN_ON(test_bit(STRIPE_R5C_WRITTEN, &sh->state));
+		r5l_recovery_create_emtpy_meta_block(log, page,
+						     ctx->pos, ctx->seq);
+		mb = page_address(page);
+		offset = le32_to_cpu(mb->meta_size);
+		write_pos = ctx->pos + BLOCK_SECTORS;
+
+		for (i = sh->disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
+			struct r5l_payload_data_parity *payload;
+			void *addr;
+
+			if (test_bit(R5_InCache, &dev->flags)) {
+				payload = (void *)mb + offset;
+				payload->header.type = cpu_to_le16(
+					R5LOG_PAYLOAD_DATA);
+				payload->size = BLOCK_SECTORS;
+				payload->location = cpu_to_le64(
+					raid5_compute_blocknr(sh, i, 0));
+				addr = kmap_atomic(dev->page);
+				payload->checksum[0] = cpu_to_le32(
+					crc32c_le(log->uuid_checksum, addr,
+						  PAGE_SIZE));
+				kunmap_atomic(addr);
+				sync_page_io(log->rdev, write_pos, PAGE_SIZE,
+					     dev->page, REQ_OP_WRITE, 0, false);
+				write_pos = r5l_ring_add(log, write_pos,
+							 BLOCK_SECTORS);
+				offset += sizeof(__le32) +
+					sizeof(struct r5l_payload_data_parity);
+
+			}
+		}
+		mb->meta_size = cpu_to_le32(offset);
+		mb->checksum = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
+		sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
+			     REQ_OP_WRITE, WRITE_FUA, false);
+		sh->log_start = ctx->pos;
+		ctx->pos = write_pos;
+		ctx->seq += 1;
 	}
 	__free_page(page);
 	return 0;
@@ -1235,43 +1643,45 @@ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
 
 static int r5l_recovery_log(struct r5l_log *log)
 {
+	struct mddev *mddev = log->rdev->mddev;
 	struct r5l_recovery_ctx ctx;
 
 	ctx.pos = log->last_checkpoint;
 	ctx.seq = log->last_cp_seq;
 	ctx.meta_page = alloc_page(GFP_KERNEL);
+	ctx.data_only_stripes = 0;
+	ctx.data_parity_stripes = 0;
+	INIT_LIST_HEAD(&ctx.cached_list);
+
 	if (!ctx.meta_page)
 		return -ENOMEM;
 
-	r5l_recovery_flush_log(log, &ctx);
+	r5c_recovery_flush_log(log, &ctx);
+
 	__free_page(ctx.meta_page);
 
-	/*
-	 * we did a recovery. Now ctx.pos points to an invalid meta block. New
-	 * log will start here. but we can't let superblock point to last valid
-	 * meta block. The log might looks like:
-	 * | meta 1| meta 2| meta 3|
-	 * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
-	 * superblock points to meta 1, we write a new valid meta 2n.  if crash
-	 * happens again, new recovery will start from meta 1. Since meta 2n is
-	 * valid now, recovery will think meta 3 is valid, which is wrong.
-	 * The solution is we create a new meta in meta2 with its seq == meta
-	 * 1's seq + 10 and let superblock points to meta2. The same recovery will
-	 * not think meta 3 is a valid meta, because its seq doesn't match
-	 */
-	if (ctx.seq > log->last_cp_seq + 1) {
-		int ret;
-
-		ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
-		if (ret)
-			return ret;
-		log->seq = ctx.seq + 11;
-		log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
-		r5l_write_super(log, ctx.pos);
-	} else {
-		log->log_start = ctx.pos;
-		log->seq = ctx.seq;
+	if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
+		pr_info("md/raid:%s: starting from clean shutdown\n",
+			mdname(mddev));
+	else {
+		pr_info("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n",
+			mdname(mddev), ctx.data_only_stripes,
+			ctx.data_parity_stripes);
+
+		if (ctx.data_only_stripes > 0)
+			if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
+				pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
+				       mdname(mddev));
+				return -EIO;
+			}
 	}
+
+	log->log_start = ctx.pos;
+	log->next_checkpoint = ctx.pos;
+	log->seq = ctx.seq;
+	r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq);
+	r5l_write_super(log, ctx.pos);
+
 	return 0;
 }
 
-- 
2.8.0.rc2


^ permalink raw reply related

* [PATCH 5/5] r5cache: handle SYNC and FUA
From: Song Liu @ 2016-08-31 22:18 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, shli, kernel-team, dan.j.williams, hch, Song Liu
In-Reply-To: <1472681902-1172317-1-git-send-email-songliubraving@fb.com>

With raid5 cache, we committing data from journal device. When
there is flush request, we need to flush journal device's cache.
This was not needed in raid5 journal, because we will flush the
journal before committing data to raid disks.

This is similar to FUA, except that we also need flush journal for
FUA. Otherwise, corruptions in earlier meta data will stop recovery
from reaching FUA data.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 134 +++++++++++++++++++++++++++++++++++++++++++----
 drivers/md/raid5.c       |   8 +++
 drivers/md/raid5.h       |   1 +
 3 files changed, 133 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 7214595..3446bcc 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -19,6 +19,7 @@
 #include <linux/raid/md_p.h>
 #include <linux/crc32c.h>
 #include <linux/random.h>
+#include <trace/events/block.h>
 #include "md.h"
 #include "raid5.h"
 
@@ -117,6 +118,9 @@ struct r5l_log {
 	struct list_head stripe_in_cache; /* all stripes in the cache, with
 					   * sh->log_start in order */
 	spinlock_t stripe_in_cache_lock;  /* lock for stripe_in_cache */
+
+	/* to submit async io_units, to fulfill ordering of flush */
+	struct work_struct deferred_io_work;
 };
 
 /*
@@ -143,6 +147,18 @@ struct r5l_io_unit {
 
 	int state;
 	bool need_split_bio;
+	struct bio *split_bio;
+
+	unsigned int has_flush:1;      /* include flush request */
+	unsigned int has_fua:1;        /* include fua request */
+	unsigned int has_null_flush:1; /* include empty flush request */
+	/*
+	 * io isn't sent yet, flush/fua request can only be submitted till it's
+	 * the first IO in running_ios list
+	 */
+	unsigned int io_deferred:1;
+
+	struct bio_list flush_barriers;   /* size == 0 flush bios */
 };
 
 /* r5l_io_unit state */
@@ -291,9 +307,11 @@ static void r5l_move_to_end_ios(struct r5l_log *log)
 	}
 }
 
+static void __r5l_stripe_write_finished(struct r5l_io_unit *io);
 static void r5l_log_endio(struct bio *bio)
 {
 	struct r5l_io_unit *io = bio->bi_private;
+	struct r5l_io_unit *io_deferred;
 	struct r5l_log *log = io->log;
 	unsigned long flags;
 
@@ -309,18 +327,89 @@ static void r5l_log_endio(struct bio *bio)
 		r5l_move_to_end_ios(log);
 	else
 		r5l_log_run_stripes(log);
+	if (!list_empty(&log->running_ios)) {
+		/*
+		 * FLUSH/FUA io_unit is deferred because of ordering, now we
+		 * can dispatch it
+		 */
+		io_deferred = list_first_entry(&log->running_ios,
+					       struct r5l_io_unit, log_sibling);
+		if (io_deferred->io_deferred)
+			schedule_work(&log->deferred_io_work);
+	}
+
 	spin_unlock_irqrestore(&log->io_list_lock, flags);
 
 	if (log->need_cache_flush)
 		md_wakeup_thread(log->rdev->mddev->thread);
+
+	if (io->has_null_flush) {
+		struct bio *bi;
+
+		WARN_ON(bio_list_empty(&io->flush_barriers));
+		while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
+			bio_endio(bi);
+			atomic_dec(&io->pending_stripe);
+		}
+		if (atomic_read(&io->pending_stripe) == 0)
+			__r5l_stripe_write_finished(io);
+	}
+}
+
+static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&log->io_list_lock, flags);
+	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
+	spin_unlock_irqrestore(&log->io_list_lock, flags);
+
+	if (io->has_flush)
+		bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FLUSH);
+	if (io->has_fua)
+		bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FUA);
+	submit_bio(io->current_bio);
+
+	if (!io->split_bio)
+		return;
+
+	if (io->has_flush)
+		bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FLUSH);
+	if (io->has_fua)
+		bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FUA);
+	submit_bio(io->split_bio);
+}
+
+/* deferred io_unit will be dispatched here */
+static void r5l_submit_io_async(struct work_struct *work)
+{
+	struct r5l_log *log = container_of(work, struct r5l_log,
+					   deferred_io_work);
+	struct r5l_io_unit *io = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&log->io_list_lock, flags);
+	if (!list_empty(&log->running_ios)) {
+		io = list_first_entry(&log->running_ios, struct r5l_io_unit,
+				      log_sibling);
+		if (!io->io_deferred)
+			io = NULL;
+		else
+			io->io_deferred = 0;
+	}
+	spin_unlock_irqrestore(&log->io_list_lock, flags);
+	if (io)
+		r5l_do_submit_io(log, io);
 }
 
 static void r5l_submit_current_io(struct r5l_log *log)
 {
 	struct r5l_io_unit *io = log->current_io;
+	struct bio *bio;
 	struct r5l_meta_block *block;
 	unsigned long flags;
 	u32 crc;
+	bool do_submit = true;
 
 	if (!io)
 		return;
@@ -329,13 +418,20 @@ static void r5l_submit_current_io(struct r5l_log *log)
 	block->meta_size = cpu_to_le32(io->meta_offset);
 	crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
 	block->checksum = cpu_to_le32(crc);
+	bio = io->current_bio;
 
 	log->current_io = NULL;
 	spin_lock_irqsave(&log->io_list_lock, flags);
-	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
+	if (io->has_flush || io->has_fua) {
+		if (io != list_first_entry(&log->running_ios,
+					   struct r5l_io_unit, log_sibling)) {
+			io->io_deferred = 1;
+			do_submit = false;
+		}
+	}
 	spin_unlock_irqrestore(&log->io_list_lock, flags);
-
-	submit_bio(io->current_bio);
+	if (do_submit)
+		r5l_do_submit_io(log, io);
 }
 
 static struct bio *r5l_bio_alloc(struct r5l_log *log)
@@ -379,6 +475,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 	io->log = log;
 	INIT_LIST_HEAD(&io->log_sibling);
 	INIT_LIST_HEAD(&io->stripe_list);
+	bio_list_init(&io->flush_barriers);
 	io->state = IO_UNIT_RUNNING;
 
 	io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
@@ -449,12 +546,11 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 	struct r5l_io_unit *io = log->current_io;
 
 	if (io->need_split_bio) {
-		struct bio *prev = io->current_bio;
-
+		BUG_ON(io->split_bio);
+		io->split_bio = io->current_bio;
 		io->current_bio = r5l_bio_alloc(log);
-		bio_chain(io->current_bio, prev);
-
-		submit_bio(prev);
+		bio_chain(io->current_bio, io->split_bio);
+		io->need_split_bio = false;
 	}
 
 	if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
@@ -484,12 +580,22 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 
 	io = log->current_io;
 
+	if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
+		io->has_flush = 1;
+
 	for (i = 0; i < sh->disks; i++) {
 		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) &&
 		    !test_bit(R5_Wantcache, &sh->dev[i].flags))
 			continue;
 		if (i == sh->pd_idx || i == sh->qd_idx)
 			continue;
+		if (test_bit(R5_WantFUA, &sh->dev[i].flags)) {
+			io->has_fua = 1;
+			/* we need to flush journal to make sure recovery can
+			 * reach the data with fua flag
+			 */
+			io->has_flush = 1;
+		}
 		r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
 					raid5_compute_blocknr(sh, i, 0),
 					sh->dev[i].log_checksum, 0, false);
@@ -633,10 +739,16 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
 	 * don't need to flush again
 	 */
 	if (bio->bi_iter.bi_size == 0) {
-		bio_endio(bio);
+		mutex_lock(&log->io_mutex);
+		r5l_get_meta(log, 0);
+		bio_list_add(&log->current_io->flush_barriers, bio);
+		log->current_io->has_flush = 1;
+		log->current_io->has_null_flush = 1;
+		atomic_inc(&log->current_io->pending_stripe);
+		r5l_submit_current_io(log);
+		mutex_unlock(&log->io_mutex);
 		return 0;
 	}
-	bio->bi_opf &= ~REQ_PREFLUSH;
 	return -EAGAIN;
 }
 
@@ -2084,6 +2196,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	INIT_LIST_HEAD(&log->no_space_stripes);
 	spin_lock_init(&log->no_space_stripes_lock);
 
+	INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
+
 	/* flush full stripe */
 	log->r5c_state = R5C_STATE_WRITE_BACK;
 	INIT_LIST_HEAD(&log->stripe_in_cache);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index af6875b..509b1c8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5297,6 +5297,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	int remaining;
 	DEFINE_WAIT(w);
 	bool do_prepare;
+	bool do_flush = false;
 
 	if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
 		int ret = r5l_handle_flush_request(conf->log, bi);
@@ -5308,6 +5309,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 			return;
 		}
 		/* ret == -EAGAIN, fallback */
+		do_flush = true;
 	}
 
 	md_write_start(mddev, bi);
@@ -5446,6 +5448,12 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 				do_prepare = true;
 				goto retry;
 			}
+			if (do_flush) {
+				set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
+				/* we only need flush for one stripe */
+				do_flush = false;
+			}
+
 			set_bit(STRIPE_HANDLE, &sh->state);
 			clear_bit(STRIPE_DELAYED, &sh->state);
 			if ((!sh->batch_head || sh == sh->batch_head) &&
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 901fd41..e465409 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -356,6 +356,7 @@ enum {
 				 * in conf->r5c_cached_list) */
 	STRIPE_R5C_FROZEN,      /* r5c_cache frozen and being written out */
 	STRIPE_R5C_WRITTEN,	/* ready for r5c_handle_stripe_written() */
+	STRIPE_R5C_PREFLUSH,	/* need to flush journal device */
 };
 
 #define STRIPE_EXPAND_SYNC_FLAGS \
-- 
2.8.0.rc2


^ permalink raw reply related

* Re: md-cluster Module Requirement
From: Guoqing Jiang @ 2016-09-01  2:18 UTC (permalink / raw)
  To: Marc Smith, linux-raid
In-Reply-To: <CAHkw+Lf82YEZ3qm9KJ_b_2pUiNFh=nDeEBXYV5PdgtEUtij8eg@mail.gmail.com>



On 08/26/2016 10:40 AM, Marc Smith wrote:
> Hi,
>
> I'm attempting to use md-cluster from Linux 4.5.2 with mdadm 3.4 and
> I'm running into this when attempting to create a RAID1 device with
> the clustered bitmap:
>
> --snip--
> [64782.619968] md: bind<dm-4>
> [64782.629336] md: bind<dm-3>
> [64782.630008] md/raid1:md127: active with 2 out of 2 mirrors
> [64782.630528] md-cluster module not found.
> [64782.630530] md127: Could not setup cluster service (-2)
> [64782.630531] md127: bitmap file superblock:
> [64782.630532]          magic: 6d746962
> [64782.630533]        version: 5
> [64782.630534]           uuid: 10fee18f.f553d7f2.deb926f1.c7c4bd4b
> [64782.630534]         events: 0
> [64782.630535] events cleared: 0
> [64782.630536]          state: 00000000
> [64782.630537]      chunksize: 67108864 B
> [64782.630537]   daemon sleep: 5s
> [64782.630538]      sync size: 878956288 KB
> [64782.630539] max write behind: 0
> [64782.630541] md127: failed to create bitmap (-2)
> [64782.630577] md: md127 stopped.
> [64782.630581] md: unbind<dm-3>
> [64782.635133] md: export_rdev(dm-3)
> [64782.635145] md: unbind<dm-4>
> [64782.643111] md: export_rdev(dm-4)
> --snip--
>
> I'm using md-cluster built-in, not as a module:
> # zcat /proc/config.gz | grep MD_CLUSTER
> CONFIG_MD_CLUSTER=y
>
> It seems the driver is attempting to load the 'md-cluster' module
> (from drivers/md/md.c):
> --snip--
>          err = request_module("md-cluster");
>          if (err) {
>                  pr_err("md-cluster module not found.\n");
>                  return -ENOENT;
>          }
> --snip--
>
> I looked at linux-next and it appears this code is the same; is there
> a test we can do before attempting to load the module in the case that
> its built-in, or is there some other requirement that md-cluster needs
> to be built as a module?

Yes, we need some additional modules corosync/pacemaker and dlm,
pls refer to http://www.spinics.net/lists/raid/msg47863.html, HTH.

Regards,
Guoqing


^ permalink raw reply

* Re: md-cluster Module Requirement
From: NeilBrown @ 2016-09-01  5:52 UTC (permalink / raw)
  To: Guoqing Jiang, Marc Smith, linux-raid
In-Reply-To: <57C79011.9090404@suse.com>

[-- Attachment #1: Type: text/plain, Size: 3169 bytes --]

On Thu, Sep 01 2016, Guoqing Jiang wrote:

> On 08/26/2016 10:40 AM, Marc Smith wrote:
>> Hi,
>>
>> I'm attempting to use md-cluster from Linux 4.5.2 with mdadm 3.4 and
>> I'm running into this when attempting to create a RAID1 device with
>> the clustered bitmap:
>>
>> --snip--
>> [64782.619968] md: bind<dm-4>
>> [64782.629336] md: bind<dm-3>
>> [64782.630008] md/raid1:md127: active with 2 out of 2 mirrors
>> [64782.630528] md-cluster module not found.
>> [64782.630530] md127: Could not setup cluster service (-2)
>> [64782.630531] md127: bitmap file superblock:
>> [64782.630532]          magic: 6d746962
>> [64782.630533]        version: 5
>> [64782.630534]           uuid: 10fee18f.f553d7f2.deb926f1.c7c4bd4b
>> [64782.630534]         events: 0
>> [64782.630535] events cleared: 0
>> [64782.630536]          state: 00000000
>> [64782.630537]      chunksize: 67108864 B
>> [64782.630537]   daemon sleep: 5s
>> [64782.630538]      sync size: 878956288 KB
>> [64782.630539] max write behind: 0
>> [64782.630541] md127: failed to create bitmap (-2)
>> [64782.630577] md: md127 stopped.
>> [64782.630581] md: unbind<dm-3>
>> [64782.635133] md: export_rdev(dm-3)
>> [64782.635145] md: unbind<dm-4>
>> [64782.643111] md: export_rdev(dm-4)
>> --snip--
>>
>> I'm using md-cluster built-in, not as a module:
>> # zcat /proc/config.gz | grep MD_CLUSTER
>> CONFIG_MD_CLUSTER=y
>>
>> It seems the driver is attempting to load the 'md-cluster' module
>> (from drivers/md/md.c):
>> --snip--
>>          err = request_module("md-cluster");
>>          if (err) {
>>                  pr_err("md-cluster module not found.\n");
>>                  return -ENOENT;
>>          }
>> --snip--

I think this code is wrong.  It should be more like:

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d646f6e444f0..09036add7f33 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7612,15 +7612,13 @@ int md_setup_cluster(struct mddev *mddev, int nodes)
 {
 	int err;
 
-	err = request_module("md-cluster");
-	if (err) {
-		pr_err("md-cluster module not found.\n");
-		return -ENOENT;
-	}
+	if (!md_cluster_ops)
+		request_module("md-cluster");
 
 	spin_lock(&pers_lock);
 	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
 		spin_unlock(&pers_lock);
+		pr_err("md-cluster module not found.\n");
 		return -ENOENT;
 	}
 	spin_unlock(&pers_lock);

>>
>> I looked at linux-next and it appears this code is the same; is there
>> a test we can do before attempting to load the module in the case that
>> its built-in, or is there some other requirement that md-cluster needs
>> to be built as a module?
>
> Yes, we need some additional modules corosync/pacemaker and dlm,

That doesn't explain the error message though.
If MD_CLUSTER is built in, then DLM must be too.

NeilBrown


> pls refer to http://www.spinics.net/lists/raid/msg47863.html, HTH.
>
> Regards,
> Guoqing
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]

^ permalink raw reply related

* Re: md-cluster Module Requirement
From: Guoqing Jiang @ 2016-09-01  9:30 UTC (permalink / raw)
  To: NeilBrown, Marc Smith, linux-raid
In-Reply-To: <87h9a0cgkw.fsf@notabene.neil.brown.name>



On 09/01/2016 01:52 AM, NeilBrown wrote:
> On Thu, Sep 01 2016, Guoqing Jiang wrote:
>
>> On 08/26/2016 10:40 AM, Marc Smith wrote:
>>> Hi,
>>>
>>> I'm attempting to use md-cluster from Linux 4.5.2 with mdadm 3.4 and
>>> I'm running into this when attempting to create a RAID1 device with
>>> the clustered bitmap:
>>>
>>> --snip--
>>> [64782.619968] md: bind<dm-4>
>>> [64782.629336] md: bind<dm-3>
>>> [64782.630008] md/raid1:md127: active with 2 out of 2 mirrors
>>> [64782.630528] md-cluster module not found.
>>> [64782.630530] md127: Could not setup cluster service (-2)
>>> [64782.630531] md127: bitmap file superblock:
>>> [64782.630532]          magic: 6d746962
>>> [64782.630533]        version: 5
>>> [64782.630534]           uuid: 10fee18f.f553d7f2.deb926f1.c7c4bd4b
>>> [64782.630534]         events: 0
>>> [64782.630535] events cleared: 0
>>> [64782.630536]          state: 00000000
>>> [64782.630537]      chunksize: 67108864 B
>>> [64782.630537]   daemon sleep: 5s
>>> [64782.630538]      sync size: 878956288 KB
>>> [64782.630539] max write behind: 0
>>> [64782.630541] md127: failed to create bitmap (-2)
>>> [64782.630577] md: md127 stopped.
>>> [64782.630581] md: unbind<dm-3>
>>> [64782.635133] md: export_rdev(dm-3)
>>> [64782.635145] md: unbind<dm-4>
>>> [64782.643111] md: export_rdev(dm-4)
>>> --snip--
>>>
>>> I'm using md-cluster built-in, not as a module:
>>> # zcat /proc/config.gz | grep MD_CLUSTER
>>> CONFIG_MD_CLUSTER=y
>>>
>>> It seems the driver is attempting to load the 'md-cluster' module
>>> (from drivers/md/md.c):
>>> --snip--
>>>           err = request_module("md-cluster");
>>>           if (err) {
>>>                   pr_err("md-cluster module not found.\n");
>>>                   return -ENOENT;
>>>           }
>>> --snip--
> I think this code is wrong.  It should be more like:
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index d646f6e444f0..09036add7f33 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -7612,15 +7612,13 @@ int md_setup_cluster(struct mddev *mddev, int nodes)
>   {
>   	int err;
>   
> -	err = request_module("md-cluster");
> -	if (err) {
> -		pr_err("md-cluster module not found.\n");
> -		return -ENOENT;
> -	}
> +	if (!md_cluster_ops)
> +		request_module("md-cluster");
>   
>   	spin_lock(&pers_lock);
>   	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
>   		spin_unlock(&pers_lock);
> +		pr_err("md-cluster module not found.\n");
>   		return -ENOENT;
>   	}
>   	spin_unlock(&pers_lock);

Thanks, how about below changes?

--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7619,20 +7619,19 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);

  int md_setup_cluster(struct mddev *mddev, int nodes)
  {
-       int err;
-
-       err = request_module("md-cluster");
-       if (err) {
-               pr_err("md-cluster module not found.\n");
-               return -ENOENT;
-       }
-
         spin_lock(&pers_lock);
-       if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
-               spin_unlock(&pers_lock);
-               return -ENOENT;
+       if (!md_cluster_ops) {
+               /* load module and ensure it won't be unloaded */
+               if (!request_module("md-cluster") &&
+                   !try_module_get(md_cluster_mod)) {
+                       pr_err("md-cluster module found.\n");
+                       spin_unlock(&pers_lock);
+               } else {
+                       pr_err("md-cluster module not found.\n");
+                       spin_unlock(&pers_lock);
+                       return -ENODEV;
+               }
         }
-       spin_unlock(&pers_lock);

         return md_cluster_ops->join(mddev, nodes);
  }

>>> I looked at linux-next and it appears this code is the same; is there
>>> a test we can do before attempting to load the module in the case that
>>> its built-in, or is there some other requirement that md-cluster needs
>>> to be built as a module?
>> Yes, we need some additional modules corosync/pacemaker and dlm,
> That doesn't explain the error message though.

You are right, I should read carefully :(

> If MD_CLUSTER is built in, then DLM must be too.

Sure,  then only need to ensure cluster is configured right.

Best Regards,
Guoqing

^ permalink raw reply

* Re: md-cluster Module Requirement
From: NeilBrown @ 2016-09-01 10:51 UTC (permalink / raw)
  To: Guoqing Jiang, Marc Smith, linux-raid
In-Reply-To: <57C7F534.3040400@suse.com>

[-- Attachment #1: Type: text/plain, Size: 2098 bytes --]

On Thu, Sep 01 2016, Guoqing Jiang wrote:

>
> Thanks, how about below changes?
>
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -7619,20 +7619,19 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
>
>   int md_setup_cluster(struct mddev *mddev, int nodes)
>   {
> -       int err;
> -
> -       err = request_module("md-cluster");
> -       if (err) {
> -               pr_err("md-cluster module not found.\n");
> -               return -ENOENT;
> -       }
> -
>          spin_lock(&pers_lock);
> -       if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
> -               spin_unlock(&pers_lock);
> -               return -ENOENT;
> +       if (!md_cluster_ops) {
> +               /* load module and ensure it won't be unloaded */
> +               if (!request_module("md-cluster") &&

Calling request_module() under a spin_lock is not OK.  request_module()
needs to wait while the module is loaded.

Just call request_module, ignore the error, then take the spinlock and
see if the module is registered.

NeilBrown

> +                   !try_module_get(md_cluster_mod)) {
> +                       pr_err("md-cluster module found.\n");
> +                       spin_unlock(&pers_lock);
> +               } else {
> +                       pr_err("md-cluster module not found.\n");
> +                       spin_unlock(&pers_lock);
> +                       return -ENODEV;
> +               }
>          }
> -       spin_unlock(&pers_lock);
>
>          return md_cluster_ops->join(mddev, nodes);
>   }
>
>>>> I looked at linux-next and it appears this code is the same; is there
>>>> a test we can do before attempting to load the module in the case that
>>>> its built-in, or is there some other requirement that md-cluster needs
>>>> to be built as a module?
>>> Yes, we need some additional modules corosync/pacemaker and dlm,
>> That doesn't explain the error message though.
>
> You are right, I should read carefully :(
>
>> If MD_CLUSTER is built in, then DLM must be too.
>
> Sure,  then only need to ensure cluster is configured right.
>
> Best Regards,
> Guoqing

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]

^ permalink raw reply

* Re: md-cluster Module Requirement
From: Marc Smith @ 2016-09-01 15:45 UTC (permalink / raw)
  To: NeilBrown; +Cc: Guoqing Jiang, linux-raid
In-Reply-To: <877favdh9z.fsf@notabene.neil.brown.name>

Thanks; I assume an updated patch will ensue? If so, I'll hang tight
for that and test it once posted.

--Marc

On Thu, Sep 1, 2016 at 6:51 AM, NeilBrown <neilb@suse.com> wrote:
> On Thu, Sep 01 2016, Guoqing Jiang wrote:
>
>>
>> Thanks, how about below changes?
>>
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -7619,20 +7619,19 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
>>
>>   int md_setup_cluster(struct mddev *mddev, int nodes)
>>   {
>> -       int err;
>> -
>> -       err = request_module("md-cluster");
>> -       if (err) {
>> -               pr_err("md-cluster module not found.\n");
>> -               return -ENOENT;
>> -       }
>> -
>>          spin_lock(&pers_lock);
>> -       if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
>> -               spin_unlock(&pers_lock);
>> -               return -ENOENT;
>> +       if (!md_cluster_ops) {
>> +               /* load module and ensure it won't be unloaded */
>> +               if (!request_module("md-cluster") &&
>
> Calling request_module() under a spin_lock is not OK.  request_module()
> needs to wait while the module is loaded.
>
> Just call request_module, ignore the error, then take the spinlock and
> see if the module is registered.
>
> NeilBrown
>
>> +                   !try_module_get(md_cluster_mod)) {
>> +                       pr_err("md-cluster module found.\n");
>> +                       spin_unlock(&pers_lock);
>> +               } else {
>> +                       pr_err("md-cluster module not found.\n");
>> +                       spin_unlock(&pers_lock);
>> +                       return -ENODEV;
>> +               }
>>          }
>> -       spin_unlock(&pers_lock);
>>
>>          return md_cluster_ops->join(mddev, nodes);
>>   }
>>
>>>>> I looked at linux-next and it appears this code is the same; is there
>>>>> a test we can do before attempting to load the module in the case that
>>>>> its built-in, or is there some other requirement that md-cluster needs
>>>>> to be built as a module?
>>>> Yes, we need some additional modules corosync/pacemaker and dlm,
>>> That doesn't explain the error message though.
>>
>> You are right, I should read carefully :(
>>
>>> If MD_CLUSTER is built in, then DLM must be too.
>>
>> Sure,  then only need to ensure cluster is configured right.
>>
>> Best Regards,
>> Guoqing

^ permalink raw reply

* Re: bootsect replicated in p1, RAID enclosure suggestions?
From: Wols Lists @ 2016-09-01 17:22 UTC (permalink / raw)
  To: linux-raid
In-Reply-To: <20160823050947.GL32250@subspacefield.org>

On 23/08/16 06:09, travis+ml-linux-raid@subspacefield.org wrote:
> Hello all,
> 
> So I have an Intel NUC (for low power Linux) plugged via USB into a 4
> bay enclosure doing linear (yeah I know; it's the backup server, the
> primary is raid10).
> 
> And every once in a while, this happens (*see end).  The partition 1
> that would normally contain a MD slice ends up being a replica of the
> boot cylinder.  I can't tell if it's the mdraid linear impl, the
> kernel doing something weird, the USB drivers, the enclosure firmware,
> or what.

Interesting snippet from LWN ...

The Btrfs CRC checking means that a read from a corrupted sector will
cause an I/O error rather than return garbage. Facebook had some storage
devices that would appear to store data correctly in a set of logical
block addresses (LBAs) until the next reboot, at which point reads to
those blocks would return GUID partition table (GPT) data instead. He
did not name the device maker because it turned out to actually be a
BIOS problem. In any case, the CRCs allowed the Facebook team to quickly
figure out that the problem was not in Btrfs when it affected thousands
of machines as they were rebooted for a kernel upgrade.

The article itself is

https://lwn.net/Articles/698090/

Cheers,
Wol

^ permalink raw reply

* Re: bootsect replicated in p1, RAID enclosure suggestions?
From: Chris Murphy @ 2016-09-01 23:10 UTC (permalink / raw)
  To: Wols Lists; +Cc: Linux-RAID
In-Reply-To: <57C863D2.2010802@youngman.org.uk>

On Thu, Sep 1, 2016 at 11:22 AM, Wols Lists <antlists@youngman.org.uk> wrote:
> On 23/08/16 06:09, travis+ml-linux-raid@subspacefield.org wrote:
>> Hello all,
>>
>> So I have an Intel NUC (for low power Linux) plugged via USB into a 4
>> bay enclosure doing linear (yeah I know; it's the backup server, the
>> primary is raid10).
>>
>> And every once in a while, this happens (*see end).  The partition 1
>> that would normally contain a MD slice ends up being a replica of the
>> boot cylinder.  I can't tell if it's the mdraid linear impl, the
>> kernel doing something weird, the USB drivers, the enclosure firmware,
>> or what.
>
> Interesting snippet from LWN ...
>
> The Btrfs CRC checking means that a read from a corrupted sector will
> cause an I/O error rather than return garbage. Facebook had some storage
> devices that would appear to store data correctly in a set of logical
> block addresses (LBAs) until the next reboot, at which point reads to
> those blocks would return GUID partition table (GPT) data instead.

Wow that's right in between bizarre and hilarious. Maybe Travis should
check for firmware updates (for the computer, and the enclosure if it
offers such a thing, and maybe even the drives).

 >He
> did not name the device maker because it turned out to actually be a
> BIOS problem. In any case, the CRCs allowed the Facebook team to quickly
> figure out that the problem was not in Btrfs when it affected thousands
> of machines as they were rebooted for a kernel upgrade.

Yeah even in a recent case on linux-btrfs where there's two drives
with bad sectors causing grief, the volume (somewhat surprisingly)
mounted ro,degraded and appears to be mostly recoverable, but the main
thing is that even in that case, other than nocow files, it's expected
anything that copies over (cp, rsync, btrfs send) is not corrupt. If
it were corrupt even after reconstruction from parity (even bad
parity), Brfs will give an I/O error and not submit the data to user
space.

-- 
Chris Murphy

^ permalink raw reply

* Re: bootsect replicated in p1, RAID enclosure suggestions?
From: travis+ml-linux-raid @ 2016-09-02  2:18 UTC (permalink / raw)
  To: Chris Murphy; +Cc: Linux-RAID
In-Reply-To: <CAJCQCtQLLJEoovBtks0ft9XArHwwZTMqBGobW5WavsQydbqSBQ@mail.gmail.com>

On Thu, Aug 25, 2016 at 10:25:35PM -0600, Chris Murphy wrote:
> Well that file does seem stale, because those partitions aren't
> actually part of LVM. They're members of an mdadm array. I don't know
> where LVM comes into this because we don't have the complete layout.

md127 = /dev/sd{b,c,d,e}1
LUKS on that
PV/VG/LV on that.

/dev/sda5 is also a LUKS partition with LVM on it for root.

I wonder if it's possible that whatever restored a GPT also restored a
LVM header, and somehow that picked it up?

Anyway, after doing bitwise backups of disks, I did a create
--assume-clean with --level=raid0 and the thing seems fine.

# fsck /dev/V_hostname/L_bu
fsck from util-linux 2.20.1
e2fsck 1.42 (29-Nov-2011)
/dev/mapper/V_hostname-L_bu contains a file system with errors, check forced.
Pass 1: Checking inodes, blocks, and sizes
Pass 2: Checking directory structure
Pass 3: Checking directory connectivity
Pass 3A: Optimizing directories
Pass 4: Checking reference counts
Pass 5: Checking group summary information
Free blocks count wrong for group #67095 (65535, counted=0).
Fix<y>? yes

Free blocks count wrong (496207637, counted=496207638).
Fix<y>? yes

And that was pretty much it.
-- 
http://www.subspacefield.org/~travis/ | if spammer then john@subspacefield.org
"Computer crime, the glamor crime of the 1970s, will become in the
1980s one of the greatest sources of preventable business loss."
John M. Carroll, "Computer Security", first edition cover flap, 1977

^ permalink raw reply

* [PATCH] md-cluster: make md-cluster also can work when compiled into kernel
From: Guoqing Jiang @ 2016-09-02 10:51 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang, stable, NeilBrown

The md-cluster is compiled as module by default,
if it is compiled by built-in way, then we can't
make md-cluster works.

[64782.630008] md/raid1:md127: active with 2 out of 2 mirrors
[64782.630528] md-cluster module not found.
[64782.630530] md127: Could not setup cluster service (-2)

Fixes: edb39c9 ("Introduce md_cluster_operations to handle cluster functions")
Cc: stable@vger.kernel.org # v4.1+
Cc: NeilBrown <neilb@suse.com>
Reported-by: Marc Smith <marc.smith@mcc.edu>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index bdbbb6e1..3b19d21 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7619,20 +7619,17 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
 
 int md_setup_cluster(struct mddev *mddev, int nodes)
 {
-	int err;
-
-	err = request_module("md-cluster");
-	if (err) {
-		pr_err("md-cluster module not found.\n");
-		return -ENOENT;
-	}
-
-	spin_lock(&pers_lock);
-	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
+	if (!md_cluster_ops) {
+		/* load module and ensure it won't be unloaded */
+		request_module("md-cluster");
+		spin_lock(&pers_lock);
+		if (!try_module_get(md_cluster_mod)) {
+			pr_err("can't get md-cluster module reference.\n");
+			spin_unlock(&pers_lock);
+			return -ENODEV;
+		}
 		spin_unlock(&pers_lock);
-		return -ENOENT;
 	}
-	spin_unlock(&pers_lock);
 
 	return md_cluster_ops->join(mddev, nodes);
 }
-- 
2.6.6


^ permalink raw reply related

* Re: [PATCH] md-cluster: make md-cluster also can work when compiled into kernel
From: NeilBrown @ 2016-09-02 12:56 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang
In-Reply-To: <1472813500-4921-1-git-send-email-gqjiang@suse.com>

[-- Attachment #1: Type: text/plain, Size: 1877 bytes --]

On Fri, Sep 02 2016, Guoqing Jiang wrote:

> The md-cluster is compiled as module by default,
> if it is compiled by built-in way, then we can't
> make md-cluster works.
>
> [64782.630008] md/raid1:md127: active with 2 out of 2 mirrors
> [64782.630528] md-cluster module not found.
> [64782.630530] md127: Could not setup cluster service (-2)
>
> Fixes: edb39c9 ("Introduce md_cluster_operations to handle cluster functions")
> Cc: stable@vger.kernel.org # v4.1+
> Cc: NeilBrown <neilb@suse.com>
> Reported-by: Marc Smith <marc.smith@mcc.edu>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> ---
>  drivers/md/md.c | 21 +++++++++------------
>  1 file changed, 9 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index bdbbb6e1..3b19d21 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -7619,20 +7619,17 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
>  
>  int md_setup_cluster(struct mddev *mddev, int nodes)
>  {
> -	int err;
> -
> -	err = request_module("md-cluster");
> -	if (err) {
> -		pr_err("md-cluster module not found.\n");
> -		return -ENOENT;
> -	}
> -
> -	spin_lock(&pers_lock);
> -	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
> +	if (!md_cluster_ops) {
> +		/* load module and ensure it won't be unloaded */
> +		request_module("md-cluster");
> +		spin_lock(&pers_lock);
> +		if (!try_module_get(md_cluster_mod)) {
> +			pr_err("can't get md-cluster module reference.\n");
> +			spin_unlock(&pers_lock);
> +			return -ENODEV;
> +		}
>  		spin_unlock(&pers_lock);
> -		return -ENOENT;
>  	}
> -	spin_unlock(&pers_lock);
>  
>  	return md_cluster_ops->join(mddev, nodes);
>  }
> -- 
> 2.6.6

No good.  If md_cluster_ops is set, try_module_get() won't be called, so
it will be possible to unload the module while it is in use.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]

^ permalink raw reply

* [PATCH V2] md-cluster: make md-cluster also can work when compiled into kernel
From: Guoqing Jiang @ 2016-09-03 14:03 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang, stable, NeilBrown
In-Reply-To: <1472813500-4921-1-git-send-email-gqjiang@suse.com>

The md-cluster is compiled as module by default,
if it is compiled by built-in way, then we can't
make md-cluster works.

[64782.630008] md/raid1:md127: active with 2 out of 2 mirrors
[64782.630528] md-cluster module not found.
[64782.630530] md127: Could not setup cluster service (-2)

Fixes: edb39c9 ("Introduce md_cluster_operations to handle cluster functions")
Cc: stable@vger.kernel.org # v4.1+
Cc: NeilBrown <neilb@suse.com>
Reported-by: Marc Smith <marc.smith@mcc.edu>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
Changes:
1. call try_module_get if md_cluster_ops is already set,
otherwise try_module_get/module_put are unbalanced.

 drivers/md/md.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 67642ba..6ac5abe 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7610,16 +7610,12 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
 
 int md_setup_cluster(struct mddev *mddev, int nodes)
 {
-	int err;
-
-	err = request_module("md-cluster");
-	if (err) {
-		pr_err("md-cluster module not found.\n");
-		return -ENOENT;
-	}
-
+	if (!md_cluster_ops)
+		request_module("md-cluster");
 	spin_lock(&pers_lock);
-	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
+	/* ensure module won't be unloaded */
+	if (!try_module_get(md_cluster_mod)) {
+		pr_err("can't get md-cluster module reference.\n");
 		spin_unlock(&pers_lock);
 		return -ENOENT;
 	}
-- 
2.6.6


^ permalink raw reply related

* 澳门金沙集团，电子游艺等您来战，谁是真正的打“虎”英雄！(AD)
From: nidingshi @ 2016-09-04  0:22 UTC (permalink / raw)
  To: linux-raid

Du har fått denne meldingen fordi "nidingshi <maxautom@tf1.abstractdns.com>" tror siden "– Sjefsrollen forsvinner i fremtiden" på www.pressenytt.no vil være interessant for deg.

Dette er linken til siden:
http://www.pressenytt.no/nor/Artikler/Jobb-og-utdanning/Sjefsrollen-forsvinner-i-fremtiden

Kommentar fra "nidingshi <maxautom@tf1.abstractdns.com>":
★2016澳门金沙集团全新改版，手机畅玩：【BB、MG、GNS】电子游艺、【BB、AG、沙龙、欧博、OG、GD】视讯、彩票投注。【微信、支付宝】二维码扫一扫秒到，最低存款10元，入款即送1.2%，品牌信誉，大额无忧！ 
五大联赛来了，新增【SBT体育】。联合【bb体育、体育投注】，为您带来更高的投注体验。
电子游艺周投注5000洗码量以上，即可获得高达1288元奖金回馈，VIP开户网址：http://www.855369.com/?linux-raid@vger.kernel.org 


------------------------------------------
                    生命对某些人来说是美丽的，这些人的一生都为某个奋斗。


^ permalink raw reply

* RAID6 - CPU At 100% Usage After Reassembly
From: Francisco Parada @ 2016-09-04  2:56 UTC (permalink / raw)
  To: linux-raid

Hello everyone,

I know this gets a ton of visibility, so I'll keep it as concise as possible.

I'm running Ubuntu 16.04.1 and I have (read had) a 7 drive RAID6
array.  I attempted to grow the array by adding 3 additional drives
for a total of 10, but it seems that one of the brand new drives had
60+ bad blocks (according to "badblocks -vw").  I came to this
conclusion, because I had a power outage during the grow that lasted
longer than my 1500VA battery backup could withstand, so when I
attempted to continue the reshape, I noticed that the assemble
wouldn't start upon reboot.  All drives were marked as spares:

=================================================================================================================
# cat /proc/mdstat
Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5]
[raid4] [raid10]
md127 : inactive sdi[0](S) sdh[2](S) sdj[3](S) sdf[7](S) sdg[9](S)
sdd[10](S) sde[11](S) sdb[13](S) sdc[12](S)
      26371219608 blocks super 1.2
=================================================================================================================

Notice above, that there's only 9 drives instead of 10, which I was
supposed to have.  The drive that's missing is "sdk", but that's
because using "badblocks -vw" has wiped out the drive in an effort to
figure out if there was actually something wrong with said drive
(You're probably gasping, but it had a missing GPT table, and no
matter what I tried to recover it, the drive would just stop
responding to reads and writes).  So I attempted to assemble the array
with "/dev/sdk" missing as shown below, but I get this:

===================================================================================================================
# mdadm -Afv /dev/md127 /dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf
/dev/sdg /dev/sdh /dev/sdi /dev/sdj missing
mdadm: looking for devices for /dev/md127
mdadm: cannot open device missing: No such file or directory
mdadm: missing has no superblock - assembly aborted
===================================================================================================================

But I guess that doesn't matter, because almost all other drives are
almost sync'ed as specified in the events output of mdadm (once again,
keep in mind that "/dev/sdk" is blank, thus the "no md superblock"
error):

==============================================
# mdadm -E /dev/sd[b-k] | grep Events
         Events : 280026
         Events : 280026
         Events : 280026
         Events : 280026
         Events : 280026
         Events : 280026
         Events : 280026
mdadm: No md superblock detected on /dev/sdk.
         Events : 280026
         Events : 280011
==============================================

So I attempt to reassemble it, by leaving out "/dev/sdk" and it seems
to assemble it, with some warnings of course:

===========================================================================================================
# mdadm -Afv /dev/md127 /dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf
/dev/sdg /dev/sdh /dev/sdi /dev/sdj
mdadm: looking for devices for /dev/md127
mdadm: /dev/sdb is identified as a member of /dev/md127, slot 7.
mdadm: /dev/sdc is identified as a member of /dev/md127, slot 8.
mdadm: /dev/sdd is identified as a member of /dev/md127, slot 6.
mdadm: /dev/sde is identified as a member of /dev/md127, slot 9.
mdadm: /dev/sdf is identified as a member of /dev/md127, slot 4.
mdadm: /dev/sdg is identified as a member of /dev/md127, slot 1.
mdadm: /dev/sdh is identified as a member of /dev/md127, slot 2.
mdadm: /dev/sdi is identified as a member of /dev/md127, slot 0.
mdadm: /dev/sdj is identified as a member of /dev/md127, slot 3.
mdadm: :/dev/md127 has an active reshape - checking if critical
section needs to be restored
mdadm: No backup metadata on device-7
mdadm: No backup metadata on device-8
mdadm: No backup metadata on device-9
mdadm: added /dev/sdg to /dev/md127 as 1
mdadm: added /dev/sdh to /dev/md127 as 2
mdadm: added /dev/sdj to /dev/md127 as 3 (possibly out of date)
mdadm: added /dev/sdf to /dev/md127 as 4
mdadm: no uptodate device for slot 10 of /dev/md127
mdadm: added /dev/sdd to /dev/md127 as 6
mdadm: added /dev/sdb to /dev/md127 as 7
mdadm: added /dev/sdc to /dev/md127 as 8
mdadm: added /dev/sde to /dev/md127 as 9
mdadm: added /dev/sdi to /dev/md127 as 0
mdadm: /dev/md127 has been started with 8 drives (out of 10).
===========================================================================================================

But now the reshape goes from 80000K to 1000K and eventually 0K speed
shortly after hitting "enter" to reassemble:

===========================================================================================================
# cat /proc/mdstat
Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5]
[raid4] [raid10]
md127 : active raid6 sdi[0] sde[11] sdc[12] sdb[13] sdd[10] sdf[7] sdh[2] sdg[9]
      14650675200 blocks super 1.2 level 6, 512k chunk, algorithm 2
[10/8] [UUU_U_UUUU]
      [=======>.............]  reshape = 39.1% (1146348628/2930135040)
finish=51538126.9min speed=0K/sec
      bitmap: 0/22 pages [0KB], 65536KB chunk

unused devices: <none>
===========================================================================================================

So I did a little probing and it seems that my CPU is running at 100%
by "md127_raid6".  I should note that it has been this way for over a
week now, the time doesn't reflect it because I had to perform a
reboot.  So I'm at a loss, because even if I try to optimize reshape
speeds, the reshape still remains at 0K/sec.

=================================================================================
top - 22:28:53 up  1:56,  3 users,  load average: 3.05, 2.04, 0.92
Tasks: 317 total,   4 running, 313 sleeping,   0 stopped,   0 zombie
%Cpu(s):  4.4 us, 50.5 sy,  0.0 ni, 44.1 id,  1.0 wa,  0.0 hi,  0.0 si,  0.0 st
KiB Mem :  1521584 total,   220812 free,   774868 used,   525904 buff/cache
KiB Swap: 25153532 total, 25000764 free,   152768 used.   477708 avail Mem

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+
COMMAND
  435 root      20   0       0      0      0 R  98.0  0.0   5:27.12
md127_raid6
28941 cisco     20   0  546436  34336  25080 R   2.9  2.3   0:18.32
gnome-disks
 3557 message+  20   0   44364   4632   3068 S   2.0  0.3   0:06.53
dbus-daemon
=================================================================================

Any ideas?  Your help would be greatly appreciated.

Thanks in advance

^ permalink raw reply

* RAID6 - CPU At 100% Usage After Reassembly
From: Francisco Parada @ 2016-09-04  4:04 UTC (permalink / raw)
  To: linux-raid

Hello everyone,

I know this gets a ton of visibility, so I'll keep it as concise as possible.

I'm running Ubuntu 16.04.1 and I have (read had) a 7 drive RAID6
array.  I attempted to grow the array by adding 3 additional drives
for a total of 10, but it seems that one of the brand new drives had
60+ bad blocks (according to "badblocks -vw").  I came to this
conclusion, because I had a power outage during the grow that lasted
longer than my 1500VA battery backup could withstand, so when I
attempted to continue the reshape, I noticed that the assemble
wouldn't start upon reboot.  All drives were marked as spares:

=================================================================================================================
# cat /proc/mdstat
Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5]
[raid4] [raid10]
md127 : inactive sdi[0](S) sdh[2](S) sdj[3](S) sdf[7](S) sdg[9](S)
sdd[10](S) sde[11](S) sdb[13](S) sdc[12](S)
      26371219608 blocks super 1.2
=================================================================================================================

Notice above, that there's only 9 drives instead of 10, which I was
supposed to have.  The drive that's missing is "sdk", but that's
because using "badblocks -vw" has wiped out the drive in an effort to
figure out if there was actually something wrong with said drive
(You're probably gasping, but it had a missing GPT table, and no
matter what I tried to recover it, the drive would just stop
responding to reads and writes).  So I attempted to assemble the array
with "/dev/sdk" missing as shown below, but I get this:

===================================================================================================================
# mdadm -Afv /dev/md127 /dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf
/dev/sdg /dev/sdh /dev/sdi /dev/sdj missing
mdadm: looking for devices for /dev/md127
mdadm: cannot open device missing: No such file or directory
mdadm: missing has no superblock - assembly aborted
===================================================================================================================

But I guess that doesn't matter, because almost all other drives are
almost sync'ed as specified in the events output of mdadm (once again,
keep in mind that "/dev/sdk" is blank, thus the "no md superblock"
error):

==============================================
# mdadm -E /dev/sd[b-k] | grep Events
         Events : 280026
         Events : 280026
         Events : 280026
         Events : 280026
         Events : 280026
         Events : 280026
         Events : 280026
mdadm: No md superblock detected on /dev/sdk.
         Events : 280026
         Events : 280011
==============================================

So I attempt to reassemble it, by leaving out "/dev/sdk" and it seems
to assemble it, with some warnings of course:

===========================================================================================================
# mdadm -Afv /dev/md127 /dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf
/dev/sdg /dev/sdh /dev/sdi /dev/sdj
mdadm: looking for devices for /dev/md127
mdadm: /dev/sdb is identified as a member of /dev/md127, slot 7.
mdadm: /dev/sdc is identified as a member of /dev/md127, slot 8.
mdadm: /dev/sdd is identified as a member of /dev/md127, slot 6.
mdadm: /dev/sde is identified as a member of /dev/md127, slot 9.
mdadm: /dev/sdf is identified as a member of /dev/md127, slot 4.
mdadm: /dev/sdg is identified as a member of /dev/md127, slot 1.
mdadm: /dev/sdh is identified as a member of /dev/md127, slot 2.
mdadm: /dev/sdi is identified as a member of /dev/md127, slot 0.
mdadm: /dev/sdj is identified as a member of /dev/md127, slot 3.
mdadm: :/dev/md127 has an active reshape - checking if critical
section needs to be restored
mdadm: No backup metadata on device-7
mdadm: No backup metadata on device-8
mdadm: No backup metadata on device-9
mdadm: added /dev/sdg to /dev/md127 as 1
mdadm: added /dev/sdh to /dev/md127 as 2
mdadm: added /dev/sdj to /dev/md127 as 3 (possibly out of date)
mdadm: added /dev/sdf to /dev/md127 as 4
mdadm: no uptodate device for slot 10 of /dev/md127
mdadm: added /dev/sdd to /dev/md127 as 6
mdadm: added /dev/sdb to /dev/md127 as 7
mdadm: added /dev/sdc to /dev/md127 as 8
mdadm: added /dev/sde to /dev/md127 as 9
mdadm: added /dev/sdi to /dev/md127 as 0
mdadm: /dev/md127 has been started with 8 drives (out of 10).
===========================================================================================================

But now the reshape goes from 80000K to 1000K and eventually 0K speed
shortly after hitting "enter" to reassemble:

===========================================================================================================
# cat /proc/mdstat
Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5]
[raid4] [raid10]
md127 : active raid6 sdi[0] sde[11] sdc[12] sdb[13] sdd[10] sdf[7] sdh[2] sdg[9]
      14650675200 blocks super 1.2 level 6, 512k chunk, algorithm 2
[10/8] [UUU_U_UUUU]
      [=======>.............]  reshape = 39.1% (1146348628/2930135040)
finish=51538126.9min speed=0K/sec
      bitmap: 0/22 pages [0KB], 65536KB chunk

unused devices: <none>
===========================================================================================================

So I did a little probing and it seems that my CPU is running at 100%
by "md127_raid6".  I should note that it has been this way for over a
week now, the time doesn't reflect it because I had to perform a
reboot.  So I'm at a loss, because even if I try to optimize reshape
speeds, the reshape still remains at 0K/sec.

=================================================================================
top - 22:28:53 up  1:56,  3 users,  load average: 3.05, 2.04, 0.92
Tasks: 317 total,   4 running, 313 sleeping,   0 stopped,   0 zombie
%Cpu(s):  4.4 us, 50.5 sy,  0.0 ni, 44.1 id,  1.0 wa,  0.0 hi,  0.0 si,  0.0 st
KiB Mem :  1521584 total,   220812 free,   774868 used,   525904 buff/cache
KiB Swap: 25153532 total, 25000764 free,   152768 used.   477708 avail Mem

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+   COMMAND
  435 root      20   0       0      0      0 R  98.0  0.0   5:27.12
md127_raid6
28941 cisco     20   0  546436  34336  25080 R   2.9  2.3   0:18.32
gnome-disks
 3557 message+  20   0   44364   4632   3068 S   2.0  0.3   0:06.53
dbus-daemon
=================================================================================

Any ideas?  Your help would be greatly appreciated.

Thanks in advance

^ permalink raw reply

* Raid failure- Please help - Linux-Raid noob
From: Norman @ 2016-09-04  4:15 UTC (permalink / raw)
  To: linux-raid

Hi,

I'm familiar with basic RAID technology but I'm not that familiar with 
software RAID on Linux. My system admin is not available as the RAID 5 
failure happened early Friday evening. I'd like to recovery the RAID 
since it seems the hard-disks are still operational, at least they seem 
to be.

The scenario: 1U server with (4) 2TB drives in RAID5 and RAID1 
configuration. The system boots to GRUB but root partition doesn't 
mount. I get a small shell call BusyBox. I can run mdadm for it.
Running mdadm --examine manually, it seems that sda1, sda3, sdb1, sdb3, 
sdc1, sdc3, sdd1, and sdd3 are missing are not active and missing the 
super-block. All other devices sdx2-sdx9 are active. Cat /proc/mdstat 
shows...

md5 :active (auto-read-only) raid1 sdb8[4] sdc8[5], 204788 blocks super 
1.2 [4/2] [_UU_]

md4 :inactive sdb7[4] sdc7[5], 419225 blocks super 1.2

md3 :active (auto-read-only) raid5 sda6[0] sdd6[3] sdc6[5], 6286848 
blocks super 1.2 level 5, 512k chunk, algorithm 2 [4/4] [UUUU]

md2 :inactive sdb5[4] sdc5[5], 2096128 blocks super 1.2

md1 :inactive sdb4[4] sdc4[5], 2096128 blocks super 1.2

md0 :active (auto-read-only) raid1 sda2[0] sdd2[3] sdc2[5] sdb2[4], 
102388 blocks super 1.2 [4/4] [UUUU]

unused devices: <none>

I hope there aren't any typos as I typed the above manually.

Lastly, thing to note...is that I get an Alert! - Alert! 
/dev/disk/by-uuid/c6e309d7-07ce-42e2-9c62-fb53be4b99cc does not exist. I 
don't see this device at all, I see 4 drive with different UUIDs.

I'm not sure what to do or what direction I should go in.

Many thanks for any help.

N

^ permalink raw reply

* Re: RAID6 - CPU At 100% Usage After Reassembly
From: Michael J. Shaver @ 2016-09-04 14:38 UTC (permalink / raw)
  To: mdraid
In-Reply-To: <CAOW94ut0uHS8+k0jutOT9bfT__WLbbOJmQC2CFa15HVVe9L4fg@mail.gmail.com>

Hello Francisco,

You are almost certainly hitting the same issue reported several times
both here and on other forums, although your case is the first one I
have seen for raid6:

http://www.spinics.net/lists/raid/msg53056.html
http://www.spinics.net/lists/raid/msg52235.html
https://bbs.archlinux.org/viewtopic.php?id=212108
https://forums.gentoo.org/viewtopic-t-1043706.html

At this time, there have been a couple suggestions on possible fixes
(disable transparent huge page support in the kernel)

Another gentleman, Bart Van Assche, had suggested a set of patches to
the kernel scheduler that may help with the problem:

https://lkml.org/lkml/2016/8/3/289

I am still trying to wrap my head around the patches themselves, and
haven't tried each of the patches individually. Disabling transparent
huge page support had no effect for me. At this time, my array is
still locked up with the exact s
ame symptoms you report. I am slowly learning about the spin lock
mechanism within the kernel to try to identify the underlying problem,
but this is admittedly out of my area of expertise.

To help correlate your problem with what others observed, would it be
possible for you to share the call stack for the following three
processes?

mdXXX_raid6
mdXXX_reshape
systemd-udevd

Or any other processes reporting deadlock while the reshape is trying to run.

Curious to see if you observe the same call stack.

I will definitely let you know if I have any major revelations. thanks Michael

On Sun, Sep 4, 2016 at 12:04 AM, Francisco Parada
<advanceandconquer@gmail.com> wrote:
> Hello everyone,
>
> I know this gets a ton of visibility, so I'll keep it as concise as possible.
>
> I'm running Ubuntu 16.04.1 and I have (read had) a 7 drive RAID6
> array.  I attempted to grow the array by adding 3 additional drives
> for a total of 10, but it seems that one of the brand new drives had
> 60+ bad blocks (according to "badblocks -vw").  I came to this
> conclusion, because I had a power outage during the grow that lasted
> longer than my 1500VA battery backup could withstand, so when I
> attempted to continue the reshape, I noticed that the assemble
> wouldn't start upon reboot.  All drives were marked as spares:
>
> =================================================================================================================
> # cat /proc/mdstat
> Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5]
> [raid4] [raid10]
> md127 : inactive sdi[0](S) sdh[2](S) sdj[3](S) sdf[7](S) sdg[9](S)
> sdd[10](S) sde[11](S) sdb[13](S) sdc[12](S)
>       26371219608 blocks super 1.2
> =================================================================================================================
>
>
> Notice above, that there's only 9 drives instead of 10, which I was
> supposed to have.  The drive that's missing is "sdk", but that's
> because using "badblocks -vw" has wiped out the drive in an effort to
> figure out if there was actually something wrong with said drive
> (You're probably gasping, but it had a missing GPT table, and no
> matter what I tried to recover it, the drive would just stop
> responding to reads and writes).  So I attempted to assemble the array
> with "/dev/sdk" missing as shown below, but I get this:
>
> ===================================================================================================================
> # mdadm -Afv /dev/md127 /dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf
> /dev/sdg /dev/sdh /dev/sdi /dev/sdj missing
> mdadm: looking for devices for /dev/md127
> mdadm: cannot open device missing: No such file or directory
> mdadm: missing has no superblock - assembly aborted
> ===================================================================================================================
>
>
> But I guess that doesn't matter, because almost all other drives are
> almost sync'ed as specified in the events output of mdadm (once again,
> keep in mind that "/dev/sdk" is blank, thus the "no md superblock"
> error):
>
> ==============================================
> # mdadm -E /dev/sd[b-k] | grep Events
>          Events : 280026
>          Events : 280026
>          Events : 280026
>          Events : 280026
>          Events : 280026
>          Events : 280026
>          Events : 280026
> mdadm: No md superblock detected on /dev/sdk.
>          Events : 280026
>          Events : 280011
> ==============================================
>
>
> So I attempt to reassemble it, by leaving out "/dev/sdk" and it seems
> to assemble it, with some warnings of course:
>
> ===========================================================================================================
> # mdadm -Afv /dev/md127 /dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf
> /dev/sdg /dev/sdh /dev/sdi /dev/sdj
> mdadm: looking for devices for /dev/md127
> mdadm: /dev/sdb is identified as a member of /dev/md127, slot 7.
> mdadm: /dev/sdc is identified as a member of /dev/md127, slot 8.
> mdadm: /dev/sdd is identified as a member of /dev/md127, slot 6.
> mdadm: /dev/sde is identified as a member of /dev/md127, slot 9.
> mdadm: /dev/sdf is identified as a member of /dev/md127, slot 4.
> mdadm: /dev/sdg is identified as a member of /dev/md127, slot 1.
> mdadm: /dev/sdh is identified as a member of /dev/md127, slot 2.
> mdadm: /dev/sdi is identified as a member of /dev/md127, slot 0.
> mdadm: /dev/sdj is identified as a member of /dev/md127, slot 3.
> mdadm: :/dev/md127 has an active reshape - checking if critical
> section needs to be restored
> mdadm: No backup metadata on device-7
> mdadm: No backup metadata on device-8
> mdadm: No backup metadata on device-9
> mdadm: added /dev/sdg to /dev/md127 as 1
> mdadm: added /dev/sdh to /dev/md127 as 2
> mdadm: added /dev/sdj to /dev/md127 as 3 (possibly out of date)
> mdadm: added /dev/sdf to /dev/md127 as 4
> mdadm: no uptodate device for slot 10 of /dev/md127
> mdadm: added /dev/sdd to /dev/md127 as 6
> mdadm: added /dev/sdb to /dev/md127 as 7
> mdadm: added /dev/sdc to /dev/md127 as 8
> mdadm: added /dev/sde to /dev/md127 as 9
> mdadm: added /dev/sdi to /dev/md127 as 0
> mdadm: /dev/md127 has been started with 8 drives (out of 10).
> ===========================================================================================================
>
>
> But now the reshape goes from 80000K to 1000K and eventually 0K speed
> shortly after hitting "enter" to reassemble:
>
> ===========================================================================================================
> # cat /proc/mdstat
> Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5]
> [raid4] [raid10]
> md127 : active raid6 sdi[0] sde[11] sdc[12] sdb[13] sdd[10] sdf[7] sdh[2] sdg[9]
>       14650675200 blocks super 1.2 level 6, 512k chunk, algorithm 2
> [10/8] [UUU_U_UUUU]
>       [=======>.............]  reshape = 39.1% (1146348628/2930135040)
> finish=51538126.9min speed=0K/sec
>       bitmap: 0/22 pages [0KB], 65536KB chunk
>
> unused devices: <none>
> ===========================================================================================================
>
>
> So I did a little probing and it seems that my CPU is running at 100%
> by "md127_raid6".  I should note that it has been this way for over a
> week now, the time doesn't reflect it because I had to perform a
> reboot.  So I'm at a loss, because even if I try to optimize reshape
> speeds, the reshape still remains at 0K/sec.
>
> =================================================================================
> top - 22:28:53 up  1:56,  3 users,  load average: 3.05, 2.04, 0.92
> Tasks: 317 total,   4 running, 313 sleeping,   0 stopped,   0 zombie
> %Cpu(s):  4.4 us, 50.5 sy,  0.0 ni, 44.1 id,  1.0 wa,  0.0 hi,  0.0 si,  0.0 st
> KiB Mem :  1521584 total,   220812 free,   774868 used,   525904 buff/cache
> KiB Swap: 25153532 total, 25000764 free,   152768 used.   477708 avail Mem
>
>   PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+   COMMAND
>   435 root      20   0       0      0      0 R  98.0  0.0   5:27.12
> md127_raid6
> 28941 cisco     20   0  546436  34336  25080 R   2.9  2.3   0:18.32
> gnome-disks
>  3557 message+  20   0   44364   4632   3068 S   2.0  0.3   0:06.53
> dbus-daemon
> =================================================================================
>
> Any ideas?  Your help would be greatly appreciated.
>
> Thanks in advance
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: RAID6 - CPU At 100% Usage After Reassembly
From: Francisco Parada @ 2016-09-04 22:48 UTC (permalink / raw)
  To: Michael J. Shaver; +Cc: mdraid
In-Reply-To: <CAOW94uv4zSGs+6be3zhcQaGZdiAgg-s4ZHZ=mszcURo6pqJyqA@mail.gmail.com>

> You are almost certainly hitting the same issue reported several times
> both here and on other forums, although your case is the first one I
> have seen for raid6:


What a bummer, was hoping I was just being dumb and missing something
(here's to hoping though). Luckily, my real important data is backed up.
Just trying to see if I can still recover everything else.



> At this time, there have been a couple suggestions on possible fixes
> (disable transparent huge page support in the kernel)


OK, I can try giving that a shot.



> Another gentleman, Bart Van Assche, had suggested a set of patches to
> the kernel scheduler that may help with the problem:
>
> https://lkml.org/lkml/2016/8/3/289


I'll read into this, thank you!



> At this time, my array is still locked up with the exact same symptoms you
> report.


Hopefully we can all work to figure this one out.


To help correlate your problem with what others observed, would it be
> possible for you to share the call stack for the following three
> processes?
>
> mdXXX_raid6
> mdXXX_reshape
> systemd-udevd
>
> Or any other processes reporting deadlock while the reshape is trying to
> run.
>
> Curious to see if you observe the same call stack.


Would that be using "strace", "ptrace", or both? Pardon my ignorance, I've
never used them. I'm pretty sure it's pstack, but want to make completely
sure!


> I will definitely let you know if I have any major revelations. thanks
> Michael


Thank you kindly, Michael.  I appreciate your input.

On Sun, Sep 4, 2016 at 11:41 AM, Francisco Parada
<advanceandconquer@gmail.com> wrote:
>
>> You are almost certainly hitting the same issue reported several times
>> both here and on other forums, although your case is the first one I
>> have seen for raid6:
>
>
> What a bummer, was hoping I was just being dumb and missing something
> (here's to hoping though). Luckily, my real important data is backed up.
> Just trying to see if I can still recover everything else.
>
>
>>
>> At this time, there have been a couple suggestions on possible fixes
>> (disable transparent huge page support in the kernel)
>
>
> OK, I can try giving that a shot.
>
>
>>
>> Another gentleman, Bart Van Assche, had suggested a set of patches to
>> the kernel scheduler that may help with the problem:
>>
>> https://lkml.org/lkml/2016/8/3/289
>
>
> I'll read into this, thank you!
>
>
>>
>> At this time, my array is still locked up with the exact same symptoms you
>> report.
>
>
> Hopefully we can all work to figure this one out.
>
>
>> To help correlate your problem with what others observed, would it be
>> possible for you to share the call stack for the following three
>> processes?
>>
>> mdXXX_raid6
>> mdXXX_reshape
>> systemd-udevd
>>
>> Or any other processes reporting deadlock while the reshape is trying to
>> run.
>>
>> Curious to see if you observe the same call stack.
>
>
> Would that be using "strace", "ptrace", or both? Pardon my ignorance, I've
> never used them. I'm pretty sure it's pstack, but want to make completely
> sure!
>
>>
>> I will definitely let you know if I have any major revelations. thanks
>> Michael
>
>
> Thank you kindly, Michael.  I appreciate your input.
>
>>
>>
>> On Sun, Sep 4, 2016 at 12:04 AM, Francisco Parada
>> <advanceandconquer@gmail.com> wrote:
>> > Hello everyone,
>> >
>> > I know this gets a ton of visibility, so I'll keep it as concise as
>> > possible.
>> >
>> > I'm running Ubuntu 16.04.1 and I have (read had) a 7 drive RAID6
>> > array.  I attempted to grow the array by adding 3 additional drives
>> > for a total of 10, but it seems that one of the brand new drives had
>> > 60+ bad blocks (according to "badblocks -vw").  I came to this
>> > conclusion, because I had a power outage during the grow that lasted
>> > longer than my 1500VA battery backup could withstand, so when I
>> > attempted to continue the reshape, I noticed that the assemble
>> > wouldn't start upon reboot.  All drives were marked as spares:
>> >
>> >
>> > =================================================================================================================
>> > # cat /proc/mdstat
>> > Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5]
>> > [raid4] [raid10]
>> > md127 : inactive sdi[0](S) sdh[2](S) sdj[3](S) sdf[7](S) sdg[9](S)
>> > sdd[10](S) sde[11](S) sdb[13](S) sdc[12](S)
>> >       26371219608 blocks super 1.2
>> >
>> > =================================================================================================================
>> >
>> >
>> > Notice above, that there's only 9 drives instead of 10, which I was
>> > supposed to have.  The drive that's missing is "sdk", but that's
>> > because using "badblocks -vw" has wiped out the drive in an effort to
>> > figure out if there was actually something wrong with said drive
>> > (You're probably gasping, but it had a missing GPT table, and no
>> > matter what I tried to recover it, the drive would just stop
>> > responding to reads and writes).  So I attempted to assemble the array
>> > with "/dev/sdk" missing as shown below, but I get this:
>> >
>> >
>> > ===================================================================================================================
>> > # mdadm -Afv /dev/md127 /dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf
>> > /dev/sdg /dev/sdh /dev/sdi /dev/sdj missing
>> > mdadm: looking for devices for /dev/md127
>> > mdadm: cannot open device missing: No such file or directory
>> > mdadm: missing has no superblock - assembly aborted
>> >
>> > ===================================================================================================================
>> >
>> >
>> > But I guess that doesn't matter, because almost all other drives are
>> > almost sync'ed as specified in the events output of mdadm (once again,
>> > keep in mind that "/dev/sdk" is blank, thus the "no md superblock"
>> > error):
>> >
>> > ==============================================
>> > # mdadm -E /dev/sd[b-k] | grep Events
>> >          Events : 280026
>> >          Events : 280026
>> >          Events : 280026
>> >          Events : 280026
>> >          Events : 280026
>> >          Events : 280026
>> >          Events : 280026
>> > mdadm: No md superblock detected on /dev/sdk.
>> >          Events : 280026
>> >          Events : 280011
>> > ==============================================
>> >
>> >
>> > So I attempt to reassemble it, by leaving out "/dev/sdk" and it seems
>> > to assemble it, with some warnings of course:
>> >
>> >
>> > ===========================================================================================================
>> > # mdadm -Afv /dev/md127 /dev/sdb /dev/sdc /dev/sdd /dev/sde /dev/sdf
>> > /dev/sdg /dev/sdh /dev/sdi /dev/sdj
>> > mdadm: looking for devices for /dev/md127
>> > mdadm: /dev/sdb is identified as a member of /dev/md127, slot 7.
>> > mdadm: /dev/sdc is identified as a member of /dev/md127, slot 8.
>> > mdadm: /dev/sdd is identified as a member of /dev/md127, slot 6.
>> > mdadm: /dev/sde is identified as a member of /dev/md127, slot 9.
>> > mdadm: /dev/sdf is identified as a member of /dev/md127, slot 4.
>> > mdadm: /dev/sdg is identified as a member of /dev/md127, slot 1.
>> > mdadm: /dev/sdh is identified as a member of /dev/md127, slot 2.
>> > mdadm: /dev/sdi is identified as a member of /dev/md127, slot 0.
>> > mdadm: /dev/sdj is identified as a member of /dev/md127, slot 3.
>> > mdadm: :/dev/md127 has an active reshape - checking if critical
>> > section needs to be restored
>> > mdadm: No backup metadata on device-7
>> > mdadm: No backup metadata on device-8
>> > mdadm: No backup metadata on device-9
>> > mdadm: added /dev/sdg to /dev/md127 as 1
>> > mdadm: added /dev/sdh to /dev/md127 as 2
>> > mdadm: added /dev/sdj to /dev/md127 as 3 (possibly out of date)
>> > mdadm: added /dev/sdf to /dev/md127 as 4
>> > mdadm: no uptodate device for slot 10 of /dev/md127
>> > mdadm: added /dev/sdd to /dev/md127 as 6
>> > mdadm: added /dev/sdb to /dev/md127 as 7
>> > mdadm: added /dev/sdc to /dev/md127 as 8
>> > mdadm: added /dev/sde to /dev/md127 as 9
>> > mdadm: added /dev/sdi to /dev/md127 as 0
>> > mdadm: /dev/md127 has been started with 8 drives (out of 10).
>> >
>> > ===========================================================================================================
>> >
>> >
>> > But now the reshape goes from 80000K to 1000K and eventually 0K speed
>> > shortly after hitting "enter" to reassemble:
>> >
>> >
>> > ===========================================================================================================
>> > # cat /proc/mdstat
>> > Personalities : [linear] [multipath] [raid0] [raid1] [raid6] [raid5]
>> > [raid4] [raid10]
>> > md127 : active raid6 sdi[0] sde[11] sdc[12] sdb[13] sdd[10] sdf[7]
>> > sdh[2] sdg[9]
>> >       14650675200 blocks super 1.2 level 6, 512k chunk, algorithm 2
>> > [10/8] [UUU_U_UUUU]
>> >       [=======>.............]  reshape = 39.1% (1146348628/2930135040)
>> > finish=51538126.9min speed=0K/sec
>> >       bitmap: 0/22 pages [0KB], 65536KB chunk
>> >
>> > unused devices: <none>
>> >
>> > ===========================================================================================================
>> >
>> >
>> > So I did a little probing and it seems that my CPU is running at 100%
>> > by "md127_raid6".  I should note that it has been this way for over a
>> > week now, the time doesn't reflect it because I had to perform a
>> > reboot.  So I'm at a loss, because even if I try to optimize reshape
>> > speeds, the reshape still remains at 0K/sec.
>> >
>> >
>> > =================================================================================
>> > top - 22:28:53 up  1:56,  3 users,  load average: 3.05, 2.04, 0.92
>> > Tasks: 317 total,   4 running, 313 sleeping,   0 stopped,   0 zombie
>> > %Cpu(s):  4.4 us, 50.5 sy,  0.0 ni, 44.1 id,  1.0 wa,  0.0 hi,  0.0 si,
>> > 0.0 st
>> > KiB Mem :  1521584 total,   220812 free,   774868 used,   525904
>> > buff/cache
>> > KiB Swap: 25153532 total, 25000764 free,   152768 used.   477708 avail
>> > Mem
>> >
>> >   PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+
>> > COMMAND
>> >   435 root      20   0       0      0      0 R  98.0  0.0   5:27.12
>> > md127_raid6
>> > 28941 cisco     20   0  546436  34336  25080 R   2.9  2.3   0:18.32
>> > gnome-disks
>> >  3557 message+  20   0   44364   4632   3068 S   2.0  0.3   0:06.53
>> > dbus-daemon
>> >
>> > =================================================================================
>> >
>> > Any ideas?  Your help would be greatly appreciated.
>> >
>> > Thanks in advance
>> > --
>> > To unsubscribe from this list: send the line "unsubscribe linux-raid" in
>> > the body of a message to majordomo@vger.kernel.org
>> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH V2] md-cluster: make md-cluster also can work when compiled into kernel
From: NeilBrown @ 2016-09-04 23:24 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang
In-Reply-To: <1472911410-15469-1-git-send-email-gqjiang@suse.com>

[-- Attachment #1: Type: text/plain, Size: 2193 bytes --]

On Sun, Sep 04 2016, Guoqing Jiang wrote:

> The md-cluster is compiled as module by default,
> if it is compiled by built-in way, then we can't
> make md-cluster works.
>
> [64782.630008] md/raid1:md127: active with 2 out of 2 mirrors
> [64782.630528] md-cluster module not found.
> [64782.630530] md127: Could not setup cluster service (-2)
>
> Fixes: edb39c9 ("Introduce md_cluster_operations to handle cluster functions")
> Cc: stable@vger.kernel.org # v4.1+

The above results in you sending email to
   stable@vger.kernel.org#v4.1+
which is not a valid address.

The correct form for comments in email address is to use parentheses.
e.g.
   Cc: stable@vger.kernel.org (v4.1+)

> Cc: NeilBrown <neilb@suse.com>
> Reported-by: Marc Smith <marc.smith@mcc.edu>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> ---
> Changes:
> 1. call try_module_get if md_cluster_ops is already set,
> otherwise try_module_get/module_put are unbalanced.
>
>  drivers/md/md.c | 14 +++++---------
>  1 file changed, 5 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 67642ba..6ac5abe 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -7610,16 +7610,12 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
>  
>  int md_setup_cluster(struct mddev *mddev, int nodes)
>  {
> -	int err;
> -
> -	err = request_module("md-cluster");
> -	if (err) {
> -		pr_err("md-cluster module not found.\n");
> -		return -ENOENT;
> -	}
> -
> +	if (!md_cluster_ops)
> +		request_module("md-cluster");
>  	spin_lock(&pers_lock);
> -	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
> +	/* ensure module won't be unloaded */
> +	if (!try_module_get(md_cluster_mod)) {

Why did you drop the "!md_cluster_ops" test?

What happens if the md-cluster module cannot be loaded?
md_cluster_ops will be NULL and md_cluster_mod will be NULL.
try_module_get(NULL) succeeds, so with this patch md_setup_cluster()
will succeed if the module is needed but fails to load.

NeilBrown



> +		pr_err("can't get md-cluster module reference.\n");
>  		spin_unlock(&pers_lock);
>  		return -ENOENT;
>  	}
> -- 
> 2.6.6

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]

^ permalink raw reply

* Re: [PATCH V2] md-cluster: make md-cluster also can work when compiled into kernel
From: Guoqing Jiang @ 2016-09-05  1:46 UTC (permalink / raw)
  To: NeilBrown, linux-raid; +Cc: shli
In-Reply-To: <87zinn9rkc.fsf@notabene.neil.brown.name>



On 09/04/2016 07:24 PM, NeilBrown wrote:
> On Sun, Sep 04 2016, Guoqing Jiang wrote:
>
>> The md-cluster is compiled as module by default,
>> if it is compiled by built-in way, then we can't
>> make md-cluster works.
>>
>> [64782.630008] md/raid1:md127: active with 2 out of 2 mirrors
>> [64782.630528] md-cluster module not found.
>> [64782.630530] md127: Could not setup cluster service (-2)
>>
>> Fixes: edb39c9 ("Introduce md_cluster_operations to handle cluster functions")
>> Cc: stable@vger.kernel.org # v4.1+
> The above results in you sending email to
>     stable@vger.kernel.org#v4.1+
> which is not a valid address.
>
> The correct form for comments in email address is to use parentheses.
> e.g.
>     Cc: stable@vger.kernel.org (v4.1+)

Thanks for correct it!

>> Cc: NeilBrown <neilb@suse.com>
>> Reported-by: Marc Smith <marc.smith@mcc.edu>
>> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
>> ---
>> Changes:
>> 1. call try_module_get if md_cluster_ops is already set,
>> otherwise try_module_get/module_put are unbalanced.
>>
>>   drivers/md/md.c | 14 +++++---------
>>   1 file changed, 5 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 67642ba..6ac5abe 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -7610,16 +7610,12 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
>>   
>>   int md_setup_cluster(struct mddev *mddev, int nodes)
>>   {
>> -	int err;
>> -
>> -	err = request_module("md-cluster");
>> -	if (err) {
>> -		pr_err("md-cluster module not found.\n");
>> -		return -ENOENT;
>> -	}
>> -
>> +	if (!md_cluster_ops)
>> +		request_module("md-cluster");
>>   	spin_lock(&pers_lock);
>> -	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
>> +	/* ensure module won't be unloaded */
>> +	if (!try_module_get(md_cluster_mod)) {
> Why did you drop the "!md_cluster_ops" test?
>
> What happens if the md-cluster module cannot be loaded?
> md_cluster_ops will be NULL and md_cluster_mod will be NULL.
> try_module_get(NULL) succeeds, so with this patch md_setup_cluster()
> will succeed if the module is needed but fails to load.

Yes, I just find try_module_get(NULL) could return true, it seems not
correct for the return value, maybe it need to be changed to:

             return module ? ret : false;

Anyway, I will send v3 with add "!md_cluster_ops" test back.

Thanks,
Guoqing


^ permalink raw reply

* Re: [PATCH V2] md-cluster: make md-cluster also can work when compiled into kernel
From: NeilBrown @ 2016-09-05  1:56 UTC (permalink / raw)
  To: Guoqing Jiang, linux-raid; +Cc: shli
In-Reply-To: <57CCCE88.4010507@suse.com>

[-- Attachment #1: Type: text/plain, Size: 2783 bytes --]

On Mon, Sep 05 2016, Guoqing Jiang wrote:

> On 09/04/2016 07:24 PM, NeilBrown wrote:
>> On Sun, Sep 04 2016, Guoqing Jiang wrote:
>>
>>> The md-cluster is compiled as module by default,
>>> if it is compiled by built-in way, then we can't
>>> make md-cluster works.
>>>
>>> [64782.630008] md/raid1:md127: active with 2 out of 2 mirrors
>>> [64782.630528] md-cluster module not found.
>>> [64782.630530] md127: Could not setup cluster service (-2)
>>>
>>> Fixes: edb39c9 ("Introduce md_cluster_operations to handle cluster functions")
>>> Cc: stable@vger.kernel.org # v4.1+
>> The above results in you sending email to
>>     stable@vger.kernel.org#v4.1+
>> which is not a valid address.
>>
>> The correct form for comments in email address is to use parentheses.
>> e.g.
>>     Cc: stable@vger.kernel.org (v4.1+)
>
> Thanks for correct it!
>
>>> Cc: NeilBrown <neilb@suse.com>
>>> Reported-by: Marc Smith <marc.smith@mcc.edu>
>>> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
>>> ---
>>> Changes:
>>> 1. call try_module_get if md_cluster_ops is already set,
>>> otherwise try_module_get/module_put are unbalanced.
>>>
>>>   drivers/md/md.c | 14 +++++---------
>>>   1 file changed, 5 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>>> index 67642ba..6ac5abe 100644
>>> --- a/drivers/md/md.c
>>> +++ b/drivers/md/md.c
>>> @@ -7610,16 +7610,12 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
>>>   
>>>   int md_setup_cluster(struct mddev *mddev, int nodes)
>>>   {
>>> -	int err;
>>> -
>>> -	err = request_module("md-cluster");
>>> -	if (err) {
>>> -		pr_err("md-cluster module not found.\n");
>>> -		return -ENOENT;
>>> -	}
>>> -
>>> +	if (!md_cluster_ops)
>>> +		request_module("md-cluster");
>>>   	spin_lock(&pers_lock);
>>> -	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
>>> +	/* ensure module won't be unloaded */
>>> +	if (!try_module_get(md_cluster_mod)) {
>> Why did you drop the "!md_cluster_ops" test?
>>
>> What happens if the md-cluster module cannot be loaded?
>> md_cluster_ops will be NULL and md_cluster_mod will be NULL.
>> try_module_get(NULL) succeeds, so with this patch md_setup_cluster()
>> will succeed if the module is needed but fails to load.
>
> Yes, I just find try_module_get(NULL) could return true, it seems not
> correct for the return value, maybe it need to be changed to:
>
>              return module ? ret : false;

If a module is built into the kernel, then the module pointer that code
will use with be NULL, but you still want try_module_get() and
module_put() to work.  So I think returning 'true' for NULL is correct.

>
> Anyway, I will send v3 with add "!md_cluster_ops" test back.

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]

^ permalink raw reply

* [PATCH V3] md-cluster: make md-cluster also can work when compiled into kernel
From: Guoqing Jiang @ 2016-09-05  2:17 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang, v4.1+, NeilBrown
In-Reply-To: <1472813500-4921-1-git-send-email-gqjiang@suse.com>

The md-cluster is compiled as module by default,
if it is compiled by built-in way, then we can't
make md-cluster works.

[64782.630008] md/raid1:md127: active with 2 out of 2 mirrors
[64782.630528] md-cluster module not found.
[64782.630530] md127: Could not setup cluster service (-2)

Fixes: edb39c9 ("Introduce md_cluster_operations to handle cluster functions")
Cc: stable@vger.kernel.org (v4.1+)
Cc: NeilBrown <neilb@suse.com>
Reported-by: Marc Smith <marc.smith@mcc.edu>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
V3 changes:
1. add the "!md_cluster_ops" test back
2. fix wrong mail info of stable kernel

V2 changes:
1. call try_module_get if md_cluster_ops is already set,
   otherwise try_module_get/module_put are unbalanced.

 drivers/md/md.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 67642ba..915e84d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7610,16 +7610,12 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
 
 int md_setup_cluster(struct mddev *mddev, int nodes)
 {
-	int err;
-
-	err = request_module("md-cluster");
-	if (err) {
-		pr_err("md-cluster module not found.\n");
-		return -ENOENT;
-	}
-
+	if (!md_cluster_ops)
+		request_module("md-cluster");
 	spin_lock(&pers_lock);
+	/* ensure module won't be unloaded */
 	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
+		pr_err("can't find md-cluster module or get it's reference.\n");
 		spin_unlock(&pers_lock);
 		return -ENOENT;
 	}
-- 
2.6.6

^ permalink raw reply related

* Re: [PATCH V3] md-cluster: make md-cluster also can work when compiled into kernel
From: NeilBrown @ 2016-09-05  3:10 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, Guoqing Jiang, v4.1+
In-Reply-To: <1473041848-28009-1-git-send-email-gqjiang@suse.com>

[-- Attachment #1: Type: text/plain, Size: 1790 bytes --]

On Mon, Sep 05 2016, Guoqing Jiang wrote:

> The md-cluster is compiled as module by default,
> if it is compiled by built-in way, then we can't
> make md-cluster works.
>
> [64782.630008] md/raid1:md127: active with 2 out of 2 mirrors
> [64782.630528] md-cluster module not found.
> [64782.630530] md127: Could not setup cluster service (-2)
>
> Fixes: edb39c9 ("Introduce md_cluster_operations to handle cluster functions")
> Cc: stable@vger.kernel.org (v4.1+)
> Cc: NeilBrown <neilb@suse.com>
> Reported-by: Marc Smith <marc.smith@mcc.edu>
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> ---
> V3 changes:
> 1. add the "!md_cluster_ops" test back
> 2. fix wrong mail info of stable kernel
>
> V2 changes:
> 1. call try_module_get if md_cluster_ops is already set,
>    otherwise try_module_get/module_put are unbalanced.
>
>  drivers/md/md.c | 12 ++++--------
>  1 file changed, 4 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 67642ba..915e84d 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -7610,16 +7610,12 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
>  
>  int md_setup_cluster(struct mddev *mddev, int nodes)
>  {
> -	int err;
> -
> -	err = request_module("md-cluster");
> -	if (err) {
> -		pr_err("md-cluster module not found.\n");
> -		return -ENOENT;
> -	}
> -
> +	if (!md_cluster_ops)
> +		request_module("md-cluster");
>  	spin_lock(&pers_lock);
> +	/* ensure module won't be unloaded */
>  	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
> +		pr_err("can't find md-cluster module or get it's reference.\n");
>  		spin_unlock(&pers_lock);
>  		return -ENOENT;
>  	}
> -- 
> 2.6.6

Reviewed-by: NeilBrown <neilb@suse.com>

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox