Linux RAID subsystem development
 help / color / mirror / Atom feed
* [md PATCH 01/15] md/raid5: use md_write_start to count stripes, not bios
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

We use md_write_start() to increase the count of pending writes, and
md_write_end() to decrement the count.  We currently count bios
submitted to md/raid5.  Change it count stripe_heads that a WRITE bio
has been attached to.

So now, raid5_make_request() calls md_write_start() and then
md_write_end() to keep the count elevated during the setup of the
request.

add_stripe_bio() calls md_write_start() for each stripe_head, and the
completion routines always call md_write_end(), instead of only
calling it when raid5_dec_bi_active_stripes() returns 0.
make_discard_request also calls md_write_start/end().

The parallel between md_write_{start,end} and use of bi_phys_segments
can be seen in that:
 Whenever we set bi_phys_segments to 1, we now call md_write_start.
 Whenever we increment it on non-read requests with
   raid5_inc_bi_active_stripes(), we now call md_write_start().
 Whenever we decrement bi_phys_segments on non-read requsts with
    raid5_dec_bi_active_stripes(), we now call md_write_end().

This reduces our dependence on keeping a per-bio count of active
stripes in bi_phys_segments.

md_write_inc() is added which parallels md_write_start(), but requires
that a write has already been started, and is certain never to sleep.
This can be used inside a spinlocked region when adding to a write
request.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c          |   17 +++++++++++++++++
 drivers/md/md.h          |    1 +
 drivers/md/raid5-cache.c |    2 +-
 drivers/md/raid5.c       |   27 +++++++++++++--------------
 4 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index af9118711228..bad5771bced4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7916,6 +7916,23 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 }
 EXPORT_SYMBOL(md_write_start);
 
+/* md_write_inc can only be called when md_write_start() has
+ * already been called at least once of the current request.
+ * It increments the counter and is useful when a single request
+ * is split into several parts.  Each part causes an increment and
+ * so needs a matching md_write_end().
+ * Unlike md_write_start(), it is safe to call md_write_inc() inside
+ * a spinlocked region.
+ */
+void md_write_inc(struct mddev *mddev, struct bio *bi)
+{
+	if (bio_data_dir(bi) != WRITE)
+		return;
+	WARN_ON_ONCE(mddev->in_sync || mddev->ro);
+	atomic_inc(&mddev->writes_pending);
+}
+EXPORT_SYMBOL(md_write_inc);
+
 void md_write_end(struct mddev *mddev)
 {
 	if (atomic_dec_and_test(&mddev->writes_pending)) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index e0940064c3ec..0cd12721a536 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -648,6 +648,7 @@ extern void md_wakeup_thread(struct md_thread *thread);
 extern void md_check_recovery(struct mddev *mddev);
 extern void md_reap_sync_thread(struct mddev *mddev);
 extern void md_write_start(struct mddev *mddev, struct bio *bi);
+extern void md_write_inc(struct mddev *mddev, struct bio *bi);
 extern void md_write_end(struct mddev *mddev);
 extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
 extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 64493132470b..f5034ecb4e94 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -318,8 +318,8 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
 	while (wbi && wbi->bi_iter.bi_sector <
 	       dev->sector + STRIPE_SECTORS) {
 		wbi2 = r5_next_bio(wbi, dev->sector);
+		md_write_end(conf->mddev);
 		if (!raid5_dec_bi_active_stripes(wbi)) {
-			md_write_end(conf->mddev);
 			bio_list_add(return_bi, wbi);
 		}
 		wbi = wbi2;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1c554a811d20..cc2d039b4aae 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3273,6 +3273,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 		bi->bi_next = *bip;
 	*bip = bi;
 	raid5_inc_bi_active_stripes(bi);
+	md_write_inc(conf->mddev, bi);
 
 	if (forwrite) {
 		/* check if page is covered */
@@ -3396,10 +3397,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
 
 			bi->bi_error = -EIO;
-			if (!raid5_dec_bi_active_stripes(bi)) {
-				md_write_end(conf->mddev);
+			md_write_end(conf->mddev);
+			if (!raid5_dec_bi_active_stripes(bi))
 				bio_list_add(return_bi, bi);
-			}
 			bi = nextbi;
 		}
 		if (bitmap_end)
@@ -3420,10 +3420,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
 
 			bi->bi_error = -EIO;
-			if (!raid5_dec_bi_active_stripes(bi)) {
-				md_write_end(conf->mddev);
+			md_write_end(conf->mddev);
+			if (!raid5_dec_bi_active_stripes(bi))
 				bio_list_add(return_bi, bi);
-			}
 			bi = bi2;
 		}
 
@@ -3780,10 +3779,9 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 				while (wbi && wbi->bi_iter.bi_sector <
 					dev->sector + STRIPE_SECTORS) {
 					wbi2 = r5_next_bio(wbi, dev->sector);
-					if (!raid5_dec_bi_active_stripes(wbi)) {
-						md_write_end(conf->mddev);
+					md_write_end(conf->mddev);
+					if (!raid5_dec_bi_active_stripes(wbi))
 						bio_list_add(return_bi, wbi);
-					}
 					wbi = wbi2;
 				}
 				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -5486,6 +5484,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+	md_write_start(mddev, bi);
 
 	stripe_sectors = conf->chunk_sectors *
 		(conf->raid_disks - conf->max_degraded);
@@ -5532,6 +5531,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 			sh->dev[d].towrite = bi;
 			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
 			raid5_inc_bi_active_stripes(bi);
+			md_write_inc(mddev, bi);
 			sh->overwrite_disks++;
 		}
 		spin_unlock_irq(&sh->stripe_lock);
@@ -5554,9 +5554,9 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 		release_stripe_plug(mddev, sh);
 	}
 
+	md_write_end(mddev);
 	remaining = raid5_dec_bi_active_stripes(bi);
 	if (remaining == 0) {
-		md_write_end(mddev);
 		bio_endio(bi);
 	}
 }
@@ -5591,8 +5591,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 		do_flush = bi->bi_opf & REQ_PREFLUSH;
 	}
 
-	md_write_start(mddev, bi);
-
 	/*
 	 * If array is degraded, better not do chunk aligned read because
 	 * later we might have to read it again in order to reconstruct
@@ -5614,6 +5612,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	last_sector = bio_end_sector(bi);
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
+	md_write_start(mddev, bi);
 
 	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
@@ -5748,11 +5747,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	}
 	finish_wait(&conf->wait_for_overlap, &w);
 
+	if (rw == WRITE)
+		md_write_end(mddev);
 	remaining = raid5_dec_bi_active_stripes(bi);
 	if (remaining == 0) {
 
-		if ( rw == WRITE )
-			md_write_end(mddev);
 
 		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
 					 bi, 0);



^ permalink raw reply related

* [md PATCH 02/15] md/raid5: simplfy delaying of writes while metadata is updated.
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

If a device fails during a write, we must ensure the failure is
recorded in the metadata before the completion of the write is
acknowleged.

Commit c3cce6cda162 ("md/raid5: ensure device failure recorded before
write request returns.")  added code for this, but it was
unnecessarily complicated.  We already had similar functionality for
handling updates to the bad-block-list, thanks to Commit de393cdea66c
("md: make it easier to wait for bad blocks to be acknowledged.")

So revert most of the former commit, and instead avoid collecting
completed writes if MD_CHANGE_PENDING is set.  raid5d() will then flush
the metadata and retry the stripe_head.
As this change can leave a stripe_head ready for handling immediately
after handle_active_stripes() returns, we change raid5_do_work() to
pause when MD_CHANGE_PENDING is set, so that it doesn't spin.

We check MD_CHANGE_PENDING *after* analyse_stripe() as it could be set
asynchronously.  After analyse_stripe(), we have collected stable data
about the state of devices, which will be used to make decisions.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c |   31 ++++++++-----------------------
 drivers/md/raid5.h |    3 ---
 2 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cc2d039b4aae..f990f74901d2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4690,7 +4690,8 @@ static void handle_stripe(struct stripe_head *sh)
 	if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
 		goto finish;
 
-	if (s.handle_bad_blocks) {
+	if (s.handle_bad_blocks ||
+	    test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
 		set_bit(STRIPE_HANDLE, &sh->state);
 		goto finish;
 	}
@@ -5020,15 +5021,8 @@ static void handle_stripe(struct stripe_head *sh)
 			md_wakeup_thread(conf->mddev->thread);
 	}
 
-	if (!bio_list_empty(&s.return_bi)) {
-		if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
-			spin_lock_irq(&conf->device_lock);
-			bio_list_merge(&conf->return_bi, &s.return_bi);
-			spin_unlock_irq(&conf->device_lock);
-			md_wakeup_thread(conf->mddev->thread);
-		} else
-			return_io(&s.return_bi);
-	}
+	if (!bio_list_empty(&s.return_bi))
+		return_io(&s.return_bi);
 
 	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
 }
@@ -6225,6 +6219,7 @@ static void raid5_do_work(struct work_struct *work)
 	struct r5worker *worker = container_of(work, struct r5worker, work);
 	struct r5worker_group *group = worker->group;
 	struct r5conf *conf = group->conf;
+	struct mddev *mddev = conf->mddev;
 	int group_id = group - conf->worker_groups;
 	int handled;
 	struct blk_plug plug;
@@ -6245,6 +6240,9 @@ static void raid5_do_work(struct work_struct *work)
 		if (!batch_size && !released)
 			break;
 		handled += batch_size;
+		wait_event_lock_irq(mddev->sb_wait,
+				    !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags),
+				    conf->device_lock);
 	}
 	pr_debug("%d stripes handled\n", handled);
 
@@ -6272,18 +6270,6 @@ static void raid5d(struct md_thread *thread)
 
 	md_check_recovery(mddev);
 
-	if (!bio_list_empty(&conf->return_bi) &&
-	    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-		struct bio_list tmp = BIO_EMPTY_LIST;
-		spin_lock_irq(&conf->device_lock);
-		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-			bio_list_merge(&tmp, &conf->return_bi);
-			bio_list_init(&conf->return_bi);
-		}
-		spin_unlock_irq(&conf->device_lock);
-		return_io(&tmp);
-	}
-
 	blk_start_plug(&plug);
 	handled = 0;
 	spin_lock_irq(&conf->device_lock);
@@ -6935,7 +6921,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	INIT_LIST_HEAD(&conf->hold_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
 	INIT_LIST_HEAD(&conf->bitmap_list);
-	bio_list_init(&conf->return_bi);
 	init_llist_head(&conf->released_stripes);
 	atomic_set(&conf->active_stripes, 0);
 	atomic_set(&conf->preread_active_stripes, 0);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ba5b7a3790af..13800dc9dd88 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -638,9 +638,6 @@ struct r5conf {
 	int			skip_copy; /* Don't copy data from bio to stripe cache */
 	struct list_head	*last_hold; /* detect hold_list promotions */
 
-	/* bios to have bi_end_io called after metadata is synced */
-	struct bio_list		return_bi;
-
 	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
 	/* unfortunately we need two cache names as we temporarily have
 	 * two caches.



^ permalink raw reply related

* [md PATCH 03/15] md/raid5: call bio_endio() directly rather than queueing for later.
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

We currently gather bios that need to be returned into a bio_list
and call bio_endio() on them all together.
The original reason for this was to avoid making the calls while
holding a spinlock.
Locking has changed a lot since then, and that reason is no longer
valid.

So discard return_io() and various return_bi lists, and just call
bio_endio() directly as needed.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5-cache.c |   13 +++++--------
 drivers/md/raid5-log.h   |    2 +-
 drivers/md/raid5.c       |   38 ++++++++++----------------------------
 drivers/md/raid5.h       |    1 -
 4 files changed, 16 insertions(+), 38 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index f5034ecb4e94..5be8dbc5d91b 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -308,8 +308,7 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 }
 
 static void
-r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
-			      struct bio_list *return_bi)
+r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
 {
 	struct bio *wbi, *wbi2;
 
@@ -319,23 +318,21 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
 	       dev->sector + STRIPE_SECTORS) {
 		wbi2 = r5_next_bio(wbi, dev->sector);
 		md_write_end(conf->mddev);
-		if (!raid5_dec_bi_active_stripes(wbi)) {
-			bio_list_add(return_bi, wbi);
-		}
+		if (!raid5_dec_bi_active_stripes(wbi))
+			bio_endio(wbi);
 		wbi = wbi2;
 	}
 }
 
 void r5c_handle_cached_data_endio(struct r5conf *conf,
-	  struct stripe_head *sh, int disks, struct bio_list *return_bi)
+				  struct stripe_head *sh, int disks)
 {
 	int i;
 
 	for (i = sh->disks; i--; ) {
 		if (sh->dev[i].written) {
 			set_bit(R5_UPTODATE, &sh->dev[i].flags);
-			r5c_return_dev_pending_writes(conf, &sh->dev[i],
-						      return_bi);
+			r5c_return_dev_pending_writes(conf, &sh->dev[i]);
 			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
 					STRIPE_SECTORS,
 					!test_bit(STRIPE_DEGRADED, &sh->state),
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 4f5a0f4e0b1f..738930ff5d17 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -21,7 +21,7 @@ extern void r5c_release_extra_page(struct stripe_head *sh);
 extern void r5c_use_extra_page(struct stripe_head *sh);
 extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
 extern void r5c_handle_cached_data_endio(struct r5conf *conf,
-	struct stripe_head *sh, int disks, struct bio_list *return_bi);
+	struct stripe_head *sh, int disks);
 extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
 extern void r5c_make_stripe_write_out(struct stripe_head *sh);
 extern void r5c_flush_cache(struct r5conf *conf, int num);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f990f74901d2..f07cd105b9f9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -158,17 +158,6 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
 	return slot;
 }
 
-static void return_io(struct bio_list *return_bi)
-{
-	struct bio *bi;
-	while ((bi = bio_list_pop(return_bi)) != NULL) {
-		bi->bi_iter.bi_size = 0;
-		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
-					 bi, 0);
-		bio_endio(bi);
-	}
-}
-
 static void print_raid5_conf (struct r5conf *conf);
 
 static int stripe_operations_active(struct stripe_head *sh)
@@ -1310,7 +1299,6 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
 static void ops_complete_biofill(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
-	struct bio_list return_bi = BIO_EMPTY_LIST;
 	int i;
 
 	pr_debug("%s: stripe %llu\n", __func__,
@@ -1335,15 +1323,13 @@ static void ops_complete_biofill(void *stripe_head_ref)
 				dev->sector + STRIPE_SECTORS) {
 				rbi2 = r5_next_bio(rbi, dev->sector);
 				if (!raid5_dec_bi_active_stripes(rbi))
-					bio_list_add(&return_bi, rbi);
+					bio_endio(rbi);
 				rbi = rbi2;
 			}
 		}
 	}
 	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
 
-	return_io(&return_bi);
-
 	set_bit(STRIPE_HANDLE, &sh->state);
 	raid5_release_stripe(sh);
 }
@@ -3350,8 +3336,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 
 static void
 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
-				struct stripe_head_state *s, int disks,
-				struct bio_list *return_bi)
+		     struct stripe_head_state *s, int disks)
 {
 	int i;
 	BUG_ON(sh->batch_head);
@@ -3399,7 +3384,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			bi->bi_error = -EIO;
 			md_write_end(conf->mddev);
 			if (!raid5_dec_bi_active_stripes(bi))
-				bio_list_add(return_bi, bi);
+				bio_endio(bi);
 			bi = nextbi;
 		}
 		if (bitmap_end)
@@ -3422,7 +3407,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			bi->bi_error = -EIO;
 			md_write_end(conf->mddev);
 			if (!raid5_dec_bi_active_stripes(bi))
-				bio_list_add(return_bi, bi);
+				bio_endio(bi);
 			bi = bi2;
 		}
 
@@ -3448,7 +3433,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 
 				bi->bi_error = -EIO;
 				if (!raid5_dec_bi_active_stripes(bi))
-					bio_list_add(return_bi, bi);
+					bio_endio(bi);
 				bi = nextbi;
 			}
 		}
@@ -3747,7 +3732,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
  * never LOCKED, so we don't need to test 'failed' directly.
  */
 static void handle_stripe_clean_event(struct r5conf *conf,
-	struct stripe_head *sh, int disks, struct bio_list *return_bi)
+	struct stripe_head *sh, int disks)
 {
 	int i;
 	struct r5dev *dev;
@@ -3781,7 +3766,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 					wbi2 = r5_next_bio(wbi, dev->sector);
 					md_write_end(conf->mddev);
 					if (!raid5_dec_bi_active_stripes(wbi))
-						bio_list_add(return_bi, wbi);
+						bio_endio(wbi);
 					wbi = wbi2;
 				}
 				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -4724,7 +4709,7 @@ static void handle_stripe(struct stripe_head *sh)
 		sh->reconstruct_state = 0;
 		break_stripe_batch_list(sh, 0);
 		if (s.to_read+s.to_write+s.written)
-			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
+			handle_failed_stripe(conf, sh, &s, disks);
 		if (s.syncing + s.replacing)
 			handle_failed_sync(conf, sh, &s);
 	}
@@ -4790,10 +4775,10 @@ static void handle_stripe(struct stripe_head *sh)
 			     && !test_bit(R5_LOCKED, &qdev->flags)
 			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
 				 test_bit(R5_Discard, &qdev->flags))))))
-		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+		handle_stripe_clean_event(conf, sh, disks);
 
 	if (s.just_cached)
-		r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
+		r5c_handle_cached_data_endio(conf, sh, disks);
 	log_stripe_write_finished(sh);
 
 	/* Now we might consider reading some blocks, either to check/generate
@@ -5021,9 +5006,6 @@ static void handle_stripe(struct stripe_head *sh)
 			md_wakeup_thread(conf->mddev->thread);
 	}
 
-	if (!bio_list_empty(&s.return_bi))
-		return_io(&s.return_bi);
-
 	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
 }
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 13800dc9dd88..fd5c21cde77f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -278,7 +278,6 @@ struct stripe_head_state {
 	int dec_preread_active;
 	unsigned long ops_request;
 
-	struct bio_list return_bi;
 	struct md_rdev *blocked_rdev;
 	int handle_bad_blocks;
 	int log_failed;



^ permalink raw reply related

* [md PATCH 04/15] block: trace completion of all bios.
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

Currently only dm and raid5 bios trigger trace_block_bio_complete.
Rather than sprinkling trace calls around all drivers that might
want them, add the trace call to bio_endio() so that all
drivers benefit.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 block/bio.c        |    3 +++
 drivers/md/dm.c    |    1 -
 drivers/md/raid5.c |    8 --------
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 5eec5e08417f..c89d83b3ca32 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1838,6 +1838,9 @@ void bio_endio(struct bio *bio)
 		goto again;
 	}
 
+	if (bio->bi_bdev)
+		trace_block_bio_complete(bdev_get_queue(bio->bi_bdev),
+					 bio, bio->bi_error);
 	if (bio->bi_end_io)
 		bio->bi_end_io(bio);
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f4ffd1eb8f44..f5f09ace690a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -810,7 +810,6 @@ static void dec_pending(struct dm_io *io, int error)
 			queue_io(md, bio);
 		} else {
 			/* done with normal IO or empty flush */
-			trace_block_bio_complete(md->queue, bio, io_error);
 			bio->bi_error = io_error;
 			bio_endio(bio);
 		}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f07cd105b9f9..7a45045ab358 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5141,8 +5141,6 @@ static void raid5_align_endio(struct bio *bi)
 	rdev_dec_pending(rdev, conf->mddev);
 
 	if (!error) {
-		trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
-					 raid_bi, 0);
 		bio_endio(raid_bi);
 		if (atomic_dec_and_test(&conf->active_aligned_reads))
 			wake_up(&conf->wait_for_quiescent);
@@ -5727,10 +5725,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 		md_write_end(mddev);
 	remaining = raid5_dec_bi_active_stripes(bi);
 	if (remaining == 0) {
-
-
-		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
-					 bi, 0);
 		bio_endio(bi);
 	}
 }
@@ -6138,8 +6132,6 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 	}
 	remaining = raid5_dec_bi_active_stripes(raid_bio);
 	if (remaining == 0) {
-		trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
-					 raid_bio, 0);
 		bio_endio(raid_bio);
 	}
 	if (atomic_dec_and_test(&conf->active_aligned_reads))



^ permalink raw reply related

* [md PATCH 05/15] md/raid5: use bio_inc_remaining() instead of repurposing bi_phys_segments as a counter
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

md/raid5 needs to keep track of how many stripe_heads are processing a
bio so that it can delay calling bio_endio() until all stripe_heads
have completed.  It currently uses 16 bits of ->bi_phys_segments for
this purpose.

16 bits is only enough for 256M requests, and it is possible for a
single bio to be larger than this, which causes problems.  Also, the
bio struct contains a larger counter, __bi_remaining, which has a
purpose very similar to the purpose of our counter.  So stop using
->bi_phys_segments, and instead use __bi_remaining.

This means we don't need to initialize the counter, as our caller
initializes it to '1'.  It also means we can call bio_endio() directly
as it tests this counter internally.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5-cache.c |    3 +--
 drivers/md/raid5.c       |   50 +++++++++++-----------------------------------
 drivers/md/raid5.h       |   17 +---------------
 3 files changed, 14 insertions(+), 56 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 5be8dbc5d91b..25eb048298fe 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -318,8 +318,7 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
 	       dev->sector + STRIPE_SECTORS) {
 		wbi2 = r5_next_bio(wbi, dev->sector);
 		md_write_end(conf->mddev);
-		if (!raid5_dec_bi_active_stripes(wbi))
-			bio_endio(wbi);
+		bio_endio(wbi);
 		wbi = wbi2;
 	}
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7a45045ab358..7824c2509905 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1322,8 +1322,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
 			while (rbi && rbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				rbi2 = r5_next_bio(rbi, dev->sector);
-				if (!raid5_dec_bi_active_stripes(rbi))
-					bio_endio(rbi);
+				bio_endio(rbi);
 				rbi = rbi2;
 			}
 		}
@@ -3195,14 +3194,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 		(unsigned long long)bi->bi_iter.bi_sector,
 		(unsigned long long)sh->sector);
 
-	/*
-	 * If several bio share a stripe. The bio bi_phys_segments acts as a
-	 * reference count to avoid race. The reference count should already be
-	 * increased before this function is called (for example, in
-	 * raid5_make_request()), so other bio sharing this stripe will not free the
-	 * stripe. If a stripe is owned by one stripe, the stripe lock will
-	 * protect it.
-	 */
 	spin_lock_irq(&sh->stripe_lock);
 	/* Don't allow new IO added to stripes in batch list */
 	if (sh->batch_head)
@@ -3258,7 +3249,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 	if (*bip)
 		bi->bi_next = *bip;
 	*bip = bi;
-	raid5_inc_bi_active_stripes(bi);
+	bio_inc_remaining(bi);
 	md_write_inc(conf->mddev, bi);
 
 	if (forwrite) {
@@ -3383,8 +3374,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 
 			bi->bi_error = -EIO;
 			md_write_end(conf->mddev);
-			if (!raid5_dec_bi_active_stripes(bi))
-				bio_endio(bi);
+			bio_endio(bi);
 			bi = nextbi;
 		}
 		if (bitmap_end)
@@ -3406,8 +3396,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 
 			bi->bi_error = -EIO;
 			md_write_end(conf->mddev);
-			if (!raid5_dec_bi_active_stripes(bi))
-				bio_endio(bi);
+			bio_endio(bi);
 			bi = bi2;
 		}
 
@@ -3432,8 +3421,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 					r5_next_bio(bi, sh->dev[i].sector);
 
 				bi->bi_error = -EIO;
-				if (!raid5_dec_bi_active_stripes(bi))
-					bio_endio(bi);
+				bio_endio(bi);
 				bi = nextbi;
 			}
 		}
@@ -3765,8 +3753,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 					dev->sector + STRIPE_SECTORS) {
 					wbi2 = r5_next_bio(wbi, dev->sector);
 					md_write_end(conf->mddev);
-					if (!raid5_dec_bi_active_stripes(wbi))
-						bio_endio(wbi);
+					bio_endio(wbi);
 					wbi = wbi2;
 				}
 				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -5111,7 +5098,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
 		 * this sets the active strip count to 1 and the processed
 		 * strip count to zero (upper 8 bits)
 		 */
-		raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
+		raid5_set_bi_processed_stripes(bi, 0);
 	}
 
 	return bi;
@@ -5446,7 +5433,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	struct r5conf *conf = mddev->private;
 	sector_t logical_sector, last_sector;
 	struct stripe_head *sh;
-	int remaining;
 	int stripe_sectors;
 
 	if (mddev->reshape_position != MaxSector)
@@ -5457,7 +5443,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
 
 	bi->bi_next = NULL;
-	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
 	md_write_start(mddev, bi);
 
 	stripe_sectors = conf->chunk_sectors *
@@ -5504,7 +5489,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 				continue;
 			sh->dev[d].towrite = bi;
 			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
-			raid5_inc_bi_active_stripes(bi);
+			bio_inc_remaining(bi);
 			md_write_inc(mddev, bi);
 			sh->overwrite_disks++;
 		}
@@ -5529,10 +5514,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	}
 
 	md_write_end(mddev);
-	remaining = raid5_dec_bi_active_stripes(bi);
-	if (remaining == 0) {
-		bio_endio(bi);
-	}
+	bio_endio(bi);
 }
 
 static void raid5_make_request(struct mddev *mddev, struct bio * bi)
@@ -5543,7 +5525,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	sector_t logical_sector, last_sector;
 	struct stripe_head *sh;
 	const int rw = bio_data_dir(bi);
-	int remaining;
 	DEFINE_WAIT(w);
 	bool do_prepare;
 	bool do_flush = false;
@@ -5585,7 +5566,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 	last_sector = bio_end_sector(bi);
 	bi->bi_next = NULL;
-	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
 	md_write_start(mddev, bi);
 
 	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
@@ -5723,10 +5703,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 
 	if (rw == WRITE)
 		md_write_end(mddev);
-	remaining = raid5_dec_bi_active_stripes(bi);
-	if (remaining == 0) {
-		bio_endio(bi);
-	}
+	bio_endio(bi);
 }
 
 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
@@ -6091,7 +6068,6 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 	int dd_idx;
 	sector_t sector, logical_sector, last_sector;
 	int scnt = 0;
-	int remaining;
 	int handled = 0;
 
 	logical_sector = raid_bio->bi_iter.bi_sector &
@@ -6130,10 +6106,8 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 		raid5_release_stripe(sh);
 		handled++;
 	}
-	remaining = raid5_dec_bi_active_stripes(raid_bio);
-	if (remaining == 0) {
-		bio_endio(raid_bio);
-	}
+	bio_endio(raid_bio);
+
 	if (atomic_dec_and_test(&conf->active_aligned_reads))
 		wake_up(&conf->wait_for_quiescent);
 	return handled;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index fd5c21cde77f..7d74fb3f2ec6 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -488,8 +488,7 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
 }
 
 /*
- * We maintain a biased count of active stripes in the bottom 16 bits of
- * bi_phys_segments, and a count of processed stripes in the upper 16 bits
+ * We maintain a count of processed stripes in the upper 16 bits
  */
 static inline int raid5_bi_processed_stripes(struct bio *bio)
 {
@@ -498,20 +497,6 @@ static inline int raid5_bi_processed_stripes(struct bio *bio)
 	return (atomic_read(segments) >> 16) & 0xffff;
 }
 
-static inline int raid5_dec_bi_active_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
-	return atomic_sub_return(1, segments) & 0xffff;
-}
-
-static inline void raid5_inc_bi_active_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
-	atomic_inc(segments);
-}
-
 static inline void raid5_set_bi_processed_stripes(struct bio *bio,
 	unsigned int cnt)
 {



^ permalink raw reply related

* [md PATCH 06/15] md/raid5: remove over-loading of ->bi_phys_segments.
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

When a read request, which bypassed the cache, fails, we need to retry
it through the cache.
This involves attaching it to a sequence of stripe_heads, and it may not
be possible to get all the stripe_heads we need at once.
We do what we can, and record how far we got in ->bi_phys_segments so
we can pick up again later.

There is only ever one bio which may have a non-zero offset stored in
->bi_phys_segments, the one that is either active in the single thread
which calls retry_aligned_read(), or is in conf->retry_read_aligned
waiting for retry_aligned_read() to be called again.

So we only need to store one offset value.  This can be in a local
variable passed between remove_bio_from_retry() and
retry_aligned_read(), or in the r5conf structure next to the
->retry_read_aligned pointer.

Storing it there allows the last usage of ->bi_phys_segments to be
removed from md/raid5.c.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c |   24 ++++++++++++------------
 drivers/md/raid5.h |   30 +-----------------------------
 2 files changed, 13 insertions(+), 41 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7824c2509905..c3a8838e4b17 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5081,12 +5081,14 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
 	md_wakeup_thread(conf->mddev->thread);
 }
 
-static struct bio *remove_bio_from_retry(struct r5conf *conf)
+static struct bio *remove_bio_from_retry(struct r5conf *conf,
+					 unsigned int *offset)
 {
 	struct bio *bi;
 
 	bi = conf->retry_read_aligned;
 	if (bi) {
+		*offset = conf->retry_read_offset;
 		conf->retry_read_aligned = NULL;
 		return bi;
 	}
@@ -5094,11 +5096,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
 	if(bi) {
 		conf->retry_read_aligned_list = bi->bi_next;
 		bi->bi_next = NULL;
-		/*
-		 * this sets the active strip count to 1 and the processed
-		 * strip count to zero (upper 8 bits)
-		 */
-		raid5_set_bi_processed_stripes(bi, 0);
+		*offset = 0;
 	}
 
 	return bi;
@@ -6052,7 +6050,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
 	return STRIPE_SECTORS;
 }
 
-static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
+static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
+			       unsigned int offset)
 {
 	/* We may not be able to submit a whole bio at once as there
 	 * may not be enough stripe_heads available.
@@ -6081,7 +6080,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 		     sector += STRIPE_SECTORS,
 		     scnt++) {
 
-		if (scnt < raid5_bi_processed_stripes(raid_bio))
+		if (scnt < offset)
 			/* already done this stripe */
 			continue;
 
@@ -6089,15 +6088,15 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 
 		if (!sh) {
 			/* failed to get a stripe - must wait */
-			raid5_set_bi_processed_stripes(raid_bio, scnt);
 			conf->retry_read_aligned = raid_bio;
+			conf->retry_read_offset = scnt;
 			return handled;
 		}
 
 		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
 			raid5_release_stripe(sh);
-			raid5_set_bi_processed_stripes(raid_bio, scnt);
 			conf->retry_read_aligned = raid_bio;
+			conf->retry_read_offset = scnt;
 			return handled;
 		}
 
@@ -6224,6 +6223,7 @@ static void raid5d(struct md_thread *thread)
 	while (1) {
 		struct bio *bio;
 		int batch_size, released;
+		unsigned int offset;
 
 		released = release_stripe_list(conf, conf->temp_inactive_list);
 		if (released)
@@ -6241,10 +6241,10 @@ static void raid5d(struct md_thread *thread)
 		}
 		raid5_activate_delayed(conf);
 
-		while ((bio = remove_bio_from_retry(conf))) {
+		while ((bio = remove_bio_from_retry(conf, &offset))) {
 			int ok;
 			spin_unlock_irq(&conf->device_lock);
-			ok = retry_aligned_read(conf, bio);
+			ok = retry_aligned_read(conf, bio, offset);
 			spin_lock_irq(&conf->device_lock);
 			if (!ok)
 				break;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 7d74fb3f2ec6..cdc7f92e1806 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -487,35 +487,6 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
 		return NULL;
 }
 
-/*
- * We maintain a count of processed stripes in the upper 16 bits
- */
-static inline int raid5_bi_processed_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
-	return (atomic_read(segments) >> 16) & 0xffff;
-}
-
-static inline void raid5_set_bi_processed_stripes(struct bio *bio,
-	unsigned int cnt)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	int old, new;
-
-	do {
-		old = atomic_read(segments);
-		new = (old & 0xffff) | (cnt << 16);
-	} while (atomic_cmpxchg(segments, old, new) != old);
-}
-
-static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
-	atomic_set(segments, cnt);
-}
-
 /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
  * This is because we sometimes take all the spinlocks
  * and creating that much locking depth can cause
@@ -613,6 +584,7 @@ struct r5conf {
 	struct list_head	delayed_list; /* stripes that have plugged requests */
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
 	struct bio		*retry_read_aligned; /* currently retrying aligned bios   */
+	unsigned int		retry_read_offset; /* sector offset into retry_read_aligned */
 	struct bio		*retry_read_aligned_list; /* aligned bios retry list  */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 	atomic_t		active_aligned_reads;



^ permalink raw reply related

* [md PATCH 07/15] Revert "md/raid5: limit request size according to implementation limits"
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

This reverts commit e8d7c33232e5fdfa761c3416539bc5b4acd12db5.

Now that raid5 doesn't abuse bi_phys_segments any more, we no longer
need to impose these limits.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c |    9 ---------
 1 file changed, 9 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c3a8838e4b17..4e906e5903e8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7357,15 +7357,6 @@ static int raid5_run(struct mddev *mddev)
 			stripe = (stripe | (stripe-1)) + 1;
 		mddev->queue->limits.discard_alignment = stripe;
 		mddev->queue->limits.discard_granularity = stripe;
-
-		/*
-		 * We use 16-bit counter of active stripes in bi_phys_segments
-		 * (minus one for over-loaded initialization)
-		 */
-		blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
-		blk_queue_max_discard_sectors(mddev->queue,
-					      0xfffe * STRIPE_SECTORS);
-
 		/*
 		 * unaligned part of discard request will be ignored, so can't
 		 * guarantee discard_zeroes_data



^ permalink raw reply related

* [md PATCH 08/15] md/raid1, raid10: move rXbio accounting closer to allocation.
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

When raid1 or raid10 find they will need to allocate a new
r1bio/r10bio, in order to work around a known bad block, they
account for the allocation well before the allocation is
made.  This separation makes the correctness less obvious
and requires comments.

The accounting needs to be a little before: before the first
rXbio is submitted, but that is all.

So move the accounting down to where it makes more sense.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid1.c  |   24 +++++++++++-------------
 drivers/md/raid10.c |   22 +++++++++-------------
 2 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d3da6b36e670..7e509a894f15 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1436,18 +1436,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 		goto retry_write;
 	}
 
-	if (max_sectors < r1_bio->sectors) {
-		/* We are splitting this write into multiple parts, so
-		 * we need to prepare for allocating another r1_bio.
-		 */
+	if (max_sectors < r1_bio->sectors)
 		r1_bio->sectors = max_sectors;
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
-	}
+
 	sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
 
 	atomic_set(&r1_bio->remaining, 1);
@@ -1553,10 +1544,17 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 	 * as it could result in the bio being freed.
 	 */
 	if (sectors_handled < bio_sectors(bio)) {
-		r1_bio_write_done(r1_bio);
-		/* We need another r1_bio.  It has already been counted
+		/* We need another r1_bio, which must be accounted
 		 * in bio->bi_phys_segments
 		 */
+		spin_lock_irq(&conf->device_lock);
+		if (bio->bi_phys_segments == 0)
+			bio->bi_phys_segments = 2;
+		else
+			bio->bi_phys_segments++;
+		spin_unlock_irq(&conf->device_lock);
+
+		r1_bio_write_done(r1_bio);
 		r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
 		goto retry_write;
 	}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b1b1f982a722..20029345f8b6 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1383,18 +1383,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 		goto retry_write;
 	}
 
-	if (max_sectors < r10_bio->sectors) {
-		/* We are splitting this into multiple parts, so
-		 * we need to prepare for allocating another r10_bio.
-		 */
+	if (max_sectors < r10_bio->sectors)
 		r10_bio->sectors = max_sectors;
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
-	}
 	sectors_handled = r10_bio->sector + max_sectors -
 		bio->bi_iter.bi_sector;
 
@@ -1504,10 +1494,16 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	 */
 
 	if (sectors_handled < bio_sectors(bio)) {
-		one_write_done(r10_bio);
-		/* We need another r10_bio.  It has already been counted
+		/* We need another r10_bio and it needs to be counted
 		 * in bio->bi_phys_segments.
 		 */
+		spin_lock_irq(&conf->device_lock);
+		if (bio->bi_phys_segments == 0)
+			bio->bi_phys_segments = 2;
+		else
+			bio->bi_phys_segments++;
+		spin_unlock_irq(&conf->device_lock);
+		one_write_done(r10_bio);
 		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 
 		r10_bio->master_bio = bio;



^ permalink raw reply related

* [md PATCH 09/15] md/raid10: stop using bi_phys_segments
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

raid10 currently repurposes bi_phys_segments on each
incoming bio to count how many r10bio was used to encode the
request.

We need to know when the number of attached r10bio reaches
zero to:
1/ call bio_endio() when all IO on the bio is finished
2/ decrement ->nr_pending so that resync IO can proceed.

Now that the bio has its own __bi_remaining counter, that
can be used instead. We can call bio_inc_remaining to
increment the counter and call bio_endio() every time an
r10bio completes, rather than only when bi_phys_segments
reaches zero.

This addresses point 1, but not point 2.  bio_endio()
doesn't (and cannot) report when the last r10bio has
finished, so a different approach is needed.

So: instead of counting bios in ->nr_pending, count r10bios.
i.e. every time we attach a bio, increment nr_pending.
Every time an r10bio completes, decrement nr_pending.

Normally we only increment nr_pending after first checking
that ->barrier is zero, or some other non-trivial tests and
possible waiting.  When attaching multiple r10bios to a bio,
we only need the tests and the waiting once.  After the
first increment, subsequent increments can happen
unconditionally as they are really all part of the one
request.

So introduce inc_pending() which can be used when we know
that nr_pending is already elevated.

Note that this fixes a bug.  freeze_array() contains the line
	atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
which implies that the units for ->nr_pending, ->nr_queued and extra
are the same.
->nr_queue and extra count r10_bios, but prior to this patch,
->nr_pending counted bios.  If a bio ever resulted in multiple
r10_bios (due to bad blocks), freeze_array() would not work correctly.
Now it does.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid10.c |   76 +++++++++++++++++----------------------------------
 1 file changed, 25 insertions(+), 51 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 20029345f8b6..288e39abae11 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -301,27 +301,18 @@ static void reschedule_retry(struct r10bio *r10_bio)
 static void raid_end_bio_io(struct r10bio *r10_bio)
 {
 	struct bio *bio = r10_bio->master_bio;
-	int done;
 	struct r10conf *conf = r10_bio->mddev->private;
 
-	if (bio->bi_phys_segments) {
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
-		bio->bi_phys_segments--;
-		done = (bio->bi_phys_segments == 0);
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-	} else
-		done = 1;
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
 		bio->bi_error = -EIO;
-	if (done) {
-		bio_endio(bio);
-		/*
-		 * Wake up any possible resync thread that waits for the device
-		 * to go idle.
-		 */
-		allow_barrier(conf);
-	}
+
+	bio_endio(bio);
+	/*
+	 * Wake up any possible resync thread that waits for the device
+	 * to go idle.
+	 */
+	allow_barrier(conf);
+
 	free_r10bio(r10_bio);
 }
 
@@ -984,6 +975,15 @@ static void wait_barrier(struct r10conf *conf)
 	spin_unlock_irq(&conf->resync_lock);
 }
 
+static void inc_pending(struct r10conf *conf)
+{
+	/* The current request requires multiple r10_bio, so
+	 * we need to increment the pending count.
+	 */
+	WARN_ON(!atomic_read(&conf->nr_pending));
+	atomic_inc(&conf->nr_pending);
+}
+
 static void allow_barrier(struct r10conf *conf)
 {
 	if ((atomic_dec_and_test(&conf->nr_pending)) ||
@@ -1161,12 +1161,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 		sectors_handled = (r10_bio->sector + max_sectors
 				   - bio->bi_iter.bi_sector);
 		r10_bio->sectors = max_sectors;
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
+		inc_pending(conf);
+		bio_inc_remaining(bio);
 		/*
 		 * Cannot call generic_make_request directly as that will be
 		 * queued in __generic_make_request and subsequent
@@ -1261,9 +1257,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	 * on which we have seen a write error, we want to avoid
 	 * writing to those blocks.  This potentially requires several
 	 * writes to write around the bad blocks.  Each set of writes
-	 * gets its own r10_bio with a set of bios attached.  The number
-	 * of r10_bios is recored in bio->bi_phys_segments just as with
-	 * the read case.
+	 * gets its own r10_bio with a set of bios attached.
 	 */
 
 	r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
@@ -1494,15 +1488,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	 */
 
 	if (sectors_handled < bio_sectors(bio)) {
-		/* We need another r10_bio and it needs to be counted
-		 * in bio->bi_phys_segments.
-		 */
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
+		/* We need another r10_bio and it needs to be counted */
+		inc_pending(conf);
+		bio_inc_remaining(bio);
 		one_write_done(r10_bio);
 		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 
@@ -1531,16 +1519,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 	r10_bio->sector = bio->bi_iter.bi_sector;
 	r10_bio->state = 0;
 
-	/*
-	 * We might need to issue multiple reads to different devices if there
-	 * are bad blocks around, so we keep track of the number of reads in
-	 * bio->bi_phys_segments.  If this is 0, there is only one r10_bio and
-	 * no locking will be needed when the request completes.  If it is
-	 * non-zero, then it is the number of not-completed requests.
-	 */
-	bio->bi_phys_segments = 0;
-	bio_clear_flag(bio, BIO_SEG_VALID);
-
 	if (bio_data_dir(bio) == READ)
 		raid10_read_request(mddev, bio, r10_bio);
 	else
@@ -2692,12 +2670,8 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 			r10_bio->sector + max_sectors
 			- mbio->bi_iter.bi_sector;
 		r10_bio->sectors = max_sectors;
-		spin_lock_irq(&conf->device_lock);
-		if (mbio->bi_phys_segments == 0)
-			mbio->bi_phys_segments = 2;
-		else
-			mbio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
+		bio_inc_remaining(mbio);
+		inc_pending(conf);
 		generic_make_request(bio);
 
 		r10_bio = mempool_alloc(conf->r10bio_pool,



^ permalink raw reply related

* [md PATCH 10/15] md/raid1: stop using bi_phys_segment
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

Change to use bio->__bi_remaining to count number of r1bio attached
to a bio.
See precious raid10 patch for more details.

Like the raid10.c patch, this fixes a bug as nr_queued and nr_pending
used to measure different things, but were being compared.

This patch fixes another bug in that nr_pending previously did not
could write-behind requests, so behind writes could continue while
resync was happening.  How that nr_pending counts all r1_bio,
the resync cannot commence until the behind writes have completed.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid1.c |   87 +++++++++++++---------------------------------------
 1 file changed, 22 insertions(+), 65 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7e509a894f15..e566407b196f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -246,35 +246,18 @@ static void reschedule_retry(struct r1bio *r1_bio)
 static void call_bio_endio(struct r1bio *r1_bio)
 {
 	struct bio *bio = r1_bio->master_bio;
-	int done;
 	struct r1conf *conf = r1_bio->mddev->private;
 	sector_t bi_sector = bio->bi_iter.bi_sector;
 
-	if (bio->bi_phys_segments) {
-		unsigned long flags;
-		spin_lock_irqsave(&conf->device_lock, flags);
-		bio->bi_phys_segments--;
-		done = (bio->bi_phys_segments == 0);
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-		/*
-		 * make_request() might be waiting for
-		 * bi_phys_segments to decrease
-		 */
-		wake_up(&conf->wait_barrier);
-	} else
-		done = 1;
-
 	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
 		bio->bi_error = -EIO;
 
-	if (done) {
-		bio_endio(bio);
-		/*
-		 * Wake up any possible resync thread that waits for the device
-		 * to go idle.
-		 */
-		allow_barrier(conf, bi_sector);
-	}
+	bio_endio(bio);
+	/*
+	 * Wake up any possible resync thread that waits for the device
+	 * to go idle.
+	 */
+	allow_barrier(conf, bi_sector);
 }
 
 static void raid_end_bio_io(struct r1bio *r1_bio)
@@ -977,6 +960,16 @@ static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
 	spin_unlock_irq(&conf->resync_lock);
 }
 
+static void inc_pending(struct r1conf *conf, sector_t bi_sector)
+{
+	/* The current request requires multiple r1_bio, so
+	 * we need to increment the pending count, and the corresponding
+	 * window count.
+	 */
+	int idx = sector_to_idx(bi_sector);
+	atomic_inc(&conf->nr_pending[idx]);
+}
+
 static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
 {
 	int idx = sector_to_idx(sector_nr);
@@ -1192,17 +1185,6 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio)
 	r1_bio = alloc_r1bio(mddev, bio, 0);
 
 	/*
-	 * We might need to issue multiple reads to different
-	 * devices if there are bad blocks around, so we keep
-	 * track of the number of reads in bio->bi_phys_segments.
-	 * If this is 0, there is only one r1_bio and no locking
-	 * will be needed when requests complete.  If it is
-	 * non-zero, then it is the number of not-completed requests.
-	 */
-	bio->bi_phys_segments = 0;
-	bio_clear_flag(bio, BIO_SEG_VALID);
-
-	/*
 	 * make_request() can abort the operation when read-ahead is being
 	 * used and no empty request is available.
 	 */
@@ -1257,12 +1239,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio)
 		sectors_handled = (r1_bio->sector + max_sectors
 				   - bio->bi_iter.bi_sector);
 		r1_bio->sectors = max_sectors;
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
+		bio_inc_remaining(bio);
 
 		/*
 		 * Cannot call generic_make_request directly as that will be
@@ -1329,16 +1306,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 
 	r1_bio = alloc_r1bio(mddev, bio, 0);
 
-	/* We might need to issue multiple writes to different
-	 * devices if there are bad blocks around, so we keep
-	 * track of the number of writes in bio->bi_phys_segments.
-	 * If this is 0, there is only one r1_bio and no locking
-	 * will be needed when requests complete.  If it is
-	 * non-zero, then it is the number of not-completed requests.
-	 */
-	bio->bi_phys_segments = 0;
-	bio_clear_flag(bio, BIO_SEG_VALID);
-
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
 		raid1_log(mddev, "wait queued");
@@ -1544,16 +1511,11 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio)
 	 * as it could result in the bio being freed.
 	 */
 	if (sectors_handled < bio_sectors(bio)) {
-		/* We need another r1_bio, which must be accounted
-		 * in bio->bi_phys_segments
-		 */
-		spin_lock_irq(&conf->device_lock);
-		if (bio->bi_phys_segments == 0)
-			bio->bi_phys_segments = 2;
-		else
-			bio->bi_phys_segments++;
-		spin_unlock_irq(&conf->device_lock);
+		/* We need another r1_bio, which must be counted */
+		sector_t sect = bio->bi_iter.bi_sector + sectors_handled;
 
+		inc_pending(conf, sect);
+		bio_inc_remaining(bio);
 		r1_bio_write_done(r1_bio);
 		r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
 		goto retry_write;
@@ -2573,12 +2535,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 			int sectors_handled = (r1_bio->sector + max_sectors
 					       - mbio->bi_iter.bi_sector);
 			r1_bio->sectors = max_sectors;
-			spin_lock_irq(&conf->device_lock);
-			if (mbio->bi_phys_segments == 0)
-				mbio->bi_phys_segments = 2;
-			else
-				mbio->bi_phys_segments++;
-			spin_unlock_irq(&conf->device_lock);
+			bio_inc_remaining(mbio);
 			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
 					      bio, bio_dev, bio_sector);
 			generic_make_request(bio);



^ permalink raw reply related

* [md PATCH 11/15] md/raid5: don't test ->writes_pending in raid5_remove_disk
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

This test on ->writes_pending cannot be safe as the counter
can be incremented at any moment and cannot be locked against.

Change it to test conf->active_stripes, which at least
can be locked against.  More changes are still needed.

A future patch will change ->writes_pending, and testing it here will
be very inconvenient.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c |    5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4e906e5903e8..5eb3764bafd3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7528,9 +7528,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 		/*
 		 * we can't wait pending write here, as this is called in
 		 * raid5d, wait will deadlock.
+		 * neilb: there is no locking about new writes here,
+		 * so this cannot be safe.
 		 */
-		if (atomic_read(&mddev->writes_pending))
+		if (atomic_read(&conf->active_stripes)) {
 			return -EBUSY;
+		}
 		log_exit(conf);
 		return 0;
 	}



^ permalink raw reply related

* [md PATCH 12/15] md: factor out set_in_sync()
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

Three separate places in md.c check if the number of active
writes is zero and, if so, sets mddev->in_sync.

There are a few differences, but there shouldn't be:
- it is always appropriate to notify the change in
  sysfs_state, and there is no need to do this outside a
  spin-locked region.
- we never need to check ->recovery_cp.  The state of resync
  is not relevant for whether there are any pending writes
  or not (which is what ->in_sync reports).

So create set_in_sync() which does the correct tests and
makes the correct changes, and call this in all three
places.

Any behaviour changes here a minor and cosmetic.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c |   54 ++++++++++++++++++++----------------------------------
 1 file changed, 20 insertions(+), 34 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index bad5771bced4..2fa8048894e6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2252,6 +2252,21 @@ static void export_array(struct mddev *mddev)
 	mddev->major_version = 0;
 }
 
+static bool set_in_sync(struct mddev *mddev)
+{
+	WARN_ON_ONCE(!spin_is_locked(&mddev->lock));
+	if (atomic_read(&mddev->writes_pending) == 0) {
+		if (mddev->in_sync == 0) {
+			mddev->in_sync = 1;
+			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+			sysfs_notify_dirent_safe(mddev->sysfs_state);
+		}
+	}
+	if (mddev->safemode == 1)
+		mddev->safemode = 0;
+	return mddev->in_sync;
+}
+
 static void sync_sbs(struct mddev *mddev, int nospares)
 {
 	/* Update each superblock (in-memory image), but
@@ -4024,7 +4039,7 @@ static int restart_array(struct mddev *mddev);
 static ssize_t
 array_state_store(struct mddev *mddev, const char *buf, size_t len)
 {
-	int err;
+	int err = 0;
 	enum array_state st = match_word(buf, array_states);
 
 	if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
@@ -4037,18 +4052,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
 			clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
 			md_wakeup_thread(mddev->thread);
 			wake_up(&mddev->sb_wait);
-			err = 0;
 		} else /* st == clean */ {
 			restart_array(mddev);
-			if (atomic_read(&mddev->writes_pending) == 0) {
-				if (mddev->in_sync == 0) {
-					mddev->in_sync = 1;
-					if (mddev->safemode == 1)
-						mddev->safemode = 0;
-					set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
-				}
-				err = 0;
-			} else
+			if (!set_in_sync(mddev))
 				err = -EBUSY;
 		}
 		if (!err)
@@ -4106,15 +4112,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
 			if (err)
 				break;
 			spin_lock(&mddev->lock);
-			if (atomic_read(&mddev->writes_pending) == 0) {
-				if (mddev->in_sync == 0) {
-					mddev->in_sync = 1;
-					if (mddev->safemode == 1)
-						mddev->safemode = 0;
-					set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
-				}
-				err = 0;
-			} else
+			if (!set_in_sync(mddev))
 				err = -EBUSY;
 			spin_unlock(&mddev->lock);
 		} else
@@ -8591,22 +8589,10 @@ void md_check_recovery(struct mddev *mddev)
 			}
 		}
 
-		if (!mddev->external) {
-			int did_change = 0;
+		if (!mddev->external && !mddev->in_sync) {
 			spin_lock(&mddev->lock);
-			if (mddev->safemode &&
-			    !atomic_read(&mddev->writes_pending) &&
-			    !mddev->in_sync &&
-			    mddev->recovery_cp == MaxSector) {
-				mddev->in_sync = 1;
-				did_change = 1;
-				set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
-			}
-			if (mddev->safemode == 1)
-				mddev->safemode = 0;
+			set_in_sync(mddev);
 			spin_unlock(&mddev->lock);
-			if (did_change)
-				sysfs_notify_dirent_safe(mddev->sysfs_state);
 		}
 
 		if (mddev->sb_flags)



^ permalink raw reply related

* [md PATCH 13/15] md: close a race with setting mddev->in_sync
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

If ->in_sync is being set just as md_write_start() is being called,
it is possible that set_in_sync() won't see the elevated
->writes_pending, and md_write_start() won't see the set ->in_sync.

To close this race, re-test ->writes_pending after setting ->in_sync,
and add memory barriers to ensure the increment of ->writes_pending
will be seen by the time of this second test, or the new ->in_sync
will be seen by md_write_start().

Add a spinlock to array_state_show() to ensure this temporary
instability is never visible from userspace.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c |    7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2fa8048894e6..c33ec97b23d4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2258,6 +2258,10 @@ static bool set_in_sync(struct mddev *mddev)
 	if (atomic_read(&mddev->writes_pending) == 0) {
 		if (mddev->in_sync == 0) {
 			mddev->in_sync = 1;
+			smp_mb();
+			if (atomic_read(&mddev->writes_pending))
+				/* lost a race with md_write_start() */
+				mddev->in_sync = 0;
 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
 			sysfs_notify_dirent_safe(mddev->sysfs_state);
 		}
@@ -4011,6 +4015,7 @@ array_state_show(struct mddev *mddev, char *page)
 			st = read_auto;
 			break;
 		case 0:
+			spin_lock(&mddev->lock);
 			if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
 				st = write_pending;
 			else if (mddev->in_sync)
@@ -4019,6 +4024,7 @@ array_state_show(struct mddev *mddev, char *page)
 				st = active_idle;
 			else
 				st = active;
+			spin_unlock(&mddev->lock);
 		}
 	else {
 		if (list_empty(&mddev->disks) &&
@@ -7894,6 +7900,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 		did_change = 1;
 	}
 	atomic_inc(&mddev->writes_pending);
+	smp_mb(); /* Match smp_mb in set_in_sync() */
 	if (mddev->safemode == 1)
 		mddev->safemode = 0;
 	if (mddev->in_sync) {



^ permalink raw reply related

* [md PATCH 14/15] percpu-refcount: support synchronous switch to atomic mode.
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

percpu_ref_switch_to_atomic_sync() schedules the switch
to atomic mode, then waits for it to complete.

Also export percpu_ref_switch_to_* so they can be used from modules.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/percpu-refcount.h |    1 +
 lib/percpu-refcount.c           |   18 ++++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 3a481a49546e..c13dceb87b60 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -99,6 +99,7 @@ int __must_check percpu_ref_init(struct percpu_ref *ref,
 void percpu_ref_exit(struct percpu_ref *ref);
 void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_switch);
+void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
 void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_kill);
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 9ac959ef4cae..d133ed43a375 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -260,6 +260,23 @@ void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 
 	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic);
+
+/**
+ * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode
+ * @ref: percpu_ref to switch to atomic mode
+ *
+ * Schedule switching the ref to atomic mode, and wait for the
+ * switch to complete.  Caller must ensure that no other thread
+ * will switch back to percpu mode.
+ *
+ */
+void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref)
+{
+	percpu_ref_switch_to_atomic(ref, NULL);
+	wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
+}
+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync);
 
 /**
  * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
@@ -290,6 +307,7 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 
 	spin_unlock_irqrestore(&percpu_ref_switch_lock, flags);
 }
+EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);
 
 /**
  * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation



^ permalink raw reply related

* [md PATCH 15/15] MD: use per-cpu counter for writes_pending
From: NeilBrown @ 2017-03-15  3:05 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, hch
In-Reply-To: <148954692173.18641.1294690639716682540.stgit@noble>

The 'writes_pending' counter is used to determine when the
array is stable so that it can be marked in the superblock
as "Clean".  Consequently it needs to be updated frequently
but only checked for zero occasionally.  Recent changes to
raid5 cause the count to be updated even more often - once
per 4K rather than once per bio.  This provided
justification for making the updates more efficient.

So we replace the atomic counter a percpu-refcount.
This can be incremented and decremented cheaply most of the
time, and can be switched to "atomic" mode when more
precise counting is needed.  As it is possible for multiple
threads to want a precise count, we introduce a
"sync_checker" counter to count the number of threads
in "set_in_sync()", and only switch the refcount back
to percpu mode when that is zero.

We need to be careful about races between set_in_sync()
setting ->in_sync to 1, and md_write_start() setting it
to zero.  md_write_start() holds the rcu_read_lock()
while checking if the refcount is in percpu mode.  If
it is, then we know a switch to 'atomic' will not happen until
after we call rcu_read_unlock(), in which case set_in_sync()
will see the elevated count, and not set in_sync to 1.
If it is not in percpu mode, we take the mddev->lock to
ensure proper synchronization.

It is no longer possible to quickly check if the count is zero, which
we previously did to update a timer or to schedule the md_thread.
So now we do these every time we decrement that counter, but make
sure they are fast.

mod_timer() already optimizes the case where the timeout value doesn't
actually change.  We leverage that further by always rounding off the
jiffies to the timeout value.  This may delay the marking of 'clean'
slightly, but ensure we only perform atomic operation here when absolutely
needed.

md_wakeup_thread() current always calls wake_up(), even if
THREAD_WAKEUP is already set.  That too can be optimised to avoid
calls to wake_up().

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c |   70 +++++++++++++++++++++++++++++++++++++------------------
 drivers/md/md.h |    3 ++
 2 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c33ec97b23d4..adf2b5bdfd67 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -65,6 +65,8 @@
 #include <linux/raid/md_p.h>
 #include <linux/raid/md_u.h>
 #include <linux/slab.h>
+#include <linux/percpu-refcount.h>
+
 #include <trace/events/block.h>
 #include "md.h"
 #include "bitmap.h"
@@ -2255,16 +2257,24 @@ static void export_array(struct mddev *mddev)
 static bool set_in_sync(struct mddev *mddev)
 {
 	WARN_ON_ONCE(!spin_is_locked(&mddev->lock));
-	if (atomic_read(&mddev->writes_pending) == 0) {
-		if (mddev->in_sync == 0) {
+	if (!mddev->in_sync) {
+		mddev->sync_checkers++;
+		spin_unlock(&mddev->lock);
+		percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
+		spin_lock(&mddev->lock);
+		if (!mddev->in_sync &&
+		    percpu_ref_is_zero(&mddev->writes_pending)) {
 			mddev->in_sync = 1;
+			/*
+			 * Ensure in_sync is visible before switch back
+			 * to percpu
+			 */
 			smp_mb();
-			if (atomic_read(&mddev->writes_pending))
-				/* lost a race with md_write_start() */
-				mddev->in_sync = 0;
 			set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
 			sysfs_notify_dirent_safe(mddev->sysfs_state);
 		}
+		if (--mddev->sync_checkers == 0)
+			percpu_ref_switch_to_percpu(&mddev->writes_pending);
 	}
 	if (mddev->safemode == 1)
 		mddev->safemode = 0;
@@ -5120,6 +5130,7 @@ static void md_free(struct kobject *ko)
 		del_gendisk(mddev->gendisk);
 		put_disk(mddev->gendisk);
 	}
+	percpu_ref_exit(&mddev->writes_pending);
 
 	kfree(mddev);
 }
@@ -5145,6 +5156,8 @@ static void mddev_delayed_delete(struct work_struct *ws)
 	kobject_put(&mddev->kobj);
 }
 
+static void no_op(struct percpu_ref *r) {}
+
 static int md_alloc(dev_t dev, char *name)
 {
 	static DEFINE_MUTEX(disks_mutex);
@@ -5196,6 +5209,10 @@ static int md_alloc(dev_t dev, char *name)
 	blk_queue_make_request(mddev->queue, md_make_request);
 	blk_set_stacking_limits(&mddev->queue->limits);
 
+	if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
+		goto abort;
+	/* We want to start with the refcount at zero */
+	percpu_ref_put(&mddev->writes_pending);
 	disk = alloc_disk(1 << shift);
 	if (!disk) {
 		blk_cleanup_queue(mddev->queue);
@@ -5279,11 +5296,10 @@ static void md_safemode_timeout(unsigned long data)
 {
 	struct mddev *mddev = (struct mddev *) data;
 
-	if (!atomic_read(&mddev->writes_pending)) {
-		mddev->safemode = 1;
-		if (mddev->external)
-			sysfs_notify_dirent_safe(mddev->sysfs_state);
-	}
+	mddev->safemode = 1;
+	if (mddev->external)
+		sysfs_notify_dirent_safe(mddev->sysfs_state);
+
 	md_wakeup_thread(mddev->thread);
 }
 
@@ -5488,7 +5504,6 @@ int md_run(struct mddev *mddev)
 	} else if (mddev->ro == 2) /* auto-readonly not meaningful */
 		mddev->ro = 0;
 
-	atomic_set(&mddev->writes_pending,0);
 	atomic_set(&mddev->max_corr_read_errors,
 		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
 	mddev->safemode = 0;
@@ -7351,8 +7366,8 @@ void md_wakeup_thread(struct md_thread *thread)
 {
 	if (thread) {
 		pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
-		set_bit(THREAD_WAKEUP, &thread->flags);
-		wake_up(&thread->wqueue);
+		if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags))
+			wake_up(&thread->wqueue);
 	}
 }
 EXPORT_SYMBOL(md_wakeup_thread);
@@ -7886,6 +7901,7 @@ EXPORT_SYMBOL(md_done_sync);
  */
 void md_write_start(struct mddev *mddev, struct bio *bi)
 {
+	unsigned long __percpu *notused;
 	int did_change = 0;
 	if (bio_data_dir(bi) != WRITE)
 		return;
@@ -7899,11 +7915,12 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 		md_wakeup_thread(mddev->sync_thread);
 		did_change = 1;
 	}
-	atomic_inc(&mddev->writes_pending);
+	rcu_read_lock();
+	percpu_ref_get(&mddev->writes_pending);
 	smp_mb(); /* Match smp_mb in set_in_sync() */
 	if (mddev->safemode == 1)
 		mddev->safemode = 0;
-	if (mddev->in_sync) {
+	if (mddev->in_sync || !__ref_is_percpu(&mddev->writes_pending, &notused)) {
 		spin_lock(&mddev->lock);
 		if (mddev->in_sync) {
 			mddev->in_sync = 0;
@@ -7914,6 +7931,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 		}
 		spin_unlock(&mddev->lock);
 	}
+	rcu_read_unlock();
 	if (did_change)
 		sysfs_notify_dirent_safe(mddev->sysfs_state);
 	wait_event(mddev->sb_wait,
@@ -7934,19 +7952,25 @@ void md_write_inc(struct mddev *mddev, struct bio *bi)
 	if (bio_data_dir(bi) != WRITE)
 		return;
 	WARN_ON_ONCE(mddev->in_sync || mddev->ro);
-	atomic_inc(&mddev->writes_pending);
+	percpu_ref_get(&mddev->writes_pending);
 }
 EXPORT_SYMBOL(md_write_inc);
 
 void md_write_end(struct mddev *mddev)
 {
-	if (atomic_dec_and_test(&mddev->writes_pending)) {
-		if (mddev->safemode == 2)
-			md_wakeup_thread(mddev->thread);
-		else if (mddev->safemode_delay)
-			mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
-	}
+	percpu_ref_put(&mddev->writes_pending);
+
+	if (mddev->safemode == 2)
+		md_wakeup_thread(mddev->thread);
+	else if (mddev->safemode_delay)
+		/* The roundup() ensures this only performs locking once
+		 * every ->safemode_delay jiffies
+		 */
+		mod_timer(&mddev->safemode_timer,
+			  roundup(jiffies, mddev->safemode_delay) +
+			  mddev->safemode_delay);
 }
+
 EXPORT_SYMBOL(md_write_end);
 
 /* md_allow_write(mddev)
@@ -8547,7 +8571,7 @@ void md_check_recovery(struct mddev *mddev)
 		test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
 		test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
 		(mddev->external == 0 && mddev->safemode == 1) ||
-		(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
+		(mddev->safemode == 2
 		 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
 		))
 		return;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 0cd12721a536..7a7847d1cc39 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -409,7 +409,8 @@ struct mddev {
 							 */
 	unsigned int			safemode_delay;
 	struct timer_list		safemode_timer;
-	atomic_t			writes_pending;
+	struct percpu_ref		writes_pending;
+	int				sync_checkers;	/* # of threads checking writes_pending */
 	struct request_queue		*queue;	/* for plugging ... */
 
 	struct bitmap			*bitmap; /* the bitmap for the device */



^ permalink raw reply related

* Re: [PATCH v6 0/4] Broadcom SBA RAID support
From: Anup Patel @ 2017-03-15  8:02 UTC (permalink / raw)
  To: Dan Williams
  Cc: Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback, dmaengine@vger.kernel.org,
	Device Tree, linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-crypto, linux-raid,
	Shaohua Li
In-Reply-To: <CAPcyv4hGGbhF-g8B__9=rsP+QwR6RBi=WqALLgABNb1M4D3rBw@mail.gmail.com>

On Tue, Mar 14, 2017 at 10:26 PM, Dan Williams <dan.j.williams@intel.com> wrote:
> On Mon, Mar 6, 2017 at 1:43 AM, Anup Patel <anup.patel@broadcom.com> wrote:
>> The Broadcom SBA RAID is a stream-based device which provides
>> RAID5/6 offload.
>>
>> It requires a SoC specific ring manager (such as Broadcom FlexRM
>> ring manager) to provide ring-based programming interface. Due to
>> this, the Broadcom SBA RAID driver (mailbox client) implements
>> DMA device having one DMA channel using a set of mailbox channels
>> provided by Broadcom SoC specific ring manager driver (mailbox
>> controller).
>>
>> The Broadcom SBA RAID hardware requires PQ disk position instead
>> of PQ disk coefficient. To address this, we have added raid_gflog
>> table which will help driver to convert PQ disk coefficient to PQ
>> disk position.
>>
>> This patchset is based on Linux-4.11-rc1 and depends on patchset
>> "[PATCH v5 0/2] Broadcom FlexRM ring manager support"
>>
>> It is also available at sba-raid-v6 branch of
>> https://github.com/Broadcom/arm64-linux.git
>>
> [..]
>>
>> Anup Patel (4):
>>   lib/raid6: Add log-of-2 table for RAID6 HW requiring disk position
>>   async_tx: Fix DMA_PREP_FENCE usage in do_async_gen_syndrome()
>>   dmaengine: Add Broadcom SBA RAID driver
>>   dt-bindings: Add DT bindings document for Broadcom SBA RAID driver
>
> For the dmaengine and async_tx changes:
>
> Acked-by: Dan Williams <dan.j.williams@intel.com>
>

Thanks Dan ...

Regards,
Anup

^ permalink raw reply

* Re: [PATCH v6 0/4] Broadcom SBA RAID support
From: Anup Patel @ 2017-03-15  8:02 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Dan Williams, Vinod Koul, Rob Herring, Mark Rutland, Herbert Xu,
	David S . Miller, Jassi Brar, Ray Jui, Scott Branden, Jon Mason,
	Rob Rice, BCM Kernel Feedback, dmaengine@vger.kernel.org,
	Device Tree, linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, linux-crypto
In-Reply-To: <20170314184806.o2oykhztje3obwot@kernel.org>

On Wed, Mar 15, 2017 at 12:18 AM, Shaohua Li <shli@kernel.org> wrote:
> On Tue, Mar 14, 2017 at 09:56:35AM -0700, Dan Williams wrote:
>> On Mon, Mar 6, 2017 at 1:43 AM, Anup Patel <anup.patel@broadcom.com> wrote:
>> > The Broadcom SBA RAID is a stream-based device which provides
>> > RAID5/6 offload.
>> >
>> > It requires a SoC specific ring manager (such as Broadcom FlexRM
>> > ring manager) to provide ring-based programming interface. Due to
>> > this, the Broadcom SBA RAID driver (mailbox client) implements
>> > DMA device having one DMA channel using a set of mailbox channels
>> > provided by Broadcom SoC specific ring manager driver (mailbox
>> > controller).
>> >
>> > The Broadcom SBA RAID hardware requires PQ disk position instead
>> > of PQ disk coefficient. To address this, we have added raid_gflog
>> > table which will help driver to convert PQ disk coefficient to PQ
>> > disk position.
>> >
>> > This patchset is based on Linux-4.11-rc1 and depends on patchset
>> > "[PATCH v5 0/2] Broadcom FlexRM ring manager support"
>> >
>> > It is also available at sba-raid-v6 branch of
>> > https://github.com/Broadcom/arm64-linux.git
>> >
>> [..]
>> >
>> > Anup Patel (4):
>> >   lib/raid6: Add log-of-2 table for RAID6 HW requiring disk position
>> >   async_tx: Fix DMA_PREP_FENCE usage in do_async_gen_syndrome()
>> >   dmaengine: Add Broadcom SBA RAID driver
>> >   dt-bindings: Add DT bindings document for Broadcom SBA RAID driver
>>
>> For the dmaengine and async_tx changes:
>>
>> Acked-by: Dan Williams <dan.j.williams@intel.com>
>>
>> The raid change should get an ack from Shaohua.
>
> For the raid6 part:
>
> Acked-by: Shaohua Li <shli@fb.com>

Thanks Shaohua ...

Regards,
Anup

^ permalink raw reply

* [PATCH] md:fix a trivial typo in comments
From: Zhilong Liu @ 2017-03-15  8:14 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Zhilong Liu

bitmap.c: fix a trivial typo in comments of bitmap_read_sb().

Signed-off-by: Zhilong Liu <zlliu@suse.com>
---
 drivers/md/bitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 9fb2cca..af9e0f0 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -696,7 +696,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 
 out:
 	kunmap_atomic(sb);
-	/* Assiging chunksize is required for "re_read" */
+	/* Assigning chunksize is required for "re_read" */
 	bitmap->mddev->bitmap_info.chunksize = chunksize;
 	if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
 		err = md_setup_cluster(bitmap->mddev, nodes);
-- 
2.6.6


^ permalink raw reply related

* [PATCH 1/2] md/r5cache: improve add-journal
From: Song Liu @ 2017-03-15 18:28 UTC (permalink / raw)
  To: linux-raid
  Cc: shli, neilb, kernel-team, dan.j.williams, hch, jes.sorensen,
	Song Liu

1. suspend the array before adding journal, so that we can add journal
   when the array is not read-only;
2. allow recreate journal when existing journal is Faulty. So that we can
   add-journal before removing failed journal.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md.c    | 5 +++--
 drivers/md/raid5.c | 6 ++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 42e68b2..ac3bd15 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6230,9 +6230,10 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 			struct md_rdev *rdev2;
 			bool has_journal = false;
 
-			/* make sure no existing journal disk */
+			/* make sure no active journal disk */
 			rdev_for_each(rdev2, mddev) {
-				if (test_bit(Journal, &rdev2->flags)) {
+				if (test_bit(Journal, &rdev2->flags) &&
+				    !test_bit(Faulty, &rdev2->flags)) {
 					has_journal = true;
 					break;
 				}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 447d9dd..ee8648b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7758,11 +7758,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			return -EBUSY;
 
 		rdev->raid_disk = 0;
-		/*
-		 * The array is in readonly mode if journal is missing, so no
-		 * write requests running. We should be safe
-		 */
+		mddev_suspend(mddev);
 		log_init(conf, rdev);
+		mddev_resume(mddev);
 		return 0;
 	}
 	if (mddev->recovery_disabled == conf->recovery_disabled)
-- 
2.9.3


^ permalink raw reply related

* [PATCH 2/2] md/r5cache: journal remove support
From: Song Liu @ 2017-03-15 18:28 UTC (permalink / raw)
  To: linux-raid
  Cc: shli, neilb, kernel-team, dan.j.williams, hch, jes.sorensen,
	Song Liu
In-Reply-To: <20170315182816.3627123-1-songliubraving@fb.com>

When journal device of an array fails, the array is forced into read-only
mode. To make the array normal without adding another journal device, we
need to remove journal _feature_ from the array.

This patch allows remove journal _feature_ from an array, For journal
remove to work, existing journal should be either missing or faulty.

Two flags are added to GET_ARRAY_INFO for mdadm.
  1. MD_SB_HAS_JOURNAL: meaning the array have journal feature;
  2. MD_SB_JOURNAL_REMOVABLE: meaning the journal is faulty or missing

When both flags are set, mdadm can clear MD_SB_HAS_JOURNAL to remove
journal _feature_.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md.c                | 42 ++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/raid/md_p.h | 11 ++++++++---
 2 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index ac3bd15..32ee994 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5981,6 +5981,25 @@ static void autorun_devices(int part)
 }
 #endif /* !MODULE */
 
+/*
+ * the journal _feature_ is removable when:
+ * the array has journal support &&
+ *    (journal is missing || journal is faulty)
+ */
+static bool journal_removable(struct mddev *mddev)
+{
+	struct md_rdev *rdev;
+
+	if (!test_bit(MD_HAS_JOURNAL, &mddev->flags))
+		return false;
+
+	rdev_for_each_rcu(rdev, mddev)
+		if (test_bit(Journal, &rdev->flags) &&
+		    !test_bit(Faulty, &rdev->flags))
+		    return false;
+	return true;
+}
+
 static int get_version(void __user *arg)
 {
 	mdu_version_t ver;
@@ -6041,6 +6060,10 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
 		info.state |= (1<<MD_SB_BITMAP_PRESENT);
 	if (mddev_is_clustered(mddev))
 		info.state |= (1<<MD_SB_CLUSTERED);
+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
+		info.state |= (1<<MD_SB_HAS_JOURNAL);
+	if (journal_removable(mddev))
+		info.state |= (1<<MD_SB_JOURNAL_REMOVABLE);
 	info.active_disks  = insync;
 	info.working_disks = working;
 	info.failed_disks  = failed;
@@ -6721,6 +6744,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 	/* calculate expected state,ignoring low bits */
 	if (mddev->bitmap && mddev->bitmap_info.offset)
 		state |= (1 << MD_SB_BITMAP_PRESENT);
+	if (journal_removable(mddev))
+		state |= (1 << MD_SB_JOURNAL_REMOVABLE);
 
 	if (mddev->major_version != info->major_version ||
 	    mddev->minor_version != info->minor_version ||
@@ -6730,8 +6755,11 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 /*	    mddev->layout        != info->layout        || */
 	    mddev->persistent	 != !info->not_persistent ||
 	    mddev->chunk_sectors != info->chunk_size >> 9 ||
-	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
-	    ((state^info->state) & 0xfffffe00)
+	    /*
+	     * ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT
+	     * and SB_HAS_JOURNAL to change
+	     */
+	    ((state^info->state) & 0xfffffc00)
 		)
 		return -EINVAL;
 	/* Check there is only one change */
@@ -6743,6 +6771,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 		cnt++;
 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
 		cnt++;
+	if ((state ^ info->state) & (1<<MD_SB_HAS_JOURNAL))
+		cnt++;
 	if (cnt == 0)
 		return 0;
 	if (cnt > 1)
@@ -6831,6 +6861,14 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 			mddev->bitmap_info.offset = 0;
 		}
 	}
+	if ((state ^ info->state) & (1<<MD_SB_HAS_JOURNAL)) {
+		if (!journal_removable(mddev)) {
+			rv = -EINVAL;
+			goto err;
+		}
+		clear_bit(MD_HAS_JOURNAL,  &mddev->flags);
+	}
+
 	md_update_sb(mddev, 1);
 	return rv;
 err:
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index d9a1ead..b1f2b63 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -1,15 +1,15 @@
 /*
    md_p.h : physical layout of Linux RAID devices
           Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
-	  
+
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2, or (at your option)
    any later version.
-   
+
    You should have received a copy of the GNU General Public License
    (for example /usr/src/linux/COPYING); if not, write to the Free
-   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
 #ifndef _MD_P_H
@@ -119,6 +119,11 @@ typedef struct mdp_device_descriptor_s {
 
 #define	MD_SB_CLUSTERED		5 /* MD is clustered */
 #define	MD_SB_BITMAP_PRESENT	8 /* bitmap may be present nearby */
+#define	MD_SB_HAS_JOURNAL	9
+#define	MD_SB_JOURNAL_REMOVABLE	10 /* journal _feature_ can be removed,
+				    * which means the journal is either
+				    * missing or Faulty
+				    */
 
 /*
  * Notes:
-- 
2.9.3


^ permalink raw reply related

* [PATCH 1/2] mdadm/r5cache: allow adding journal to array without journal
From: Song Liu @ 2017-03-15 18:28 UTC (permalink / raw)
  To: linux-raid
  Cc: shli, neilb, kernel-team, dan.j.williams, hch, jes.sorensen,
	Song Liu

We use to apply constraints to --add-journal:
  1. We only support recreate journal for arrays with missing journal;
  2. The array need to be readonly when the journal is added.

As the kernel code getting more mature, these constraints are no longer
necessary. This patch removes this constraints from mdadm.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 Manage.c   | 29 ++++-------------------------
 md_p.h     |  5 +++++
 mdadm.8.in |  7 ++-----
 3 files changed, 11 insertions(+), 30 deletions(-)

diff --git a/Manage.c b/Manage.c
index 5c3d2b9..289b7ce 100644
--- a/Manage.c
+++ b/Manage.c
@@ -946,28 +946,13 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
 
 	/* only add journal to array that supports journaling */
 	if (dv->disposition == 'j') {
-		struct mdinfo mdi;
-		struct mdinfo *mdp;
-
-		mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE);
-		if (!mdp) {
-			pr_err("%s unable to read array state.\n", devname);
-			return -1;
-		}
-
-		if (strncmp(mdp->sysfs_array_state, "readonly", 8) != 0) {
-			sysfs_free(mdp);
-			pr_err("%s is not readonly, cannot add journal.\n", devname);
+		if ((array->state & (1<<MD_SB_HAS_JOURNAL)) &&
+		    !(array->state & (1<<MD_SB_JOURNAL_REMOVABLE))) {
+			pr_err("%s has a working journal. Hot spare journal device is not supported.\n",
+			       devname);
 			return -1;
 		}
 
-		sysfs_free(mdp);
-
-		tst->ss->getinfo_super(tst, &mdi, NULL);
-		if (mdi.journal_device_required == 0) {
-			pr_err("%s does not support journal device.\n", devname);
-			return -1;
-		}
 		disc.raid_disk = 0;
 	}
 
@@ -1097,12 +1082,6 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
 				       dv->devname, j, strerror(errno));
 			return -1;
 		}
-		if (dv->disposition == 'j') {
-			pr_err("Journal added successfully, making %s read-write\n", devname);
-			if (Manage_ro(devname, fd, -1))
-				pr_err("Failed to make %s read-write\n", devname);
-		}
-
 	}
 	if (verbose >= 0)
 		pr_err("added %s\n", dv->devname);
diff --git a/md_p.h b/md_p.h
index dc9fec1..7081bd9 100644
--- a/md_p.h
+++ b/md_p.h
@@ -121,6 +121,11 @@ typedef struct mdp_device_descriptor_s {
 				   * in container can be activated */
 #define MD_SB_CLUSTERED		5 /* MD is clustered  */
 #define	MD_SB_BITMAP_PRESENT	8 /* bitmap may be present nearby */
+#define	MD_SB_HAS_JOURNAL	9
+#define	MD_SB_JOURNAL_REMOVABLE	10 /* journal _feature_ can be removed,
+				    * which means the journal is either
+				    * missing or Faulty
+				    */
 
 typedef struct mdp_superblock_s {
 	/*
diff --git a/mdadm.8.in b/mdadm.8.in
index df1d460..087f679 100644
--- a/mdadm.8.in
+++ b/mdadm.8.in
@@ -1474,11 +1474,8 @@ the device is found or <slot>:missing in case the device is not found.
 
 .TP
 .BR \-\-add-journal
-Recreate journal for RAID-4/5/6 array that lost a journal device. In the
-current implementation, this command cannot add a journal to an array
-that had a failed journal. To avoid interrupting on-going write opertions,
-.B \-\-add-journal
-only works for array in Read-Only state.
+Add journal to an existing array, or recreate journal for RAID-4/5/6 array
+that lost a journal device.
 
 .TP
 .BR \-\-failfast
-- 
2.9.3


^ permalink raw reply related

* [PATCH 2/2] mdadm: remove journal with "remove-journal"
From: Song Liu @ 2017-03-15 18:28 UTC (permalink / raw)
  To: linux-raid
  Cc: shli, neilb, kernel-team, dan.j.williams, hch, jes.sorensen,
	Song Liu
In-Reply-To: <20170315182850.3628010-1-songliubraving@fb.com>

When journal device of an array fails, the array is forced into read-only
mode. To make the array normal without adding another journal device, we
need to remove journal _feature_ from the array.

This patch is the mdadm part of removing journal feature for an array with
failed or missing journal.

Two flags are added to GET_ARRAY_INFO:
  1. MD_SB_HAS_JOURNAL: meaning the array have journal feature;
  2. MD_SB_JOURNAL_REMOVABLE: meaning the journal is faulty or missing

When both flags are set, mdadm will use ioctl SET_ARRAY_INFO to clear
MD_SB_HAS_JOURNAL, and thus remove journal feature from the array.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 Manage.c | 26 ++++++++++++++++++++++++++
 ReadMe.c |  1 +
 mdadm.c  |  9 ++++++++-
 mdadm.h  |  3 +++
 4 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/Manage.c b/Manage.c
index 289b7ce..26c7dca 100644
--- a/Manage.c
+++ b/Manage.c
@@ -189,6 +189,32 @@ int Manage_run(char *devname, int fd, struct context *c)
 	return IncrementalScan(c, nm);
 }
 
+int Manage_remove_journal(char *devname, int fd, int verbose)
+{
+	mdu_array_info_t array;
+
+	if (ioctl(fd, GET_ARRAY_INFO, &array) != 0)
+		return 1;
+
+	if (!(array.state & (1<<MD_SB_HAS_JOURNAL))) {
+		pr_err("%s does not have journal. No need to remove.\n",
+		       devname);
+		return 1;
+	} else if (!(array.state & (1<<MD_SB_JOURNAL_REMOVABLE))) {
+		pr_err("Cannot remove active journal. Please fail it and try again.\n");
+		return 1;
+	}
+
+	array.state &= ~(1<<MD_SB_HAS_JOURNAL);
+
+	if (ioctl(fd, SET_ARRAY_INFO, &array)) {
+		pr_err("Failed to add remove journal feature.\n");
+		return 1;
+	}
+
+	return 0;
+}
+
 int Manage_stop(char *devname, int fd, int verbose, int will_retry)
 {
 	/* Stop the array.  Array must already be configured
diff --git a/ReadMe.c b/ReadMe.c
index 50d3807..acc4180 100644
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -164,6 +164,7 @@ struct option long_options[] = {
     {"add-spare", 0, 0, AddSpare},
     {"add-journal", 0, 0, AddJournal},
     {"remove",    0, 0, Remove},
+    {"remove-journal",    0, 0, RemoveJournal},
     {"fail",      0, 0, Fail},
     {"set-faulty",0, 0, Fail},
     {"replace",   0, 0, Replace},
diff --git a/mdadm.c b/mdadm.c
index d6ad8dc..3ad413f 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -194,6 +194,7 @@ int main(int argc, char *argv[])
 		case AddJournal:
 		case 'r':
 		case Remove:
+		case RemoveJournal:
 		case Replace:
 		case With:
 		case 'f':
@@ -994,6 +995,9 @@ int main(int argc, char *argv[])
 			}
 			c.runstop = -1;
 			continue;
+		case O(MANAGE,RemoveJournal):
+			c.remove_journal = 1;
+			continue;
 		case O(MANAGE,'t'):
 			c.test = 1;
 			continue;
@@ -1387,6 +1391,9 @@ int main(int argc, char *argv[])
 			rv = Manage_run(devlist->devname, mdfd, &c);
 		if (!rv && c.runstop < 0)
 			rv = Manage_stop(devlist->devname, mdfd, c.verbose, 0);
+		if (!rv && c.remove_journal > 0)
+			rv = Manage_remove_journal(devlist->devname, mdfd,
+						   c.verbose);
 		break;
 	case ASSEMBLE:
 		if (devs_found == 1 && ident.uuid_set == 0 &&
@@ -1906,7 +1913,7 @@ static int misc_list(struct mddev_dev *devlist,
 				mdfd = open_dev(dv->devname);
 				if (mdfd >= 0) break;
 			case 1:
-				mdfd = open_mddev(dv->devname, 1);  
+				mdfd = open_mddev(dv->devname, 1);
 		}
 		if (mdfd>=0) {
 			switch(dv->disposition) {
diff --git a/mdadm.h b/mdadm.h
index 71b8afb..e5acc49 100644
--- a/mdadm.h
+++ b/mdadm.h
@@ -390,6 +390,7 @@ enum special_options {
 	AddSpare,
 	AddJournal,
 	Remove,
+	RemoveJournal,
 	Fail,
 	Replace,
 	With,
@@ -512,6 +513,7 @@ struct context {
 	char	*action;
 	int	nodes;
 	char	*homecluster;
+	int	remove_journal;
 };
 
 struct shape {
@@ -1293,6 +1295,7 @@ extern int Manage_ro(char *devname, int fd, int readonly);
 extern int Manage_run(char *devname, int fd, struct context *c);
 extern int Manage_stop(char *devname, int fd, int quiet,
 		       int will_retry);
+extern int Manage_remove_journal(char *devname, int fd, int verbose);
 extern int Manage_subdevs(char *devname, int fd,
 			  struct mddev_dev *devlist, int verbose, int test,
 			  char *update, int force);
-- 
2.9.3


^ permalink raw reply related

* [PATCH v2] md/r5cache: journal remove support
From: Song Liu @ 2017-03-15 21:15 UTC (permalink / raw)
  To: linux-raid
  Cc: shli, neilb, kernel-team, dan.j.williams, hch, jes.sorensen,
	Song Liu

When journal device of an array fails, the array is forced into read-only
mode. To make the array normal without adding another journal device, we
need to remove journal _feature_ from the array.

This patch allows remove journal _feature_ from an array, For journal
remove to work, existing journal should be either missing or faulty.

Two flags are added to GET_ARRAY_INFO for mdadm.
  1. MD_SB_HAS_JOURNAL: meaning the array have journal feature;
  2. MD_SB_JOURNAL_REMOVABLE: meaning the journal is faulty or missing

When both flags are set, mdadm can clear MD_SB_HAS_JOURNAL to remove
journal _feature_.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md.c                | 44 ++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/raid/md_p.h | 11 ++++++++---
 2 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index ac3bd15..14bf4c3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5981,6 +5981,25 @@ static void autorun_devices(int part)
 }
 #endif /* !MODULE */
 
+/*
+ * the journal _feature_ is removable when:
+ * the array has journal support &&
+ *    (journal is missing || journal is faulty)
+ */
+static bool journal_removable(struct mddev *mddev)
+{
+	struct md_rdev *rdev;
+
+	if (!test_bit(MD_HAS_JOURNAL, &mddev->flags))
+		return false;
+
+	rdev_for_each_rcu(rdev, mddev)
+		if (test_bit(Journal, &rdev->flags) &&
+		    !test_bit(Faulty, &rdev->flags))
+		    return false;
+	return true;
+}
+
 static int get_version(void __user *arg)
 {
 	mdu_version_t ver;
@@ -6041,6 +6060,10 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
 		info.state |= (1<<MD_SB_BITMAP_PRESENT);
 	if (mddev_is_clustered(mddev))
 		info.state |= (1<<MD_SB_CLUSTERED);
+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
+		info.state |= (1<<MD_SB_HAS_JOURNAL);
+	if (journal_removable(mddev))
+		info.state |= (1<<MD_SB_JOURNAL_REMOVABLE);
 	info.active_disks  = insync;
 	info.working_disks = working;
 	info.failed_disks  = failed;
@@ -6721,6 +6744,10 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 	/* calculate expected state,ignoring low bits */
 	if (mddev->bitmap && mddev->bitmap_info.offset)
 		state |= (1 << MD_SB_BITMAP_PRESENT);
+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
+		state |= (1 << MD_SB_HAS_JOURNAL);
+	if (journal_removable(mddev))
+		state |= (1 << MD_SB_JOURNAL_REMOVABLE);
 
 	if (mddev->major_version != info->major_version ||
 	    mddev->minor_version != info->minor_version ||
@@ -6730,8 +6757,11 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 /*	    mddev->layout        != info->layout        || */
 	    mddev->persistent	 != !info->not_persistent ||
 	    mddev->chunk_sectors != info->chunk_size >> 9 ||
-	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
-	    ((state^info->state) & 0xfffffe00)
+	    /*
+	     * ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT
+	     * and SB_HAS_JOURNAL to change
+	     */
+	    ((state^info->state) & 0xfffffc00)
 		)
 		return -EINVAL;
 	/* Check there is only one change */
@@ -6743,6 +6773,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 		cnt++;
 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
 		cnt++;
+	if ((state ^ info->state) & (1<<MD_SB_HAS_JOURNAL))
+		cnt++;
 	if (cnt == 0)
 		return 0;
 	if (cnt > 1)
@@ -6831,6 +6863,14 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 			mddev->bitmap_info.offset = 0;
 		}
 	}
+	if ((state ^ info->state) & (1<<MD_SB_HAS_JOURNAL)) {
+		if (!journal_removable(mddev)) {
+			rv = -EINVAL;
+			goto err;
+		}
+		clear_bit(MD_HAS_JOURNAL,  &mddev->flags);
+	}
+
 	md_update_sb(mddev, 1);
 	return rv;
 err:
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index d9a1ead..b1f2b63 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -1,15 +1,15 @@
 /*
    md_p.h : physical layout of Linux RAID devices
           Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
-	  
+
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2, or (at your option)
    any later version.
-   
+
    You should have received a copy of the GNU General Public License
    (for example /usr/src/linux/COPYING); if not, write to the Free
-   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  
+   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
 #ifndef _MD_P_H
@@ -119,6 +119,11 @@ typedef struct mdp_device_descriptor_s {
 
 #define	MD_SB_CLUSTERED		5 /* MD is clustered */
 #define	MD_SB_BITMAP_PRESENT	8 /* bitmap may be present nearby */
+#define	MD_SB_HAS_JOURNAL	9
+#define	MD_SB_JOURNAL_REMOVABLE	10 /* journal _feature_ can be removed,
+				    * which means the journal is either
+				    * missing or Faulty
+				    */
 
 /*
  * Notes:
-- 
2.9.3


^ permalink raw reply related

* Re: [RFC PATCH] md/raid10: refactor some codes from raid10_write_request
From: Shaohua Li @ 2017-03-15 22:46 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, shli, neilb
In-Reply-To: <58C8AED4.5020309@suse.com>

On Wed, Mar 15, 2017 at 11:02:44AM +0800, Guoqing Jiang wrote:
> 
> 
> On 03/15/2017 12:48 AM, Shaohua Li wrote:
> > On Mon, Mar 13, 2017 at 05:23:59PM +0800, Guoqing Jiang wrote:
> > > Previously, we clone both bio and repl_bio in raid10_write_request,
> > > then add the cloned bio to plug->pending or conf->pending_bio_list
> > > based on plug or not, and most of the logics are same for the two
> > > conditions.
> > > 
> > > So introduce handle_clonebio (a better name is welcome) for it, and
> > > use replacement parameter to distinguish the difference. No functional
> > > changes in the patch.
> > > 
> > > Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> > > ---
> > > Another reason for it is to improve the readability of code, but
> > > I didn't touch raid10 before so this is labeled as RFC.
> > > 
> > >   drivers/md/raid10.c | 172 ++++++++++++++++++++++------------------------------
> > >   1 file changed, 72 insertions(+), 100 deletions(-)
> > > 
> > > diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> > > index b1b1f982a722..02d8eff8d26e 100644
> > > --- a/drivers/md/raid10.c
> > > +++ b/drivers/md/raid10.c
> > > @@ -1188,18 +1188,81 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
> > >   	return;
> > >   }
> > > -static void raid10_write_request(struct mddev *mddev, struct bio *bio,
> > > -				 struct r10bio *r10_bio)
> > > +static void handle_clonebio(struct mddev *mddev, struct r10bio *r10_bio,
> > > +			    struct bio *bio, int i, int replacement,
> > > +			    int max_sectors)
> > Maybe raid10_write_one_disk?
> 
> Thanks for review, I will use it unless a better name is found.
> 
> >   Please replace 'i' with a meaningful name and
> > change to 'boo' for replacement.
> 
> Ok, I would replace 'i' with 'n_copy', what is the meaning of 'boo'?
> IMO, replacement fits better here, thanks.

sorry, I mean 'bool'
 
> +		if (r10_bio->devs[i].bio)
> +			handle_clonebio(mddev, r10_bio, bio, i, 0, max_sectors);
> +		if (r10_bio->devs[i].repl_bio)
> +			handle_clonebio(mddev, r10_bio, bio, i, 1, max_sectors);
> 
> 
> 
> Regards,
> Guoqing
> 
> 

^ permalink raw reply

* Re: [PATCH] md/r5cache: flush data in memory during journal device failure
From: Shaohua Li @ 2017-03-15 22:48 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-raid, Shaohua Li, NeilBrown, Kernel Team,
	dan.j.williams@intel.com, hch@infradead.org
In-Reply-To: <5FFE3F62-D87A-46C5-B9D7-7A7501A32B90@fb.com>

On Tue, Mar 14, 2017 at 10:40:14PM +0000, Song Liu wrote:
> 
> > On Mar 14, 2017, at 10:50 AM, Shaohua Li <shli@kernel.org> wrote:
> > 
> > On Mon, Mar 13, 2017 at 04:36:26PM -0700, Song Liu wrote:
> >> For the raid456 with writeback cache, when journal device failed during
> >> normal operation, it is still possible to persist all data, as all
> >> pending data is still in stripe cache. However, the stripe will be
> >> marked as fail with s.log_failed. Thus, the write out from stripe cache
> >> cannot make progress.
> >> 
> >> To unblock the write out in journal failures, this patch allows stripes
> >> with data injournal to make progress.
> > 
> > what about the parity part? if log failed, we should skip journaling the parity.
> > 
> > Thanks,
> > Shaohua
> > 
> 
> For stripes with data in journal (not flushed yet), the state machine 
> can flush them out. The behavior is just like when there are no journal 
> at all. 

can you explain this more? I didn't find any place we check the failure bit and
so skip journaling the parity. Also include the description in the changelog.
 
> On the other hand, other writes will be gated by the log_failed flags, 
> so the array appears to be read-only to upper layers. 
> 
> Thanks,
> Song
> 
> >> The array should be read-only in journal failures. Therefore, pending
> >> writes (in dev->towrite) are excluded in this write (in delay_towrite).
> >> 
> >> Signed-off-by: Song Liu <songliubraving@fb.com>
> >> ---
> >> drivers/md/raid5.c | 10 +++++++++-
> >> 1 file changed, 9 insertions(+), 1 deletion(-)
> >> 
> >> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> >> index 3233975..447d9dd 100644
> >> --- a/drivers/md/raid5.c
> >> +++ b/drivers/md/raid5.c
> >> @@ -3069,6 +3069,10 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
> >>  *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
> >>  *      no_space_stripes list.
> >>  *
> >> + *   3. during journal failure
> >> + *      In journal failure, we try to flush all cached data to raid disks
> >> + *      based on data in stripe cache. The array is read-only to upper
> >> + *      layers, so we would skip all pending writes.
> >>  */
> >> static inline bool delay_towrite(struct r5conf *conf,
> >> 				 struct r5dev *dev,
> >> @@ -3082,6 +3086,9 @@ static inline bool delay_towrite(struct r5conf *conf,
> >> 	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
> >> 	    s->injournal > 0)
> >> 		return true;
> >> +	/* case 3 above */
> >> +	if (s->log_failed && s->injournal)
> >> +		return true;
> >> 	return false;
> >> }
> >> 
> >> @@ -4721,7 +4728,8 @@ static void handle_stripe(struct stripe_head *sh)
> >> 	/* check if the array has lost more than max_degraded devices and,
> >> 	 * if so, some requests might need to be failed.
> >> 	 */
> >> -	if (s.failed > conf->max_degraded || s.log_failed) {
> >> +	if (s.failed > conf->max_degraded ||
> >> +	    (s.log_failed && s.injournal == 0)) {
> >> 		sh->check_state = 0;
> >> 		sh->reconstruct_state = 0;
> >> 		break_stripe_batch_list(sh, 0);
> >> -- 
> >> 2.9.3
> >> 
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox