[PATCH 1/4] md/r5cache: flush data only stripes in r5l_recovery

linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/4] md/r5cache: flush data only stripes in r5l_recovery_log()
@ 2017-01-13  1:22 Song Liu
  2017-01-13  1:22 ` [PATCH 2/4] md/r5cache: read data into orig_page for prexor of cached data Song Liu
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Song Liu @ 2017-01-13  1:22 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	liuyun01, Song Liu, Jes.Sorensen

When there is data only stripes in the journal, we flush them out in
r5l_recovery_log(). Ths logic is implemented in a new function:
r5c_recovery_flush_data_only_stripes():

1. enable write back cache
2. set flag R5C_PRE_INIT_FLUSH in conf->cache_state
3. flush all stripes
4. wake up conf->mddev->thread
5. wait for all stripes get flushed (reuse wait_for_quiescent)
6. clear R5C_PRE_INIT_FLUSH
7. disable write back cache

do_release_stripe() will wake up the wait when conf->active_stripe
reduces to 0.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 58 +++++++++++++++++++++++++++++++++++-------------
 drivers/md/raid5.c       |  4 ++++
 drivers/md/raid5.h       |  3 +++
 3 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 0c88648..2bbc38b 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2102,7 +2102,7 @@ static int
 r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 				       struct r5l_recovery_ctx *ctx)
 {
-	struct stripe_head *sh, *next;
+	struct stripe_head *sh;
 	struct mddev *mddev = log->rdev->mddev;
 	struct page *page;
 	sector_t next_checkpoint = MaxSector;
@@ -2116,7 +2116,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 
 	WARN_ON(list_empty(&ctx->cached_list));
 
-	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
+	list_for_each_entry(sh, &ctx->cached_list, lru) {
 		struct r5l_meta_block *mb;
 		int i;
 		int offset;
@@ -2166,14 +2166,41 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 		ctx->pos = write_pos;
 		ctx->seq += 1;
 		next_checkpoint = sh->log_start;
-		list_del_init(&sh->lru);
-		raid5_release_stripe(sh);
 	}
 	log->next_checkpoint = next_checkpoint;
 	__free_page(page);
 	return 0;
 }
 
+static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
+						 struct r5l_recovery_ctx *ctx)
+{
+	struct mddev *mddev = log->rdev->mddev;
+	struct r5conf *conf = mddev->private;
+	struct stripe_head *sh, *next;
+
+	if (ctx->data_only_stripes == 0)
+		return;
+
+	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
+	set_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state);
+
+	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
+		r5c_make_stripe_write_out(sh);
+		set_bit(STRIPE_HANDLE, &sh->state);
+		list_del_init(&sh->lru);
+		raid5_release_stripe(sh);
+	}
+
+	md_wakeup_thread(conf->mddev->thread);
+	/* reuse conf->wait_for_quiescent in recovery */
+	wait_event(conf->wait_for_quiescent,
+		   atomic_read(&conf->active_stripes) == 0);
+
+	clear_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state);
+	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
+}
+
 static int r5l_recovery_log(struct r5l_log *log)
 {
 	struct mddev *mddev = log->rdev->mddev;
@@ -2200,32 +2227,31 @@ static int r5l_recovery_log(struct r5l_log *log)
 	pos = ctx.pos;
 	ctx.seq += 10000;
 
-	if (ctx.data_only_stripes == 0) {
-		log->next_checkpoint = ctx.pos;
-		r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
-		ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
-	}
 
 	if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
 		pr_debug("md/raid:%s: starting from clean shutdown\n",
 			 mdname(mddev));
-	else {
+	else
 		pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
 			 mdname(mddev), ctx.data_only_stripes,
 			 ctx.data_parity_stripes);
 
-		if (ctx.data_only_stripes > 0)
-			if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
-				pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
-				       mdname(mddev));
-				return -EIO;
-			}
+	if (ctx.data_only_stripes == 0) {
+		log->next_checkpoint = ctx.pos;
+		r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
+		ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
+	} else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
+		pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
+		       mdname(mddev));
+		return -EIO;
 	}
 
 	log->log_start = ctx.pos;
 	log->seq = ctx.seq;
 	log->last_checkpoint = pos;
 	r5l_write_super(log, pos);
+
+	r5c_recovery_flush_data_only_stripes(log, &ctx);
 	return 0;
 }
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b8e45cc..0d2082d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -266,6 +266,10 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 			    < IO_THRESHOLD)
 				md_wakeup_thread(conf->mddev->thread);
 		atomic_dec(&conf->active_stripes);
+		if (test_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state) &&
+		    atomic_read(&conf->active_stripes) == 0)
+			wake_up(&sh->raid_conf->wait_for_quiescent);
+
 		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
 			if (!r5c_is_writeback(conf->log))
 				list_add_tail(&sh->lru, temp_inactive_list);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 3bb9ffc..c582086 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -564,6 +564,9 @@ enum r5_cache_state {
 	R5C_EXTRA_PAGE_IN_USE,	/* a stripe is using disk_info.extra_page
 				 * for prexor
 				 */
+	R5C_PRE_INIT_FLUSH,	/* flushing data only stripes recovered from
+				 * the journal
+				 */
 };
 
 struct r5conf {
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/4] md/r5cache: read data into orig_page for prexor of cached data
  2017-01-13  1:22 [PATCH 1/4] md/r5cache: flush data only stripes in r5l_recovery_log() Song Liu
@ 2017-01-13  1:22 ` Song Liu
  2017-01-21 18:04   ` Shaohua Li
  2017-01-13  1:22 ` [PATCH 3/4] md/raid5: move comment of fetch_block to right location Song Liu
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 7+ messages in thread
From: Song Liu @ 2017-01-13  1:22 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	liuyun01, Song Liu, Jes.Sorensen

With write back cache, we use orig_page to do prexor. This patch
makes sure we read data into orig_page for it.

Flag R5_OrigPageUPTDODATE is added to show whether orig_page
has the latest data from raid disk.

We introduce a helper function uptodate_for_rmw() to simplify
the a couple conditions in handle_stripe_dirtying().

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c |  2 ++
 drivers/md/raid5.c       | 44 +++++++++++++++++++++++++++++++++++---------
 drivers/md/raid5.h       |  5 +++++
 3 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 2bbc38b..248fede 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2459,6 +2459,8 @@ void r5c_release_extra_page(struct stripe_head *sh)
 			struct page *p = sh->dev[i].orig_page;
 
 			sh->dev[i].orig_page = sh->dev[i].page;
+			clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
+
 			if (!using_disk_info_extra_page)
 				put_page(p);
 		}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0d2082d..3e75289 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1056,7 +1056,17 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
 				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
-			sh->dev[i].vec.bv_page = sh->dev[i].page;
+
+			if (!op_is_write(op) &&
+			    test_bit(R5_InJournal, &sh->dev[i].flags))
+				/*
+				 * issuing read for a page in journal, this
+				 * must be preparing for prexor in rmw; read
+				 * the data into orig_page
+				 */
+				sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
+			else
+				sh->dev[i].vec.bv_page = sh->dev[i].page;
 			bi->bi_vcnt = 1;
 			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 			bi->bi_io_vec[0].bv_offset = 0;
@@ -2421,6 +2431,13 @@ static void raid5_end_read_request(struct bio * bi)
 		} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
 			clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
 
+		if (test_bit(R5_InJournal, &sh->dev[i].flags))
+			/*
+			 * end read for a page in journal, this
+			 * must be preparing for prexor in rmw
+			 */
+			set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
+
 		if (atomic_read(&rdev->read_errors))
 			atomic_set(&rdev->read_errors, 0);
 	} else {
@@ -3635,6 +3652,21 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 		break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
 }
 
+/*
+ * For RMW in write back cache, we need extra page in prexor to store the
+ * old data. This page is stored in dev->orig_page.
+ *
+ * This function checks whether we have data for prexor. The exact logic
+ * is:
+ *       R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE)
+ */
+static inline bool uptodate_for_rmw(struct r5dev *dev)
+{
+	return (test_bit(R5_UPTODATE, &dev->flags)) &&
+		(!test_bit(R5_InJournal, &dev->flags) ||
+		 test_bit(R5_OrigPageUPTDODATE, &dev->flags));
+}
+
 static int handle_stripe_dirtying(struct r5conf *conf,
 				  struct stripe_head *sh,
 				  struct stripe_head_state *s,
@@ -3666,9 +3698,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
 		     test_bit(R5_InJournal, &dev->flags)) &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
-		    !((test_bit(R5_UPTODATE, &dev->flags) &&
-		       (!test_bit(R5_InJournal, &dev->flags) ||
-			dev->page != dev->orig_page)) ||
+		    !(uptodate_for_rmw(dev) ||
 		      test_bit(R5_Wantcompute, &dev->flags))) {
 			if (test_bit(R5_Insync, &dev->flags))
 				rmw++;
@@ -3680,7 +3710,6 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 		    i != sh->pd_idx && i != sh->qd_idx &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
-		      test_bit(R5_InJournal, &dev->flags) ||
 		      test_bit(R5_Wantcompute, &dev->flags))) {
 			if (test_bit(R5_Insync, &dev->flags))
 				rcw++;
@@ -3734,9 +3763,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 			     i == sh->pd_idx || i == sh->qd_idx ||
 			     test_bit(R5_InJournal, &dev->flags)) &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
-			    !((test_bit(R5_UPTODATE, &dev->flags) &&
-			       (!test_bit(R5_InJournal, &dev->flags) ||
-				dev->page != dev->orig_page)) ||
+			    !(uptodate_for_rmw(dev) ||
 			      test_bit(R5_Wantcompute, &dev->flags)) &&
 			    test_bit(R5_Insync, &dev->flags)) {
 				if (test_bit(STRIPE_PREREAD_ACTIVE,
@@ -3763,7 +3790,6 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 			    i != sh->pd_idx && i != sh->qd_idx &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
-			      test_bit(R5_InJournal, &dev->flags) ||
 			      test_bit(R5_Wantcompute, &dev->flags))) {
 				rcw++;
 				if (test_bit(R5_Insync, &dev->flags) &&
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index c582086..50855ad 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -322,6 +322,11 @@ enum r5dev_flags {
 			 * data and parity being written are in the journal
 			 * device
 			 */
+	R5_OrigPageUPTDODATE,	/* with write back cache, we read old data into
+				 * dev->orig_page for prexor. When this flag is
+				 * set, orig_page contains latest data in the
+				 * raid disk.
+				 */
 };
 
 /*
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 2/4] md/r5cache: read data into orig_page for prexor of cached data
  2017-01-13  1:22 ` [PATCH 2/4] md/r5cache: read data into orig_page for prexor of cached data Song Liu
@ 2017-01-21 18:04   ` Shaohua Li
  0 siblings, 0 replies; 7+ messages in thread
From: Shaohua Li @ 2017-01-21 18:04 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-raid, neilb, shli, kernel-team, dan.j.williams, hch,
	liuzhengyuan, liuyun01, Jes.Sorensen

On Thu, Jan 12, 2017 at 05:22:41PM -0800, Song Liu wrote:
> With write back cache, we use orig_page to do prexor. This patch
> makes sure we read data into orig_page for it.
> 
> Flag R5_OrigPageUPTDODATE is added to show whether orig_page
> has the latest data from raid disk.
> 
> We introduce a helper function uptodate_for_rmw() to simplify
> the a couple conditions in handle_stripe_dirtying().

applied patch 2 & 3

Thanks,
Shaohua
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  drivers/md/raid5-cache.c |  2 ++
>  drivers/md/raid5.c       | 44 +++++++++++++++++++++++++++++++++++---------
>  drivers/md/raid5.h       |  5 +++++
>  3 files changed, 42 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 2bbc38b..248fede 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -2459,6 +2459,8 @@ void r5c_release_extra_page(struct stripe_head *sh)
>  			struct page *p = sh->dev[i].orig_page;
>  
>  			sh->dev[i].orig_page = sh->dev[i].page;
> +			clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
> +
>  			if (!using_disk_info_extra_page)
>  				put_page(p);
>  		}
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 0d2082d..3e75289 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -1056,7 +1056,17 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
>  
>  			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
>  				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
> -			sh->dev[i].vec.bv_page = sh->dev[i].page;
> +
> +			if (!op_is_write(op) &&
> +			    test_bit(R5_InJournal, &sh->dev[i].flags))
> +				/*
> +				 * issuing read for a page in journal, this
> +				 * must be preparing for prexor in rmw; read
> +				 * the data into orig_page
> +				 */
> +				sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
> +			else
> +				sh->dev[i].vec.bv_page = sh->dev[i].page;
>  			bi->bi_vcnt = 1;
>  			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
>  			bi->bi_io_vec[0].bv_offset = 0;
> @@ -2421,6 +2431,13 @@ static void raid5_end_read_request(struct bio * bi)
>  		} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
>  			clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
>  
> +		if (test_bit(R5_InJournal, &sh->dev[i].flags))
> +			/*
> +			 * end read for a page in journal, this
> +			 * must be preparing for prexor in rmw
> +			 */
> +			set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);
> +
>  		if (atomic_read(&rdev->read_errors))
>  			atomic_set(&rdev->read_errors, 0);
>  	} else {
> @@ -3635,6 +3652,21 @@ static void handle_stripe_clean_event(struct r5conf *conf,
>  		break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
>  }
>  
> +/*
> + * For RMW in write back cache, we need extra page in prexor to store the
> + * old data. This page is stored in dev->orig_page.
> + *
> + * This function checks whether we have data for prexor. The exact logic
> + * is:
> + *       R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE)
> + */
> +static inline bool uptodate_for_rmw(struct r5dev *dev)
> +{
> +	return (test_bit(R5_UPTODATE, &dev->flags)) &&
> +		(!test_bit(R5_InJournal, &dev->flags) ||
> +		 test_bit(R5_OrigPageUPTDODATE, &dev->flags));
> +}
> +
>  static int handle_stripe_dirtying(struct r5conf *conf,
>  				  struct stripe_head *sh,
>  				  struct stripe_head_state *s,
> @@ -3666,9 +3698,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
>  		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
>  		     test_bit(R5_InJournal, &dev->flags)) &&
>  		    !test_bit(R5_LOCKED, &dev->flags) &&
> -		    !((test_bit(R5_UPTODATE, &dev->flags) &&
> -		       (!test_bit(R5_InJournal, &dev->flags) ||
> -			dev->page != dev->orig_page)) ||
> +		    !(uptodate_for_rmw(dev) ||
>  		      test_bit(R5_Wantcompute, &dev->flags))) {
>  			if (test_bit(R5_Insync, &dev->flags))
>  				rmw++;
> @@ -3680,7 +3710,6 @@ static int handle_stripe_dirtying(struct r5conf *conf,
>  		    i != sh->pd_idx && i != sh->qd_idx &&
>  		    !test_bit(R5_LOCKED, &dev->flags) &&
>  		    !(test_bit(R5_UPTODATE, &dev->flags) ||
> -		      test_bit(R5_InJournal, &dev->flags) ||
>  		      test_bit(R5_Wantcompute, &dev->flags))) {
>  			if (test_bit(R5_Insync, &dev->flags))
>  				rcw++;
> @@ -3734,9 +3763,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
>  			     i == sh->pd_idx || i == sh->qd_idx ||
>  			     test_bit(R5_InJournal, &dev->flags)) &&
>  			    !test_bit(R5_LOCKED, &dev->flags) &&
> -			    !((test_bit(R5_UPTODATE, &dev->flags) &&
> -			       (!test_bit(R5_InJournal, &dev->flags) ||
> -				dev->page != dev->orig_page)) ||
> +			    !(uptodate_for_rmw(dev) ||
>  			      test_bit(R5_Wantcompute, &dev->flags)) &&
>  			    test_bit(R5_Insync, &dev->flags)) {
>  				if (test_bit(STRIPE_PREREAD_ACTIVE,
> @@ -3763,7 +3790,6 @@ static int handle_stripe_dirtying(struct r5conf *conf,
>  			    i != sh->pd_idx && i != sh->qd_idx &&
>  			    !test_bit(R5_LOCKED, &dev->flags) &&
>  			    !(test_bit(R5_UPTODATE, &dev->flags) ||
> -			      test_bit(R5_InJournal, &dev->flags) ||
>  			      test_bit(R5_Wantcompute, &dev->flags))) {
>  				rcw++;
>  				if (test_bit(R5_Insync, &dev->flags) &&
> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> index c582086..50855ad 100644
> --- a/drivers/md/raid5.h
> +++ b/drivers/md/raid5.h
> @@ -322,6 +322,11 @@ enum r5dev_flags {
>  			 * data and parity being written are in the journal
>  			 * device
>  			 */
> +	R5_OrigPageUPTDODATE,	/* with write back cache, we read old data into
> +				 * dev->orig_page for prexor. When this flag is
> +				 * set, orig_page contains latest data in the
> +				 * raid disk.
> +				 */
>  };
>  
>  /*
> -- 
> 2.9.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 3/4] md/raid5: move comment of fetch_block to right location
  2017-01-13  1:22 [PATCH 1/4] md/r5cache: flush data only stripes in r5l_recovery_log() Song Liu
  2017-01-13  1:22 ` [PATCH 2/4] md/r5cache: read data into orig_page for prexor of cached data Song Liu
@ 2017-01-13  1:22 ` Song Liu
  2017-01-13  1:22 ` [PATCH 4/4] md/r5cache: shift complex rmw from read path to write path Song Liu
  2017-01-21 18:03 ` [PATCH 1/4] md/r5cache: flush data only stripes in r5l_recovery_log() Shaohua Li
  3 siblings, 0 replies; 7+ messages in thread
From: Song Liu @ 2017-01-13  1:22 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	liuyun01, Song Liu, Jes.Sorensen

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3e75289..d07a319 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3353,13 +3353,6 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
 	return rv;
 }
 
-/* fetch_block - checks the given member device to see if its data needs
- * to be read or computed to satisfy a request.
- *
- * Returns 1 when no more member devices need to be checked, otherwise returns
- * 0 to tell the loop in handle_stripe_fill to continue
- */
-
 static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 			   int disk_idx, int disks)
 {
@@ -3450,6 +3443,12 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 	return 0;
 }
 
+/* fetch_block - checks the given member device to see if its data needs
+ * to be read or computed to satisfy a request.
+ *
+ * Returns 1 when no more member devices need to be checked, otherwise returns
+ * 0 to tell the loop in handle_stripe_fill to continue
+ */
 static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
 		       int disk_idx, int disks)
 {
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 4/4] md/r5cache: shift complex rmw from read path to write path
  2017-01-13  1:22 [PATCH 1/4] md/r5cache: flush data only stripes in r5l_recovery_log() Song Liu
  2017-01-13  1:22 ` [PATCH 2/4] md/r5cache: read data into orig_page for prexor of cached data Song Liu
  2017-01-13  1:22 ` [PATCH 3/4] md/raid5: move comment of fetch_block to right location Song Liu
@ 2017-01-13  1:22 ` Song Liu
  2017-01-21 18:06   ` Shaohua Li
  2017-01-21 18:03 ` [PATCH 1/4] md/r5cache: flush data only stripes in r5l_recovery_log() Shaohua Li
  3 siblings, 1 reply; 7+ messages in thread
From: Song Liu @ 2017-01-13  1:22 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	liuyun01, Song Liu, Jes.Sorensen

Write back cache requires a complex RMW mechanism, where old data is
read into dev->orig_page for prexor, and then xor is done with
dev->page. This logic is already implemented in the write path.

However, current read path is not awared of this requirement. When
the array is optimal, the RMW is not required, as the data are
read from raid disks. However, when the target stripe is degraded,
complex RMW is required to generate right data.

To keep read path as clean as possible, we handle read path by
flushing degraded, in-journal stripes before processing reads to
missing dev.

Specifically, when there is read requests to a degraded stripe
with data in journal, handle_stripe_fill() calls
r5c_make_stripe_write_out() and exits. Then handle_stripe_dirtying()
will do the complex RMW and flush the stripe to RAID disks. After
that, read requests are handled.

There is one more corner case when there is non-overwrite bio for
the missing (or out of sync) dev. handle_stripe_dirtying() will not
be able to process the non-overwrite bios without constructing the
data in handle_stripe_fill(). This is fixed by delaying non-overwrite
bios in handle_stripe_dirtying(). So handle_stripe_fill() works on
these bios after the stripe is flushed to raid disks.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 48 ++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d07a319..193acd3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2938,6 +2938,30 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
 	return r_sector;
 }
 
+/*
+ * There are cases where we want handle_stripe_dirtying() and
+ * schedule_reconstruction() to delay towrite to some dev of a stripe.
+ *
+ * This function checks whether we want to delay the towrite. Specifically,
+ * we delay the towrite when:
+ *   1. degraded stripe has a non-overwrite to the missing dev, AND this
+ *      stripe has data in journal (for other devices).
+ *
+ *      In this case, when reading data for the non-overwrite dev, it is
+ *      necessary to handle complex rmw of write back cache (prexor with
+ *      orig_page, and xor with page). To keep read path simple, we would
+ *      like to flush data in journal to RAID disks first, so complex rmw
+ *      is handled in the write patch (handle_stripe_dirtying).
+ *
+ *   2. to be added
+ */
+static inline bool delay_towrite(struct r5dev *dev,
+				   struct stripe_head_state *s)
+{
+	return dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
+		!test_bit(R5_Insync, &dev->flags) && s->injournal;
+}
+
 static void
 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 			 int rcw, int expand)
@@ -2958,7 +2982,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 
-			if (dev->towrite) {
+			if (dev->towrite && !delay_towrite(dev, s)) {
 				set_bit(R5_LOCKED, &dev->flags);
 				set_bit(R5_Wantdrain, &dev->flags);
 				if (!expand)
@@ -3535,10 +3559,25 @@ static void handle_stripe_fill(struct stripe_head *sh,
 	 * midst of changing due to a write
 	 */
 	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
-	    !sh->reconstruct_state)
+	    !sh->reconstruct_state) {
+
+		/* for degraded stripe with data in journal, do not handle
+		 * read requests yet, instead, flush the stripe to raid
+		 * disks first, this avoids handling complex rmw of write
+		 * back cache (prexor with orig_page, and then xor with
+		 * page) in the read path
+		 */
+		if (s->injournal && s->failed) {
+			if (test_bit(STRIPE_R5C_CACHING, &sh->state))
+				r5c_make_stripe_write_out(sh);
+			goto out;
+		}
+
 		for (i = disks; i--; )
 			if (fetch_block(sh, s, i, disks))
 				break;
+	}
+out:
 	set_bit(STRIPE_HANDLE, &sh->state);
 }
 
@@ -3694,7 +3733,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
 		struct r5dev *dev = &sh->dev[i];
-		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
+		if (((dev->towrite && !delay_towrite(dev, s)) ||
+		     i == sh->pd_idx || i == sh->qd_idx ||
 		     test_bit(R5_InJournal, &dev->flags)) &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
 		    !(uptodate_for_rmw(dev) ||
@@ -3758,7 +3798,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if ((dev->towrite ||
+			if (((dev->towrite && !delay_towrite(dev, s)) ||
 			     i == sh->pd_idx || i == sh->qd_idx ||
 			     test_bit(R5_InJournal, &dev->flags)) &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH 4/4] md/r5cache: shift complex rmw from read path to write path
  2017-01-13  1:22 ` [PATCH 4/4] md/r5cache: shift complex rmw from read path to write path Song Liu
@ 2017-01-21 18:06   ` Shaohua Li
  0 siblings, 0 replies; 7+ messages in thread
From: Shaohua Li @ 2017-01-21 18:06 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-raid, neilb, shli, kernel-team, dan.j.williams, hch,
	liuzhengyuan, liuyun01, Jes.Sorensen

On Thu, Jan 12, 2017 at 05:22:43PM -0800, Song Liu wrote:
> Write back cache requires a complex RMW mechanism, where old data is
> read into dev->orig_page for prexor, and then xor is done with
> dev->page. This logic is already implemented in the write path.
> 
> However, current read path is not awared of this requirement. When
> the array is optimal, the RMW is not required, as the data are
> read from raid disks. However, when the target stripe is degraded,
> complex RMW is required to generate right data.
> 
> To keep read path as clean as possible, we handle read path by
> flushing degraded, in-journal stripes before processing reads to
> missing dev.
> 
> Specifically, when there is read requests to a degraded stripe
> with data in journal, handle_stripe_fill() calls
> r5c_make_stripe_write_out() and exits. Then handle_stripe_dirtying()
> will do the complex RMW and flush the stripe to RAID disks. After
> that, read requests are handled.
> 
> There is one more corner case when there is non-overwrite bio for
> the missing (or out of sync) dev. handle_stripe_dirtying() will not
> be able to process the non-overwrite bios without constructing the
> data in handle_stripe_fill(). This is fixed by delaying non-overwrite
> bios in handle_stripe_dirtying(). So handle_stripe_fill() works on
> these bios after the stripe is flushed to raid disks.

This patch looks good and I think it should be applied to 4.10. Some minor
issues.
 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  drivers/md/raid5.c | 48 ++++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 44 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index d07a319..193acd3 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -2938,6 +2938,30 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
>  	return r_sector;
>  }
>  
> +/*
> + * There are cases where we want handle_stripe_dirtying() and
> + * schedule_reconstruction() to delay towrite to some dev of a stripe.
> + *
> + * This function checks whether we want to delay the towrite. Specifically,
> + * we delay the towrite when:
> + *   1. degraded stripe has a non-overwrite to the missing dev, AND this
> + *      stripe has data in journal (for other devices).
> + *
> + *      In this case, when reading data for the non-overwrite dev, it is
> + *      necessary to handle complex rmw of write back cache (prexor with
> + *      orig_page, and xor with page). To keep read path simple, we would
> + *      like to flush data in journal to RAID disks first, so complex rmw
> + *      is handled in the write patch (handle_stripe_dirtying).
> + *
> + *   2. to be added

what does this mean?

> + */
> +static inline bool delay_towrite(struct r5dev *dev,
> +				   struct stripe_head_state *s)
> +{
> +	return dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
> +		!test_bit(R5_Insync, &dev->flags) && s->injournal;
> +}

this is always called with dev->towrite true, so please remove it.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH 1/4] md/r5cache: flush data only stripes in r5l_recovery_log()
  2017-01-13  1:22 [PATCH 1/4] md/r5cache: flush data only stripes in r5l_recovery_log() Song Liu
                   ` (2 preceding siblings ...)
  2017-01-13  1:22 ` [PATCH 4/4] md/r5cache: shift complex rmw from read path to write path Song Liu
@ 2017-01-21 18:03 ` Shaohua Li
  3 siblings, 0 replies; 7+ messages in thread
From: Shaohua Li @ 2017-01-21 18:03 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-raid, neilb, shli, kernel-team, dan.j.williams, hch,
	liuzhengyuan, liuyun01, Jes.Sorensen

On Thu, Jan 12, 2017 at 05:22:40PM -0800, Song Liu wrote:
> When there is data only stripes in the journal, we flush them out in
> r5l_recovery_log(). Ths logic is implemented in a new function:
> r5c_recovery_flush_data_only_stripes():
> 
> 1. enable write back cache
> 2. set flag R5C_PRE_INIT_FLUSH in conf->cache_state
> 3. flush all stripes
> 4. wake up conf->mddev->thread
> 5. wait for all stripes get flushed (reuse wait_for_quiescent)
> 6. clear R5C_PRE_INIT_FLUSH
> 7. disable write back cache

Please explain why we need this and if this applies to writethrough too. Also
please explain why it's safe to wakeup mddev->thread in r5l_recovery_log() as
the mddev isn't fully initialized yet.
 
> do_release_stripe() will wake up the wait when conf->active_stripe
> reduces to 0.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  drivers/md/raid5-cache.c | 58 +++++++++++++++++++++++++++++++++++-------------
>  drivers/md/raid5.c       |  4 ++++
>  drivers/md/raid5.h       |  3 +++
>  3 files changed, 49 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 0c88648..2bbc38b 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -2102,7 +2102,7 @@ static int
>  r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
>  				       struct r5l_recovery_ctx *ctx)
>  {
> -	struct stripe_head *sh, *next;
> +	struct stripe_head *sh;
>  	struct mddev *mddev = log->rdev->mddev;
>  	struct page *page;
>  	sector_t next_checkpoint = MaxSector;
> @@ -2116,7 +2116,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
>  
>  	WARN_ON(list_empty(&ctx->cached_list));
>  
> -	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
> +	list_for_each_entry(sh, &ctx->cached_list, lru) {
>  		struct r5l_meta_block *mb;
>  		int i;
>  		int offset;
> @@ -2166,14 +2166,41 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
>  		ctx->pos = write_pos;
>  		ctx->seq += 1;
>  		next_checkpoint = sh->log_start;
> -		list_del_init(&sh->lru);
> -		raid5_release_stripe(sh);
>  	}
>  	log->next_checkpoint = next_checkpoint;
>  	__free_page(page);
>  	return 0;
>  }
>  
> +static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
> +						 struct r5l_recovery_ctx *ctx)
> +{
> +	struct mddev *mddev = log->rdev->mddev;
> +	struct r5conf *conf = mddev->private;
> +	struct stripe_head *sh, *next;
> +
> +	if (ctx->data_only_stripes == 0)
> +		return;
> +
> +	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
> +	set_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state);
> +
> +	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
> +		r5c_make_stripe_write_out(sh);
> +		set_bit(STRIPE_HANDLE, &sh->state);
> +		list_del_init(&sh->lru);
> +		raid5_release_stripe(sh);
> +	}
> +
> +	md_wakeup_thread(conf->mddev->thread);
> +	/* reuse conf->wait_for_quiescent in recovery */
> +	wait_event(conf->wait_for_quiescent,
> +		   atomic_read(&conf->active_stripes) == 0);
> +
> +	clear_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state);
> +	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
> +}
> +
>  static int r5l_recovery_log(struct r5l_log *log)
>  {
>  	struct mddev *mddev = log->rdev->mddev;
> @@ -2200,32 +2227,31 @@ static int r5l_recovery_log(struct r5l_log *log)
>  	pos = ctx.pos;
>  	ctx.seq += 10000;
>  
> -	if (ctx.data_only_stripes == 0) {
> -		log->next_checkpoint = ctx.pos;
> -		r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
> -		ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
> -	}
>  
>  	if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
>  		pr_debug("md/raid:%s: starting from clean shutdown\n",
>  			 mdname(mddev));
> -	else {
> +	else
>  		pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
>  			 mdname(mddev), ctx.data_only_stripes,
>  			 ctx.data_parity_stripes);
>  
> -		if (ctx.data_only_stripes > 0)
> -			if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
> -				pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
> -				       mdname(mddev));
> -				return -EIO;
> -			}
> +	if (ctx.data_only_stripes == 0) {
> +		log->next_checkpoint = ctx.pos;
> +		r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
> +		ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
> +	} else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
> +		pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
> +		       mdname(mddev));
> +		return -EIO;
>  	}
>  
>  	log->log_start = ctx.pos;
>  	log->seq = ctx.seq;
>  	log->last_checkpoint = pos;
>  	r5l_write_super(log, pos);
> +
> +	r5c_recovery_flush_data_only_stripes(log, &ctx);
>  	return 0;
>  }
>  
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index b8e45cc..0d2082d 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -266,6 +266,10 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
>  			    < IO_THRESHOLD)
>  				md_wakeup_thread(conf->mddev->thread);
>  		atomic_dec(&conf->active_stripes);
> +		if (test_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state) &&
> +		    atomic_read(&conf->active_stripes) == 0)
> +			wake_up(&sh->raid_conf->wait_for_quiescent);

Don't understand why R5C_PRE_INIT_FLUSH must be introduced. Shouldn't we always
wakeup wait_for_quiescent after active_stripes == 0? If not, this is a bug in
existing code.


Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2017-01-21 18:06 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-01-13  1:22 [PATCH 1/4] md/r5cache: flush data only stripes in r5l_recovery_log() Song Liu
2017-01-13  1:22 ` [PATCH 2/4] md/r5cache: read data into orig_page for prexor of cached data Song Liu
2017-01-21 18:04   ` Shaohua Li
2017-01-13  1:22 ` [PATCH 3/4] md/raid5: move comment of fetch_block to right location Song Liu
2017-01-13  1:22 ` [PATCH 4/4] md/r5cache: shift complex rmw from read path to write path Song Liu
2017-01-21 18:06   ` Shaohua Li
2017-01-21 18:03 ` [PATCH 1/4] md/r5cache: flush data only stripes in r5l_recovery_log() Shaohua Li

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).