Linux RAID subsystem development
 help / color / mirror / Atom feed
* [PATCH 03/10] md/dm-crypt: Rename a jump label in crypt_message()
From: SF Markus Elfring @ 2016-09-28 15:40 UTC (permalink / raw)
  To: dm-devel, linux-raid, Alasdair Kergon, Mike Snitzer, Shaohua Li
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <7c232017-e291-fd2d-5516-26e5150d90df@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Wed, 28 Sep 2016 14:54:39 +0200

Adjust a jump label according to the current Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/dm-crypt.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 88a3b62..08e3de2 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2016,7 +2016,7 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
 	int ret = -EINVAL;
 
 	if (argc < 2)
-		goto error;
+		goto show_warning;
 
 	if (!strcasecmp(argv[0], "key")) {
 		if (!test_bit(DM_CRYPT_SUSPENDED, &cc->flags)) {
@@ -2040,8 +2040,7 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
 			return crypt_wipe_key(cc);
 		}
 	}
-
-error:
+show_warning:
 	DMWARN("unrecognised message received.");
 	return -EINVAL;
 }
-- 
2.10.0


^ permalink raw reply related

* [PATCH 02/10] md/dm-crypt: Reduce the scope for a variable in crypt_alloc_tfms()
From: SF Markus Elfring @ 2016-09-28 15:38 UTC (permalink / raw)
  To: dm-devel, linux-raid, Alasdair Kergon, Mike Snitzer, Shaohua Li
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <7c232017-e291-fd2d-5516-26e5150d90df@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Wed, 28 Sep 2016 14:05:25 +0200

Move the definition for the variable "err" (including its declaration)
into an if branch (so that the corresponding setting will only be performed
if a call of the function "crypto_alloc_skcipher" failed as before
this refactoring).

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/dm-crypt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 68971a2..88a3b62 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1452,7 +1452,6 @@ static void crypt_free_tfms(struct crypt_config *cc)
 static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
 {
 	unsigned i;
-	int err;
 
 	cc->tfms = kcalloc(cc->tfms_count, sizeof(*cc->tfms), GFP_KERNEL);
 	if (!cc->tfms)
@@ -1461,7 +1460,8 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
 	for (i = 0; i < cc->tfms_count; i++) {
 		cc->tfms[i] = crypto_alloc_skcipher(ciphermode, 0, 0);
 		if (IS_ERR(cc->tfms[i])) {
-			err = PTR_ERR(cc->tfms[i]);
+			int err = PTR_ERR(cc->tfms[i]);
+
 			crypt_free_tfms(cc);
 			return err;
 		}
-- 
2.10.0


^ permalink raw reply related

* Re: [PATCH 00/10] md/dm-crypt: Fine-tuning for five function implementations
From: Mike Snitzer @ 2016-09-28 15:38 UTC (permalink / raw)
  To: SF Markus Elfring
  Cc: dm-devel, linux-raid, Alasdair Kergon, Shaohua Li, LKML,
	kernel-janitors, Julia Lawall
In-Reply-To: <7c232017-e291-fd2d-5516-26e5150d90df@users.sourceforge.net>

On Wed, Sep 28 2016 at 11:34am -0400,
SF Markus Elfring <elfring@users.sourceforge.net> wrote:

> From: Markus Elfring <elfring@users.sourceforge.net>
> Date: Wed, 28 Sep 2016 17:25:17 +0200
> 
> Some update suggestions were taken into account
> from static source code analysis.
> 
> Markus Elfring (10):
>   Use kcalloc() in crypt_alloc_tfms()
>   Reduce the scope for a variable in crypt_alloc_tfms()
>   Rename a jump label in crypt_message()
>   Delete an unnecessary variable initialisation in crypt_message()
>   Rename a jump label in crypt_set_key()
>   Delete an unnecessary variable initialisation in crypt_set_key()
>   Rename a jump label in crypt_iv_tcw_whitening()
>   Return directly after a failed crypto_alloc_ahash() in crypt_iv_essiv_ctr()
>   Two checks and one function call less in crypt_iv_essiv_ctr() after error detection
>   Delete unnecessary variable initialisations in crypt_iv_essiv_ctr()
> 
>  drivers/md/dm-crypt.c | 51 ++++++++++++++++++++++-----------------------------
>  1 file changed, 22 insertions(+), 29 deletions(-)

These are _not_ the kind of changes I want to be seeing.  Churn in the
name of "fine-tuning".

Nack.

^ permalink raw reply

* [PATCH 01/10] md/dm-crypt: Use kcalloc() in crypt_alloc_tfms()
From: SF Markus Elfring @ 2016-09-28 15:36 UTC (permalink / raw)
  To: dm-devel, linux-raid, Alasdair Kergon, Mike Snitzer, Shaohua Li
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <7c232017-e291-fd2d-5516-26e5150d90df@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Wed, 28 Sep 2016 13:26:22 +0200

* A multiplication for the size determination of a memory allocation
  indicated that an array data structure should be processed.
  Thus reuse the corresponding function "kcalloc".

  This issue was detected by using the Coccinelle software.

* Replace the specification of a data type by a pointer dereference
  to make the corresponding size determination a bit safer according to
  the Linux coding style convention.

Fixes: 5d0be84ec0cacfc7a6d6ea548afdd07d481324cd ("dm crypt: fix free of bad values after tfm allocation failure")
Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/dm-crypt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index a276883..68971a2 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1454,8 +1454,7 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
 	unsigned i;
 	int err;
 
-	cc->tfms = kzalloc(cc->tfms_count * sizeof(struct crypto_skcipher *),
-			   GFP_KERNEL);
+	cc->tfms = kcalloc(cc->tfms_count, sizeof(*cc->tfms), GFP_KERNEL);
 	if (!cc->tfms)
 		return -ENOMEM;
 
-- 
2.10.0


^ permalink raw reply related

* [PATCH 00/10] md/dm-crypt: Fine-tuning for five function implementations
From: SF Markus Elfring @ 2016-09-28 15:34 UTC (permalink / raw)
  To: dm-devel, linux-raid, Alasdair Kergon, Mike Snitzer, Shaohua Li
  Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <566ABCD9.1060404@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Wed, 28 Sep 2016 17:25:17 +0200

Some update suggestions were taken into account
from static source code analysis.

Markus Elfring (10):
  Use kcalloc() in crypt_alloc_tfms()
  Reduce the scope for a variable in crypt_alloc_tfms()
  Rename a jump label in crypt_message()
  Delete an unnecessary variable initialisation in crypt_message()
  Rename a jump label in crypt_set_key()
  Delete an unnecessary variable initialisation in crypt_set_key()
  Rename a jump label in crypt_iv_tcw_whitening()
  Return directly after a failed crypto_alloc_ahash() in crypt_iv_essiv_ctr()
  Two checks and one function call less in crypt_iv_essiv_ctr() after error detection
  Delete unnecessary variable initialisations in crypt_iv_essiv_ctr()

 drivers/md/dm-crypt.c | 51 ++++++++++++++++++++++-----------------------------
 1 file changed, 22 insertions(+), 29 deletions(-)

-- 
2.10.0

^ permalink raw reply

* Re: [PATCH 12/16] md/bitmap: One check less in read_page() at the end
From: Dan Carpenter @ 2016-09-28  7:33 UTC (permalink / raw)
  To: SF Markus Elfring
  Cc: linux-raid, Shaohua Li, LKML, kernel-janitors, Julia Lawall
In-Reply-To: <18d73bcc-795b-8680-c82c-9a06047a8ef3@users.sourceforge.net>

This makes the code ugly.

regards,
dan carpenter


^ permalink raw reply

* Re: [PATCH v2 5/6] r5cache: handle SYNC and FUA
From: Shaohua Li @ 2016-09-28  1:32 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-raid, neilb, shli, kernel-team, dan.j.williams, hch,
	liuzhengyuang521, liuzhengyuan
In-Reply-To: <20160926233050.3351081-6-songliubraving@fb.com>

On Mon, Sep 26, 2016 at 04:30:49PM -0700, Song Liu wrote:
> With raid5 cache, we committing data from journal device. When
> there is flush request, we need to flush journal device's cache.
> This was not needed in raid5 journal, because we will flush the
> journal before committing data to raid disks.
> 
> This is similar to FUA, except that we also need flush journal for
> FUA. Otherwise, corruptions in earlier meta data will stop recovery
> from reaching FUA data.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
> @@ -554,12 +650,22 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
>  
>  	io = log->current_io;
>  
> +	if (test_and_clear_bit(STRIPE_R5C_PREFLUSH, &sh->state))
> +		io->has_flush = 1;
> +
>  	for (i = 0; i < sh->disks; i++) {
>  		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) &&
>  		    !test_bit(R5_Wantcache, &sh->dev[i].flags))
>  			continue;
>  		if (i == sh->pd_idx || i == sh->qd_idx)
>  			continue;
> +		if (test_bit(R5_WantFUA, &sh->dev[i].flags)) {
> +			io->has_fua = 1;
> +			/* we need to flush journal to make sure recovery can
> +			 * reach the data with fua flag
> +			 */
comments format.

> +			io->has_flush = 1;
> +		}
>  		r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
>  					raid5_compute_blocknr(sh, i, 0),
>  					sh->dev[i].log_checksum, 0, false);
> @@ -716,10 +822,16 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
>  	 * don't need to flush again
>  	 */
>  	if (bio->bi_iter.bi_size == 0) {
> -		bio_endio(bio);
> +		mutex_lock(&log->io_mutex);
> +		r5l_get_meta(log, 0);
> +		bio_list_add(&log->current_io->flush_barriers, bio);
> +		log->current_io->has_flush = 1;
> +		log->current_io->has_null_flush = 1;
> +		atomic_inc(&log->current_io->pending_stripe);
> +		r5l_submit_current_io(log);
> +		mutex_unlock(&log->io_mutex);
>  		return 0;
>  	}
please not change the behavior of writethrough mode.

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH v2 4/6] r5cache: r5c recovery
From: Shaohua Li @ 2016-09-28  1:08 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-raid, neilb, shli, kernel-team, dan.j.williams, hch,
	liuzhengyuang521, liuzhengyuan
In-Reply-To: <20160926233050.3351081-5-songliubraving@fb.com>

On Mon, Sep 26, 2016 at 04:30:48PM -0700, Song Liu wrote:
> For Data-Only strips, we need to finish complete calculate parity and
> finish the full reconstruct write or RMW write. For simplicity, in
> the recovery, we load the stripe to stripe cache. Once the array is
> started, the stripe cache state machine will handle these stripes
> through normal write path.

please make sure not change the behavior of writethrough mode. In writethrough,
we discard data-only stripes.

Is it safe to run the state machine in recovery stage? For exmaple, md
personablity ->run is called before bitmap is initialized.

> r5c_recovery_flush_log contains the main procedure of recovery. The
> recovery code first scans through the journal and loads data to
> stripe cache. The code keeps tracks of all these stripes in a list
> (use sh->lru and ctx->cached_list), stripes in the list are
> organized in the order of its first appearance on the journal.
> During the scan, the recovery code assesses each stripe as
> Data-Parity or Data-Only.
> 
> During scan, the array may run out of stripe cache. In these cases,
> the recovery code tries to release some stripe head by replaying
> existing Data-Parity stripes. Once these replays are done, these
> stripes can be released. When releasing Data-Parity stripes is not
> enough, the recovery code will also call raid5_set_cache_size to
> increase stripe cache size.
> 
> At the end of scan, the recovery code replays all Data-Parity
> stripes, and sets proper states for Data-Only stripes. The recovery
> code also increases seq number by 10 and rewrites all Data-Only
> stripes to journal. This is to avoid confusion after repeated
> crashes. More details is explained in raid5-cache.c before
> r5c_recovery_rewrite_data_only_stripes().
...
> +r5c_recovery_analyze_meta_block(struct r5l_log *log,
> +				struct r5l_recovery_ctx *ctx,
> +				struct list_head *cached_stripe_list)
> +{
> +	struct mddev *mddev = log->rdev->mddev;
> +	struct r5conf *conf = mddev->private;
>  	struct r5l_meta_block *mb;
> -	int offset;
> +	struct r5l_payload_data_parity *payload;
> +	int mb_offset;
>  	sector_t log_offset;
> -	sector_t stripe_sector;
> +	sector_t stripe_sect;
> +	struct stripe_head *sh;
> +	int ret;
> +
> +	/* for mismatch in data blocks, we will drop all data in this mb, but
> +	 * we will still read next mb for other data with FLUSH flag, as
> +	 * io_unit could finish out of order.
> +	 */
please correct the format

> +	ret = r5l_recovery_verify_data_checksum_for_mb(log, ctx);
> +	if (ret == -EINVAL)
> +		return -EAGAIN;
> +	else if (ret)
> +		return ret;
>  
>  	mb = page_address(ctx->meta_page);
> -	offset = sizeof(struct r5l_meta_block);
> +	mb_offset = sizeof(struct r5l_meta_block);
>  	log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
>  
> -	while (offset < le32_to_cpu(mb->meta_size)) {
> +	while (mb_offset < le32_to_cpu(mb->meta_size)) {
>  		int dd;
>  
> -		payload = (void *)mb + offset;
> -		stripe_sector = raid5_compute_sector(conf,
> -						     le64_to_cpu(payload->location), 0, &dd, NULL);
> -		if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
> -						  &offset, &log_offset))
> +		payload = (void *)mb + mb_offset;
> +		stripe_sect = (payload->header.type == R5LOG_PAYLOAD_DATA) ?
> +			raid5_compute_sector(
> +				conf, le64_to_cpu(payload->location), 0, &dd,
> +				NULL)
> +			: le64_to_cpu(payload->location);
> +
> +		sh = r5c_recovery_lookup_stripe(cached_stripe_list,
> +						stripe_sect);
> +
> +		if (!sh) {
> +			sh = r5c_recovery_alloc_stripe(conf, cached_stripe_list,
> +						       stripe_sect, ctx->pos);
> +			/* cannot get stripe from raid5_get_active_stripe
> +			 * try replay some stripes
> +			 */
ditto

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH v2 3/6] r5cache: reclaim support
From: Shaohua Li @ 2016-09-28  0:34 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-raid, neilb, shli, kernel-team, dan.j.williams, hch,
	liuzhengyuang521, liuzhengyuan
In-Reply-To: <20160926233050.3351081-4-songliubraving@fb.com>

On Mon, Sep 26, 2016 at 04:30:47PM -0700, Song Liu wrote:
> There are two limited resources, stripe cache and journal disk space.
> For better performance, we priotize reclaim of full stripe writes.
> To free up more journal space, we free earliest data on the journal.
> 
> In current implementation, reclaim happens when:
> 1. every R5C_RECLAIM_WAKEUP_INTERVAL (5 seconds)
> 2. when there are R5C_FULL_STRIPE_FLUSH_BATCH (8) cached full stripes
>    (r5c_check_cached_full_stripe)
> 3. when raid5_get_active_stripe sees pressure in stripe cache space
>    (r5c_check_stripe_cache_usage)
> 4. when there is pressure in journal space.
> 
> 1-3 above are straightforward.
> 
> For 4, we added 2 flags to r5conf->cache_state: R5C_LOG_TIGHT and
> R5C_LOG_CRITICAL. R5C_LOG_TIGHT is set when 2x max_free_space of
> journal space is in-use; while R5C_LOG_CRITICAL is set when 3x
> max_free_space of journal space is in-use. Where max_free_space
> = min(1/4 journal space, 10GB).
> 
> r5c_cache keeps all data in cache (not fully committed to RAID) in
> a list (stripe_in_cache). These stripes are in the order of their
> first appearance on the journal. So the log tail (last_checkpoint)
> should point to the journal_start of the first item in the list.
> 
> When R5C_LOG_TIGHT is set, r5l_reclaim_thread starts freezing
> stripes at the head of stripe_in_cache. When R5C_LOG_CRITICAL is
> set, the state machine only processes stripes at the head of
> stripe_in_cache (other stripes are added to no_space_stripes in
> r5c_cache_data and r5l_write_stripe).
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  drivers/md/raid5-cache.c | 313 +++++++++++++++++++++++++++++++++++++++--------
>  drivers/md/raid5.c       |  31 +++--
>  drivers/md/raid5.h       |  37 ++++--
>  3 files changed, 313 insertions(+), 68 deletions(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 0a0b16a..75b70d8 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -28,8 +28,7 @@
>  #define BLOCK_SECTORS (8)
>  
>  /*
> - * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
> - * recovery scans a very long log
> + * log->max_free_space is min(1/4 disk size, 10G reclaimable space)
>   */
>  #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
>  #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
> @@ -116,6 +115,9 @@ struct r5l_log {
>  
>  	/* for r5c_cache */
>  	enum r5c_state r5c_state;
> +	struct list_head stripe_in_cache; /* all stripes in the cache, with
> +					   * sh->log_start in order */
> +	spinlock_t stripe_in_cache_lock;  /* lock for stripe_in_cache */
>  };
>  
>  /*
> @@ -180,6 +182,16 @@ static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
>  	return log->device_size > used_size + size;
>  }
>  
> +static sector_t r5l_used_space(struct r5l_log *log)
> +{
> +	sector_t ret;
> +
> +	WARN_ON(!mutex_is_locked(&log->io_mutex));
> +	ret = r5l_ring_distance(log, log->last_checkpoint,
> +				log->log_start);
> +	return ret;
> +}
> +
>  static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
>  				    enum r5l_io_unit_state state)
>  {
> @@ -188,6 +200,56 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
>  	io->state = state;
>  }
>  
> +static inline int r5c_total_cached_stripes(struct r5conf *conf)
> +{
> +	return atomic_read(&conf->r5c_cached_partial_stripes) +
> +		atomic_read(&conf->r5c_cached_full_stripes);
> +}
> +
> +/*
> + * check whether we should flush some stripes to free up stripe cache
> + */
> +void r5c_check_stripe_cache_usage(struct r5conf *conf)
> +{
> +	if (!conf->log)
> +		return;
> +	spin_lock(&conf->device_lock);
> +	if (r5c_total_cached_stripes(conf) > conf->max_nr_stripes * 3 / 4 ||
> +	    atomic_read(&conf->empty_inactive_list_nr) > 0)
> +		r5c_flush_cache(conf, R5C_RECLAIM_STRIPE_GROUP);

I still worry about the max_nr_stripes usage. It can be changed at runtime. If
there are no enough stripes, should we just allocate more stripes or reclaim
stripe cache? If memory system tries to shrink stripes (eg, decrease
max_nr_stripes), will it cause deadlock for r5cache?

> +	else if (r5c_total_cached_stripes(conf) >
> +		 conf->max_nr_stripes * 1 / 2)
> +		r5c_flush_cache(conf, 1);

This one is a defensive reclaim. It should always reclaim stripes with full
data. If there are no enough such stripes, do nothing. Flushing 1 stripe would
always be wrong unless we are in critical stripe space shortage, as reclaim
involves disk cache flush and is slow, we should do aggretation as much as
possible.

> +	spin_unlock(&conf->device_lock);
> +}
> +
> +void r5c_check_cached_full_stripe(struct r5conf *conf)
> +{
> +	if (!conf->log)
> +		return;
> +	if (atomic_read(&conf->r5c_cached_full_stripes) >=
> +	    R5C_FULL_STRIPE_FLUSH_BATCH)
> +		r5l_wake_reclaim(conf->log, 0);
> +}
> +
> +static void r5c_update_log_state(struct r5l_log *log)
> +{
> +	struct r5conf *conf = log->rdev->mddev->private;
> +	sector_t used_space = r5l_used_space(log);
> +
> +	if (used_space > 3 * log->max_free_space) {
> +		set_bit(R5C_LOG_CRITICAL, &conf->cache_state);
> +		set_bit(R5C_LOG_TIGHT, &conf->cache_state);
> +	} else if (used_space > 2 * log->max_free_space) {
> +		clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
> +		set_bit(R5C_LOG_TIGHT, &conf->cache_state);
> +	} else if (used_space < log->max_free_space) {
> +		clear_bit(R5C_LOG_TIGHT, &conf->cache_state);
> +		clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
> +	} else  /* max_free_space < used_space < 2 * max_free_space */
> +		clear_bit(R5C_LOG_CRITICAL, &conf->cache_state);
> +}
> +
>  /*
>   * Freeze the stripe, thus send the stripe into reclaim path.
>   *
> @@ -198,10 +260,9 @@ void r5c_freeze_stripe_for_reclaim(struct stripe_head *sh)
>  {
>  	struct r5conf *conf = sh->raid_conf;
>  
> -	if (!conf->log)
> +	if (!conf->log || test_bit(STRIPE_R5C_FROZEN, &sh->state))
>  		return;
>  
> -	WARN_ON(test_bit(STRIPE_R5C_FROZEN, &sh->state));
>  	set_bit(STRIPE_R5C_FROZEN, &sh->state);

This is confusing. The WARN_ON suggests the STRIPE_R5C_FROZEN isn't set for sh,
but the change suggests it's possible the bit is set. Which one is correct?

>  
>  	if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
> @@ -356,8 +417,11 @@ static struct bio *r5l_bio_alloc(struct r5l_log *log)
>  
>  static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
>  {
> +	WARN_ON(!mutex_is_locked(&log->io_mutex));
> +	WARN_ON(!r5l_has_free_space(log, BLOCK_SECTORS));
>  	log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
>  
> +	r5c_update_log_state(log);
>  	/*
>  	 * If we filled up the log device start from the beginning again,
>  	 * which will require a new bio.
> @@ -475,6 +539,7 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
>  	int meta_size;
>  	int ret;
>  	struct r5l_io_unit *io;
> +	unsigned long flags;
>  
>  	meta_size =
>  		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
> @@ -518,6 +583,14 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
>  	atomic_inc(&io->pending_stripe);
>  	sh->log_io = io;
>  
> +	if (sh->log_start == MaxSector) {
> +		BUG_ON(!list_empty(&sh->r5c));
> +		sh->log_start = io->log_start;
> +		spin_lock_irqsave(&log->stripe_in_cache_lock, flags);
> +		list_add_tail(&sh->r5c,
> +			      &log->stripe_in_cache);
> +		spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
> +	}
what if it's in writethrogh mode?

>  	return 0;
>  }
>  
> @@ -527,6 +600,7 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
>   */
>  int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
>  {
> +	struct r5conf *conf = sh->raid_conf;
>  	int write_disks = 0;
>  	int data_pages, parity_pages;
>  	int meta_size;
> @@ -590,19 +664,31 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
>  	mutex_lock(&log->io_mutex);
>  	/* meta + data */
>  	reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
> -	if (!r5l_has_free_space(log, reserve)) {
> -		spin_lock(&log->no_space_stripes_lock);
> -		list_add_tail(&sh->log_list, &log->no_space_stripes);
> -		spin_unlock(&log->no_space_stripes_lock);
>  
> -		r5l_wake_reclaim(log, reserve);
> -	} else {
> -		ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
> -		if (ret) {
> -			spin_lock_irq(&log->io_list_lock);
> -			list_add_tail(&sh->log_list, &log->no_mem_stripes);
> -			spin_unlock_irq(&log->io_list_lock);
> +	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) {
> +		sector_t last_checkpoint;
> +
> +		spin_lock(&log->stripe_in_cache_lock);
> +		last_checkpoint = (list_first_entry(&log->stripe_in_cache,
> +						    struct stripe_head, r5c))->log_start;
> +		spin_unlock(&log->stripe_in_cache_lock);
> +		if (sh->log_start != last_checkpoint) {
> +			spin_lock(&log->no_space_stripes_lock);
> +			list_add_tail(&sh->log_list, &log->no_space_stripes);
> +			spin_unlock(&log->no_space_stripes_lock);
> +			mutex_unlock(&log->io_mutex);
> +			return -ENOSPC;

So if a stripe is in cache, we try to reclaim it. We should have some mechanism
to guarantee there are enough space for reclaim (eg for parity). Otherwise
there could be a deadlock because the space allocation in reclaim path is to
free space. Could you please explain how this is done?

> +		} else 	if (!r5l_has_free_space(log, reserve)) {
> +			WARN(1, "%s: run out of journal space\n", __func__);
> +			BUG();
that's scaring, why it happens?

>  		}
> +		pr_debug("%s: write sh %lu to free log space\n", __func__, sh->sector);
> +	}
> +	ret = r5l_log_stripe(log, sh, data_pages, parity_pages);
> +	if (ret) {
> +		spin_lock_irq(&log->io_list_lock);
> +		list_add_tail(&sh->log_list, &log->no_mem_stripes);
> +		spin_unlock_irq(&log->io_list_lock);
>  	}
>  
>  	mutex_unlock(&log->io_mutex);
> @@ -639,12 +725,17 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
>  /* This will run after log space is reclaimed */
>  static void r5l_run_no_space_stripes(struct r5l_log *log)
>  {
> -	struct stripe_head *sh;
> +	struct r5conf *conf = log->rdev->mddev->private;
> +	struct stripe_head *sh, *next;
> +	sector_t last_checkpoint;
>  
>  	spin_lock(&log->no_space_stripes_lock);
> -	while (!list_empty(&log->no_space_stripes)) {
> -		sh = list_first_entry(&log->no_space_stripes,
> -				      struct stripe_head, log_list);
> +	last_checkpoint = (list_first_entry(&log->stripe_in_cache,
> +					    struct stripe_head, r5c))->log_start;
> +	list_for_each_entry_safe(sh, next, &log->no_space_stripes, log_list) {
> +		if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
> +		    sh->log_start != last_checkpoint)
> +			continue;
what's this check for?

>  		list_del_init(&sh->log_list);
>  		set_bit(STRIPE_HANDLE, &sh->state);
>  		raid5_release_stripe(sh);
> @@ -652,10 +743,32 @@ static void r5l_run_no_space_stripes(struct r5l_log *log)
>  	spin_unlock(&log->no_space_stripes_lock);
>  }
>  
> +static sector_t r5c_calculate_last_cp(struct r5conf *conf)
> +{
> +	struct stripe_head *sh;
> +	struct r5l_log *log = conf->log;
> +	sector_t end = MaxSector;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&log->stripe_in_cache_lock, flags);
> +	if (list_empty(&conf->log->stripe_in_cache)) {
> +		/* all stripes flushed */
> +		spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
> +		return log->next_checkpoint;
> +	}
> +	sh = list_first_entry(&conf->log->stripe_in_cache,
> +			      struct stripe_head, r5c);
> +	end = sh->log_start;
> +	spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
> +	return end;
> +}
> +
>  static sector_t r5l_reclaimable_space(struct r5l_log *log)
>  {
> +	struct r5conf *conf = log->rdev->mddev->private;
> +
>  	return r5l_ring_distance(log, log->last_checkpoint,
> -				 log->next_checkpoint);
> +				 r5c_calculate_last_cp(conf));
will this work for writethrouth?

>  }
>  
>  static void r5l_run_no_mem_stripe(struct r5l_log *log)
> @@ -830,14 +943,21 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
>  		blkdev_issue_discard(bdev, log->rdev->data_offset, end,
>  				GFP_NOIO, 0);
>  	}
> +	mutex_lock(&log->io_mutex);
> +	log->last_checkpoint = end;
> +	r5c_update_log_state(log);
> +	pr_debug("%s: set last_checkpoint = %lu\n", __func__, end);
> +
> +	log->last_cp_seq = log->next_cp_seq;
> +	mutex_unlock(&log->io_mutex);
>  }
>  
>  static void r5l_do_reclaim(struct r5l_log *log)
>  {
> +	struct r5conf *conf = log->rdev->mddev->private;
>  	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
>  	sector_t reclaimable;
>  	sector_t next_checkpoint;
> -	u64 next_cp_seq;
>  
>  	spin_lock_irq(&log->io_list_lock);
>  	/*
> @@ -860,14 +980,12 @@ static void r5l_do_reclaim(struct r5l_log *log)
>  				    log->io_list_lock);
>  	}
>  
> -	next_checkpoint = log->next_checkpoint;
> -	next_cp_seq = log->next_cp_seq;
> +	next_checkpoint = r5c_calculate_last_cp(conf);
>  	spin_unlock_irq(&log->io_list_lock);
>  
>  	BUG_ON(reclaimable < 0);
>  	if (reclaimable == 0)
>  		return;
> -
>  	/*
>  	 * write_super will flush cache of each raid disk. We must write super
>  	 * here, because the log area might be reused soon and we don't want to
> @@ -877,10 +995,7 @@ static void r5l_do_reclaim(struct r5l_log *log)
>  
>  	mutex_lock(&log->io_mutex);
>  	log->last_checkpoint = next_checkpoint;
> -	log->last_cp_seq = next_cp_seq;
why not update last_cp_seq?

>  	mutex_unlock(&log->io_mutex);
> -
> -	r5l_run_no_space_stripes(log);

I don't understand why move r5l_run_no_space_stripes to r5c_flush_cache. It's
natural we run this after some spaces are reclaimed.

>  }
>  
>  static void r5l_reclaim_thread(struct md_thread *thread)
> @@ -891,7 +1006,9 @@ static void r5l_reclaim_thread(struct md_thread *thread)
>  
>  	if (!log)
>  		return;
> +	r5c_do_reclaim(conf);
>  	r5l_do_reclaim(log);
> +	md_wakeup_thread(mddev->thread);

this wakeup is a bit strange. After we reclaim some spaces, we will rerun
pending stripes, which will wakeup mddev->thread. Do miss some wakeup in other
reclaim places?

>  }
>  
>  void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
> @@ -899,6 +1016,8 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
>  	unsigned long target;
>  	unsigned long new = (unsigned long)space; /* overflow in theory */
>  
> +	if (!log)
> +		return;
>  	do {
>  		target = log->reclaim_target;
>  		if (new < target)
> @@ -926,7 +1045,7 @@ void r5l_quiesce(struct r5l_log *log, int state)
>  		/* make sure r5l_write_super_and_discard_space exits */
>  		mddev = log->rdev->mddev;
>  		wake_up(&mddev->sb_wait);
> -		r5l_wake_reclaim(log, -1L);
> +		r5l_wake_reclaim(log, MaxSector);
>  		md_unregister_thread(&log->reclaim_thread);
>  		r5l_do_reclaim(log);
>  	}
> @@ -1207,14 +1326,39 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp)
>  	set_bit(MD_CHANGE_DEVS, &mddev->flags);
>  }
>  
> -static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
> +/*
> + * r5c_flush_cache will move stripe from cached list to handle_list or
> + * r5c_priority_list

What's the r5c_priority_list for? If you want to make sure reclaim makes
progress, I think it's the wrong way. If there are no spaces, handling other
normal stripes will mean moving them to no_space list and do nothing else. Then
the reclaim stripes will get the turn to run. There is no extra list required.

> + *
> + * return 1 if the stripe is moved, and 0 if the stripe is not moved
> + * must hold conf->device_lock
> + */
> +static int r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh,
> +			    bool priority)
>  {
> -	list_del_init(&sh->lru);
> +	BUG_ON(list_empty(&sh->lru));
> +
> +	BUG_ON(test_bit(STRIPE_R5C_PRIORITY, &sh->state) &&
> +	       !test_bit(STRIPE_HANDLE, &sh->state));
> +
> +	if (test_bit(STRIPE_R5C_PRIORITY, &sh->state))
> +		return 0;
> +	if (test_bit(STRIPE_HANDLE, &sh->state) && !priority)
> +		return 0;
> +
>  	r5c_freeze_stripe_for_reclaim(sh);
> -	atomic_inc(&conf->active_stripes);
> +	if (!test_and_set_bit(STRIPE_HANDLE, &sh->state)) {
> +		atomic_inc(&conf->active_stripes);
> +	}

shouldn't the stripe is always accounted to active before it's reclaimed? Do we
decrease the count before the stripe is reclaimed? sounds like a bug.

> +	clear_bit(STRIPE_DELAYED, &sh->state);
> +	clear_bit(STRIPE_BIT_DELAY, &sh->state);
> +	if (priority)
> +		set_bit(STRIPE_R5C_PRIORITY, &sh->state);
> +
> +	list_del_init(&sh->lru);
>  	atomic_inc(&sh->count);
> -	set_bit(STRIPE_HANDLE, &sh->state);
>  	raid5_release_stripe(sh);
> +	return 1;
>  }
>  
>  /* if num <= 0, flush all stripes
> @@ -1228,20 +1372,28 @@ int r5c_flush_cache(struct r5conf *conf, int num)
>  	assert_spin_locked(&conf->device_lock);
>  	if (!conf->log)
>  		return 0;
> +
>  	list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
> -		r5c_flush_stripe(conf, sh);
> -		count++;
> +		count += r5c_flush_stripe(conf, sh, false);
>  		if (num > 0 && count >= num && count >=
>  		    R5C_FULL_STRIPE_FLUSH_BATCH)
>  			return count;
>  	}
>  
>  	list_for_each_entry_safe(sh, next, &conf->r5c_partial_stripe_list, lru) {
> -		r5c_flush_stripe(conf, sh);
> -		count++;
> +		count += r5c_flush_stripe(conf, sh, false);
>  		if (num > 0 && count == num)
>  			return count;
>  	}
> +
> +	if (num <= 0) {
> +		list_for_each_entry_safe(sh, next, &conf->delayed_list, lru) {
> +			if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) ||
> +			    test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
> +				r5c_flush_stripe(conf, sh, false);
> +		}
> +		r5l_run_no_space_stripes(conf->log);
> +	}
>  	return count;
>  }
>  
> @@ -1349,6 +1501,7 @@ void r5c_handle_stripe_written(struct r5conf *conf,
>  			       struct stripe_head *sh) {
>  	int i;
>  	int do_wakeup = 0;
> +	unsigned long flags;
>  
>  	if (test_and_clear_bit(STRIPE_R5C_WRITTEN, &sh->state)) {
>  		WARN_ON(!test_bit(STRIPE_R5C_FROZEN, &sh->state));
> @@ -1361,6 +1514,11 @@ void r5c_handle_stripe_written(struct r5conf *conf,
>  			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
>  				do_wakeup = 1;
>  		}
> +		spin_lock_irqsave(&conf->log->stripe_in_cache_lock, flags);
> +		list_del_init(&sh->r5c);
> +		spin_unlock_irqrestore(&conf->log->stripe_in_cache_lock, flags);
> +		sh->log_start = MaxSector;
> +		clear_bit(STRIPE_R5C_PRIORITY, &sh->state);
>  	}
>  
>  	if (do_wakeup)
> @@ -1371,6 +1529,7 @@ int
>  r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
>  	       struct stripe_head_state *s)
>  {
> +	struct r5conf *conf = sh->raid_conf;
>  	int pages;
>  	int meta_size;
>  	int reserve;
> @@ -1413,19 +1572,33 @@ r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
>  	mutex_lock(&log->io_mutex);
>  	/* meta + data */
>  	reserve = (1 + pages) << (PAGE_SHIFT - 9);
> -	if (!r5l_has_free_space(log, reserve)) {
> -		spin_lock(&log->no_space_stripes_lock);
> -		list_add_tail(&sh->log_list, &log->no_space_stripes);
> -		spin_unlock(&log->no_space_stripes_lock);
>  
> -		r5l_wake_reclaim(log, reserve);
> -	} else {
> -		ret = r5l_log_stripe(log, sh, pages, 0);
> -		if (ret) {
> -			spin_lock_irq(&log->io_list_lock);
> -			list_add_tail(&sh->log_list, &log->no_mem_stripes);
> -			spin_unlock_irq(&log->io_list_lock);
> +	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) {
> +		sector_t last_checkpoint;
> +
> +		spin_lock(&log->stripe_in_cache_lock);
> +		last_checkpoint = (list_first_entry(&log->stripe_in_cache,
> +						    struct stripe_head, r5c))->log_start;
> +		spin_unlock(&log->stripe_in_cache_lock);
> +		if (sh->log_start != last_checkpoint) {
> +			spin_lock(&log->no_space_stripes_lock);
> +			list_add_tail(&sh->log_list, &log->no_space_stripes);
> +			spin_unlock(&log->no_space_stripes_lock);
> +
> +			mutex_unlock(&log->io_mutex);
> +			return -ENOSPC;
>  		}
> +		pr_debug("%s: write sh %lu to free log space\n", __func__, sh->sector);
> +	}
> +	if (!r5l_has_free_space(log, reserve)) {
> +		pr_err("%s: cannot reserve space %d\n", __func__, reserve);
> +		BUG();

same here. we should put the stripe into no_space list. If we can't allocate
space eventually, it indicates reclaim has bug.

> +	}
> +	ret = r5l_log_stripe(log, sh, pages, 0);
> +	if (ret) {
> +		spin_lock_irq(&log->io_list_lock);
> +		list_add_tail(&sh->log_list, &log->no_mem_stripes);
> +		spin_unlock_irq(&log->io_list_lock);
>  	}
>  
>  	mutex_unlock(&log->io_mutex);
> @@ -1435,12 +1608,45 @@ r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
>  void r5c_do_reclaim(struct r5conf *conf)
>  {
>  	struct r5l_log *log = conf->log;
> -
> -	assert_spin_locked(&conf->device_lock);
> +	struct stripe_head *sh, *next;
> +	int count = 0;
> +	unsigned long flags;
> +	sector_t last_checkpoint;
>  
>  	if (!log)
>  		return;
> -	r5c_flush_cache(conf, 0);
> +
> +	if (!test_bit(R5C_LOG_CRITICAL, &conf->cache_state)) {
> +		/* flush all full stripes */
> +		spin_lock_irqsave(&conf->device_lock, flags);
> +		list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru)
> +			r5c_flush_stripe(conf, sh, false);
> +		spin_unlock_irqrestore(&conf->device_lock, flags);
> +	}
> +
> +	if (test_bit(R5C_LOG_TIGHT, &conf->cache_state)) {
> +		spin_lock_irqsave(&log->stripe_in_cache_lock, flags);
> +		spin_lock(&conf->device_lock);
> +		last_checkpoint = (list_first_entry(&log->stripe_in_cache,
> +						    struct stripe_head, r5c))->log_start;
> +		list_for_each_entry(sh, &log->stripe_in_cache, r5c) {
> +			if (sh->log_start == last_checkpoint) {
> +				if (!list_empty(&sh->lru))
> +					r5c_flush_stripe(conf, sh, true);
> +			} else
> +				break;
> +		}
> +		spin_unlock(&conf->device_lock);
> +		spin_unlock_irqrestore(&log->stripe_in_cache_lock, flags);
> +		pr_debug("%s: flushed %d stripes for log space\n", __func__, count);
> +	} else if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
> +		spin_lock_irqsave(&conf->device_lock, flags);
> +		r5c_flush_cache(conf, R5C_RECLAIM_STRIPE_GROUP);
> +		spin_unlock_irqrestore(&conf->device_lock, flags);
> +	}
> +	wake_up(&conf->wait_for_stripe);
> +	md_wakeup_thread(conf->mddev->thread);
> +	r5l_run_no_space_stripes(log);
>  }
>  
>  static int r5l_load_log(struct r5l_log *log)
> @@ -1500,6 +1706,9 @@ create:
>  	if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
>  		log->max_free_space = RECLAIM_MAX_FREE_SPACE;
>  	log->last_checkpoint = cp;
> +	mutex_lock(&log->io_mutex);
> +	r5c_update_log_state(log);
> +	mutex_unlock(&log->io_mutex);
>  
>  	__free_page(page);
>  
> @@ -1555,6 +1764,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
>  						 log->rdev->mddev, "reclaim");
>  	if (!log->reclaim_thread)
>  		goto reclaim_thread;
> +	log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL;
> +
>  	init_waitqueue_head(&log->iounit_wait);
>  
>  	INIT_LIST_HEAD(&log->no_mem_stripes);
> @@ -1564,6 +1775,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
>  
>  	/* flush full stripe */
>  	log->r5c_state = R5C_STATE_WRITE_BACK;
> +	INIT_LIST_HEAD(&log->stripe_in_cache);
> +	spin_lock_init(&log->stripe_in_cache_lock);
>  
>  	if (r5l_load_log(log))
>  		goto error;
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index cc4ac1d..a3d26ec 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -301,7 +301,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
>  		else {
>  			clear_bit(STRIPE_DELAYED, &sh->state);
>  			clear_bit(STRIPE_BIT_DELAY, &sh->state);
> -			if (conf->worker_cnt_per_group == 0) {
> +			if (test_bit(STRIPE_R5C_PRIORITY, &sh->state))
> +				list_add_tail(&sh->lru, &conf->r5c_priority_list);
> +			else if (conf->worker_cnt_per_group == 0) {
>  				list_add_tail(&sh->lru, &conf->handle_list);
>  			} else {
>  				raid5_wakeup_stripe_thread(sh);
> @@ -327,6 +329,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
>  				if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
>  					atomic_dec(&conf->r5c_cached_partial_stripes);
>  				list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
> +				r5c_check_cached_full_stripe(conf);
>  			} else {
>  				/* not full stripe */
>  				if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
> @@ -697,9 +700,14 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
>  			}
>  			if (noblock && sh == NULL)
>  				break;
> +
> +			r5c_check_stripe_cache_usage(conf);
>  			if (!sh) {
> +				unsigned long before_jiffies;
>  				set_bit(R5_INACTIVE_BLOCKED,
>  					&conf->cache_state);
> +				r5l_wake_reclaim(conf->log, 0);
> +				before_jiffies = jiffies;
>  				wait_event_lock_irq(
>  					conf->wait_for_stripe,
>  					!list_empty(conf->inactive_list + hash) &&
> @@ -708,6 +716,9 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
>  					 || !test_bit(R5_INACTIVE_BLOCKED,
>  						      &conf->cache_state)),
>  					*(conf->hash_locks + hash));
> +				before_jiffies = jiffies - before_jiffies;
> +				if (before_jiffies > 20)
> +					pr_debug("%s: wait for sh takes %lu jiffies\n", __func__, before_jiffies);
please remove the debug code.

Thanks,
Shaohua

^ permalink raw reply

* Re: moving spares into group and checking spares
From: scar @ 2016-09-28  0:19 UTC (permalink / raw)
  To: linux-raid-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <CAJCQCtRW9REafzmyk+W6aVxqxMUWCdXNkrST1e7udrH-zp26Uw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>

Chris Murphy wrote on 09/14/2016 08:51 PM:
> On Wed, Sep 14, 2016 at 9:42 PM, scar <scar-47zfDnpWZoPQT0dZR+AlfA@public.gmane.org> wrote:
>
>> 00 80 00
>> Sep 12 08:05:34 hind kernel: [1350051.023847] end_request: critical medium
>> error, dev sdk, sector 598457896
>> Sep 12 08:05:35 hind kernel: [1350051.385810] md/raid:md0: read error
>> corrected (8 sectors at 598455848 on sdk1)
>> Sep 12 08:05:35 hind kernel: [1350051.385826] md/raid:md0: read error
>> corrected (8 sectors at 598455856 on sdk1)
>> Sep 12 08:05:35 hind kernel: [1350051.385834] md/raid:md0: read error
>> corrected (8 sectors at 598455864 on sdk1)
>> Sep 12 08:05:35 hind kernel: [1350051.385840] md/raid:md0: read error
>> corrected (8 sectors at 598455872 on sdk1)
>> Sep 12 08:05:35 hind kernel: [1350051.385847] md/raid:md0: read error
>> corrected (8 sectors at 598455880 on sdk1)
>> Sep 12 08:05:35 hind kernel: [1350051.385853] md/raid:md0: read error
>> corrected (8 sectors at 598455888 on sdk1)
>> Sep 12 08:05:35 hind kernel: [1350051.385859] md/raid:md0: read error
>> corrected (8 sectors at 598455896 on sdk1)
>> Sep 12 08:05:35 hind kernel: [1350051.385865] md/raid:md0: read error
>> corrected (8 sectors at 598455904 on sdk1)
>> Sep 12 08:05:35 hind kernel: [1350051.385873] md/raid:md0: read error
>> corrected (8 sectors at 598455912 on sdk1)
>> Sep 12 08:05:35 hind kernel: [1350051.385880] md/raid:md0: read error
>> corrected (8 sectors at 598455920 on sdk1)
>> Sep 12 13:39:43 hind kernel: [1370087.022160] md: md0: data-check done.
>>
>>
>> so it seems to be working?  although the sector reported by libata is
>> different than what md corrected
>
> Looks like it replaced the entire chunk that includes the bad sector.
> Is the chunk size 32KiB?
>

No it is 512K, and the sector reported by libata is almost 2000 sectors 
different than what mdadm reported


--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: WARNING: mismatch_cnt is not 0 on <array device>
From: Adam Goryachev @ 2016-09-27 23:09 UTC (permalink / raw)
  To: Benjammin2068, Linux-RAID
In-Reply-To: <5427ce35-f222-8a2c-486e-441c4c6ec9a6@gmail.com>

On 28/09/16 03:30, Benjammin2068 wrote:
> On 09/27/2016 11:45 AM, Wols Lists wrote:
>> I'm thinking about all this. The second section is all about recovering
>> a failing/ed array, and is new. The first section is the original,
>> that's being updated. It just feels totally wrong to me now, as it's
>> becoming a jumbled mess of old and new.
>>
>> What I'm probably going to do, is create a new first section about
>> setting up a raid system. That means that a section on monitoring will
>> actually make sense and fit between setting it up, and fixing problems.
>>
>> (And all the old stuff will end up in the "software archaeology"
>> section, so people who are still running ancient systems can find it :-)
>>
> That would be awesome.
>
>   There was a shell script out there already for MUNIN, but I modified it a little to add thresholds that throw up flags. I might change some more to handle different thresholds for different devices or the ability to monitor only RAIDs that matter.
>
> I have smartctl running for all my drives -- but that doesn't help me at the mdadm level.
>
> While you're in the docs adding stuff about mismatch_cnt, is there anything that can help someone backtrace which block cause the count to go up? This would help us mere mortals maybe go back to inspect a block or a file or something to make sure it's not corrupted.
>
Just out of interest, but I'm not sure how useful your munin monitoring 
will be... AFAIK, the mismatch_cnt value is only updated when you run a 
check, which would probably take some number of hours to complete. I 
would guess that you are unlikely to run more than one check a week or 
month.... and as soon as there is any change (unless you know the 
explanation) then you should be looking to resolve that.

Unless of course I'm wrong about when the count is updated?

Regards,
Adam
-- 
Adam Goryachev Website Managers www.websitemanagers.com.au

^ permalink raw reply

* Re: [PATCH v2 2/6] r5cache: sysfs entry r5c_state
From: Shaohua Li @ 2016-09-27 22:58 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-raid, neilb, shli, kernel-team, dan.j.williams, hch,
	liuzhengyuang521, liuzhengyuan
In-Reply-To: <20160926233050.3351081-3-songliubraving@fb.com>

On Mon, Sep 26, 2016 at 04:30:46PM -0700, Song Liu wrote:
> r5c_state have 4 states:
> * no-cache;
> * write-through (write journal only);
> * write-back (w/ write cache);
> * cache-broken (journal missing or Faulty)
> 
> When there is functional write cache, r5c_state is a knob to
> switch between write-back and write-through.
> 
> When the journal device is broken, the raid array is forced
> in readonly mode. In this case, r5c_state can be used to
> remove "journal feature", and thus make the array read-write
> without journal. By writing into r5c_cache_mode, the array
> can transit from cache-broken to no-cache, which removes
> journal feature for the array.

How will user use the new interface? I suppose
-raid array is forced readonly mode (by kernel)
-user uses the new interface to remove journal
-user forces array read-write
-kernel will update superblock and array can run read/write

please describe
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  drivers/md/raid5-cache.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/md/raid5.c       |  1 +
>  drivers/md/raid5.h       |  2 ++
>  3 files changed, 60 insertions(+)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 6b28461..0a0b16a 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -54,6 +54,9 @@ enum r5c_state {
>  	R5C_STATE_CACHE_BROKEN = 3,
>  };
>  
> +static char *r5c_state_str[] = {"no-cache", "write-through",
> +				"write-back", "cache-broken"};
> +
>  struct r5l_log {
>  	struct md_rdev *rdev;
>  
> @@ -1242,6 +1245,60 @@ int r5c_flush_cache(struct r5conf *conf, int num)
>  	return count;
>  }
>  
> +ssize_t r5c_state_show(struct mddev *mddev, char *page)
> +{
> +	struct r5conf *conf = mddev->private;
> +	int val = 0;
> +	int ret = 0;
> +
> +	if (conf->log)
> +		val = conf->log->r5c_state;
> +	else if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
> +		val = R5C_STATE_CACHE_BROKEN;
> +	ret += snprintf(page, PAGE_SIZE - ret, "%d: %s\n",
> +			val, r5c_state_str[val]);
> +	return ret;
> +}
> +
> +ssize_t r5c_state_store(struct mddev *mddev, const char *page, size_t len)
> +{
> +	struct r5conf *conf = mddev->private;
> +	struct r5l_log *log = conf->log;
> +	int val;
> +
> +	if (kstrtoint(page, 10, &val))
> +		return -EINVAL;
> +	if (!log && val != R5C_STATE_NO_CACHE)
> +		return -EINVAL;
> +
> +	if (val < R5C_STATE_NO_CACHE || val > R5C_STATE_WRITE_BACK)
> +		return -EINVAL;
> +	if (val == R5C_STATE_NO_CACHE) {
> +		if (conf->log &&
> +		    !test_bit(Faulty, &log->rdev->flags)) {
> +			pr_err("md/raid:%s: journal device is in use, cannot remove it\n",
> +			       mdname(mddev));
> +			return -EINVAL;
> +		}
> +	}
> +
> +	spin_lock_irq(&conf->device_lock);
> +	if (log)
> +		conf->log->r5c_state = val;
> +	if (val == R5C_STATE_NO_CACHE) {
> +		clear_bit(MD_HAS_JOURNAL, &mddev->flags);
> +		set_bit(MD_UPDATE_SB_FLAGS, &mddev->flags);
> +	}
> +	spin_unlock_irq(&conf->device_lock);
> +	pr_info("md/raid:%s: setting r5c cache mode to %d: %s\n",
> +		mdname(mddev), val, r5c_state_str[val]);
> +	return len;
> +}

Is this safe? Eg, switching from writethrough to writeback or vice versa in
runtime with IO running? I think you'd better call mddev_suspend/mddev_resume.

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH v2 1/6] r5cache: write part of r5cache
From: Shaohua Li @ 2016-09-27 22:51 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-raid, neilb, shli, kernel-team, dan.j.williams, hch,
	liuzhengyuang521, liuzhengyuan
In-Reply-To: <20160926233050.3351081-2-songliubraving@fb.com>

On Mon, Sep 26, 2016 at 04:30:45PM -0700, Song Liu wrote:

Please add more comments directly in the code. The state machine is already
very complex and please help make the code more readable. Maybe adding some
descriptions from the changelog into the code is a good start.

The patch includes both write path and reclaim path. It would be easier to
review if you can split them into 2 patches.

> With r5cache, write operation does not wait for parity calculation
> and write out, so the write latency is lower (1 write to journal
> device vs. read and then write to raid disks). Also, r5cache will
> reduce RAID overhead (multipile IO due to read-modify-write of
> parity) and provide more opportunities of full stripe writes.
> 
> r5cache adds 4 flags to stripe_head.state:
>  - STRIPE_R5C_PARTIAL_STRIPE,
>  - STRIPE_R5C_FULL_STRIPE,
>  - STRIPE_R5C_FROZEN and
>  - STRIPE_R5C_WRITTEN.
please describe why STRIPE_R5C_WRITTEN is required.

> There are some known limitations of the cache implementation:
> 
> 1. Write cache only covers full page writes (R5_OVERWRITE). Writes
>    of smaller granularity are write through.
> 2. Only one log io (sh->log_io) for each stripe at anytime. Later
>    writes for the same stripe have to wait. This can be improved by
>    moving log_io to r5dev.
There are other limitations in the implementation.
- read path must enter state machine, which is a signatificant bottleneck

- there is no checkpoint in the log, which r5l_payload_flush is designed for.
  So A recovery must replay the whole log. This is a big availatility
  shortcoming.

please explain when a stripe is in cache or is flushing to raid disks and new
write into the stripe comes, there isn't a problem (data corruption,
deadlocking).

Please make sure the code doesn't change behavior for array without a log or
just using log in writethrough mode, which includes both performance and
correctness. If log is used, the default mode should be writethrough, as the
cache is still premature and cache could be dangerous for some users (single
point failure). I'll add inline comments in places I found this problem, but
please double check. This is very important.

> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 1b1ab4a..6b28461 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -34,12 +34,26 @@
>  #define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
>  #define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
>  
> +/* wake up reclaim thread periodically */
> +#define R5C_RECLAIM_WAKEUP_INTERVAL (5 * HZ)
> +/* start flush with these full stripes */
> +#define R5C_FULL_STRIPE_FLUSH_BATCH 8
> +/* reclaim stripes in groups */
> +#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
> +
>  /*
>   * We only need 2 bios per I/O unit to make progress, but ensure we
>   * have a few more available to not get too tight.
>   */
>  #define R5L_POOL_SIZE	4
>  
> +enum r5c_state {
> +	R5C_STATE_NO_CACHE = 0,
> +	R5C_STATE_WRITE_THROUGH = 1,
> +	R5C_STATE_WRITE_BACK = 2,
> +	R5C_STATE_CACHE_BROKEN = 3,
> +};
> +
>  struct r5l_log {
>  	struct md_rdev *rdev;
>  
> @@ -96,6 +110,9 @@ struct r5l_log {
>  	spinlock_t no_space_stripes_lock;
>  
>  	bool need_cache_flush;
> +
> +	/* for r5c_cache */
> +	enum r5c_state r5c_state;
>  };
>  
>  /*
> @@ -168,12 +185,79 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
>  	io->state = state;
>  }
>  
> +/*
> + * Freeze the stripe, thus send the stripe into reclaim path.
> + *
> + * This function should only be called from raid5d that handling this stripe,
> + * or when holds conf->device_lock
> + */

Do you mean if this called in raid5d, the lock isn't required? Please note we
could have several threads (like raid5d) handle stripes. Is there any problem
here?

> +void r5c_freeze_stripe_for_reclaim(struct stripe_head *sh)
> +{
> +	struct r5conf *conf = sh->raid_conf;
> +
> +	if (!conf->log)
> +		return;
> +
> +	WARN_ON(test_bit(STRIPE_R5C_FROZEN, &sh->state));
> +	set_bit(STRIPE_R5C_FROZEN, &sh->state);
> +
> +	if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
> +		atomic_inc(&conf->preread_active_stripes);
> +
> +	if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
> +		BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
> +		atomic_dec(&conf->r5c_cached_partial_stripes);
> +	}
> +
> +	if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
> +		BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
> +		atomic_dec(&conf->r5c_cached_full_stripes);
> +	}
> +}
> +
> +static void r5c_handle_data_cached(struct stripe_head *sh)
> +{
> +	int i;
> +
> +	for (i = sh->disks; i--; )
> +		if (test_and_clear_bit(R5_Wantcache, &sh->dev[i].flags)) {
> +			set_bit(R5_InCache, &sh->dev[i].flags);
> +			clear_bit(R5_LOCKED, &sh->dev[i].flags);
> +			atomic_inc(&sh->dev_in_cache);
> +		}
> +}
> +
> +/*
> + * this journal write must contain full parity,
> + * it may also contain some data pages
> + */
> +static void r5c_handle_parity_cached(struct stripe_head *sh)
> +{
> +	int i;
> +
> +	for (i = sh->disks; i--; )
> +		if (test_bit(R5_InCache, &sh->dev[i].flags))
> +			set_bit(R5_Wantwrite, &sh->dev[i].flags);
> +	set_bit(STRIPE_R5C_WRITTEN, &sh->state);
> +}
> +
> +static void r5c_finish_cache_stripe(struct stripe_head *sh)
> +{

I'm hoping this one has a
	if (not in writeback mode)
		return
so it's clearly this is only for writeback mode.

> +	if (test_bit(STRIPE_R5C_FROZEN, &sh->state))
> +		r5c_handle_parity_cached(sh);
> +	else
> +		r5c_handle_data_cached(sh);
> +}
> +
>  static void r5l_io_run_stripes(struct r5l_io_unit *io)
>  {
>  	struct stripe_head *sh, *next;
>  
>  	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
>  		list_del_init(&sh->log_list);
> +
> +		r5c_finish_cache_stripe(sh);
> +
>  		set_bit(STRIPE_HANDLE, &sh->state);
>  		raid5_release_stripe(sh);
>  	}
> @@ -402,7 +486,8 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
>  	io = log->current_io;
>  
>  	for (i = 0; i < sh->disks; i++) {
> -		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
> +		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags) &&
> +		    !test_bit(R5_Wantcache, &sh->dev[i].flags))
>  			continue;
>  		if (i == sh->pd_idx || i == sh->qd_idx)
>  			continue;
> @@ -412,18 +497,19 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
>  		r5l_append_payload_page(log, sh->dev[i].page);
>  	}
>  
> -	if (sh->qd_idx >= 0) {
> +	if (parity_pages == 2) {
>  		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
>  					sh->sector, sh->dev[sh->pd_idx].log_checksum,
>  					sh->dev[sh->qd_idx].log_checksum, true);
>  		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
>  		r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
> -	} else {
> +	} else if (parity_pages == 1) {
>  		r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
>  					sh->sector, sh->dev[sh->pd_idx].log_checksum,
>  					0, false);
>  		r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
> -	}
> +	} else
> +		BUG_ON(parity_pages != 0);
>  
>  	list_add_tail(&sh->log_list, &io->stripe_list);
>  	atomic_inc(&io->pending_stripe);
> @@ -432,7 +518,6 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
>  	return 0;
>  }
>  
> -static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
>  /*
>   * running in raid5d, where reclaim could wait for raid5d too (when it flushes
>   * data from log to raid disks), so we shouldn't wait for reclaim here
> @@ -456,11 +541,17 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
>  		return -EAGAIN;
>  	}
>  
> +	WARN_ON(!test_bit(STRIPE_R5C_FROZEN, &sh->state));

is this set even for writethrough?

>  	for (i = 0; i < sh->disks; i++) {
>  		void *addr;
>  
>  		if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
>  			continue;
> +
> +		if (test_bit(R5_InCache, &sh->dev[i].flags))
> +			continue;
> +
>  		write_disks++;
>  		/* checksum is already calculated in last run */
>  		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
> @@ -473,6 +564,9 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
>  	parity_pages = 1 + !!(sh->qd_idx >= 0);
>  	data_pages = write_disks - parity_pages;
>  
> +	pr_debug("%s: write %d data_pages and %d parity_pages\n",
> +		 __func__, data_pages, parity_pages);
> +
>  	meta_size =
>  		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
>  		 * data_pages) +
> @@ -735,7 +829,6 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
>  	}
>  }
>  
> -
>  static void r5l_do_reclaim(struct r5l_log *log)
>  {
>  	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
> @@ -798,7 +891,7 @@ static void r5l_reclaim_thread(struct md_thread *thread)
>  	r5l_do_reclaim(log);
>  }
>  
> -static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
> +void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
>  {
>  	unsigned long target;
>  	unsigned long new = (unsigned long)space; /* overflow in theory */
> @@ -1111,6 +1204,188 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp)
>  	set_bit(MD_CHANGE_DEVS, &mddev->flags);
>  }
>  
> +static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
> +{
> +	list_del_init(&sh->lru);
> +	r5c_freeze_stripe_for_reclaim(sh);
> +	atomic_inc(&conf->active_stripes);
> +	atomic_inc(&sh->count);
> +	set_bit(STRIPE_HANDLE, &sh->state);
> +	raid5_release_stripe(sh);
> +}
> +
> +/* if num <= 0, flush all stripes
> + * if num > 0, flush at most num stripes
> + */
comments should be
/*
 * xxx
 */

> +int r5c_flush_cache(struct r5conf *conf, int num)
> +{
> +	int count = 0;
> +	struct stripe_head *sh, *next;
> +
> +	assert_spin_locked(&conf->device_lock);
> +	if (!conf->log)
> +		return 0;
> +	list_for_each_entry_safe(sh, next, &conf->r5c_full_stripe_list, lru) {
> +		r5c_flush_stripe(conf, sh);
> +		count++;
> +		if (num > 0 && count >= num && count >=
> +		    R5C_FULL_STRIPE_FLUSH_BATCH)
> +			return count;
> +	}
> +
> +	list_for_each_entry_safe(sh, next, &conf->r5c_partial_stripe_list, lru) {
> +		r5c_flush_stripe(conf, sh);
> +		count++;
> +		if (num > 0 && count == num)
> +			return count;
> +	}
> +	return count;
> +}
> +
> +int r5c_handle_stripe_dirtying(struct r5conf *conf,
> +			       struct stripe_head *sh,
> +			       struct stripe_head_state *s,
> +			       int disks) {
> +	struct r5l_log *log = conf->log;
> +	int i;
> +	struct r5dev *dev;
> +
> +	if (!log || test_bit(STRIPE_R5C_FROZEN, &sh->state))
> +		return -EAGAIN;
> +
> +	if (conf->log->r5c_state == R5C_STATE_WRITE_THROUGH ||
> +	    conf->quiesce != 0 || conf->mddev->degraded != 0) {
> +		/* write through mode */
> +		r5c_freeze_stripe_for_reclaim(sh);
> +		return -EAGAIN;

will this change behavior of R5C_STATE_WRITE_THROUGH?
r5c_freeze_stripe_for_reclaim does change something not related to writethrough
mode.

The quiesce check sounds not necessary. The patch flush all caches in quiesce,
so no IO is running in quiesce state.

> +	}
> +
> +	s->to_cache = 0;
> +
> +	for (i = disks; i--; ) {
> +		dev = &sh->dev[i];
> +		/* if none-overwrite, use the reclaim path (write through) */
> +		if (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags) &&
> +		    !test_bit(R5_InCache, &dev->flags)) {
> +			r5c_freeze_stripe_for_reclaim(sh);
> +			return -EAGAIN;
> +		}
> +	}
> +
> +	for (i = disks; i--; ) {
> +		dev = &sh->dev[i];
> +		if (dev->towrite) {
> +			set_bit(R5_Wantcache, &dev->flags);
> +			set_bit(R5_Wantdrain, &dev->flags);
> +			set_bit(R5_LOCKED, &dev->flags);
> +			s->to_cache++;
> +		}
> +	}
> +
> +	if (s->to_cache)
> +		set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
> +
> +	return 0;
> +}
> +
> +void r5c_handle_stripe_written(struct r5conf *conf,
> +			       struct stripe_head *sh) {
> +	int i;
> +	int do_wakeup = 0;
> +
> +	if (test_and_clear_bit(STRIPE_R5C_WRITTEN, &sh->state)) {
> +		WARN_ON(!test_bit(STRIPE_R5C_FROZEN, &sh->state));
> +		clear_bit(STRIPE_R5C_FROZEN, &sh->state);
> +
> +		for (i = sh->disks; i--; ) {
> +			if (test_and_clear_bit(R5_InCache, &sh->dev[i].flags))
> +				atomic_dec(&sh->dev_in_cache);
> +			clear_bit(R5_UPTODATE, &sh->dev[i].flags);
> +			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
> +				do_wakeup = 1;
> +		}
> +	}
> +
> +	if (do_wakeup)
> +		wake_up(&conf->wait_for_overlap);
> +}
> +
> +int
> +r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
> +	       struct stripe_head_state *s)
> +{
> +	int pages;
> +	int meta_size;
> +	int reserve;
> +	int i;
> +	int ret = 0;
> +	int page_count = 0;
> +
> +	BUG_ON(!log);
> +	BUG_ON(s->to_cache == 0);
> +
> +	for (i = 0; i < sh->disks; i++) {
> +		void *addr;
> +
> +		if (!test_bit(R5_Wantcache, &sh->dev[i].flags))
> +			continue;
> +		addr = kmap_atomic(sh->dev[i].page);
> +		sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
> +						    addr, PAGE_SIZE);
> +		kunmap_atomic(addr);
> +		page_count++;
> +	}
> +	WARN_ON(page_count != s->to_cache);
> +
> +	pages = s->to_cache;
> +
> +	meta_size =
> +		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
> +		 * pages);
> +	/* Doesn't work with very big raid array */
> +	if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
> +		return -EINVAL;
> +
> +	/*
> +	 * The stripe must enter state machine again to call endio, so
> +	 * don't delay.
> +	 */
> +	clear_bit(STRIPE_DELAYED, &sh->state);
> +	atomic_inc(&sh->count);
> +
> +	mutex_lock(&log->io_mutex);
> +	/* meta + data */
> +	reserve = (1 + pages) << (PAGE_SHIFT - 9);
> +	if (!r5l_has_free_space(log, reserve)) {
> +		spin_lock(&log->no_space_stripes_lock);
> +		list_add_tail(&sh->log_list, &log->no_space_stripes);
> +		spin_unlock(&log->no_space_stripes_lock);
> +
> +		r5l_wake_reclaim(log, reserve);
> +	} else {
> +		ret = r5l_log_stripe(log, sh, pages, 0);
> +		if (ret) {
> +			spin_lock_irq(&log->io_list_lock);
> +			list_add_tail(&sh->log_list, &log->no_mem_stripes);
> +			spin_unlock_irq(&log->io_list_lock);
> +		}
> +	}
> +
> +	mutex_unlock(&log->io_mutex);
> +	return 0;
> +}
> +
> +void r5c_do_reclaim(struct r5conf *conf)
> +{
> +	struct r5l_log *log = conf->log;
> +
> +	assert_spin_locked(&conf->device_lock);
> +
> +	if (!log)
> +		return;

skip for writethrough

> +	r5c_flush_cache(conf, 0);
> +}
> +
>  static int r5l_load_log(struct r5l_log *log)
>  {
>  	struct md_rdev *rdev = log->rdev;
> @@ -1230,6 +1505,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
>  	INIT_LIST_HEAD(&log->no_space_stripes);
>  	spin_lock_init(&log->no_space_stripes_lock);
>  
> +	/* flush full stripe */
> +	log->r5c_state = R5C_STATE_WRITE_BACK;

Should be R5C_STATE_WRITE_THROUGH
> +
>  	if (r5l_load_log(log))
>  		goto error;
>  
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index f94472d..25b411d 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -316,8 +316,25 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
>  			    < IO_THRESHOLD)
>  				md_wakeup_thread(conf->mddev->thread);
>  		atomic_dec(&conf->active_stripes);
> -		if (!test_bit(STRIPE_EXPANDING, &sh->state))
> -			list_add_tail(&sh->lru, temp_inactive_list);
> +		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
> +			if (atomic_read(&sh->dev_in_cache) == 0) {
> +				list_add_tail(&sh->lru, temp_inactive_list);
> +			} else if (atomic_read(&sh->dev_in_cache) ==
> +				   conf->raid_disks - conf->max_degraded) {
> +				/* full stripe */
> +				if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
> +					atomic_inc(&conf->r5c_cached_full_stripes);
> +				if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
> +					atomic_dec(&conf->r5c_cached_partial_stripes);
> +				list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
> +			} else {
> +				/* not full stripe */
> +				if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
> +						      &sh->state))
> +					atomic_inc(&conf->r5c_cached_partial_stripes);
> +				list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
> +			}
> +		}
>  	}
>  }
>  
> @@ -901,6 +918,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
>  
>  	might_sleep();
>  
> +	if (s->to_cache) {
> +		if (r5c_cache_data(conf->log, sh, s) == 0)
> +			return;

At this stage we fallback to no cache. But I don't see R5_Wantcache is cleared,
is it a problem?

> +		/* array is too big that meta data size > PAGE_SIZE  */
> +		r5c_freeze_stripe_for_reclaim(sh);
> +	}
> +
>  	if (r5l_write_stripe(conf->log, sh) == 0)
>  		return;
>  	for (i = disks; i--; ) {
> @@ -1029,6 +1053,7 @@ again:
>  
>  			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
>  				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
> +
>  			sh->dev[i].vec.bv_page = sh->dev[i].page;
>  			bi->bi_vcnt = 1;
>  			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
> @@ -1115,7 +1140,7 @@ again:
>  static struct dma_async_tx_descriptor *
>  async_copy_data(int frombio, struct bio *bio, struct page **page,
>  	sector_t sector, struct dma_async_tx_descriptor *tx,
> -	struct stripe_head *sh)
> +	struct stripe_head *sh, int no_skipcopy)
>  {
>  	struct bio_vec bvl;
>  	struct bvec_iter iter;
> @@ -1155,7 +1180,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
>  			if (frombio) {
>  				if (sh->raid_conf->skip_copy &&
>  				    b_offset == 0 && page_offset == 0 &&
> -				    clen == STRIPE_SIZE)
> +				    clen == STRIPE_SIZE &&
> +				    !no_skipcopy)
>  					*page = bio_page;
>  				else
>  					tx = async_memcpy(*page, bio_page, page_offset,
> @@ -1237,7 +1263,7 @@ static void ops_run_biofill(struct stripe_head *sh)
>  			while (rbi && rbi->bi_iter.bi_sector <
>  				dev->sector + STRIPE_SECTORS) {
>  				tx = async_copy_data(0, rbi, &dev->page,
> -					dev->sector, tx, sh);
> +						     dev->sector, tx, sh, 0);
>  				rbi = r5_next_bio(rbi, dev->sector);
>  			}
>  		}
> @@ -1364,7 +1390,8 @@ static int set_syndrome_sources(struct page **srcs,
>  		if (i == sh->qd_idx || i == sh->pd_idx ||
>  		    (srctype == SYNDROME_SRC_ALL) ||
>  		    (srctype == SYNDROME_SRC_WANT_DRAIN &&
> -		     test_bit(R5_Wantdrain, &dev->flags)) ||
> +		     (test_bit(R5_Wantdrain, &dev->flags) ||
> +		      test_bit(R5_InCache, &dev->flags))) ||
>  		    (srctype == SYNDROME_SRC_WRITTEN &&
>  		     dev->written))
>  			srcs[slot] = sh->dev[i].page;
> @@ -1543,9 +1570,18 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
>  static void ops_complete_prexor(void *stripe_head_ref)
>  {
>  	struct stripe_head *sh = stripe_head_ref;
> +	int i;
>  
>  	pr_debug("%s: stripe %llu\n", __func__,
>  		(unsigned long long)sh->sector);
> +
> +	for (i = sh->disks; i--; )
> +		if (sh->dev[i].page != sh->dev[i].orig_page) {
> +			struct page *p = sh->dev[i].page;
> +
> +			sh->dev[i].page = sh->dev[i].orig_page;
> +			put_page(p);
What if array hasn't log but skipcopy is enabled?

> +		}
>  }
>  
>  static struct dma_async_tx_descriptor *
> @@ -1567,7 +1603,8 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
>  	for (i = disks; i--; ) {
>  		struct r5dev *dev = &sh->dev[i];
>  		/* Only process blocks that are known to be uptodate */
> -		if (test_bit(R5_Wantdrain, &dev->flags))
> +		if (test_bit(R5_Wantdrain, &dev->flags) ||
> +		    test_bit(R5_InCache, &dev->flags))
>  			xor_srcs[count++] = dev->page;
>  	}
>  
> @@ -1618,6 +1655,10 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
>  
>  again:
>  			dev = &sh->dev[i];
> +			if (test_and_clear_bit(R5_InCache, &dev->flags)) {
> +				BUG_ON(atomic_read(&sh->dev_in_cache) == 0);
> +				atomic_dec(&sh->dev_in_cache);
> +			}
>  			spin_lock_irq(&sh->stripe_lock);
>  			chosen = dev->towrite;
>  			dev->towrite = NULL;
> @@ -1625,7 +1666,8 @@ again:
>  			BUG_ON(dev->written);
>  			wbi = dev->written = chosen;
>  			spin_unlock_irq(&sh->stripe_lock);
> -			WARN_ON(dev->page != dev->orig_page);
> +			if (!test_bit(R5_Wantcache, &dev->flags))
> +				WARN_ON(dev->page != dev->orig_page);
>  
>  			while (wbi && wbi->bi_iter.bi_sector <
>  				dev->sector + STRIPE_SECTORS) {
> @@ -1637,8 +1679,10 @@ again:
>  					set_bit(R5_Discard, &dev->flags);
>  				else {
>  					tx = async_copy_data(1, wbi, &dev->page,
> -						dev->sector, tx, sh);
> -					if (dev->page != dev->orig_page) {
> +							     dev->sector, tx, sh,
> +							     test_bit(R5_Wantcache, &dev->flags));
> +					if (dev->page != dev->orig_page &&
> +					    !test_bit(R5_Wantcache, &dev->flags)) {
>  						set_bit(R5_SkipCopy, &dev->flags);
>  						clear_bit(R5_UPTODATE, &dev->flags);
>  						clear_bit(R5_OVERWRITE, &dev->flags);
> @@ -1746,7 +1790,8 @@ again:
>  		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
>  		for (i = disks; i--; ) {
>  			struct r5dev *dev = &sh->dev[i];
> -			if (head_sh->dev[i].written)
> +			if (head_sh->dev[i].written ||
> +			    test_bit(R5_InCache, &head_sh->dev[i].flags))
>  				xor_srcs[count++] = dev->page;
>  		}
>  	} else {
> @@ -2001,6 +2046,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
>  		INIT_LIST_HEAD(&sh->batch_list);
>  		INIT_LIST_HEAD(&sh->lru);
>  		atomic_set(&sh->count, 1);
> +		atomic_set(&sh->dev_in_cache, 0);
>  		for (i = 0; i < disks; i++) {
>  			struct r5dev *dev = &sh->dev[i];
>  
> @@ -2887,6 +2933,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
>  				if (!expand)
>  					clear_bit(R5_UPTODATE, &dev->flags);
>  				s->locked++;
> +			} else if (test_bit(R5_InCache, &dev->flags)) {
> +				set_bit(R5_LOCKED, &dev->flags);
> +				s->locked++;
>  			}
>  		}
>  		/* if we are not expanding this is a proper write request, and
> @@ -2926,6 +2975,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
>  				set_bit(R5_LOCKED, &dev->flags);
>  				clear_bit(R5_UPTODATE, &dev->flags);
>  				s->locked++;
> +			} else if (test_bit(R5_InCache, &dev->flags)) {
> +				set_bit(R5_LOCKED, &dev->flags);
> +				s->locked++;
>  			}
>  		}
>  		if (!s->locked)
> @@ -3577,6 +3629,9 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>  	int rmw = 0, rcw = 0, i;
>  	sector_t recovery_cp = conf->mddev->recovery_cp;
>  
> +	if (r5c_handle_stripe_dirtying(conf, sh, s, disks) == 0)
> +		return;
> +
>  	/* Check whether resync is now happening or should start.
>  	 * If yes, then the array is dirty (after unclean shutdown or
>  	 * initial creation), so parity in some stripes might be inconsistent.
> @@ -3597,9 +3652,12 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>  	} else for (i = disks; i--; ) {
>  		/* would I have to read this buffer for read_modify_write */
>  		struct r5dev *dev = &sh->dev[i];
> -		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
> +		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
> +		     test_bit(R5_InCache, &dev->flags)) &&
>  		    !test_bit(R5_LOCKED, &dev->flags) &&
> -		    !(test_bit(R5_UPTODATE, &dev->flags) ||
> +		    !((test_bit(R5_UPTODATE, &dev->flags) &&
> +		       (!test_bit(R5_InCache, &dev->flags) ||
> +			dev->page != dev->orig_page)) ||
>  		      test_bit(R5_Wantcompute, &dev->flags))) {
>  			if (test_bit(R5_Insync, &dev->flags))
>  				rmw++;
> @@ -3611,13 +3669,15 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>  		    i != sh->pd_idx && i != sh->qd_idx &&
>  		    !test_bit(R5_LOCKED, &dev->flags) &&
>  		    !(test_bit(R5_UPTODATE, &dev->flags) ||
> -		    test_bit(R5_Wantcompute, &dev->flags))) {
> +		      test_bit(R5_InCache, &dev->flags) ||
> +		      test_bit(R5_Wantcompute, &dev->flags))) {
>  			if (test_bit(R5_Insync, &dev->flags))
>  				rcw++;
>  			else
>  				rcw += 2*disks;
>  		}
>  	}
> +
>  	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
>  		(unsigned long long)sh->sector, rmw, rcw);
>  	set_bit(STRIPE_HANDLE, &sh->state);
> @@ -3629,10 +3689,18 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>  					  (unsigned long long)sh->sector, rmw);
>  		for (i = disks; i--; ) {
>  			struct r5dev *dev = &sh->dev[i];
> -			if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
> +			if (test_bit(R5_InCache, &dev->flags) &&
> +			    dev->page == dev->orig_page)
> +				dev->page = alloc_page(GFP_NOIO);  /* prexor */
> +
> +			if ((dev->towrite ||
> +			     i == sh->pd_idx || i == sh->qd_idx ||
> +			     test_bit(R5_InCache, &dev->flags)) &&
>  			    !test_bit(R5_LOCKED, &dev->flags) &&
> -			    !(test_bit(R5_UPTODATE, &dev->flags) ||
> -			    test_bit(R5_Wantcompute, &dev->flags)) &&
> +			    !((test_bit(R5_UPTODATE, &dev->flags) &&
> +			       (!test_bit(R5_InCache, &dev->flags) ||
> +				dev->page != dev->orig_page)) ||
> +			      test_bit(R5_Wantcompute, &dev->flags)) &&
>  			    test_bit(R5_Insync, &dev->flags)) {
>  				if (test_bit(STRIPE_PREREAD_ACTIVE,
>  					     &sh->state)) {
> @@ -3658,6 +3726,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>  			    i != sh->pd_idx && i != sh->qd_idx &&
>  			    !test_bit(R5_LOCKED, &dev->flags) &&
>  			    !(test_bit(R5_UPTODATE, &dev->flags) ||
> +			      test_bit(R5_InCache, &dev->flags) ||
>  			      test_bit(R5_Wantcompute, &dev->flags))) {
>  				rcw++;
>  				if (test_bit(R5_Insync, &dev->flags) &&
> @@ -3697,7 +3766,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
>  	 */
>  	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
>  	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
> -	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
> +	     !test_bit(STRIPE_BIT_DELAY, &sh->state)))
>  		schedule_reconstruction(sh, s, rcw == 0, 0);
>  }
>  
> @@ -4010,6 +4079,45 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
>  	async_tx_quiesce(&tx);
>  }

please consider move r5c_return_dev_pending_writes/r5c_handle_cached_data_endio
to raid5-cache.c, I'd like the log related code self-contained if possible.
 
> +static void
> +r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
> +			      struct bio_list *return_bi)
> +{
> +	struct bio *wbi, *wbi2;
> +
> +	wbi = dev->written;
> +	dev->written = NULL;
> +	while (wbi && wbi->bi_iter.bi_sector <
> +	       dev->sector + STRIPE_SECTORS) {
> +		wbi2 = r5_next_bio(wbi, dev->sector);
> +		if (!raid5_dec_bi_active_stripes(wbi)) {
> +			md_write_end(conf->mddev);
> +			bio_list_add(return_bi, wbi);
> +		}
> +		wbi = wbi2;
> +	}
> +}
> +
> +static void r5c_handle_cached_data_endio(struct r5conf *conf,
> +	  struct stripe_head *sh, int disks, struct bio_list *return_bi)
> +{
> +	int i;
> +
> +	for (i = sh->disks; i--; ) {
> +		if (test_bit(R5_InCache, &sh->dev[i].flags) &&
> +		    sh->dev[i].written) {
> +			set_bit(R5_UPTODATE, &sh->dev[i].flags);
> +			r5c_return_dev_pending_writes(conf, &sh->dev[i],
> +						      return_bi);
> +			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
> +					STRIPE_SECTORS,
> +					!test_bit(STRIPE_DEGRADED, &sh->state),
> +					0);
> +		}
> +	}
> +	r5l_stripe_write_finished(sh);
> +}
> +
>  /*
>   * handle_stripe - do things to a stripe.
>   *
> @@ -4188,6 +4296,10 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
>  			if (rdev && !test_bit(Faulty, &rdev->flags))
>  				do_recovery = 1;
>  		}
> +		if (test_bit(R5_InCache, &dev->flags) && dev->written)
> +			s->just_cached++;
> +		if (test_bit(R5_Wantcache, &dev->flags) && dev->written)
> +			s->want_cache++;
>  	}
>  	if (test_bit(STRIPE_SYNCING, &sh->state)) {
>  		/* If there is a failed device being replaced,
> @@ -4353,6 +4465,16 @@ static void handle_stripe(struct stripe_head *sh)
>  
>  	analyse_stripe(sh, &s);
>  
> +	if (s.want_cache) {
> +		/* we have finished r5c_handle_stripe_dirtying and
> +		 * ops_run_biodrain, but r5c_cache_data didn't finish because
> +		 * the journal device didn't have enough space. This time we
> +		 * should skip handle_stripe_dirtying and ops_run_biodrain
> +		 */
> +		s.to_cache = s.want_cache;
> +		goto finish;
> +	}
> +
>  	if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
>  		goto finish;
>  
> @@ -4416,7 +4538,7 @@ static void handle_stripe(struct stripe_head *sh)
>  			struct r5dev *dev = &sh->dev[i];
>  			if (test_bit(R5_LOCKED, &dev->flags) &&
>  				(i == sh->pd_idx || i == sh->qd_idx ||
> -				 dev->written)) {
> +				 dev->written || test_bit(R5_InCache, &dev->flags))) {
>  				pr_debug("Writing block %d\n", i);
>  				set_bit(R5_Wantwrite, &dev->flags);
>  				if (prexor)
> @@ -4456,6 +4578,12 @@ static void handle_stripe(struct stripe_head *sh)
>  				 test_bit(R5_Discard, &qdev->flags))))))
>  		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
>  
> +	if (s.just_cached)
> +		r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
> +
> +	if (test_bit(STRIPE_R5C_FROZEN, &sh->state))
> +		r5l_stripe_write_finished(sh);
what's this for?

> +
>  	/* Now we might consider reading some blocks, either to check/generate
>  	 * parity, or to satisfy requests
>  	 * or to load a block that is being partially written.
> @@ -4467,13 +4595,17 @@ static void handle_stripe(struct stripe_head *sh)
>  	    || s.expanding)
>  		handle_stripe_fill(sh, &s, disks);
>  
> -	/* Now to consider new write requests and what else, if anything
> -	 * should be read.  We do not handle new writes when:
> +	r5c_handle_stripe_written(conf, sh);

please explain why this is required?

> +
> +	/* Now to consider new write requests, cache write back and what else,
> +	 * if anything should be read.  We do not handle new writes when:
>  	 * 1/ A 'write' operation (copy+xor) is already in flight.
>  	 * 2/ A 'check' operation is in flight, as it may clobber the parity
>  	 *    block.
> +	 * 3/ A r5c cache log write is in flight.
>  	 */
Chnage comments format

> -	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
> +	if ((s.to_write || test_bit(STRIPE_R5C_FROZEN, &sh->state)) &&
> +	     !sh->reconstruct_state && !sh->check_state && !sh->log_io)
>  		handle_stripe_dirtying(conf, sh, &s, disks);
>  
>  	/* maybe we need to check and possibly fix the parity for this stripe
> @@ -5192,7 +5324,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
>  	 * later we might have to read it again in order to reconstruct
>  	 * data on failed drives.
>  	 */
> -	if (rw == READ && mddev->degraded == 0 &&
> +	if (rw == READ && mddev->degraded == 0 && conf->log == NULL &&
>  	    mddev->reshape_position == MaxSector) {
>  		bi = chunk_aligned_read(mddev, bi);
>  		if (!bi)
This should be only for R5C_STATE_WRITE_BACK.

> @@ -5917,6 +6049,7 @@ static void raid5d(struct md_thread *thread)
>  			md_check_recovery(mddev);
>  			spin_lock_irq(&conf->device_lock);
>  		}
> +		r5c_do_reclaim(conf);
>  	}
>  	pr_debug("%d stripes handled\n", handled);
>  
> @@ -6583,6 +6716,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
>  	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
>  		INIT_LIST_HEAD(conf->temp_inactive_list + i);
>  
> +	atomic_set(&conf->r5c_cached_full_stripes, 0);
> +	INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
> +	atomic_set(&conf->r5c_cached_partial_stripes, 0);
> +	INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
> +
>  	conf->level = mddev->new_level;
>  	conf->chunk_sectors = mddev->new_chunk_sectors;
>  	if (raid5_alloc_percpu(conf) != 0)
> @@ -7662,8 +7800,11 @@ static void raid5_quiesce(struct mddev *mddev, int state)
>  		/* '2' tells resync/reshape to pause so that all
>  		 * active stripes can drain
>  		 */
> +		r5c_flush_cache(conf, 0);
>  		conf->quiesce = 2;
>  		wait_event_cmd(conf->wait_for_quiescent,
> +				    atomic_read(&conf->r5c_cached_partial_stripes) == 0 &&
> +				    atomic_read(&conf->r5c_cached_full_stripes) == 0 &&
I don't see a wakeup for this after the check condition is met.

Thanks,
Shaohua

^ permalink raw reply

* Re: WARNING: mismatch_cnt is not 0 on <array device>
From: Wols Lists @ 2016-09-27 19:35 UTC (permalink / raw)
  To: Benjammin2068, Linux-RAID
In-Reply-To: <5427ce35-f222-8a2c-486e-441c4c6ec9a6@gmail.com>

On 27/09/16 18:30, Benjammin2068 wrote:
> I have smartctl running for all my drives -- but that doesn't help me at the mdadm level.
> 
> While you're in the docs adding stuff about mismatch_cnt, is there anything that can help someone backtrace which block cause the count to go up? This would help us mere mortals maybe go back to inspect a block or a file or something to make sure it's not corrupted.

I'm planning to go back through the archives as best I can (I keep a
local archive :-), and I'm starring everything of interest. So if
anybody can chime in, I'll make a note! :-)

All this should eventually get there ... and I'm planning to add a
kernel programming section too - not least because I want to do some!

Cheers,
Wol

^ permalink raw reply

* Re: WARNING: mismatch_cnt is not 0 on <array device>
From: Benjammin2068 @ 2016-09-27 17:30 UTC (permalink / raw)
  To: Linux-RAID
In-Reply-To: <57EAA214.8030603@youngman.org.uk>

On 09/27/2016 11:45 AM, Wols Lists wrote:
>
> I'm thinking about all this. The second section is all about recovering
> a failing/ed array, and is new. The first section is the original,
> that's being updated. It just feels totally wrong to me now, as it's
> becoming a jumbled mess of old and new.
>
> What I'm probably going to do, is create a new first section about
> setting up a raid system. That means that a section on monitoring will
> actually make sense and fit between setting it up, and fixing problems.
>
> (And all the old stuff will end up in the "software archaeology"
> section, so people who are still running ancient systems can find it :-)
>

That would be awesome.

 There was a shell script out there already for MUNIN, but I modified it a little to add thresholds that throw up flags. I might change some more to handle different thresholds for different devices or the ability to monitor only RAIDs that matter.

I have smartctl running for all my drives -- but that doesn't help me at the mdadm level.

While you're in the docs adding stuff about mismatch_cnt, is there anything that can help someone backtrace which block cause the count to go up? This would help us mere mortals maybe go back to inspect a block or a file or something to make sure it's not corrupted.

 -Ben

^ permalink raw reply

* Re: WARNING: mismatch_cnt is not 0 on <array device>
From: Benjammin2068 @ 2016-09-27 17:24 UTC (permalink / raw)
  To: Linux-RAID
In-Reply-To: <20160927213624.739f80ba@natsu>

On 09/27/2016 11:36 AM, Roman Mamedov wrote:
> On Tue, 27 Sep 2016 11:27:13 -0500
> Benjammin2068 <benjammin2068@gmail.com> wrote:
>
>> I think I did find the problem. The card was running hot due to airflow.
>> That's been remedied (I hope) -- the temp sensor on the heat-sink for the
>> PCIe controller now sits around 45'C which is fine. Before it was >=
>> 60'C . :O
> I wouldn't trust such controller anyway. 15 degrees difference and it
> (allegedly) gives you silent data corruption? What if you have a particularly
> hot day, and/or the AC is out for a few hours.
> There is a lot of better failure modes than this (honestly reported read or
> CRC errors for a start, or heck, even complete lock-up of the controller would
> be more preferrable).
>

I think it was running way hotter than that. I could only get to it to measure in a certain timespan and it had already cooled off. (that's what heatsinks do)

by the time I got a temp sensor on it -- it was too late.

  -Ben

^ permalink raw reply

* [PATCH 16/16] md/bitmap: Delete an unwanted space in read_sb_page()
From: SF Markus Elfring @ 2016-09-27 17:04 UTC (permalink / raw)
  To: linux-raid, Shaohua Li; +Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <30938c84-20a7-0f13-bdda-a2d2109a6dac@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 27 Sep 2016 18:18:16 +0200

The script "checkpatch.pl" can point information out like the following.

ERROR: space prohibited after that '!' (ctx:BxW)

Thus fix the affected source code place.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/bitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index d029576..c6a6d59 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -154,7 +154,7 @@ static int read_sb_page(struct mddev *mddev, loff_t offset,
 	sector_t target;
 
 	rdev_for_each(rdev, mddev) {
-		if (! test_bit(In_sync, &rdev->flags)
+		if (!test_bit(In_sync, &rdev->flags)
 		    || test_bit(Faulty, &rdev->flags))
 			continue;
 
-- 
2.10.0


^ permalink raw reply related

* [PATCH 15/16] md/bitmap: Add spaces around three comparison operators
From: SF Markus Elfring @ 2016-09-27 17:03 UTC (permalink / raw)
  To: linux-raid, Shaohua Li; +Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <30938c84-20a7-0f13-bdda-a2d2109a6dac@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 27 Sep 2016 18:08:04 +0200

The script "checkpatch.pl" can point information out like the following.

ERROR: spaces required around that '==' (ctx:VxV)

Thus fix the affected source code places.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/bitmap.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 9d77f16..d029576 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -303,7 +303,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
 
 		if (wait)
 			wait_event(bitmap->write_wait,
-				   atomic_read(&bitmap->pending_writes)==0);
+				   atomic_read(&bitmap->pending_writes) == 0);
 	}
 	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
 		bitmap_file_kick(bitmap);
@@ -400,7 +400,7 @@ static int read_page(struct file *file, unsigned long index,
 	page->index = index;
 
 	wait_event(bitmap->write_wait,
-		   atomic_read(&bitmap->pending_writes)==0);
+		   atomic_read(&bitmap->pending_writes) == 0);
 	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) {
 		ret = -EIO;
 report_failure:
@@ -1003,7 +1003,7 @@ void bitmap_unplug(struct bitmap *bitmap)
 	}
 	if (bitmap->storage.file)
 		wait_event(bitmap->write_wait,
-			   atomic_read(&bitmap->pending_writes)==0);
+			   atomic_read(&bitmap->pending_writes) == 0);
 	else
 		md_super_wait(bitmap->mddev);
 
-- 
2.10.0


^ permalink raw reply related

* [PATCH 14/16] md/bitmap: Delete unnecessary braces in bitmap_resize()
From: SF Markus Elfring @ 2016-09-27 17:02 UTC (permalink / raw)
  To: linux-raid, Shaohua Li; +Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <30938c84-20a7-0f13-bdda-a2d2109a6dac@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 27 Sep 2016 17:53:07 +0200

Do not use curly brackets at one source code place
where a single statement should be sufficient.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/bitmap.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index f8900cc..9d77f16 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -2075,9 +2075,8 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 				unsigned long k;
 
 				/* deallocate the page memory */
-				for (k = 0; k < page; k++) {
+				for (k = 0; k < page; k++)
 					kfree(new_bp[k].map);
-				}
 
 				/* restore some fields from old_counts */
 				bitmap->counts.bp = old_counts.bp;
-- 
2.10.0


^ permalink raw reply related

* [PATCH 13/16] md/bitmap: Adjust checks for null pointers in 11 functions
From: SF Markus Elfring @ 2016-09-27 17:00 UTC (permalink / raw)
  To: linux-raid, Shaohua Li; +Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <30938c84-20a7-0f13-bdda-a2d2109a6dac@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 27 Sep 2016 17:40:12 +0200
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The script "checkpatch.pl" can point information out like the following.

Comparison to NULL could be written !…

Thus fix the affected source code places.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/bitmap.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index e7a7fc8..f8900cc 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -88,7 +88,7 @@ __acquires(bitmap->lock)
 	mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
 	spin_lock_irq(&bitmap->lock);
 
-	if (mappage == NULL) {
+	if (!mappage) {
 		pr_debug("md/bitmap: map page allocation failed, hijacking\n");
 		/* We don't support hijack for cluster raid */
 		if (no_hijack)
@@ -186,7 +186,7 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
 	 * list_for_each_entry_continue_rcu() to find the first entry.
 	 */
 	rcu_read_lock();
-	if (rdev == NULL)
+	if (!rdev)
 		/* start at the beginning */
 		rdev = list_entry(&mddev->disks, struct md_rdev, same_set);
 	else {
@@ -284,7 +284,7 @@ static void write_page(struct bitmap *bitmap, struct page *page, int wait)
 {
 	struct buffer_head *bh;
 
-	if (bitmap->storage.file == NULL) {
+	if (!bitmap->storage.file) {
 		switch (write_sb_page(bitmap, page, wait)) {
 		case -EINVAL:
 			set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
@@ -493,7 +493,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
 	unsigned long chunksize, daemon_sleep, write_behind;
 
 	bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-	if (bitmap->storage.sb_page == NULL)
+	if (!bitmap->storage.sb_page)
 		return -ENOMEM;
 	bitmap->storage.sb_page->index = 0;
 
@@ -767,7 +767,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
 
 	if (with_super && !store->sb_page) {
 		store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
-		if (store->sb_page == NULL)
+		if (!store->sb_page)
 			return -ENOMEM;
 	}
 
@@ -1209,7 +1209,7 @@ void bitmap_daemon_work(struct mddev *mddev)
 	 */
 	mutex_lock(&mddev->bitmap_info.mutex);
 	bitmap = mddev->bitmap;
-	if (bitmap == NULL) {
+	if (!bitmap) {
 		mutex_unlock(&mddev->bitmap_info.mutex);
 		return;
 	}
@@ -1336,7 +1336,7 @@ __acquires(bitmap->lock)
 	err = bitmap_checkpage(bitmap, page, create, 0);
 
 	if (bitmap->bp[page].hijacked ||
-	    bitmap->bp[page].map == NULL)
+	    !bitmap->bp[page].map)
 		csize = ((sector_t)1) << (bitmap->chunkshift +
 					  PAGE_COUNTER_SHIFT - 1);
 	else
@@ -1481,7 +1481,7 @@ static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t
 {
 	bitmap_counter_t *bmc;
 	int rv;
-	if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
+	if (!bitmap) {/* FIXME or bitmap set as 'failed' */
 		*blocks = 1024;
 		return 1; /* always resync if no bitmap */
 	}
@@ -1533,13 +1533,13 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
 	bitmap_counter_t *bmc;
 	unsigned long flags;
 
-	if (bitmap == NULL) {
+	if (!bitmap) {
 		*blocks = 1024;
 		return;
 	}
 	spin_lock_irqsave(&bitmap->counts.lock, flags);
 	bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
-	if (bmc == NULL)
+	if (!bmc)
 		goto unlock;
 	/* locked */
 	if (RESYNC(*bmc)) {
@@ -2455,7 +2455,7 @@ static ssize_t can_clear_show(struct mddev *mddev, char *page)
 
 static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len)
 {
-	if (mddev->bitmap == NULL)
+	if (!mddev->bitmap)
 		return -ENOENT;
 	if (strncmp(buf, "false", 5) == 0)
 		mddev->bitmap->need_sync = 1;
@@ -2476,7 +2476,7 @@ behind_writes_used_show(struct mddev *mddev, char *page)
 {
 	ssize_t ret;
 	spin_lock(&mddev->lock);
-	if (mddev->bitmap == NULL)
+	if (!mddev->bitmap)
 		ret = sprintf(page, "0\n");
 	else
 		ret = sprintf(page, "%lu\n",
-- 
2.10.0


^ permalink raw reply related

* [PATCH 12/16] md/bitmap: One check less in read_page() at the end
From: SF Markus Elfring @ 2016-09-27 16:59 UTC (permalink / raw)
  To: linux-raid, Shaohua Li; +Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <30938c84-20a7-0f13-bdda-a2d2109a6dac@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 27 Sep 2016 16:30:25 +0200

* Adjust a jump target.

* Delete a repeated check which became unnecessary with this refactoring.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/bitmap.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index c186e5d..e7a7fc8 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -367,7 +367,7 @@ static int read_page(struct file *file, unsigned long index,
 	bh = alloc_page_buffers(page, 1<<inode->i_blkbits, 0);
 	if (!bh) {
 		ret = -ENOMEM;
-		goto out;
+		goto report_failure;
 	}
 	attach_page_buffers(page, bh);
 	block = index << (PAGE_SHIFT - inode->i_blkbits);
@@ -379,7 +379,7 @@ static int read_page(struct file *file, unsigned long index,
 			if (bh->b_blocknr == 0) {
 				/* Cannot use this file! */
 				ret = -EINVAL;
-				goto out;
+				goto report_failure;
 			}
 			bh->b_bdev = inode->i_sb->s_bdev;
 			if (count < (1<<inode->i_blkbits))
@@ -401,14 +401,14 @@ static int read_page(struct file *file, unsigned long index,
 
 	wait_event(bitmap->write_wait,
 		   atomic_read(&bitmap->pending_writes)==0);
-	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
+	if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) {
 		ret = -EIO;
-out:
-	if (ret)
+report_failure:
 		printk(KERN_ALERT "md: bitmap read error: (%dB @ %llu): %d\n",
 			(int)PAGE_SIZE,
 			(unsigned long long)index << PAGE_SHIFT,
 			ret);
+	}
 	return ret;
 }
 
-- 
2.10.0


^ permalink raw reply related

* [PATCH 11/16] md/bitmap: Rename a jump label in bitmap_init_from_disk()
From: SF Markus Elfring @ 2016-09-27 16:58 UTC (permalink / raw)
  To: linux-raid, Shaohua Li; +Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <30938c84-20a7-0f13-bdda-a2d2109a6dac@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 27 Sep 2016 16:12:47 +0200

Adjust jump labels according to the current Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/bitmap.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 1f7f1e1..c186e5d 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1064,7 +1064,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 		       bmname(bitmap),
 		       (unsigned long) i_size_read(file->f_mapping->host),
 		       store->bytes);
-		goto err;
+		goto report_failure;
 	}
 
 	oldindex = ~0L;
@@ -1098,7 +1098,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 					index + node_offset, count);
 
 			if (ret)
-				goto err;
+				goto report_failure;
 
 			oldindex = index;
 
@@ -1116,7 +1116,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 				ret = -EIO;
 				if (test_bit(BITMAP_WRITE_ERROR,
 					     &bitmap->flags))
-					goto err;
+					goto report_failure;
 			}
 		}
 		paddr = kmap_atomic(page);
@@ -1143,8 +1143,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 	       bit_cnt, chunks);
 
 	return 0;
-
- err:
+report_failure:
 	printk(KERN_INFO "%s: bitmap initialisation failed: %d\n",
 	       bmname(bitmap), ret);
 	return ret;
-- 
2.10.0


^ permalink raw reply related

* [PATCH 10/16] md/bitmap: Rename a jump label in bitmap_create()
From: SF Markus Elfring @ 2016-09-27 16:57 UTC (permalink / raw)
  To: linux-raid, Shaohua Li; +Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <30938c84-20a7-0f13-bdda-a2d2109a6dac@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 27 Sep 2016 16:06:35 +0200

Adjust jump labels according to the current Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/bitmap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 5125186..1f7f1e1 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1818,22 +1818,22 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot)
 			err = -EINVAL;
 	}
 	if (err)
-		goto error;
+		goto free_bitmap;
 
 	bitmap->daemon_lastrun = jiffies;
 	err = bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1);
 	if (err)
-		goto error;
+		goto free_bitmap;
 
 	printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
 	       bitmap->counts.pages, bmname(bitmap));
 
 	err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
 	if (err)
-		goto error;
+		goto free_bitmap;
 
 	return bitmap;
- error:
+free_bitmap:
 	bitmap_free(bitmap);
 	return ERR_PTR(err);
 }
-- 
2.10.0


^ permalink raw reply related

* [PATCH 09/16] md/bitmap: Rename a jump label in bitmap_copy_from_slot()
From: SF Markus Elfring @ 2016-09-27 16:56 UTC (permalink / raw)
  To: linux-raid, Shaohua Li; +Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <30938c84-20a7-0f13-bdda-a2d2109a6dac@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 27 Sep 2016 15:55:01 +0200

Adjust a jump label according to the current Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/bitmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 22fa09a..5125186 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1910,7 +1910,7 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 
 	rv = bitmap_init_from_disk(bitmap, 0);
 	if (rv)
-		goto err;
+		goto free_bitmap;
 
 	counts = &bitmap->counts;
 	for (j = 0; j < counts->chunks; j++) {
@@ -1937,7 +1937,7 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 	bitmap_unplug(mddev->bitmap);
 	*low = lo;
 	*high = hi;
-err:
+free_bitmap:
 	bitmap_free(bitmap);
 	return rv;
 }
-- 
2.10.0


^ permalink raw reply related

* [PATCH 08/16] md/bitmap: Rename a jump label in location_store()
From: SF Markus Elfring @ 2016-09-27 16:55 UTC (permalink / raw)
  To: linux-raid, Shaohua Li; +Cc: LKML, kernel-janitors, Julia Lawall
In-Reply-To: <30938c84-20a7-0f13-bdda-a2d2109a6dac@users.sourceforge.net>

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 27 Sep 2016 15:46:22 +0200

Adjust jump labels according to the current Linux coding style convention.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---
 drivers/md/bitmap.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 41d99fd..22fa09a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -2187,11 +2187,11 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 	if (mddev->pers) {
 		if (!mddev->pers->quiesce) {
 			rv = -EBUSY;
-			goto out;
+			goto unlock;
 		}
 		if (mddev->recovery || mddev->sync_thread) {
 			rv = -EBUSY;
-			goto out;
+			goto unlock;
 		}
 	}
 
@@ -2200,7 +2200,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 		/* bitmap already configured.  Only option is to clear it */
 		if (strncmp(buf, "none", 4) != 0) {
 			rv = -EBUSY;
-			goto out;
+			goto unlock;
 		}
 		if (mddev->pers) {
 			mddev->pers->quiesce(mddev, 1);
@@ -2221,23 +2221,23 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 		else if (strncmp(buf, "file:", 5) == 0) {
 			/* Not supported yet */
 			rv = -EINVAL;
-			goto out;
+			goto unlock;
 		} else {
 			if (buf[0] == '+')
 				rv = kstrtoll(buf+1, 10, &offset);
 			else
 				rv = kstrtoll(buf, 10, &offset);
 			if (rv)
-				goto out;
+				goto unlock;
 			if (offset == 0) {
 				rv = -EINVAL;
-				goto out;
+				goto unlock;
 			}
 			if (mddev->bitmap_info.external == 0 &&
 			    mddev->major_version == 0 &&
 			    offset != mddev->bitmap_info.default_offset) {
 				rv = -EINVAL;
-				goto out;
+				goto unlock;
 			}
 			mddev->bitmap_info.offset = offset;
 			if (mddev->pers) {
@@ -2255,7 +2255,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 				mddev->pers->quiesce(mddev, 0);
 				if (rv) {
 					bitmap_destroy(mddev);
-					goto out;
+					goto unlock;
 				}
 			}
 		}
@@ -2268,7 +2268,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 		md_wakeup_thread(mddev->thread);
 	}
 	rv = 0;
-out:
+unlock:
 	mddev_unlock(mddev);
 	if (rv)
 		return rv;
-- 
2.10.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox