Linux RAID subsystem development
 help / color / mirror / Atom feed
* Re: [PATCH v2 05/12] raid5-ppl: Partial Parity Log implementation
From: NeilBrown @ 2016-12-07  1:17 UTC (permalink / raw)
  To: Artur Paszkiewicz, shli; +Cc: linux-raid
In-Reply-To: <20161205153113.7268-6-artur.paszkiewicz@intel.com>

[-- Attachment #1: Type: text/plain, Size: 6455 bytes --]

On Tue, Dec 06 2016, Artur Paszkiewicz wrote:

> This implements the write logging functionality, using the policy logic
> introduced in previous patches.
>
> PPL is a distributed log - data is stored on all RAID member drives in
> the metadata area. PPL is written to the parity disk of a particular
> stripe. Distributed log is implemented by using one r5l_log instance per
> each array member. They are grouped in child_logs array in struct
> ppl_conf, which is assigned to a common parent log. This parent log
> serves as a proxy and is used in raid5 personality code - it is assigned
> as _the_ log in r5conf->log. The child logs are where all the real work
> is done.
>
> The PPL consists of a 4KB header (struct ppl_header), and at least 128KB
> for partial parity data. It is stored right after the array data (for
> IMSM) or in the bitmap area (super 1.1 and 1.2) and can be overwritten
> even at each array write request.
>
> Attach a page for holding the partial parity data to each stripe_head.
> Allocate it only if mddev has the MD_HAS_PPL flag set.
>
> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
> ---
>  drivers/md/raid5-cache.c |  12 +-
>  drivers/md/raid5-cache.h |   6 +
>  drivers/md/raid5-ppl.c   | 594 ++++++++++++++++++++++++++++++++++++++++++++++-
>  drivers/md/raid5.c       |  15 +-
>  drivers/md/raid5.h       |   1 +
>  5 files changed, 620 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index fa82b9a..be534d8 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -119,8 +119,8 @@ static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
>  	return log->device_size > used_size + size;
>  }
>  
> -static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
> -				    enum r5l_io_unit_state state)
> +void __r5l_set_io_unit_state(struct r5l_io_unit *io,
> +			     enum r5l_io_unit_state state)
>  {
>  	if (WARN_ON(io->state >= state))
>  		return;
> @@ -340,7 +340,7 @@ static void r5c_finish_cache_stripe(struct stripe_head *sh)
>  	}
>  }
>  
> -static void r5l_io_run_stripes(struct r5l_io_unit *io)
> +void r5l_io_run_stripes(struct r5l_io_unit *io)
>  {
>  	struct stripe_head *sh, *next;
>  
> @@ -935,7 +935,7 @@ static sector_t r5l_reclaimable_space(struct r5l_log *log)
>  				 r5c_calculate_new_cp(conf));
>  }
>  
> -static void r5l_run_no_mem_stripe(struct r5l_log *log)
> +void r5l_run_no_mem_stripe(struct r5l_log *log)
>  {
>  	struct stripe_head *sh;
>  
> @@ -1039,7 +1039,7 @@ static void r5l_log_flush_endio(struct bio *bio)
>   * only write stripes of an io_unit to raid disks till the io_unit is the first
>   * one whose data/parity is in log.
>   */
> -static void __r5l_flush_stripe_to_raid(struct r5l_log *log)
> +void __r5l_flush_stripe_to_raid(struct r5l_log *log)
>  {
>  	bool do_flush;
>  
> @@ -1359,7 +1359,7 @@ bool r5l_log_disk_error(struct r5conf *conf)
>  	if (!log)
>  		ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
>  	else
> -		ret = test_bit(Faulty, &log->rdev->flags);
> +		ret = log->rdev && test_bit(Faulty, &log->rdev->flags);
>  	rcu_read_unlock();
>  	return ret;
>  }
> diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
> index 4ba11d3..0446100 100644
> --- a/drivers/md/raid5-cache.h
> +++ b/drivers/md/raid5-cache.h
> @@ -157,4 +157,10 @@ extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
>  extern void r5l_quiesce(struct r5l_log *log, int state);
>  extern bool r5l_log_disk_error(struct r5conf *conf);
>  
> +extern void __r5l_set_io_unit_state(struct r5l_io_unit *io,
> +				    enum r5l_io_unit_state state);
> +extern void r5l_io_run_stripes(struct r5l_io_unit *io);
> +extern void r5l_run_no_mem_stripe(struct r5l_log *log);
> +extern void __r5l_flush_stripe_to_raid(struct r5l_log *log);
> +
>  #endif /* _RAID5_CACHE_H */
> diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
> index 263fad7..2d4c90f 100644
> --- a/drivers/md/raid5-ppl.c
> +++ b/drivers/md/raid5-ppl.c
> @@ -14,7 +14,599 @@
>  
>  #include <linux/kernel.h>
>  #include <linux/blkdev.h>
> +#include <linux/slab.h>
> +#include <linux/crc32c.h>
> +#include <linux/module.h>
> +#include <linux/raid/md_p.h>
> +#include "md.h"
>  #include "raid5.h"
>  #include "raid5-cache.h"
>  
> -struct r5l_policy r5l_ppl;
> +static bool ppl_debug;
> +module_param(ppl_debug, bool, 0644);
> +MODULE_PARM_DESC(ppl_debug, "Debug mode for md raid5 PPL");
> +
> +#define dbg(format, args...)						\
> +do {									\
> +	if (ppl_debug)							\
> +		printk(KERN_DEBUG"[%d] %s() "format,			\
> +			current->pid, __func__, ##args);		\
> +} while (0)

Please don't do this.  Just use pr_debug(), and use
 /sys/kernel/debug/dynamic_debug/control
to turn them on and off.

> +
> +struct ppl_conf {
> +	int count;
> +	struct r5l_log **child_logs;
> +};
> +
> +struct ppl_header_entry {
> +	__le64 data_sector;	/* Raid sector of the new data */
> +	__le32 pp_size;		/* Length of partial parity */
> +	__le32 data_size;	/* Length of data */
> +	__u8 parity_disk;	/* Member disk containing parity */
> +	__le32 checksum;	/* Checksum of this entry */
> +} __packed;

Really?  "checksum" is 32bits but not aligned?
I *think* you should be using get_unaligned_le32() to access this
and put_unaligned_le32() to set it.

> +
> +#define PPL_HEADER_SIZE PAGE_SIZE
> +#define PPL_HDR_RESERVED 512
> +#define PPL_HDR_ENTRY_SPACE \
> +	(PPL_HEADER_SIZE - PPL_HDR_RESERVED - 3 * sizeof(u32) - sizeof(u64))
> +#define PPL_HDR_MAX_ENTRIES \
> +	(PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
> +#define PPL_ENTRY_SPACE_IMSM (128 * 1024)
> +
> +struct ppl_header {
> +	__u8 reserved[PPL_HDR_RESERVED];/* Reserved space */
> +	__le32 signature;		/* Signature (family number of volume) */
> +	__le64 generation;		/* Generation number of PP Header */

This probably needs to use the 'unaligned' macros too.

> +	__le32 entries_count;		/* Number of entries in entry array */
> +	__le32 checksum;		/* Checksum of PP Header */
> +	struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES];
> +} __packed;

ppl_header_entry doesn't seem to be a multiple of 4 bytes long.
This means all the fields in it could be unaligned...

Maybe we should make this code refuse to compile except on x86 ???


NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [PATCH v2 03/12] raid5-cache: add a new policy
From: NeilBrown @ 2016-12-07  0:46 UTC (permalink / raw)
  To: Artur Paszkiewicz, shli; +Cc: linux-raid
In-Reply-To: <20161205153113.7268-4-artur.paszkiewicz@intel.com>

[-- Attachment #1: Type: text/plain, Size: 1279 bytes --]

On Tue, Dec 06 2016, Artur Paszkiewicz wrote:

> Add a source file for the new policy implementation and allow selecting
> the policy based on the policy_type parameter in r5l_init_log().
>
> Introduce a new flag for rdev state flags to allow enabling the new
> policy from userspace.

This seems odd.  Why is this a per-device flag?
It makes sense for "journal" to be a per-device flag, because only one
device is the journal device and it is obviously different from the
others.

But with the ppl, all devices serve as journal devices.  So we would
need to set journal_ppl on all devices?  What happens if you set it on
some, but not others?  I see you get an error.

I think some sort of array-wide setting would make more sense, would it
not?

And what is an RWH???  A Really Weird Handle ??

I guess it is probably a Raid5 Write Hole ?  At the very least there
should be a comment explaining this when you define the enum. (remember,
you are trying to make it easier for reviewers).

It might almost make sense for bitmap/metadata to be used here.
It can currently be "external" "internal" "clustered".
Allow also "journalled" or "partial-partiy-log" ???

Maybe not ... but I'd definitely prefer a global setting, and one that
didn't use an obscure abbreviation.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [PATCH v2 00/12] Partial Parity Log for MD RAID 5
From: NeilBrown @ 2016-12-07  0:32 UTC (permalink / raw)
  To: Artur Paszkiewicz, shli; +Cc: linux-raid
In-Reply-To: <20161205153113.7268-1-artur.paszkiewicz@intel.com>

[-- Attachment #1: Type: text/plain, Size: 1768 bytes --]

On Tue, Dec 06 2016, Artur Paszkiewicz wrote:

> This series of patches implements the Partial Parity Log for RAID5 arrays. The
> purpose of this feature is closing the RAID Write Hole. It is a solution
> alternative to the existing raid5-cache, but the implementation is based on it
> and reuses some of the code by introducing support for interchangeable
> policies. This allows decoupling policy from mechanism and not adding more
> boilerplate code in raid5.c.
>
> The issue addressed by PPL is, that on a dirty shutdown, parity for a
> particular stripe may be inconsistent with data on other member disks. In
> degraded state, there is no way to recalculate parity, because one of the disks
> is missing. PPL addresses this issue and allows recalculating the parity. It
> stores only enough data needed for recovering from RWH and is not a true
> journal, like the raid5-cache implementation. It does not protect from losing
> in-flight data.
>
> PPL is a distributed log - data is stored on all RAID member drives in the
> metadata area. It does not need a dedicated journaling drive. Performance is
> reduced by up to 30%-40% but it scales with the number of drives in the array
> and the journaling drive does not become a bottleneck.

I would expect to see as description of what a PPL actually is and how
it works here... but there is none.

The change-log for patch 06 has a tiny bit more information which is
just enough to be able to start trying to understand the code, but it
isn't much.
And none of this description gets into the code, or into the
Documentation/.  This makes it hard to review and hard to maintain.

Remember: if you want people to review you code, it is in your interest
to make it easy.  That means give lots of details.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [PATCH v2 6/7] Allow changing the RWH policy for a running array
From: Jes Sorensen @ 2016-12-06 15:38 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: linux-raid
In-Reply-To: <c21e1560-ca0d-67c8-83ca-04d275698d8b@intel.com>

Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> On 12/06/2016 04:08 PM, Jes Sorensen wrote:
>> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>>> Sure, I can change that :) If you agree with the rest I'll just resend
>>> those two patches.
>> 
>> Yes, that would be perfect.
>
> OK, but just in case - if you are going to apply them maybe wait until
> Shaohua comments on the kernel patches. These mdadm patches won't be any
> good if the kernel part doesn't get accepted.

I can cope with that :) I was trying to get in sync as I felt I was
falling behind the last couple of weeks. If I drop the ball or miss it,
please feel free to throw heavy objects at me.

Cheers,
Jes

^ permalink raw reply

* Re: [PATCH v2 6/7] Allow changing the RWH policy for a running array
From: Artur Paszkiewicz @ 2016-12-06 15:30 UTC (permalink / raw)
  To: Jes Sorensen; +Cc: linux-raid
In-Reply-To: <wrfja8c96qo5.fsf@redhat.com>

On 12/06/2016 04:08 PM, Jes Sorensen wrote:
> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>> Sure, I can change that :) If you agree with the rest I'll just resend
>> those two patches.
> 
> Yes, that would be perfect.

OK, but just in case - if you are going to apply them maybe wait until
Shaohua comments on the kernel patches. These mdadm patches won't be any
good if the kernel part doesn't get accepted.

Thanks,
Artur

^ permalink raw reply

* Re: [PATCH v2 6/7] Allow changing the RWH policy for a running array
From: Jes Sorensen @ 2016-12-06 15:08 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: linux-raid
In-Reply-To: <e90fade8-b904-7834-880d-61e1634e5fed@intel.com>

Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> On 12/06/2016 03:30 PM, Jes Sorensen wrote:
>> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>>> On 12/05/2016 10:50 PM, Jes Sorensen wrote:
>>>> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>>>>> This extends the --rwh-policy parameter to work also in Misc mode. Using
>>>>> it changes the currently active RWH policy in the kernel driver and
>>>>> updates the metadata to make this change permanent. Updating metadata is
>>>>> not yet implemented for super1, so this is limited to IMSM for now.
>>>>>
>>>>> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
>>>>
>>>> Hi Artur,
>>>>
>>>> It looked good all the way up until 6/7, but there is a nit here:
>>>>
>>>>> ---
>>>>>  Manage.c | 79
>>>>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>  mdadm.c       |  9 +++++++
>>>>>  mdadm.h       |  1 +
>>>>>  super-intel.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++-
>>>>>  4 files changed, 153 insertions(+), 1 deletion(-)
>>>> [snip]
>>>>> diff --git a/super-intel.c b/super-intel.c
>>>>> index e524ef0..3b40429 100644
>>>>> --- a/super-intel.c
>>>>> +++ b/super-intel.c
>>>>> @@ -448,6 +448,7 @@ enum imsm_update_type {
>>>>>  	update_general_migration_checkpoint,
>>>>>  	update_size_change,
>>>>>  	update_prealloc_badblocks_mem,
>>>>> +	update_rwh_policy,
>>>>>  };
>>>>>  
>>>>>  struct imsm_update_activate_spare {
>>>>> @@ -540,6 +541,12 @@ struct imsm_update_prealloc_bb_mem {
>>>>>  	enum imsm_update_type type;
>>>>>  };
>>>>>  
>>>>> +struct imsm_update_rwh_policy {
>>>>> +	enum imsm_update_type type;
>>>>> +	int new_policy;
>>>>> +	int dev_idx;
>>>>> +};
>>>>> +
>>>>>  static const char *_sys_dev_type[] = {
>>>>>  	[SYS_DEV_UNKNOWN] = "Unknown",
>>>>>  	[SYS_DEV_SAS] = "SAS",
>>>>> @@ -3175,7 +3182,6 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>>>>  	info->custom_array_size   <<= 32;
>>>>>  	info->custom_array_size   |= __le32_to_cpu(dev->size_low);
>>>>>  	info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
>>>>> -	info->journal_clean = dev->rwh_policy;
>>>>>  
>>>>>  	if (is_gen_migration(dev)) {
>>>>>  		info->reshape_active = 1;
>>>>> @@ -3347,6 +3353,8 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>>>>  			info->rwh_policy = RWH_POLICY_PPL;
>>>>>  		else
>>>>>  			info->rwh_policy = RWH_POLICY_UNKNOWN;
>>>>> +
>>>>> +		info->journal_clean = info->rwh_policy == RWH_POLICY_PPL;
>>>>>  	}
>>>>>  }
>>>>
>>>> This part doesn't make sense, first you set info->rwh_policy based on
>>>> sb->feature_map to RWH_POLICY_PPL or RWH_POLICY_UNKNOWN and then right
>>>> after you hard set it to RWH_POLICY_PPL.
>>>>
>>>> In general I really would prefer not to see any of those double
>>>> assignments if it can be avoided.
>>>
>>> This isn't a double assignment, there is a '==' there. I'm setting
>>> info->journal_clean to true only if the policy is PPL. I'm not sure how
>>> this change ended up in this patch, it was supposed to go to 5/7. I must
>>> have overlooked it.
>> 
>> Argh you're right, code obfuscation at it's finest - if this is meant to
>> be in 5/7 do you want to respin the two?
>> 
>> In addition why not put the info->journal_clean assignments up together
>> with the info->rhw_policy assignments? Would make it a lot easier to
>> read without making my mistake :)
>
> Sure, I can change that :) If you agree with the rest I'll just resend
> those two patches.

Yes, that would be perfect.

Thanks,
Jes

^ permalink raw reply

* Re: [PATCH v2 6/7] Allow changing the RWH policy for a running array
From: Artur Paszkiewicz @ 2016-12-06 14:50 UTC (permalink / raw)
  To: Jes Sorensen; +Cc: linux-raid
In-Reply-To: <wrfj1sxl870h.fsf@redhat.com>

On 12/06/2016 03:30 PM, Jes Sorensen wrote:
> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>> On 12/05/2016 10:50 PM, Jes Sorensen wrote:
>>> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>>>> This extends the --rwh-policy parameter to work also in Misc mode. Using
>>>> it changes the currently active RWH policy in the kernel driver and
>>>> updates the metadata to make this change permanent. Updating metadata is
>>>> not yet implemented for super1, so this is limited to IMSM for now.
>>>>
>>>> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
>>>
>>> Hi Artur,
>>>
>>> It looked good all the way up until 6/7, but there is a nit here:
>>>
>>>> ---
>>>>  Manage.c | 79
>>>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  mdadm.c       |  9 +++++++
>>>>  mdadm.h       |  1 +
>>>>  super-intel.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++-
>>>>  4 files changed, 153 insertions(+), 1 deletion(-)
>>> [snip]
>>>> diff --git a/super-intel.c b/super-intel.c
>>>> index e524ef0..3b40429 100644
>>>> --- a/super-intel.c
>>>> +++ b/super-intel.c
>>>> @@ -448,6 +448,7 @@ enum imsm_update_type {
>>>>  	update_general_migration_checkpoint,
>>>>  	update_size_change,
>>>>  	update_prealloc_badblocks_mem,
>>>> +	update_rwh_policy,
>>>>  };
>>>>  
>>>>  struct imsm_update_activate_spare {
>>>> @@ -540,6 +541,12 @@ struct imsm_update_prealloc_bb_mem {
>>>>  	enum imsm_update_type type;
>>>>  };
>>>>  
>>>> +struct imsm_update_rwh_policy {
>>>> +	enum imsm_update_type type;
>>>> +	int new_policy;
>>>> +	int dev_idx;
>>>> +};
>>>> +
>>>>  static const char *_sys_dev_type[] = {
>>>>  	[SYS_DEV_UNKNOWN] = "Unknown",
>>>>  	[SYS_DEV_SAS] = "SAS",
>>>> @@ -3175,7 +3182,6 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>>>  	info->custom_array_size   <<= 32;
>>>>  	info->custom_array_size   |= __le32_to_cpu(dev->size_low);
>>>>  	info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
>>>> -	info->journal_clean = dev->rwh_policy;
>>>>  
>>>>  	if (is_gen_migration(dev)) {
>>>>  		info->reshape_active = 1;
>>>> @@ -3347,6 +3353,8 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>>>  			info->rwh_policy = RWH_POLICY_PPL;
>>>>  		else
>>>>  			info->rwh_policy = RWH_POLICY_UNKNOWN;
>>>> +
>>>> +		info->journal_clean = info->rwh_policy == RWH_POLICY_PPL;
>>>>  	}
>>>>  }
>>>
>>> This part doesn't make sense, first you set info->rwh_policy based on
>>> sb->feature_map to RWH_POLICY_PPL or RWH_POLICY_UNKNOWN and then right
>>> after you hard set it to RWH_POLICY_PPL.
>>>
>>> In general I really would prefer not to see any of those double
>>> assignments if it can be avoided.
>>
>> This isn't a double assignment, there is a '==' there. I'm setting
>> info->journal_clean to true only if the policy is PPL. I'm not sure how
>> this change ended up in this patch, it was supposed to go to 5/7. I must
>> have overlooked it.
> 
> Argh you're right, code obfuscation at it's finest - if this is meant to
> be in 5/7 do you want to respin the two?
> 
> In addition why not put the info->journal_clean assignments up together
> with the info->rhw_policy assignments? Would make it a lot easier to
> read without making my mistake :)

Sure, I can change that :) If you agree with the rest I'll just resend
those two patches.

Artur

^ permalink raw reply

* Re: [PATCH v2 6/7] Allow changing the RWH policy for a running array
From: Jes Sorensen @ 2016-12-06 14:30 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: linux-raid
In-Reply-To: <5c74d40c-36c4-97ae-9a67-be533d6abfb1@intel.com>

Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> On 12/05/2016 10:50 PM, Jes Sorensen wrote:
>> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>>> This extends the --rwh-policy parameter to work also in Misc mode. Using
>>> it changes the currently active RWH policy in the kernel driver and
>>> updates the metadata to make this change permanent. Updating metadata is
>>> not yet implemented for super1, so this is limited to IMSM for now.
>>>
>>> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
>> 
>> Hi Artur,
>> 
>> It looked good all the way up until 6/7, but there is a nit here:
>> 
>>> ---
>>>  Manage.c | 79
>>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  mdadm.c       |  9 +++++++
>>>  mdadm.h       |  1 +
>>>  super-intel.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++-
>>>  4 files changed, 153 insertions(+), 1 deletion(-)
>> [snip]
>>> diff --git a/super-intel.c b/super-intel.c
>>> index e524ef0..3b40429 100644
>>> --- a/super-intel.c
>>> +++ b/super-intel.c
>>> @@ -448,6 +448,7 @@ enum imsm_update_type {
>>>  	update_general_migration_checkpoint,
>>>  	update_size_change,
>>>  	update_prealloc_badblocks_mem,
>>> +	update_rwh_policy,
>>>  };
>>>  
>>>  struct imsm_update_activate_spare {
>>> @@ -540,6 +541,12 @@ struct imsm_update_prealloc_bb_mem {
>>>  	enum imsm_update_type type;
>>>  };
>>>  
>>> +struct imsm_update_rwh_policy {
>>> +	enum imsm_update_type type;
>>> +	int new_policy;
>>> +	int dev_idx;
>>> +};
>>> +
>>>  static const char *_sys_dev_type[] = {
>>>  	[SYS_DEV_UNKNOWN] = "Unknown",
>>>  	[SYS_DEV_SAS] = "SAS",
>>> @@ -3175,7 +3182,6 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>>  	info->custom_array_size   <<= 32;
>>>  	info->custom_array_size   |= __le32_to_cpu(dev->size_low);
>>>  	info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
>>> -	info->journal_clean = dev->rwh_policy;
>>>  
>>>  	if (is_gen_migration(dev)) {
>>>  		info->reshape_active = 1;
>>> @@ -3347,6 +3353,8 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>>  			info->rwh_policy = RWH_POLICY_PPL;
>>>  		else
>>>  			info->rwh_policy = RWH_POLICY_UNKNOWN;
>>> +
>>> +		info->journal_clean = info->rwh_policy == RWH_POLICY_PPL;
>>>  	}
>>>  }
>> 
>> This part doesn't make sense, first you set info->rwh_policy based on
>> sb->feature_map to RWH_POLICY_PPL or RWH_POLICY_UNKNOWN and then right
>> after you hard set it to RWH_POLICY_PPL.
>> 
>> In general I really would prefer not to see any of those double
>> assignments if it can be avoided.
>
> This isn't a double assignment, there is a '==' there. I'm setting
> info->journal_clean to true only if the policy is PPL. I'm not sure how
> this change ended up in this patch, it was supposed to go to 5/7. I must
> have overlooked it.

Argh you're right, code obfuscation at it's finest - if this is meant to
be in 5/7 do you want to respin the two?

In addition why not put the info->journal_clean assignments up together
with the info->rhw_policy assignments? Would make it a lot easier to
read without making my mistake :)

cheers,
Jes

^ permalink raw reply

* Re: [PATCH v2 6/7] Allow changing the RWH policy for a running array
From: Artur Paszkiewicz @ 2016-12-06  7:43 UTC (permalink / raw)
  To: Jes Sorensen; +Cc: linux-raid
In-Reply-To: <wrfjd1h69hbn.fsf@redhat.com>

On 12/05/2016 10:50 PM, Jes Sorensen wrote:
> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>> This extends the --rwh-policy parameter to work also in Misc mode. Using
>> it changes the currently active RWH policy in the kernel driver and
>> updates the metadata to make this change permanent. Updating metadata is
>> not yet implemented for super1, so this is limited to IMSM for now.
>>
>> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
> 
> Hi Artur,
> 
> It looked good all the way up until 6/7, but there is a nit here:
> 
>> ---
>>  Manage.c      | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  mdadm.c       |  9 +++++++
>>  mdadm.h       |  1 +
>>  super-intel.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++-
>>  4 files changed, 153 insertions(+), 1 deletion(-)
> [snip]
>> diff --git a/super-intel.c b/super-intel.c
>> index e524ef0..3b40429 100644
>> --- a/super-intel.c
>> +++ b/super-intel.c
>> @@ -448,6 +448,7 @@ enum imsm_update_type {
>>  	update_general_migration_checkpoint,
>>  	update_size_change,
>>  	update_prealloc_badblocks_mem,
>> +	update_rwh_policy,
>>  };
>>  
>>  struct imsm_update_activate_spare {
>> @@ -540,6 +541,12 @@ struct imsm_update_prealloc_bb_mem {
>>  	enum imsm_update_type type;
>>  };
>>  
>> +struct imsm_update_rwh_policy {
>> +	enum imsm_update_type type;
>> +	int new_policy;
>> +	int dev_idx;
>> +};
>> +
>>  static const char *_sys_dev_type[] = {
>>  	[SYS_DEV_UNKNOWN] = "Unknown",
>>  	[SYS_DEV_SAS] = "SAS",
>> @@ -3175,7 +3182,6 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>  	info->custom_array_size   <<= 32;
>>  	info->custom_array_size   |= __le32_to_cpu(dev->size_low);
>>  	info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
>> -	info->journal_clean = dev->rwh_policy;
>>  
>>  	if (is_gen_migration(dev)) {
>>  		info->reshape_active = 1;
>> @@ -3347,6 +3353,8 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>  			info->rwh_policy = RWH_POLICY_PPL;
>>  		else
>>  			info->rwh_policy = RWH_POLICY_UNKNOWN;
>> +
>> +		info->journal_clean = info->rwh_policy == RWH_POLICY_PPL;
>>  	}
>>  }
> 
> This part doesn't make sense, first you set info->rwh_policy based on
> sb->feature_map to RWH_POLICY_PPL or RWH_POLICY_UNKNOWN and then right
> after you hard set it to RWH_POLICY_PPL.
> 
> In general I really would prefer not to see any of those double
> assignments if it can be avoided.

This isn't a double assignment, there is a '==' there. I'm setting
info->journal_clean to true only if the policy is PPL. I'm not sure how
this change ended up in this patch, it was supposed to go to 5/7. I must
have overlooked it.

Artur

^ permalink raw reply

* [PATCH] md/raid5-cache: no recovery is required when create super-block
From: JackieLiu @ 2016-12-06  6:13 UTC (permalink / raw)
  To: songliubraving; +Cc: liuzhengyuan, shli, linux-raid, JackieLiu

When create the super-block information, We do not need to do this
recovery stage, only need to initialize some variables.
---
 drivers/md/raid5-cache.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index c3b3124..96f25df 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2545,7 +2545,13 @@ static int r5l_load_log(struct r5l_log *log)
 
 	__free_page(page);
 
-	ret = r5l_recovery_log(log);
+	if (create_super) {
+		log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS);
+		log->seq = log->last_cp_seq + 1;
+		log->next_checkpoint = cp;
+	} else
+		ret = r5l_recovery_log(log);
+
 	r5c_update_log_state(log);
 	return ret;
 ioerr:
-- 
2.10.2




^ permalink raw reply related

* Re: [PATCH 1/2] md/r5cache: sh->log_start in recovery
From: JackieLiu @ 2016-12-06  4:10 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-raid, neilb, shli, kernel-team, dan.j.williams, hch,
	刘正元
In-Reply-To: <20161206014657.3592902-1-songliubraving@fb.com>


> 在 2016年12月6日,09:46,Song Liu <songliubraving@fb.com> 写道:
> 
> We only need to update sh->log_start at the end of recovery,
> which is r5c_recovery_rewrite_data_only_stripes().
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
> drivers/md/raid5-cache.c | 17 +++++++----------
> 1 file changed, 7 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index c3b3124..93f3310 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -1681,8 +1681,7 @@ r5l_recovery_replay_one_stripe(struct r5conf *conf,
> 
> static struct stripe_head *
> r5c_recovery_alloc_stripe(struct r5conf *conf,
> -			  sector_t stripe_sect,
> -			  sector_t log_start)
> +			  sector_t stripe_sect)
> {
> 	struct stripe_head *sh;
> 
> @@ -1691,7 +1690,6 @@ r5c_recovery_alloc_stripe(struct r5conf *conf,
> 		return NULL;  /* no more stripe available */
> 
> 	r5l_recovery_reset_stripe(sh);
> -	sh->log_start = log_start;

Hi Song, 
the sh->log_start is not only used in r5c_recovery_rewrite_data_only_stripes function, in my new patch, 
https://git.kernel.org/cgit/linux/kernel/git/shli/md.git/tree/drivers/md/raid5-cache.c?h=for-next&id=43b9674832cc41ad0ad7b7e2ec397e47dcd5f6c3#n2167
Also be used. 

Thanks 
Jackie

> 
> 	return sh;
> }
> @@ -1861,7 +1859,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
> 						stripe_sect);
> 
> 		if (!sh) {
> -			sh = r5c_recovery_alloc_stripe(conf, stripe_sect, ctx->pos);
> +			sh = r5c_recovery_alloc_stripe(conf, stripe_sect);
> 			/*
> 			 * cannot get stripe from raid5_get_active_stripe
> 			 * try replay some stripes
> @@ -1870,7 +1868,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
> 				r5c_recovery_replay_stripes(
> 					cached_stripe_list, ctx);
> 				sh = r5c_recovery_alloc_stripe(
> -					conf, stripe_sect, ctx->pos);
> +					conf, stripe_sect);
> 			}
> 			if (!sh) {
> 				pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
> @@ -1878,8 +1876,8 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
> 					conf->min_nr_stripes * 2);
> 				raid5_set_cache_size(mddev,
> 						     conf->min_nr_stripes * 2);
> -				sh = r5c_recovery_alloc_stripe(
> -					conf, stripe_sect, ctx->pos);
> +				sh = r5c_recovery_alloc_stripe(conf,
> +							       stripe_sect);
> 			}
> 			if (!sh) {
> 				pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
> @@ -1893,7 +1891,6 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
> 			if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
> 			    test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
> 				r5l_recovery_replay_one_stripe(conf, sh, ctx);
> -				sh->log_start = ctx->pos;
> 				list_move_tail(&sh->lru, cached_stripe_list);
> 			}
> 			r5l_recovery_load_data(log, sh, ctx, payload,
> @@ -1932,8 +1929,6 @@ static void r5c_recovery_load_one_stripe(struct r5l_log *log,
> 			set_bit(R5_UPTODATE, &dev->flags);
> 		}
> 	}
> -	list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
> -	atomic_inc(&log->stripe_in_journal_count);
> }
> 
> /*
> @@ -2121,6 +2116,8 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
> 		sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
> 			     REQ_OP_WRITE, WRITE_FUA, false);
> 		sh->log_start = ctx->pos;
> +		list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
> +		atomic_inc(&log->stripe_in_journal_count);
> 		ctx->pos = write_pos;
> 		ctx->seq += 1;
> 
> -- 
> 2.9.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html




^ permalink raw reply

* [PATCH 2/2] md/r5cache: flush data only stripes in r5l_recovery_log()
From: Song Liu @ 2016-12-06  1:46 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	Song Liu
In-Reply-To: <20161206014657.3592902-1-songliubraving@fb.com>

When there is data only stripes in the journal, we flush them out in
r5l_recovery_log().

We need conf->log in r5l_load_log(), so we need to set it before calling
r5l_load_log(). If r5l_load_log() fails, we set conf->log back to NULL.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 22 ++++++++++++++++++++--
 drivers/md/raid5.c       |  8 +++++++-
 drivers/md/raid5.h       |  4 ++++
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 93f3310..519a680 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2131,10 +2131,12 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 static int r5l_recovery_log(struct r5l_log *log)
 {
 	struct mddev *mddev = log->rdev->mddev;
+	struct r5conf *conf = mddev->private;
 	struct r5l_recovery_ctx ctx;
 	int ret;
 	sector_t pos;
 	struct stripe_head *sh;
+	unsigned long flags;
 
 	ctx.pos = log->last_checkpoint;
 	ctx.seq = log->last_cp_seq;
@@ -2172,12 +2174,26 @@ static int r5l_recovery_log(struct r5l_log *log)
 			 mdname(mddev), ctx.data_only_stripes,
 			 ctx.data_parity_stripes);
 
-		if (ctx.data_only_stripes > 0)
+		if (ctx.data_only_stripes > 0) {
+			log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
 			if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
 				pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
 				       mdname(mddev));
 				return -EIO;
 			}
+
+			set_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state);
+			spin_lock_irqsave(&conf->device_lock, flags);
+			r5c_flush_cache(conf, INT_MAX);
+			spin_unlock_irqrestore(&conf->device_lock, flags);
+			md_wakeup_thread(conf->mddev->thread);
+			wait_event(conf->wait_for_r5c_pre_init_flush,
+				   atomic_read(&conf->active_stripes) == 0 &&
+				   atomic_read(&conf->r5c_cached_full_stripes) == 0 &&
+				   atomic_read(&conf->r5c_cached_partial_stripes) == 0);
+			clear_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state);
+			log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
+		}
 	}
 
 	log->log_start = ctx.pos;
@@ -2628,14 +2644,16 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	spin_lock_init(&log->stripe_in_journal_lock);
 	atomic_set(&log->stripe_in_journal_count, 0);
 
+	rcu_assign_pointer(conf->log, log);
+
 	if (r5l_load_log(log))
 		goto error;
 
-	rcu_assign_pointer(conf->log, log);
 	set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
 	return 0;
 
 error:
+	rcu_assign_pointer(conf->log, NULL);
 	md_unregister_thread(&log->reclaim_thread);
 reclaim_thread:
 	mempool_destroy(log->meta_pool);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6bf3c26..279f213 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -232,7 +232,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 	 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
 	 * data in journal, so they are not released to cached lists
 	 */
-	if (conf->quiesce && r5c_is_writeback(conf->log) &&
+	if ((conf->quiesce ||
+	     test_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state)) &&
+	    r5c_is_writeback(conf->log) &&
 	    !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
 		if (test_bit(STRIPE_R5C_CACHING, &sh->state))
 			r5c_make_stripe_write_out(sh);
@@ -264,6 +266,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 			    < IO_THRESHOLD)
 				md_wakeup_thread(conf->mddev->thread);
 		atomic_dec(&conf->active_stripes);
+		if (test_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state))
+		    	wake_up(&sh->raid_conf->wait_for_r5c_pre_init_flush);
+
 		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
 			if (!r5c_is_writeback(conf->log))
 				list_add_tail(&sh->lru, temp_inactive_list);
@@ -6638,6 +6643,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	init_waitqueue_head(&conf->wait_for_quiescent);
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
+	init_waitqueue_head(&conf->wait_for_r5c_pre_init_flush);
 	INIT_LIST_HEAD(&conf->handle_list);
 	INIT_LIST_HEAD(&conf->hold_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ed8e136..b39fe46 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -564,6 +564,9 @@ enum r5_cache_state {
 	R5C_EXTRA_PAGE_IN_USE,	/* a stripe is using disk_info.extra_page
 				 * for prexor
 				 */
+	R5C_PRE_INIT_FLUSH,	/* flushing data only stripes recovered from
+				 * the journal
+				 */
 };
 
 struct r5conf {
@@ -679,6 +682,7 @@ struct r5conf {
 	int			group_cnt;
 	int			worker_cnt_per_group;
 	struct r5l_log		*log;
+	wait_queue_head_t	wait_for_r5c_pre_init_flush;
 };
 
 
-- 
2.9.3


^ permalink raw reply related

* [PATCH 1/2] md/r5cache: sh->log_start in recovery
From: Song Liu @ 2016-12-06  1:46 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	Song Liu

We only need to update sh->log_start at the end of recovery,
which is r5c_recovery_rewrite_data_only_stripes().

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index c3b3124..93f3310 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1681,8 +1681,7 @@ r5l_recovery_replay_one_stripe(struct r5conf *conf,
 
 static struct stripe_head *
 r5c_recovery_alloc_stripe(struct r5conf *conf,
-			  sector_t stripe_sect,
-			  sector_t log_start)
+			  sector_t stripe_sect)
 {
 	struct stripe_head *sh;
 
@@ -1691,7 +1690,6 @@ r5c_recovery_alloc_stripe(struct r5conf *conf,
 		return NULL;  /* no more stripe available */
 
 	r5l_recovery_reset_stripe(sh);
-	sh->log_start = log_start;
 
 	return sh;
 }
@@ -1861,7 +1859,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 						stripe_sect);
 
 		if (!sh) {
-			sh = r5c_recovery_alloc_stripe(conf, stripe_sect, ctx->pos);
+			sh = r5c_recovery_alloc_stripe(conf, stripe_sect);
 			/*
 			 * cannot get stripe from raid5_get_active_stripe
 			 * try replay some stripes
@@ -1870,7 +1868,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 				r5c_recovery_replay_stripes(
 					cached_stripe_list, ctx);
 				sh = r5c_recovery_alloc_stripe(
-					conf, stripe_sect, ctx->pos);
+					conf, stripe_sect);
 			}
 			if (!sh) {
 				pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
@@ -1878,8 +1876,8 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 					conf->min_nr_stripes * 2);
 				raid5_set_cache_size(mddev,
 						     conf->min_nr_stripes * 2);
-				sh = r5c_recovery_alloc_stripe(
-					conf, stripe_sect, ctx->pos);
+				sh = r5c_recovery_alloc_stripe(conf,
+							       stripe_sect);
 			}
 			if (!sh) {
 				pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
@@ -1893,7 +1891,6 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 			if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
 			    test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
 				r5l_recovery_replay_one_stripe(conf, sh, ctx);
-				sh->log_start = ctx->pos;
 				list_move_tail(&sh->lru, cached_stripe_list);
 			}
 			r5l_recovery_load_data(log, sh, ctx, payload,
@@ -1932,8 +1929,6 @@ static void r5c_recovery_load_one_stripe(struct r5l_log *log,
 			set_bit(R5_UPTODATE, &dev->flags);
 		}
 	}
-	list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
-	atomic_inc(&log->stripe_in_journal_count);
 }
 
 /*
@@ -2121,6 +2116,8 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 		sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
 			     REQ_OP_WRITE, WRITE_FUA, false);
 		sh->log_start = ctx->pos;
+		list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
+		atomic_inc(&log->stripe_in_journal_count);
 		ctx->pos = write_pos;
 		ctx->seq += 1;
 
-- 
2.9.3


^ permalink raw reply related

* Re: [PATCH 1/2] md/r5cache: do r5c_update_log_state after log recovery
From: Shaohua Li @ 2016-12-06  1:10 UTC (permalink / raw)
  To: Zhengyuan Liu; +Cc: linux-raid
In-Reply-To: <1480841385-21180-1-git-send-email-liuzhengyuan@kylinos.cn>

On Sun, Dec 04, 2016 at 04:49:44PM +0800, Zhengyuan Liu wrote:
> We should update log state after we did a log recovery, current completion
> may get wrong log state since log->log_start wasn't initalized until we
> called r5l_recovery_log.
> 
> At log recovery stage, no lock needed as there is no race conditon.
> next_checkpoint field will be initialized in r5l_recovery_log too.

applied, thanks!
 
> Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
> ---
>  drivers/md/raid5-cache.c | 8 +++-----
>  1 file changed, 3 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index fa3319c..07bce0e 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -2522,14 +2522,12 @@ static int r5l_load_log(struct r5l_log *log)
>  	if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
>  		log->max_free_space = RECLAIM_MAX_FREE_SPACE;
>  	log->last_checkpoint = cp;
> -	log->next_checkpoint = cp;
> -	mutex_lock(&log->io_mutex);
> -	r5c_update_log_state(log);
> -	mutex_unlock(&log->io_mutex);
>  
>  	__free_page(page);
>  
> -	return r5l_recovery_log(log);
> +	ret = r5l_recovery_log(log);
> +	r5c_update_log_state(log);
> +	return ret;
>  ioerr:
>  	__free_page(page);
>  	return ret;
> -- 
> 2.7.4
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2 05/12] raid5-ppl: Partial Parity Log implementation
From: kbuild test robot @ 2016-12-06  1:06 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: kbuild-all, shli, linux-raid
In-Reply-To: <20161205153113.7268-6-artur.paszkiewicz@intel.com>

[-- Attachment #1: Type: text/plain, Size: 1574 bytes --]

Hi Artur,

[auto build test ERROR on next-20161205]
[cannot apply to v4.9-rc8 v4.9-rc7 v4.9-rc6 v4.9-rc8]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Artur-Paszkiewicz/Partial-Parity-Log-for-MD-RAID-5/20161206-081540
config: i386-randconfig-r0-201649 (attached as .config)
compiler: gcc-5 (Debian 5.4.1-2) 5.4.1 20160904
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   drivers/md/raid5-ppl.c: In function 'ppl_init_log_child':
>> drivers/md/raid5-ppl.c:459:2: error: too few arguments to function 'bio_init'
     bio_init(&log->flush_bio);
     ^
   In file included from include/linux/blkdev.h:19:0,
                    from drivers/md/raid5-ppl.c:16:
   include/linux/bio.h:426:13: note: declared here
    extern void bio_init(struct bio *bio, struct bio_vec *table,
                ^

vim +/bio_init +459 drivers/md/raid5-ppl.c

   453		spin_lock_init(&log->io_list_lock);
   454		INIT_LIST_HEAD(&log->running_ios);
   455		INIT_LIST_HEAD(&log->io_end_ios);
   456		INIT_LIST_HEAD(&log->flushing_ios);
   457		INIT_LIST_HEAD(&log->finished_ios);
   458		INIT_LIST_HEAD(&log->no_mem_stripes);
 > 459		bio_init(&log->flush_bio);
   460	
   461		log->io_kc = log_parent->io_kc;
   462		log->io_pool = log_parent->io_pool;

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 31114 bytes --]

^ permalink raw reply

* Re: [PATCH 4/4] md/raid5-cache: adjust the write position of the empty block and mark it as a checkpoint
From: Shaohua Li @ 2016-12-06  1:00 UTC (permalink / raw)
  To: JackieLiu; +Cc: songliubraving, 刘正元, linux-raid
In-Reply-To: <D667B2F7-64DE-434D-B793-F4F514BB2714@kylinos.cn>

On Mon, Dec 05, 2016 at 11:58:53AM +0800, JackieLiu wrote:
> 
> > 在 2016年12月3日,04:10,Shaohua Li <shli@kernel.org> 写道:
> > 
> > confusing reading the code. Don't think a 'if () write_empty_block' makes the
> 
> Hi ShaoHua. 
> 
> I rewrote this part of the code, now without data_only_stripes, we write an empty block here,
> with data_only_stripes, mark the first cache block as last_checkpoint.

Next time please send formal patch and fix your mail client (it skews up tabs).
I manually applied this patch this time. Thanks!

Best Regards,
Shaohua

 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 9e72180..5697724 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -2072,7 +2072,6 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
>                 return -ENOMEM;
>         }
> 
> -       ctx->seq += 10;
>         list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
>                 struct r5l_meta_block *mb;
>                 int i;
> @@ -2132,6 +2131,8 @@ static int r5l_recovery_log(struct r5l_log *log)
>         struct mddev *mddev = log->rdev->mddev;
>         struct r5l_recovery_ctx ctx;
>         int ret;
> +       sector_t pos;
> +       struct stripe_head *sh;
> 
>         ctx.pos = log->last_checkpoint;
>         ctx.seq = log->last_cp_seq;
> @@ -2149,6 +2150,18 @@ static int r5l_recovery_log(struct r5l_log *log)
>         if (ret)
>                 return ret;
> 
> +       pos = ctx.pos;
> +       ctx.seq += 10;
> +
> +       if (ctx.data_only_stripes == 0) {
> +               log->next_checkpoint = ctx.pos;
> +               r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
> +               ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
> +       } else {
> +               sh = list_last_entry(&ctx.cached_list, struct stripe_head, lru);
> +               log->next_checkpoint = sh->log_start;
> +       }
> +
>         if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
>                 pr_debug("md/raid:%s: starting from clean shutdown\n",
>                          mdname(mddev));
> @@ -2166,10 +2179,9 @@ static int r5l_recovery_log(struct r5l_log *log)
>         }
> 
>         log->log_start = ctx.pos;
> -       log->next_checkpoint = ctx.pos;
>         log->seq = ctx.seq;
> -       r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq);
> -       r5l_write_super(log, ctx.pos);
> +       log->last_checkpoint = pos;
> +       r5l_write_super(log, pos);
>         return 0;
>  }
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] md: fix refcount problem on mddev when stopping array.
From: Shaohua Li @ 2016-12-06  0:39 UTC (permalink / raw)
  To: NeilBrown; +Cc: Guoqing Jiang, linux-raid, Marc Smith
In-Reply-To: <87h96j53x9.fsf@notabene.neil.brown.name>

On Mon, Dec 05, 2016 at 04:40:50PM +1100, Neil Brown wrote:
> 
> md_open() gets a counted reference on an mddev using mddev_find().
> If it ends up returning an error, it must drop this reference.
> 
> There are two error paths where the reference is not dropped.
> One only happens if the process is signalled and an awkward time,
> which is quite unlikely.
> The other was introduced recently in commit af8d8e6f0.
> 
> Change the code to ensure the drop the reference when returning an error,
> and make it harded to re-introduce this sort of bug in the future.
> 
> Reported-by: Marc Smith <marc.smith@mcc.edu>
> Fixes: af8d8e6f0315 ("md: changes for MD_STILL_CLOSED flag")
> Signed-off-by: NeilBrown <neilb@suse.com>
> ---
> 
> Hi Shaohua,
>  as this bug was introduced in v4.9-rc1, it would be great if this
>  patch could get to Linus before v4.9-final comes out (on Sunday?).

Applied to the for-next tree. This sounds not significant enough, so I'll push
it to 4.10.

Thanks,
Shaohua
 
> Thanks,
> NeilBrown
> 
> 
>  drivers/md/md.c | 5 ++++-
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 2089d46b0eb8..d1a291ac2a75 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -7092,7 +7092,8 @@ static int md_open(struct block_device *bdev, fmode_t mode)
>  
>  	if (test_bit(MD_CLOSING, &mddev->flags)) {
>  		mutex_unlock(&mddev->open_mutex);
> -		return -ENODEV;
> +		err = -ENODEV;
> +		goto out;
>  	}
>  
>  	err = 0;
> @@ -7101,6 +7102,8 @@ static int md_open(struct block_device *bdev, fmode_t mode)
>  
>  	check_disk_change(bdev);
>   out:
> +	if (err)
> +		mddev_put(mddev);
>  	return err;
>  }
>  
> -- 
> 2.10.2
> 



^ permalink raw reply

* Re: [PATCH v2 6/7] Allow changing the RWH policy for a running array
From: Jes Sorensen @ 2016-12-05 21:50 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: linux-raid
In-Reply-To: <20161205153204.7343-7-artur.paszkiewicz@intel.com>

Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> This extends the --rwh-policy parameter to work also in Misc mode. Using
> it changes the currently active RWH policy in the kernel driver and
> updates the metadata to make this change permanent. Updating metadata is
> not yet implemented for super1, so this is limited to IMSM for now.
>
> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>

Hi Artur,

It looked good all the way up until 6/7, but there is a nit here:

> ---
>  Manage.c      | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  mdadm.c       |  9 +++++++
>  mdadm.h       |  1 +
>  super-intel.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 153 insertions(+), 1 deletion(-)
[snip]
> diff --git a/super-intel.c b/super-intel.c
> index e524ef0..3b40429 100644
> --- a/super-intel.c
> +++ b/super-intel.c
> @@ -448,6 +448,7 @@ enum imsm_update_type {
>  	update_general_migration_checkpoint,
>  	update_size_change,
>  	update_prealloc_badblocks_mem,
> +	update_rwh_policy,
>  };
>  
>  struct imsm_update_activate_spare {
> @@ -540,6 +541,12 @@ struct imsm_update_prealloc_bb_mem {
>  	enum imsm_update_type type;
>  };
>  
> +struct imsm_update_rwh_policy {
> +	enum imsm_update_type type;
> +	int new_policy;
> +	int dev_idx;
> +};
> +
>  static const char *_sys_dev_type[] = {
>  	[SYS_DEV_UNKNOWN] = "Unknown",
>  	[SYS_DEV_SAS] = "SAS",
> @@ -3175,7 +3182,6 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>  	info->custom_array_size   <<= 32;
>  	info->custom_array_size   |= __le32_to_cpu(dev->size_low);
>  	info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
> -	info->journal_clean = dev->rwh_policy;
>  
>  	if (is_gen_migration(dev)) {
>  		info->reshape_active = 1;
> @@ -3347,6 +3353,8 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>  			info->rwh_policy = RWH_POLICY_PPL;
>  		else
>  			info->rwh_policy = RWH_POLICY_UNKNOWN;
> +
> +		info->journal_clean = info->rwh_policy == RWH_POLICY_PPL;
>  	}
>  }

This part doesn't make sense, first you set info->rwh_policy based on
sb->feature_map to RWH_POLICY_PPL or RWH_POLICY_UNKNOWN and then right
after you hard set it to RWH_POLICY_PPL.

In general I really would prefer not to see any of those double
assignments if it can be avoided.

Cheers,
Jes

^ permalink raw reply

* Re: MD Remnants After --stop
From: Marc Smith @ 2016-12-05 21:37 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid
In-Reply-To: <87r35n5hsn.fsf@notabene.neil.brown.name>

On Sun, Dec 4, 2016 at 7:41 PM, NeilBrown <neilb@suse.com> wrote:
> On Sat, Dec 03 2016, Marc Smith wrote:
>
>> Finally, I got it! Why is it when I want it to break, it doesn't. =)
>
> welcome to my world :-)
>
>
>>
>> I will say, using the modified mdadm that prevents the synthesized
>> CHANGE event, it seems to not induce the problem as regularly.
>>
>> Below are the kernel logs after stopping an array:
>
> Thank you so much for persisting with this.
> The logs you provide make it clear that two separate processes (494 and
> 31178) increment the ->active count by opening the device, but never
> decrement that count by closing the device.
> It seems too unlikely that either process would be holding the
> file descriptor open indefinitely, so something must be going wrong
> either as part of 'open', or as part of 'close'.
>
> Now that I know where to look, the bug is obvious.  Why didn't I see
> that before?
>
> The open request is failing, almost certainly because MD_CLOSING is set,
> but the ->active count isn't being decremented on failure.
> This patch should fix it.
>
> Please test and report results.
>
> Thanks,
> NeilBrown
>
> Fixes: af8d8e6f0315 ("md: changes for MD_STILL_CLOSED flag" v4.9-rc1)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 2089d46b0eb8..a8e07eb2ca5f 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -7087,11 +7087,14 @@ static int md_open(struct block_device *bdev, fmode_t mode)
>         }
>         BUG_ON(mddev != bdev->bd_disk->private_data);
>
> -       if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
> +       if ((err = mutex_lock_interruptible(&mddev->open_mutex))) {
> +               mddev_put(mddev);
>                 goto out;
> +       }
>
>         if (test_bit(MD_CLOSING, &mddev->flags)) {
>                 mutex_unlock(&mddev->open_mutex);
> +               mddev_put(mddev);
>                 return -ENODEV;
>         }
>

That did the trick! I ran 'mdadm --stop' eight different times on two
nodes, and every time it was removed completely from /dev and
/sys/block just like expected. =)

Thanks for your effort and time on this. I think I read in the other
thread RE: this patch, it may make it into 4.9?


--Marc

^ permalink raw reply

* Re: [PATCH v3 0/2] Reorganize raid*_make_request to clean up code
From: Robert LeBlanc @ 2016-12-05 20:04 UTC (permalink / raw)
  To: linux-raid; +Cc: Robert LeBlanc
In-Reply-To: <20161205200258.6653-1-robert@leblancnet.us>

And also rebased against for-next.
----------------
Robert LeBlanc
PGP Fingerprint 79A2 9CA4 6CC4 45DD A904  C70E E654 3BB2 FA62 B9F1


On Mon, Dec 5, 2016 at 1:02 PM, Robert LeBlanc <robert@leblancnet.us> wrote:
> In response to Christoph, I've broken the read and writes into their own
> functions to make the code even cleaner. Since it is such a big change, I broke
> up the commits into this series instead of creating a v2 of the previous patch.
>
> Changes since v2:
>  Shaohua Li
>  * Make md_write_start before wait_barrier
>  * Move I/O in recovery stripe test to write path
>  * Changed to if/then instead of return in __make_request
>
> Changes since v1:
>
>  John Stoffel
>  * Changed to if/then instead of return in raid1_make_request
>
>  Neil Brown
>  * Moved wait_barrier into raid1_{read,write}_request so that it could be after
>    ->suspend_{hi,lo}. This prevents a write blocking a resync until the suspend
>    region is moved.
>
> Robert LeBlanc (2):
>   md/raid1: Refactor raid1_make_request
>   md/raid10: Refactor raid10_make_request
>
>  drivers/md/raid1.c  | 259 +++++++++++++++++++++++++++-------------------------
>  drivers/md/raid10.c | 215 ++++++++++++++++++++++---------------------
>  2 files changed, 245 insertions(+), 229 deletions(-)
>
> --
> 2.10.2
>

^ permalink raw reply

* [PATCH v3 2/2] md/raid10: Refactor raid10_make_request
From: Robert LeBlanc @ 2016-12-05 20:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Robert LeBlanc
In-Reply-To: <20161205200258.6653-1-robert@leblancnet.us>

Refactor raid10_make_request into seperate read and write functions to
clean up the code.

Signed-off-by: Robert LeBlanc <robert@leblancnet.us>
---
 drivers/md/raid10.c | 215 +++++++++++++++++++++++++++-------------------------
 1 file changed, 111 insertions(+), 104 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 525ca99..8698e00 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1087,23 +1087,97 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	kfree(plug);
 }
 
-static void __make_request(struct mddev *mddev, struct bio *bio)
+static void raid10_read_request(struct mddev *mddev, struct bio *bio,
+				struct r10bio *r10_bio)
 {
 	struct r10conf *conf = mddev->private;
-	struct r10bio *r10_bio;
 	struct bio *read_bio;
+	const int op = bio_op(bio);
+	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
+	int sectors_handled;
+	int max_sectors;
+	struct md_rdev *rdev;
+	int slot;
+
+	wait_barrier(conf);
+
+read_again:
+	rdev = read_balance(conf, r10_bio, &max_sectors);
+	if (!rdev) {
+		raid_end_bio_io(r10_bio);
+		return;
+	}
+	slot = r10_bio->read_slot;
+
+	read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+	bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
+		 max_sectors);
+
+	r10_bio->devs[slot].bio = read_bio;
+	r10_bio->devs[slot].rdev = rdev;
+
+	read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
+		choose_data_offset(r10_bio, rdev);
+	read_bio->bi_bdev = rdev->bdev;
+	read_bio->bi_end_io = raid10_end_read_request;
+	bio_set_op_attrs(read_bio, op, do_sync);
+	if (test_bit(FailFast, &rdev->flags) &&
+	    test_bit(R10BIO_FailFast, &r10_bio->state))
+	        read_bio->bi_opf |= MD_FAILFAST;
+	read_bio->bi_private = r10_bio;
+
+	if (mddev->gendisk)
+	        trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
+	                              read_bio, disk_devt(mddev->gendisk),
+	                              r10_bio->sector);
+	if (max_sectors < r10_bio->sectors) {
+		/* Could not read all from this device, so we will
+		 * need another r10_bio.
+		 */
+		sectors_handled = (r10_bio->sector + max_sectors
+				   - bio->bi_iter.bi_sector);
+		r10_bio->sectors = max_sectors;
+		spin_lock_irq(&conf->device_lock);
+		if (bio->bi_phys_segments == 0)
+			bio->bi_phys_segments = 2;
+		else
+			bio->bi_phys_segments++;
+		spin_unlock_irq(&conf->device_lock);
+		/* Cannot call generic_make_request directly
+		 * as that will be queued in __generic_make_request
+		 * and subsequent mempool_alloc might block
+		 * waiting for it.  so hand bio over to raid10d.
+		 */
+		reschedule_retry(r10_bio);
+
+		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+		r10_bio->master_bio = bio;
+		r10_bio->sectors = bio_sectors(bio) - sectors_handled;
+		r10_bio->state = 0;
+		r10_bio->mddev = mddev;
+		r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+		goto read_again;
+	} else
+		generic_make_request(read_bio);
+	return;
+}
+
+static void raid10_write_request(struct mddev *mddev, struct bio *bio,
+				 struct r10bio *r10_bio)
+{
+	struct r10conf *conf = mddev->private;
 	int i;
 	const int op = bio_op(bio);
-	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
 	const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
 	unsigned long flags;
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
 	struct raid10_plug_cb *plug = NULL;
+	sector_t sectors;
 	int sectors_handled;
 	int max_sectors;
-	int sectors;
 
 	md_write_start(mddev, bio);
 
@@ -1130,7 +1204,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 		wait_barrier(conf);
 	}
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-	    bio_data_dir(bio) == WRITE &&
 	    (mddev->reshape_backwards
 	     ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
 		bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
@@ -1147,99 +1220,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 
 		conf->reshape_safe = mddev->reshape_position;
 	}
-
-	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
-	r10_bio->master_bio = bio;
-	r10_bio->sectors = sectors;
-
-	r10_bio->mddev = mddev;
-	r10_bio->sector = bio->bi_iter.bi_sector;
-	r10_bio->state = 0;
-
-	/* We might need to issue multiple reads to different
-	 * devices if there are bad blocks around, so we keep
-	 * track of the number of reads in bio->bi_phys_segments.
-	 * If this is 0, there is only one r10_bio and no locking
-	 * will be needed when the request completes.  If it is
-	 * non-zero, then it is the number of not-completed requests.
-	 */
-	bio->bi_phys_segments = 0;
-	bio_clear_flag(bio, BIO_SEG_VALID);
-
-	if (rw == READ) {
-		/*
-		 * read balancing logic:
-		 */
-		struct md_rdev *rdev;
-		int slot;
-
-read_again:
-		rdev = read_balance(conf, r10_bio, &max_sectors);
-		if (!rdev) {
-			raid_end_bio_io(r10_bio);
-			return;
-		}
-		slot = r10_bio->read_slot;
-
-		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
-			 max_sectors);
-
-		r10_bio->devs[slot].bio = read_bio;
-		r10_bio->devs[slot].rdev = rdev;
-
-		read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
-			choose_data_offset(r10_bio, rdev);
-		read_bio->bi_bdev = rdev->bdev;
-		read_bio->bi_end_io = raid10_end_read_request;
-		bio_set_op_attrs(read_bio, op, do_sync);
-		if (test_bit(FailFast, &rdev->flags) &&
-		    test_bit(R10BIO_FailFast, &r10_bio->state))
-			read_bio->bi_opf |= MD_FAILFAST;
-		read_bio->bi_private = r10_bio;
-
-		if (mddev->gendisk)
-			trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
-					      read_bio, disk_devt(mddev->gendisk),
-					      r10_bio->sector);
-		if (max_sectors < r10_bio->sectors) {
-			/* Could not read all from this device, so we will
-			 * need another r10_bio.
-			 */
-			sectors_handled = (r10_bio->sector + max_sectors
-					   - bio->bi_iter.bi_sector);
-			r10_bio->sectors = max_sectors;
-			spin_lock_irq(&conf->device_lock);
-			if (bio->bi_phys_segments == 0)
-				bio->bi_phys_segments = 2;
-			else
-				bio->bi_phys_segments++;
-			spin_unlock_irq(&conf->device_lock);
-			/* Cannot call generic_make_request directly
-			 * as that will be queued in __generic_make_request
-			 * and subsequent mempool_alloc might block
-			 * waiting for it.  so hand bio over to raid10d.
-			 */
-			reschedule_retry(r10_bio);
-
-			r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
-			r10_bio->master_bio = bio;
-			r10_bio->sectors = bio_sectors(bio) - sectors_handled;
-			r10_bio->state = 0;
-			r10_bio->mddev = mddev;
-			r10_bio->sector = bio->bi_iter.bi_sector +
-				sectors_handled;
-			goto read_again;
-		} else
-			generic_make_request(read_bio);
-		return;
-	}
-
-	/*
-	 * WRITE:
-	 */
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
 		raid10_log(mddev, "wait queued");
@@ -1300,8 +1280,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 			int bad_sectors;
 			int is_bad;
 
-			is_bad = is_badblock(rdev, dev_sector,
-					     max_sectors,
+			is_bad = is_badblock(rdev, dev_sector, max_sectors,
 					     &first_bad, &bad_sectors);
 			if (is_bad < 0) {
 				/* Mustn't write here until the bad block
@@ -1405,8 +1384,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 			r10_bio->devs[i].bio = mbio;
 
 			mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr+
-					   choose_data_offset(r10_bio,
-							      rdev));
+					   choose_data_offset(r10_bio, rdev));
 			mbio->bi_bdev = rdev->bdev;
 			mbio->bi_end_io	= raid10_end_write_request;
 			bio_set_op_attrs(mbio, op, do_sync | do_fua);
@@ -1457,8 +1435,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 			r10_bio->devs[i].repl_bio = mbio;
 
 			mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr +
-					   choose_data_offset(
-						   r10_bio, rdev));
+					   choose_data_offset(r10_bio, rdev));
 			mbio->bi_bdev = rdev->bdev;
 			mbio->bi_end_io	= raid10_end_write_request;
 			bio_set_op_attrs(mbio, op, do_sync | do_fua);
@@ -1503,6 +1480,36 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 	one_write_done(r10_bio);
 }
 
+static void __make_request(struct mddev *mddev, struct bio *bio)
+{
+	struct r10conf *conf = mddev->private;
+	struct r10bio *r10_bio;
+
+	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+	r10_bio->master_bio = bio;
+	r10_bio->sectors = bio_sectors(bio);
+
+	r10_bio->mddev = mddev;
+	r10_bio->sector = bio->bi_iter.bi_sector;
+	r10_bio->state = 0;
+
+	/* We might need to issue multiple reads to different
+	 * devices if there are bad blocks around, so we keep
+	 * track of the number of reads in bio->bi_phys_segments.
+	 * If this is 0, there is only one r10_bio and no locking
+	 * will be needed when the request completes.  If it is
+	 * non-zero, then it is the number of not-completed requests.
+	 */
+	bio->bi_phys_segments = 0;
+	bio_clear_flag(bio, BIO_SEG_VALID);
+
+	if (bio_data_dir(bio) == READ)
+		raid10_read_request(mddev, bio, r10_bio);
+	else
+		raid10_write_request(mddev, bio, r10_bio);
+}
+
 static void raid10_make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct r10conf *conf = mddev->private;
-- 
2.10.2


^ permalink raw reply related

* [PATCH v3 1/2] md/raid1: Refactor raid1_make_request
From: Robert LeBlanc @ 2016-12-05 20:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Robert LeBlanc
In-Reply-To: <20161205200258.6653-1-robert@leblancnet.us>

Refactor raid1_make_request to make read and write code in their own
functions to clean up the code.

Signed-off-by: Robert LeBlanc <robert@leblancnet.us>
---
 drivers/md/raid1.c | 259 +++++++++++++++++++++++++++--------------------------
 1 file changed, 134 insertions(+), 125 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 94e0afc..4edefb2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1066,17 +1066,106 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	kfree(plug);
 }
 
-static void raid1_make_request(struct mddev *mddev, struct bio * bio)
+static void raid1_read_request(struct mddev *mddev, struct bio *bio,
+				 struct r1bio *r1_bio)
 {
 	struct r1conf *conf = mddev->private;
 	struct raid1_info *mirror;
-	struct r1bio *r1_bio;
 	struct bio *read_bio;
+	struct bitmap *bitmap = mddev->bitmap;
+	const int op = bio_op(bio);
+	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
+	int sectors_handled;
+	int max_sectors;
+	int rdisk;
+
+	wait_barrier(conf, bio);
+
+read_again:
+	rdisk = read_balance(conf, r1_bio, &max_sectors);
+
+	if (rdisk < 0) {
+		/* couldn't find anywhere to read from */
+		raid_end_bio_io(r1_bio);
+		return;
+	}
+	mirror = conf->mirrors + rdisk;
+
+	if (test_bit(WriteMostly, &mirror->rdev->flags) &&
+	    bitmap) {
+		/* Reading from a write-mostly device must
+		 * take care not to over-take any writes
+		 * that are 'behind'
+		 */
+		raid1_log(mddev, "wait behind writes");
+		wait_event(bitmap->behind_wait,
+			   atomic_read(&bitmap->behind_writes) == 0);
+	}
+	r1_bio->read_disk = rdisk;
+	r1_bio->start_next_window = 0;
+
+	read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+	bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
+		 max_sectors);
+
+	r1_bio->bios[rdisk] = read_bio;
+
+	read_bio->bi_iter.bi_sector = r1_bio->sector +
+		mirror->rdev->data_offset;
+	read_bio->bi_bdev = mirror->rdev->bdev;
+	read_bio->bi_end_io = raid1_end_read_request;
+	bio_set_op_attrs(read_bio, op, do_sync);
+	if (test_bit(FailFast, &mirror->rdev->flags) &&
+	    test_bit(R1BIO_FailFast, &r1_bio->state))
+	        read_bio->bi_opf |= MD_FAILFAST;
+	read_bio->bi_private = r1_bio;
+
+	if (mddev->gendisk)
+	        trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
+	                              read_bio, disk_devt(mddev->gendisk),
+	                              r1_bio->sector);
+
+	if (max_sectors < r1_bio->sectors) {
+		/* could not read all from this device, so we will
+		 * need another r1_bio.
+		 */
+
+		sectors_handled = (r1_bio->sector + max_sectors
+				   - bio->bi_iter.bi_sector);
+		r1_bio->sectors = max_sectors;
+		spin_lock_irq(&conf->device_lock);
+		if (bio->bi_phys_segments == 0)
+			bio->bi_phys_segments = 2;
+		else
+			bio->bi_phys_segments++;
+		spin_unlock_irq(&conf->device_lock);
+		/* Cannot call generic_make_request directly
+		 * as that will be queued in __make_request
+		 * and subsequent mempool_alloc might block waiting
+		 * for it.  So hand bio over to raid1d.
+		 */
+		reschedule_retry(r1_bio);
+
+		r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+		r1_bio->master_bio = bio;
+		r1_bio->sectors = bio_sectors(bio) - sectors_handled;
+		r1_bio->state = 0;
+		r1_bio->mddev = mddev;
+		r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+		goto read_again;
+	} else
+		generic_make_request(read_bio);
+}
+
+static void raid1_write_request(struct mddev *mddev, struct bio *bio,
+				struct r1bio *r1_bio)
+{
+	struct r1conf *conf = mddev->private;
 	int i, disks;
-	struct bitmap *bitmap;
+	struct bitmap *bitmap = mddev->bitmap;
 	unsigned long flags;
 	const int op = bio_op(bio);
-	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
 	const unsigned long do_flush_fua = (bio->bi_opf &
 						(REQ_PREFLUSH | REQ_FUA));
@@ -1096,12 +1185,11 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
 
 	md_write_start(mddev, bio); /* wait on superblock update early */
 
-	if (bio_data_dir(bio) == WRITE &&
-	    ((bio_end_sector(bio) > mddev->suspend_lo &&
+	if ((bio_end_sector(bio) > mddev->suspend_lo &&
 	    bio->bi_iter.bi_sector < mddev->suspend_hi) ||
 	    (mddev_is_clustered(mddev) &&
 	     md_cluster_ops->area_resyncing(mddev, WRITE,
-		     bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
+		     bio->bi_iter.bi_sector, bio_end_sector(bio)))) {
 		/* As the suspend_* range is controlled by
 		 * userspace, we want an interruptible
 		 * wait.
@@ -1115,128 +1203,15 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
 			    bio->bi_iter.bi_sector >= mddev->suspend_hi ||
 			    (mddev_is_clustered(mddev) &&
 			     !md_cluster_ops->area_resyncing(mddev, WRITE,
-				     bio->bi_iter.bi_sector, bio_end_sector(bio))))
+				     bio->bi_iter.bi_sector,
+				     bio_end_sector(bio))))
 				break;
 			schedule();
 		}
 		finish_wait(&conf->wait_barrier, &w);
 	}
-
 	start_next_window = wait_barrier(conf, bio);
 
-	bitmap = mddev->bitmap;
-
-	/*
-	 * make_request() can abort the operation when read-ahead is being
-	 * used and no empty request is available.
-	 *
-	 */
-	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
-	r1_bio->master_bio = bio;
-	r1_bio->sectors = bio_sectors(bio);
-	r1_bio->state = 0;
-	r1_bio->mddev = mddev;
-	r1_bio->sector = bio->bi_iter.bi_sector;
-
-	/* We might need to issue multiple reads to different
-	 * devices if there are bad blocks around, so we keep
-	 * track of the number of reads in bio->bi_phys_segments.
-	 * If this is 0, there is only one r1_bio and no locking
-	 * will be needed when requests complete.  If it is
-	 * non-zero, then it is the number of not-completed requests.
-	 */
-	bio->bi_phys_segments = 0;
-	bio_clear_flag(bio, BIO_SEG_VALID);
-
-	if (rw == READ) {
-		/*
-		 * read balancing logic:
-		 */
-		int rdisk;
-
-read_again:
-		rdisk = read_balance(conf, r1_bio, &max_sectors);
-
-		if (rdisk < 0) {
-			/* couldn't find anywhere to read from */
-			raid_end_bio_io(r1_bio);
-			return;
-		}
-		mirror = conf->mirrors + rdisk;
-
-		if (test_bit(WriteMostly, &mirror->rdev->flags) &&
-		    bitmap) {
-			/* Reading from a write-mostly device must
-			 * take care not to over-take any writes
-			 * that are 'behind'
-			 */
-			raid1_log(mddev, "wait behind writes");
-			wait_event(bitmap->behind_wait,
-				   atomic_read(&bitmap->behind_writes) == 0);
-		}
-		r1_bio->read_disk = rdisk;
-		r1_bio->start_next_window = 0;
-
-		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
-			 max_sectors);
-
-		r1_bio->bios[rdisk] = read_bio;
-
-		read_bio->bi_iter.bi_sector = r1_bio->sector +
-			mirror->rdev->data_offset;
-		read_bio->bi_bdev = mirror->rdev->bdev;
-		read_bio->bi_end_io = raid1_end_read_request;
-		bio_set_op_attrs(read_bio, op, do_sync);
-		if (test_bit(FailFast, &mirror->rdev->flags) &&
-		    test_bit(R1BIO_FailFast, &r1_bio->state))
-			read_bio->bi_opf |= MD_FAILFAST;
-		read_bio->bi_private = r1_bio;
-
-		if (mddev->gendisk)
-			trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
-					      read_bio, disk_devt(mddev->gendisk),
-					      r1_bio->sector);
-
-		if (max_sectors < r1_bio->sectors) {
-			/* could not read all from this device, so we will
-			 * need another r1_bio.
-			 */
-
-			sectors_handled = (r1_bio->sector + max_sectors
-					   - bio->bi_iter.bi_sector);
-			r1_bio->sectors = max_sectors;
-			spin_lock_irq(&conf->device_lock);
-			if (bio->bi_phys_segments == 0)
-				bio->bi_phys_segments = 2;
-			else
-				bio->bi_phys_segments++;
-			spin_unlock_irq(&conf->device_lock);
-			/* Cannot call generic_make_request directly
-			 * as that will be queued in __make_request
-			 * and subsequent mempool_alloc might block waiting
-			 * for it.  So hand bio over to raid1d.
-			 */
-			reschedule_retry(r1_bio);
-
-			r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
-			r1_bio->master_bio = bio;
-			r1_bio->sectors = bio_sectors(bio) - sectors_handled;
-			r1_bio->state = 0;
-			r1_bio->mddev = mddev;
-			r1_bio->sector = bio->bi_iter.bi_sector +
-				sectors_handled;
-			goto read_again;
-		} else
-			generic_make_request(read_bio);
-		return;
-	}
-
-	/*
-	 * WRITE:
-	 */
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
 		raid1_log(mddev, "wait queued");
@@ -1280,8 +1255,7 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
 			int bad_sectors;
 			int is_bad;
 
-			is_bad = is_badblock(rdev, r1_bio->sector,
-					     max_sectors,
+			is_bad = is_badblock(rdev, r1_bio->sector, max_sectors,
 					     &first_bad, &bad_sectors);
 			if (is_bad < 0) {
 				/* mustn't write here until the bad block is
@@ -1370,7 +1344,8 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
 			continue;
 
 		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors);
+		bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector,
+			 max_sectors);
 
 		if (first_clone) {
 			/* do behind I/O ?
@@ -1464,6 +1439,40 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
 	wake_up(&conf->wait_barrier);
 }
 
+static void raid1_make_request(struct mddev *mddev, struct bio *bio)
+{
+	struct r1conf *conf = mddev->private;
+	struct r1bio *r1_bio;
+
+	/*
+	 * make_request() can abort the operation when read-ahead is being
+	 * used and no empty request is available.
+	 *
+	 */
+	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+	r1_bio->master_bio = bio;
+	r1_bio->sectors = bio_sectors(bio);
+	r1_bio->state = 0;
+	r1_bio->mddev = mddev;
+	r1_bio->sector = bio->bi_iter.bi_sector;
+
+	/* We might need to issue multiple reads to different
+	 * devices if there are bad blocks around, so we keep
+	 * track of the number of reads in bio->bi_phys_segments.
+	 * If this is 0, there is only one r1_bio and no locking
+	 * will be needed when requests complete.  If it is
+	 * non-zero, then it is the number of not-completed requests.
+	 */
+	bio->bi_phys_segments = 0;
+	bio_clear_flag(bio, BIO_SEG_VALID);
+
+	if (bio_data_dir(bio) == READ)
+		raid1_read_request(mddev, bio, r1_bio);
+	else
+		raid1_write_request(mddev, bio, r1_bio);
+}
+
 static void raid1_status(struct seq_file *seq, struct mddev *mddev)
 {
 	struct r1conf *conf = mddev->private;
-- 
2.10.2


^ permalink raw reply related

* [PATCH v3 0/2] Reorganize raid*_make_request to clean up code
From: Robert LeBlanc @ 2016-12-05 20:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Robert LeBlanc

In response to Christoph, I've broken the read and writes into their own
functions to make the code even cleaner. Since it is such a big change, I broke
up the commits into this series instead of creating a v2 of the previous patch.

Changes since v2:
 Shaohua Li
 * Make md_write_start before wait_barrier
 * Move I/O in recovery stripe test to write path
 * Changed to if/then instead of return in __make_request

Changes since v1:
 
 John Stoffel
 * Changed to if/then instead of return in raid1_make_request

 Neil Brown
 * Moved wait_barrier into raid1_{read,write}_request so that it could be after
   ->suspend_{hi,lo}. This prevents a write blocking a resync until the suspend
   region is moved.

Robert LeBlanc (2):
  md/raid1: Refactor raid1_make_request
  md/raid10: Refactor raid10_make_request

 drivers/md/raid1.c  | 259 +++++++++++++++++++++++++++-------------------------
 drivers/md/raid10.c | 215 ++++++++++++++++++++++---------------------
 2 files changed, 245 insertions(+), 229 deletions(-)

-- 
2.10.2


^ permalink raw reply

* [PATCH v2 7/7] Man page changes for --rwh-policy
From: Artur Paszkiewicz @ 2016-12-05 15:32 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid
In-Reply-To: <20161205153204.7343-1-artur.paszkiewicz@intel.com>

Describe the usage of the --rwh-policy parameter in Create and Misc
modes.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 mdadm.8.in | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/mdadm.8.in b/mdadm.8.in
index aa80f0c..d4456a1 100644
--- a/mdadm.8.in
+++ b/mdadm.8.in
@@ -1015,6 +1015,26 @@ simultaneously. If not specified, this defaults to 4.
 Specify journal device for the RAID-4/5/6 array. The journal device
 should be a SSD with reasonable lifetime.
 
+.TP
+.BR \-\-rwh-policy=
+Specify the RAID Write Hole policy for a RAID-4/5/6 array. Currently supported
+options are
+.BR off ,
+.B journal
+and
+.BR ppl .
+
+The
+.B journal
+policy is implicitly selected when using
+.BR \-\-write-journal .
+
+The
+.B ppl
+policy (Partial Parity Log) is a mechanism that can be used with RAID5 arrays.
+This feature prevents data loss by keeping parity consistent with data even in
+case of drive failure during dirty shutdown. PPL is stored in the metadata
+region of RAID member drives, no additional journal drive is needed.
 
 .SH For assemble:
 
@@ -1705,6 +1725,14 @@ can be found it
 under
 .BR "SCRUBBING AND MISMATCHES" .
 
+.TP
+.BR \-\-rwh-policy=
+Change the RAID Write Hole policy for a RAID-4/5/6 array at runtime. For
+details about the RWH policies, see the description for the same parameter
+under
+.B Create mode
+options.
+
 .SH For Incremental Assembly mode:
 .TP
 .BR \-\-rebuild\-map ", " \-r
-- 
2.10.1


^ permalink raw reply related

* [PATCH v2 6/7] Allow changing the RWH policy for a running array
From: Artur Paszkiewicz @ 2016-12-05 15:32 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid
In-Reply-To: <20161205153204.7343-1-artur.paszkiewicz@intel.com>

This extends the --rwh-policy parameter to work also in Misc mode. Using
it changes the currently active RWH policy in the kernel driver and
updates the metadata to make this change permanent. Updating metadata is
not yet implemented for super1, so this is limited to IMSM for now.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 Manage.c      | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mdadm.c       |  9 +++++++
 mdadm.h       |  1 +
 super-intel.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 153 insertions(+), 1 deletion(-)

diff --git a/Manage.c b/Manage.c
index 5c3d2b9..0119684 100644
--- a/Manage.c
+++ b/Manage.c
@@ -1823,4 +1823,83 @@ int move_spare(char *from_devname, char *to_devname, dev_t devid)
 	close(fd2);
 	return 0;
 }
+
+int ChangeRwhPolicy(char *dev, char *update, int verbose)
+{
+	struct supertype *st;
+	struct mdinfo *info;
+	char *subarray = NULL;
+	int ret = 0;
+	int fd;
+	int new_policy = map_name(rwh_policies, update);
+
+	if (new_policy == UnSet)
+		return 1;
+
+	fd = open(dev, O_RDONLY);
+	if (fd < 0)
+		return 1;
+
+	st = super_by_fd(fd, &subarray);
+	if (!st) {
+		close(fd);
+		return 1;
+	}
+
+	info = sysfs_read(fd, NULL, GET_RWH_POLICY|GET_LEVEL);
+	close(fd);
+	if (!info) {
+		ret = 1;
+		goto free_st;
+	}
+
+	if (new_policy == RWH_POLICY_PPL && !st->ss->supports_ppl) {
+		pr_err("%s metadata does not support PPL\n", st->ss->name);
+		ret = 1;
+		goto free_info;
+	}
+
+	if (info->array.level < 4 || info->array.level > 6) {
+		pr_err("Operation not supported for array level %d\n",
+				info->array.level);
+		ret = 1;
+		goto free_info;
+	}
+
+	if (info->rwh_policy != (unsigned)new_policy) {
+		if (!st->ss->external && new_policy == RWH_POLICY_PPL) {
+			pr_err("Operation supported for external metadata only.\n");
+			ret = 1;
+			goto free_info;
+		}
+
+		if (sysfs_set_str(info, NULL, "rwh_policy", update)) {
+			pr_err("Failed to change array RWH Policy\n");
+			ret = 1;
+			goto free_info;
+		}
+		info->rwh_policy = new_policy;
+	}
+
+	if (subarray) {
+		char container_dev[PATH_MAX];
+		struct mddev_ident ident;
+
+		sprintf(container_dev, "/dev/%s", st->container_devnm);
+
+		st->info = info;
+		ident.st = st;
+
+		ret = Update_subarray(container_dev, subarray, "rwh-policy",
+				&ident, verbose);
+	}
+
+free_info:
+	sysfs_free(info);
+free_st:
+	free(st);
+	free(subarray);
+
+	return ret;
+}
 #endif
diff --git a/mdadm.c b/mdadm.c
index 2b6d3a2..ed21f1c 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -252,6 +252,7 @@ int main(int argc, char *argv[])
 		case UpdateSubarray:
 		case UdevRules:
 		case KillOpt:
+		case RwhPolicy:
 			if (!mode)
 				newmode = MISC;
 			break;
@@ -1211,11 +1212,16 @@ int main(int argc, char *argv[])
 			s.journaldisks = 1;
 			continue;
 		case O(CREATE, RwhPolicy):
+		case O(MISC, RwhPolicy):
 			s.rwh_policy = map_name(rwh_policies, optarg);
 			if (s.rwh_policy == UnSet) {
 				pr_err("Invalid RWH policy: %s\n", optarg);
 				exit(2);
 			}
+			if (mode == MISC) {
+				devmode = opt;
+				c.update = optarg;
+			}
 			continue;
 		}
 		/* We have now processed all the valid options. Anything else is
@@ -1927,6 +1933,9 @@ static int misc_list(struct mddev_dev *devlist,
 		case Action:
 			rv |= SetAction(dv->devname, c->action);
 			continue;
+		case RwhPolicy:
+			rv |= ChangeRwhPolicy(dv->devname, c->update, c->verbose);
+			continue;
 		}
 		if (dv->devname[0] == '/')
 			mdfd = open_mddev(dv->devname, 1);
diff --git a/mdadm.h b/mdadm.h
index 6d52bdf..53f8bd1 100644
--- a/mdadm.h
+++ b/mdadm.h
@@ -1367,6 +1367,7 @@ extern int Update_subarray(char *dev, char *subarray, char *update, struct mddev
 extern int Wait(char *dev);
 extern int WaitClean(char *dev, int sock, int verbose);
 extern int SetAction(char *dev, char *action);
+extern int ChangeRwhPolicy(char *dev, char *update, int verbose);
 
 extern int Incremental(struct mddev_dev *devlist, struct context *c,
 		       struct supertype *st);
diff --git a/super-intel.c b/super-intel.c
index e524ef0..3b40429 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -448,6 +448,7 @@ enum imsm_update_type {
 	update_general_migration_checkpoint,
 	update_size_change,
 	update_prealloc_badblocks_mem,
+	update_rwh_policy,
 };
 
 struct imsm_update_activate_spare {
@@ -540,6 +541,12 @@ struct imsm_update_prealloc_bb_mem {
 	enum imsm_update_type type;
 };
 
+struct imsm_update_rwh_policy {
+	enum imsm_update_type type;
+	int new_policy;
+	int dev_idx;
+};
+
 static const char *_sys_dev_type[] = {
 	[SYS_DEV_UNKNOWN] = "Unknown",
 	[SYS_DEV_SAS] = "SAS",
@@ -3175,7 +3182,6 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
 	info->custom_array_size   <<= 32;
 	info->custom_array_size   |= __le32_to_cpu(dev->size_low);
 	info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
-	info->journal_clean = dev->rwh_policy;
 
 	if (is_gen_migration(dev)) {
 		info->reshape_active = 1;
@@ -3347,6 +3353,8 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
 			info->rwh_policy = RWH_POLICY_PPL;
 		else
 			info->rwh_policy = RWH_POLICY_UNKNOWN;
+
+		info->journal_clean = info->rwh_policy == RWH_POLICY_PPL;
 	}
 }
 
@@ -7183,6 +7191,41 @@ static int update_subarray_imsm(struct supertype *st, char *subarray,
 			}
 			super->updates_pending++;
 		}
+	} else if (strcmp(update, "rwh-policy") == 0) {
+		struct mdinfo *info;
+		int new_policy;
+		char *ep;
+		int vol = strtoul(subarray, &ep, 10);
+
+		if (!ident->st || !ident->st->info)
+			return 2;
+
+		info = ident->st->info;
+
+		if (*ep != '\0' || vol >= super->anchor->num_raid_devs)
+			return 2;
+
+		if (info->rwh_policy == RWH_POLICY_OFF)
+			new_policy = RWH_OFF;
+		else if (info->rwh_policy == RWH_POLICY_PPL)
+			new_policy = RWH_DISTRIBUTED;
+		else
+			return 2;
+
+		if (st->update_tail) {
+			struct imsm_update_rwh_policy *u = xmalloc(sizeof(*u));
+
+			u->type = update_rwh_policy;
+			u->dev_idx = vol;
+			u->new_policy = new_policy;
+			append_metadata_update(st, u, sizeof(*u));
+		} else {
+			struct imsm_dev *dev;
+
+			dev = get_imsm_dev(super, vol);
+			dev->rwh_policy = new_policy;
+			super->updates_pending++;
+		}
 	} else
 		return 2;
 
@@ -9400,6 +9443,21 @@ static void imsm_process_update(struct supertype *st,
 	}
 	case update_prealloc_badblocks_mem:
 		break;
+	case update_rwh_policy: {
+		struct imsm_update_rwh_policy *u = (void *)update->buf;
+		int target = u->dev_idx;
+		struct imsm_dev *dev = get_imsm_dev(super, target);
+		if (!dev) {
+			dprintf("could not find subarray-%d\n", target);
+			break;
+		}
+
+		if (dev->rwh_policy != u->new_policy) {
+			dev->rwh_policy = u->new_policy;
+			super->updates_pending++;
+		}
+		break;
+	}
 	default:
 		pr_err("error: unsuported process update type:(type: %d)\n",	type);
 	}
@@ -9645,6 +9703,11 @@ static int imsm_prepare_update(struct supertype *st,
 		super->extra_space += sizeof(struct bbm_log) -
 			get_imsm_bbm_log_size(super->bbm_log);
 		break;
+	case update_rwh_policy: {
+		if (update->len < (int)sizeof(struct imsm_update_rwh_policy))
+			return 0;
+		break;
+	}
 	default:
 		return 0;
 	}
-- 
2.10.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox