Linux RAID subsystem development
 help / color / mirror / Atom feed
* Re: [PATCH 1/7] imsm: metadata changes for PPL
From: Artur Paszkiewicz @ 2016-11-29 10:47 UTC (permalink / raw)
  To: Jes Sorensen; +Cc: linux-raid
In-Reply-To: <wrfjpolfjibn.fsf@redhat.com>

On 11/29/2016 12:27 AM, Jes Sorensen wrote:
> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>> Updates for the IMSM metadata format, including PPL header structures.
>>
>> Extend imsm_vol dirty field adding a third value, which is required to
>> enable PPL recovery in Windows and UEFI drivers.
>>
>> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
>> ---
>>  super-intel.c | 48 ++++++++++++++++++++++++++++++++++++++++--------
>>  1 file changed, 40 insertions(+), 8 deletions(-)
> 
> A couple of comments on this change:
> 
>> diff --git a/super-intel.c b/super-intel.c
>> index 5740088..69f6201 100644
>> --- a/super-intel.c
>> +++ b/super-intel.c
>> @@ -154,6 +157,9 @@ struct imsm_vol {
>>  #define MIGR_STATE_CHANGE 4
>>  #define MIGR_REPAIR 5
>>  	__u8  migr_type;	/* Initializing, Rebuilding, ... */
>> +#define RAIDVOL_CLEAN          0
>> +#define RAIDVOL_DIRTY          1
>> +#define RAIDVOL_DSRECORD_VALID 2
>>  	__u8  dirty;
>>  	__u8  fs_state;		/* fast-sync state for CnG (0xff == disabled) */
>>  	__u16 verify_errors;	/* number of mismatches */
>> @@ -189,7 +195,30 @@ struct imsm_dev {
>>  	__u16 cache_policy;
>>  	__u8  cng_state;
>>  	__u8  cng_sub_state;
>> -#define IMSM_DEV_FILLERS 10
>> +	__u16 my_vol_raid_dev_num; /* Used in Unique volume Id for this RaidDev */
>> +
>> +	/* NVM_EN */
>> +	__u8 nv_cache_mode;
>> +	union {
>> +		__u8 nv_cache_flags;
>> +		struct {
>> +		    __u8 dirty:1; /* 1 - cache is dirty, 0 - clean */
>> +		    __u8 nvc_health:2;
>> +		    __u8 expansion_bytes:5;
>> +		} nvCache;
>> +	};
> 
> This sets off alarm bells here - you declare a union of a u8 and a
> matching bitfield, but do not handle bit endian bitfields. If someone
> tried to use this on a big endian system this could get messy.

Those fields are not used in the code at all, the only thing added to
this structure that is used is 'rwh_policy'. The rest is to fill the
gaps between IMSM format in UEFI/Windows.

> 
>> @@ -7565,12 +7594,15 @@ mark_checkpoint:
>>  
>>  skip_mark_checkpoint:
>>  	/* mark dirty / clean */
>> -	if (dev->vol.dirty != !consistent) {
>> +	if ((dev->vol.dirty & RAIDVOL_DIRTY) != !consistent) {
> 
> This part makes my head spin (to be honest, the original code did
> too). I had to go write a small snipped of C to figure out what the
> compiler does with !x given x > 0, since consistent can be 0, 1, and 2
> here.
> 
> Basically for RAIDVOL_CLEAN and consistent = 0, and RAIDVOL_DIRTY and
> consistent = 1 and 2, this statement triggers. Maybe I am just terrible
> dense when it comes to reading obfuscated C, but could we change this
> round to a construct that is a little easier to read?

Yes, I admit it is confusing. Does this look better?

-       if (dev->vol.dirty != !consistent) {
+       if (((dev->vol.dirty & RAIDVOL_DIRTY) && consistent) ||
+           (!(dev->vol.dirty & RAIDVOL_DIRTY) && !consistent)) {

Thanks,
Artur


^ permalink raw reply

* Re: [PATCH 3/7] imsm: PPL support
From: Artur Paszkiewicz @ 2016-11-29 11:02 UTC (permalink / raw)
  To: Jes Sorensen; +Cc: linux-raid
In-Reply-To: <wrfjlgw3jh8c.fsf@redhat.com>

On 11/29/2016 12:51 AM, Jes Sorensen wrote:
> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>> Enable creating and assembling IMSM raid5 arrays with PPL.
>>
>> Write the IMSM MPB location for a device to the newly added rdev
>> sb_start sysfs attribute and 'journal_ppl' to 'state' attribute for
>> every active member.
>>
>> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
>> ---
>>  mdadm.h       |  1 +
>>  super-intel.c | 33 +++++++++++++++++++++++++++++++++
>>  sysfs.c       |  4 ++++
>>  3 files changed, 38 insertions(+)
>>
>> diff --git a/mdadm.h b/mdadm.h
>> index 4eabf59..5600341 100755
>> --- a/mdadm.h
>> +++ b/mdadm.h
>> @@ -252,6 +252,7 @@ struct mdinfo {
>>  	unsigned long long	custom_array_size; /* size for non-default sized
>>  						    * arrays (in sectors)
>>  						    */
>> +	unsigned long long	sb_start;
>>  #define NO_RESHAPE		0
>>  #define VOLUME_RESHAPE		1
>>  #define CONTAINER_RESHAPE	2
>> diff --git a/super-intel.c b/super-intel.c
>> index df09272..79a3d78 100644
>> --- a/super-intel.c
>> +++ b/super-intel.c
>> @@ -1261,6 +1261,15 @@ static void print_imsm_dev(struct intel_super *super,
>>  	}
>>  	printf("\n");
>>  	printf("    Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean");
>> +	printf("     RWH Policy : ");
>> +	if (dev->rwh_policy == RWH_OFF)
>> +		printf("off\n");
>> +	else if (dev->rwh_policy == RWH_DISTRIBUTED)
>> +		printf("PPL distributed\n");
>> +	else if (dev->rwh_policy == RWH_JOURNALING_DRIVE)
>> +		printf("PPL journaling drive\n");
>> +	else
>> +		printf("<unknown:%d>\n", dev->rwh_policy);
>>  }
>>  
>>  static void print_imsm_disk(struct imsm_disk *disk, int index, __u32 reserved)
>> @@ -3043,6 +3052,15 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>  			}
>>  		}
>>  	}
>> +
>> +	if (info->array.level == 5) {
>> +		if (dev->rwh_policy == RWH_OFF)
>> +			info->rwh_policy = RWH_POLICY_OFF;
>> +		else if (dev->rwh_policy == RWH_DISTRIBUTED)
>> +			info->rwh_policy = RWH_POLICY_PPL;
>> +		else
>> +			info->rwh_policy = RWH_POLICY_UNKNOWN;
>> +	}
>>  }
>>  
>>  static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev,
>> @@ -3177,6 +3195,9 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *
>>  
>>  		disk = &super->disks->disk;
>>  		info->data_offset = total_blocks(&super->disks->disk) - reserved;
>> +		/* mpb anchor sector - see store_imsm_mpb() */
>> +		info->sb_start = total_blocks(&super->disks->disk) -
>> +				 ((2 * super->sector_size) >> 9);
>>  		info->component_size = reserved;
>>  		info->disk.state  = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0;
>>  		/* we don't change info->disk.raid_disk here because
> 
> Hi Artur,
> 
> I have been sitting staring at the above for a while, and looking at
> store_imsm_mpb() it is not clear to me what is meant to happen here.
> 
> For 4k sector drives, aren't you pushing back sb_start way further than
> you are for 512 byte sector drives? Ie. you are subtracting 16 sectors
> for the 4k drive but only two sectors for the 512 byte sector drive?
> 
> Maybe it's because it's Monday or I lost the last of my marbles, but
> could you possibly enlighten me here please?

Jes,

You read it correctly. The reason for this is that the IMSM anchor is
located in the second _logical_ sector from the end of the drive. So for
4k drives this will be 16 512-byte sectors from the end.

Thanks,
Artur


^ permalink raw reply

* Re: [BUG] MD/RAID1 hung forever on freeze_array
From: Jinpu Wang @ 2016-11-29 11:15 UTC (permalink / raw)
  To: Coly Li; +Cc: linux-raid, Shaohua Li, Neil Brown, Nate Dailey
In-Reply-To: <dd068837-6873-ab1b-3c25-d4dbf4ad2ec4@suse.de>

On Mon, Nov 28, 2016 at 10:10 AM, Coly Li <colyli@suse.de> wrote:
> On 2016/11/28 下午5:02, Jinpu Wang wrote:
>> On Mon, Nov 28, 2016 at 9:54 AM, Coly Li <colyli@suse.de> wrote:
>>> On 2016/11/28 下午4:24, Jinpu Wang wrote:
>>>> snip
>>>>>>>
>>>>>>> every time nr_pending is 1 bigger then (nr_queued + 1), so seems we
>>>>>>> forgot to increase nr_queued somewhere?
>>>>>>>
>>>>>>> I've noticed (commit ccfc7bf1f09d61)raid1: include bio_end_io_list in
>>>>>>> nr_queued to prevent freeze_array hang. Seems it fixed similar bug.
>>>>>>>
>>>>>>> Could you give your suggestion?
>>>>>>>
>>>>>> Sorry, forgot to mention kernel version is 4.4.28
>>>>>
>>>>> This commit is Cced to stable@vger.kernel.org for v4.3+, do you use a
>>>>> stable kernel or a distribution with 4.4.28 kernel ?
>>>>>
>>>>> Coly
>>>>>
>>>>>
>>>> Hi Coly,
>>>>
>>>> I'm using Debian8 with 4.4.28 kernel.
>>>
>>> Hi Jinpu,
>>>
>>> Is it possible for your to run a upstream kernel or vanilla kernel to
>>> test whether the issue still can be reproduced ? Then we can know
>>> whether it is an upstream bug or a distro issue.
>>>
>>> Thanks.
>>>
>>> Coly
>>
>> Hi Coly,
>>
>> I did run kernel 4.4.34 (I download from kernel.org), I can reproduce
>> the same bug.
>>
>> I can also try latest 4.8 or 4.9 rc kernel, if you think it's necessary?
>>
> Yes, please. If it can be reproduced on upstream kernel by a set of
> scripts, it will be very helpful to debug and fix this issue.
>
> Thanks in advance.
>
> Coly

Hi Coly,

I tried with kernel 4.9-cr7, I can't reproduce it with my testcase anymore.

It's hard to say the bug is fixed or harder to reproduce because code
changed a lot.

-- 
Jinpu Wang
Linux Kernel Developer

ProfitBricks GmbH
Greifswalder Str. 207
D - 10405 Berlin

Tel:       +49 30 577 008  042
Fax:      +49 30 577 008 299
Email:    jinpu.wang@profitbricks.com
URL:      https://www.profitbricks.de

Sitz der Gesellschaft: Berlin
Registergericht: Amtsgericht Charlottenburg, HRB 125506 B
Geschäftsführer: Achim Weiss

^ permalink raw reply

* Is trim/discard supported on LVM RAID logical volume?
From: Patrick Dung @ 2016-11-29 12:59 UTC (permalink / raw)
  To: linux-raid

Hello,

Sorry if it is a cross posting.
I had post my question in linux-lvm mailing list but did not have reply.

In my old setting, fstrim is supported:
ext4 over LVM over MD software raid 1 (mdadm) > SSD.

After reading the recommendation in RHEL 6
https://access.redhat.com/documentation/en-US/Red_Hat_Enterprise_Linux/6/html/Storage_Administration_Guide/ch-ssd.html

I changed to use raid 1 mirror using 'LVM RAID logical volume'. It
make use of MD raid internally.
But I found fstrim does not work anymore. Is the behavior expected?

Thanks and regards,
Patrick

^ permalink raw reply

* [PATCH v2 0/9] Bad block support for IMSM metadata
From: Tomasz Majchrzak @ 2016-11-29 13:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Jes.Sorensen, Tomasz Majchrzak

This series of patches implements bad block support for IMSM metadata.

Requested changes have been applied to the first 2 patches. The whole set is
resent because the previous patches don't apply cleanly on the latest upstream
branch. An extra patch has been added which adds 4kn support for bad block log.

Regards,

Tomek

Tomasz Majchrzak (9):
  imsm: parse bad block log in metadata on startup
  imsm: write bad block log on metadata sync
  imsm: give md list of known bad blocks on startup
  imsm: record new bad block in bad block log
  imsm: clear bad block from bad block log
  imsm: clear bad blocks if disk becomes unavailable
  imsm: provide list of bad blocks for an array
  imsm: implement "--examine-badblocks" command
  imsm: 4kn support for bad block log

 super-intel.c | 640 ++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 574 insertions(+), 66 deletions(-)

-- 
1.8.3.1


^ permalink raw reply

* [PATCH v2 1/9] imsm: parse bad block log in metadata on startup
From: Tomasz Majchrzak @ 2016-11-29 13:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Jes.Sorensen, Tomasz Majchrzak
In-Reply-To: <1480424555-31509-1-git-send-email-tomasz.majchrzak@intel.com>

Always allocate memory for all log entries to avoid a need for memory
allocation when monitor requests to record a bad block.

Also some extra checks added to make static code analyzer happy.

Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
---
 super-intel.c | 159 +++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 113 insertions(+), 46 deletions(-)

diff --git a/super-intel.c b/super-intel.c
index 5740088..19ca420 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -218,22 +218,24 @@ struct imsm_super {
 } __attribute__ ((packed));
 
 #define BBM_LOG_MAX_ENTRIES 254
+#define BBM_LOG_MAX_LBA_ENTRY_VAL 256		/* Represents 256 LBAs */
+#define BBM_LOG_SIGNATURE 0xabadb10c
+
+struct bbm_log_block_addr {
+	__u16 w1;
+	__u32 dw1;
+} __attribute__ ((__packed__));
 
 struct bbm_log_entry {
-	__u64 defective_block_start;
-#define UNREADABLE 0xFFFFFFFF
-	__u32 spare_block_offset;
-	__u16 remapped_marked_count;
-	__u16 disk_ordinal;
+	__u8 marked_count;		/* Number of blocks marked - 1 */
+	__u8 disk_ordinal;		/* Disk entry within the imsm_super */
+	struct bbm_log_block_addr defective_block_start;
 } __attribute__ ((__packed__));
 
 struct bbm_log {
 	__u32 signature; /* 0xABADB10C */
 	__u32 entry_count;
-	__u32 reserved_spare_block_count; /* 0 */
-	__u32 reserved; /* 0xFFFF */
-	__u64 first_spare_lba;
-	struct bbm_log_entry mapped_block_entries[BBM_LOG_MAX_ENTRIES];
+	struct bbm_log_entry marked_block_entries[BBM_LOG_MAX_ENTRIES];
 } __attribute__ ((__packed__));
 
 #ifndef MDASSEMBLE
@@ -788,6 +790,93 @@ static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index)
 	return NULL;
 }
 
+#if BYTE_ORDER == LITTLE_ENDIAN
+static inline unsigned long long __le48_to_cpu(const struct bbm_log_block_addr
+					       *addr)
+{
+	return ((((__u64)addr->dw1) << 16) | addr->w1);
+}
+
+static inline struct bbm_log_block_addr __cpu_to_le48(unsigned long long sec)
+{
+	struct bbm_log_block_addr addr;
+
+	addr.w1 =  (__u16)(sec & 0xffff);
+	addr.dw1 = (__u32)((sec >> 16) & 0xffffffff);
+	return addr;
+}
+#elif BYTE_ORDER == BIG_ENDIAN
+static inline unsigned long long __le48_to_cpu(const struct bbm_log_block_addr
+					       *addr)
+{
+	return ((((__u64)__le32_to_cpu(addr->dw1)) << 16) |
+		__le16_to_cpu(addr->w1));
+}
+
+static inline struct bbm_log_block_addr __cpu_to_le48(unsigned long long sec)
+{
+	struct bbm_log_block_addr addr;
+
+	addr.w1 =  __cpu_to_le16((__u16)(sec & 0xffff));
+	addr.dw1 = __cpu_to_le32((__u32)(sec >> 16) & 0xffffffff);
+	return addr;
+}
+#else
+#  error "unknown endianness."
+#endif
+
+#ifndef MDASSEMBLE
+/* get size of the bbm log */
+static __u32 get_imsm_bbm_log_size(struct bbm_log *log)
+{
+	if (!log || log->entry_count == 0)
+		return 0;
+
+	return sizeof(log->signature) +
+		sizeof(log->entry_count) +
+		log->entry_count * sizeof(struct bbm_log_entry);
+}
+#endif /* MDASSEMBLE */
+
+/* allocate and load BBM log from metadata */
+static int load_bbm_log(struct intel_super *super)
+{
+	struct imsm_super *mpb = super->anchor;
+	__u32 bbm_log_size =  __le32_to_cpu(mpb->bbm_log_size);
+
+	super->bbm_log = xcalloc(1, sizeof(struct bbm_log));
+	if (!super->bbm_log)
+		return 1;
+
+	if (bbm_log_size) {
+		struct bbm_log *log = (void *)mpb +
+			__le32_to_cpu(mpb->mpb_size) - bbm_log_size;
+
+		__u32 entry_count;
+
+		if (bbm_log_size < sizeof(log->signature) +
+		    sizeof(log->entry_count))
+			return 2;
+
+		entry_count = __le32_to_cpu(log->entry_count);
+		if ((__le32_to_cpu(log->signature) != BBM_LOG_SIGNATURE) ||
+		    (entry_count > BBM_LOG_MAX_ENTRIES))
+			return 3;
+
+		if (bbm_log_size !=
+		    sizeof(log->signature) + sizeof(log->entry_count) +
+		    entry_count * sizeof(struct bbm_log_entry))
+			return 4;
+
+		memcpy(super->bbm_log, log, bbm_log_size);
+	} else {
+		super->bbm_log->signature = __cpu_to_le32(BBM_LOG_SIGNATURE);
+		super->bbm_log->entry_count = 0;
+	}
+
+	return 0;
+}
+
 /*
  * for second_map:
  *  == MAP_0 get first map
@@ -1546,7 +1635,7 @@ static void examine_super_imsm(struct supertype *st, char *homehost)
 	printf("          Disks : %d\n", mpb->num_disks);
 	printf("   RAID Devices : %d\n", mpb->num_raid_devs);
 	print_imsm_disk(__get_imsm_disk(mpb, super->disks->index), super->disks->index, reserved);
-	if (super->bbm_log) {
+	if (get_imsm_bbm_log_size(super->bbm_log)) {
 		struct bbm_log *log = super->bbm_log;
 
 		printf("\n");
@@ -1554,9 +1643,6 @@ static void examine_super_imsm(struct supertype *st, char *homehost)
 		printf("       Log Size : %d\n", __le32_to_cpu(mpb->bbm_log_size));
 		printf("      Signature : %x\n", __le32_to_cpu(log->signature));
 		printf("    Entry Count : %d\n", __le32_to_cpu(log->entry_count));
-		printf("   Spare Blocks : %d\n",  __le32_to_cpu(log->reserved_spare_block_count));
-		printf("    First Spare : %llx\n",
-		       (unsigned long long) __le64_to_cpu(log->first_spare_lba));
 	}
 	for (i = 0; i < mpb->num_raid_devs; i++) {
 		struct mdinfo info;
@@ -3757,19 +3843,6 @@ static int parse_raid_devices(struct intel_super *super)
 	return 0;
 }
 
-/* retrieve a pointer to the bbm log which starts after all raid devices */
-struct bbm_log *__get_imsm_bbm_log(struct imsm_super *mpb)
-{
-	void *ptr = NULL;
-
-	if (__le32_to_cpu(mpb->bbm_log_size)) {
-		ptr = mpb;
-		ptr += mpb->mpb_size - __le32_to_cpu(mpb->bbm_log_size);
-	}
-
-	return ptr;
-}
-
 /*******************************************************************************
  * Function:	check_mpb_migr_compatibility
  * Description:	Function checks for unsupported migration features:
@@ -3922,12 +3995,6 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname)
 		return 3;
 	}
 
-	/* FIXME the BBM log is disk specific so we cannot use this global
-	 * buffer for all disks.  Ok for now since we only look at the global
-	 * bbm_log_size parameter to gate assembly
-	 */
-	super->bbm_log = __get_imsm_bbm_log(super->anchor);
-
 	return 0;
 }
 
@@ -3973,6 +4040,9 @@ load_and_parse_mpb(int fd, struct intel_super *super, char *devname, int keep_fd
 	if (err)
 		return err;
 	err = parse_raid_devices(super);
+	if (err)
+		return err;
+	err = load_bbm_log(super);
 	clear_hi(super);
 	return err;
 }
@@ -4037,6 +4107,8 @@ static void __free_imsm(struct intel_super *super, int free_disks)
 		free(elem);
 		elem = next;
 	}
+	if (super->bbm_log)
+		free(super->bbm_log);
 	super->hba = NULL;
 }
 
@@ -4643,7 +4715,7 @@ static int get_super_block(struct intel_super **super_list, char *devnm, char *d
 		*super_list = s;
 	} else {
 		if (s)
-			free(s);
+			free_imsm(s);
 		if (dfd >= 0)
 			close(dfd);
 	}
@@ -4706,6 +4778,8 @@ static int load_super_imsm(struct supertype *st, int fd, char *devname)
 
 	super = alloc_super();
 	get_dev_sector_size(fd, NULL, &super->sector_size);
+	if (!super)
+		return 1;
 	/* Load hba and capabilities if they exist.
 	 * But do not preclude loading metadata in case capabilities or hba are
 	 * non-compliant and ignore_hw_compat is set.
@@ -5050,7 +5124,7 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
 	super = alloc_super();
 	if (super &&
 	    posix_memalign(&super->buf, MAX_SECTOR_SIZE, mpb_size) != 0) {
-		free(super);
+		free_imsm(super);
 		super = NULL;
 	}
 	if (!super) {
@@ -5061,7 +5135,7 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
 	    MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE) != 0) {
 		pr_err("could not allocate migr_rec buffer\n");
 		free(super->buf);
-		free(super);
+		free_imsm(super);
 		return 0;
 	}
 	memset(super->buf, 0, mpb_size);
@@ -5656,11 +5730,6 @@ static int store_super_imsm(struct supertype *st, int fd)
 #endif
 }
 
-static int imsm_bbm_log_size(struct imsm_super *mpb)
-{
-	return __le32_to_cpu(mpb->bbm_log_size);
-}
-
 #ifndef MDASSEMBLE
 static int validate_geometry_imsm_container(struct supertype *st, int level,
 					    int layout, int raiddisks, int chunk,
@@ -5696,6 +5765,10 @@ static int validate_geometry_imsm_container(struct supertype *st, int level,
 	 * note that there is no fd for the disks in array.
 	 */
 	super = alloc_super();
+	if (!super) {
+		close(fd);
+		return 0;
+	}
 	if (!get_dev_sector_size(fd, NULL, &super->sector_size)) {
 		close(fd);
 		free_imsm(super);
@@ -6933,12 +7006,6 @@ static struct mdinfo *container_content_imsm(struct supertype *st, char *subarra
 		pr_err("Unsupported attributes in IMSM metadata.Arrays activation is blocked.\n");
 	}
 
-	/* check for bad blocks */
-	if (imsm_bbm_log_size(super->anchor)) {
-		pr_err("BBM log found in IMSM metadata.Arrays activation is blocked.\n");
-		sb_errors = 1;
-	}
-
 	/* count spare devices, not used in maps
 	 */
 	for (d = super->disks; d; d = d->next)
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v2 2/9] imsm: write bad block log on metadata sync
From: Tomasz Majchrzak @ 2016-11-29 13:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Jes.Sorensen, Tomasz Majchrzak
In-Reply-To: <1480424555-31509-1-git-send-email-tomasz.majchrzak@intel.com>

Pre-allocate memory for largest possible bad block section when monitor
is being opened to avoid a need for memory allocation on metadata sync.

If memory for a structure has been allocated in mpb buffer but it hasn't
been used yet, it will be taken by next buffer grow, leading to
insufficient memory on metadata flush. Start tracking such memory and
take it into calculation when growing a buffer. Also assert has been
added to debug mode to warn when more metadata has been written than
memory allocated.

Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
---
 super-intel.c | 48 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 5 deletions(-)

diff --git a/super-intel.c b/super-intel.c
index 19ca420..421bfbc 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -81,7 +81,8 @@
 					MPB_ATTRIB_RAID1           | \
 					MPB_ATTRIB_RAID10          | \
 					MPB_ATTRIB_RAID5           | \
-					MPB_ATTRIB_EXP_STRIPE_SIZE)
+					MPB_ATTRIB_EXP_STRIPE_SIZE | \
+					MPB_ATTRIB_BBM)
 
 /* Define attributes that are unused but not harmful */
 #define MPB_ATTRIB_IGNORED		(MPB_ATTRIB_NEVER_USE)
@@ -363,6 +364,7 @@ struct intel_super {
 		array, it indicates that mdmon is allowed to clean migration
 		record */
 	size_t len; /* size of the 'buf' allocation */
+	size_t extra_space; /* extra space in 'buf' that is not used yet */
 	void *next_buf; /* for realloc'ing buf from the manager */
 	size_t next_len;
 	int updates_pending; /* count of pending updates for mdmon */
@@ -423,6 +425,7 @@ enum imsm_update_type {
 	update_takeover,
 	update_general_migration_checkpoint,
 	update_size_change,
+	update_prealloc_badblocks_mem,
 };
 
 struct imsm_update_activate_spare {
@@ -511,6 +514,10 @@ struct imsm_update_add_remove_disk {
 	enum imsm_update_type type;
 };
 
+struct imsm_update_prealloc_bb_mem {
+	enum imsm_update_type type;
+};
+
 static const char *_sys_dev_type[] = {
 	[SYS_DEV_UNKNOWN] = "Unknown",
 	[SYS_DEV_SAS] = "SAS",
@@ -3379,6 +3386,8 @@ static size_t disks_to_mpb_size(int disks)
 	size += (4 - 2) * sizeof(struct imsm_map);
 	/* 4 possible disk_ord_tbl's */
 	size += 4 * (disks - 1) * sizeof(__u32);
+	/* maximum bbm log */
+	size += sizeof(struct bbm_log);
 
 	return size;
 }
@@ -3840,6 +3849,8 @@ static int parse_raid_devices(struct intel_super *super)
 		super->len = len;
 	}
 
+	super->extra_space += space_needed;
+
 	return 0;
 }
 
@@ -4982,6 +4993,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 		super->anchor = mpb_new;
 		mpb->mpb_size = __cpu_to_le32(size_new);
 		memset(mpb_new + size_old, 0, size_round - size_old);
+		super->len = size_round;
 	}
 	super->current_vol = idx;
 
@@ -5542,6 +5554,7 @@ static int write_super_imsm(struct supertype *st, int doclose)
 	__u32 mpb_size = sizeof(struct imsm_super) - sizeof(struct imsm_disk);
 	int num_disks = 0;
 	int clear_migration_record = 1;
+	__u32 bbm_log_size;
 
 	/* 'generation' is incremented everytime the metadata is written */
 	generation = __le32_to_cpu(mpb->generation_num);
@@ -5579,9 +5592,23 @@ static int write_super_imsm(struct supertype *st, int doclose)
 		if (is_gen_migration(dev2))
 			clear_migration_record = 0;
 	}
-	mpb_size += __le32_to_cpu(mpb->bbm_log_size);
+
+	bbm_log_size = get_imsm_bbm_log_size(super->bbm_log);
+
+	if (bbm_log_size) {
+		memcpy((void *)mpb + mpb_size, super->bbm_log, bbm_log_size);
+		mpb->attributes |= MPB_ATTRIB_BBM;
+	} else
+		mpb->attributes &= ~MPB_ATTRIB_BBM;
+
+	super->anchor->bbm_log_size = __cpu_to_le32(bbm_log_size);
+	mpb_size += bbm_log_size;
 	mpb->mpb_size = __cpu_to_le32(mpb_size);
 
+#ifdef DEBUG
+	assert(super->len == 0 || mpb_size <= super->len);
+#endif
+
 	/* recalculate checksum */
 	sum = __gen_imsm_checksum(mpb);
 	mpb->check_sum = __cpu_to_le32(sum);
@@ -7278,6 +7305,7 @@ static int imsm_open_new(struct supertype *c, struct active_array *a,
 {
 	struct intel_super *super = c->sb;
 	struct imsm_super *mpb = super->anchor;
+	struct imsm_update_prealloc_bb_mem u;
 
 	if (atoi(inst) >= mpb->num_raid_devs) {
 		pr_err("subarry index %d, out of range\n", atoi(inst));
@@ -7286,6 +7314,10 @@ static int imsm_open_new(struct supertype *c, struct active_array *a,
 
 	dprintf("imsm: open_new %s\n", inst);
 	a->info.container_member = atoi(inst);
+
+	u.type = update_prealloc_badblocks_mem;
+	imsm_update_metadata_locally(c, &u, sizeof(u));
+
 	return 0;
 }
 
@@ -9031,6 +9063,8 @@ static void imsm_process_update(struct supertype *st,
 		}
 		break;
 	}
+	case update_prealloc_badblocks_mem:
+		break;
 	default:
 		pr_err("error: unsuported process update type:(type: %d)\n",	type);
 	}
@@ -9272,6 +9306,10 @@ static int imsm_prepare_update(struct supertype *st,
 	case update_add_remove_disk:
 		/* no update->len needed */
 		break;
+	case update_prealloc_badblocks_mem:
+		super->extra_space += sizeof(struct bbm_log) -
+			get_imsm_bbm_log_size(super->bbm_log);
+		break;
 	default:
 		return 0;
 	}
@@ -9282,13 +9320,13 @@ static int imsm_prepare_update(struct supertype *st,
 	else
 		buf_len = super->len;
 
-	if (__le32_to_cpu(mpb->mpb_size) + len > buf_len) {
+	if (__le32_to_cpu(mpb->mpb_size) + super->extra_space + len > buf_len) {
 		/* ok we need a larger buf than what is currently allocated
 		 * if this allocation fails process_update will notice that
 		 * ->next_len is set and ->next_buf is NULL
 		 */
-		buf_len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + len,
-				  sector_size);
+		buf_len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) +
+				   super->extra_space + len, sector_size);
 		if (super->next_buf)
 			free(super->next_buf);
 
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v2 3/9] imsm: give md list of known bad blocks on startup
From: Tomasz Majchrzak @ 2016-11-29 13:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Jes.Sorensen, Tomasz Majchrzak
In-Reply-To: <1480424555-31509-1-git-send-email-tomasz.majchrzak@intel.com>

On create set bad block support flag for each drive. On assmble also
provide a list of known bad blocks. Bad blocks are stored in metadata
per disk so they have to be checked against volume boundaries
beforehand.

Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
---
 super-intel.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/super-intel.c b/super-intel.c
index 421bfbc..b2afdff 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -884,6 +884,56 @@ static int load_bbm_log(struct intel_super *super)
 	return 0;
 }
 
+/* checks if bad block is within volume boundaries */
+static int is_bad_block_in_volume(const struct bbm_log_entry *entry,
+			const unsigned long long start_sector,
+			const unsigned long long size)
+{
+	unsigned long long bb_start;
+	unsigned long long bb_end;
+
+	bb_start = __le48_to_cpu(&entry->defective_block_start);
+	bb_end = bb_start + (entry->marked_count + 1);
+
+	if (((bb_start >= start_sector) && (bb_start < start_sector + size)) ||
+	    ((bb_end >= start_sector) && (bb_end <= start_sector + size)))
+		return 1;
+
+	return 0;
+}
+
+/* get list of bad blocks on a drive for a volume */
+static void get_volume_badblocks(const struct bbm_log *log, const __u8 idx,
+			const unsigned long long start_sector,
+			const unsigned long long size,
+			struct md_bb *bbs)
+{
+	__u32 count = 0;
+	__u32 i;
+
+	for (i = 0; i < log->entry_count; i++) {
+		const struct bbm_log_entry *ent =
+			&log->marked_block_entries[i];
+		struct md_bb_entry *bb;
+
+		if ((ent->disk_ordinal == idx) &&
+		    is_bad_block_in_volume(ent, start_sector, size)) {
+
+			if (!bbs->entries) {
+				bbs->entries = xmalloc(BBM_LOG_MAX_ENTRIES *
+						     sizeof(*bb));
+				if (!bbs->entries)
+					break;
+			}
+
+			bb = &bbs->entries[count++];
+			bb->sector = __le48_to_cpu(&ent->defective_block_start);
+			bb->length = ent->marked_count + 1;
+		}
+	}
+	bbs->count = count;
+}
+
 /*
  * for second_map:
  *  == MAP_0 get first map
@@ -3016,6 +3066,7 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
 							info->array.chunk_size,
 							super->sector_size,
 							info->component_size);
+	info->bb.supported = 0;
 
 	memset(info->uuid, 0, sizeof(info->uuid));
 	info->recovery_start = MaxSector;
@@ -3182,6 +3233,7 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *
 	info->name[0] = 0;
 	info->recovery_start = MaxSector;
 	info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
+	info->bb.supported = 0;
 
 	/* do we have the all the insync disks that we expect? */
 	mpb = super->anchor;
@@ -7160,6 +7212,12 @@ static struct mdinfo *container_content_imsm(struct supertype *st, char *subarra
 			info_d->events = __le32_to_cpu(mpb->generation_num);
 			info_d->data_offset = pba_of_lba0(map);
 			info_d->component_size = blocks_per_member(map);
+
+			info_d->bb.supported = 0;
+			get_volume_badblocks(super->bbm_log, ord_to_idx(ord),
+					     info_d->data_offset,
+					     info_d->component_size,
+					     &info_d->bb);
 		}
 		/* now that the disk list is up-to-date fixup recovery_start */
 		update_recovery_start(super, dev, this);
@@ -8166,6 +8224,7 @@ static struct mdinfo *imsm_activate_spare(struct active_array *a,
 		di->data_offset = pba_of_lba0(map);
 		di->component_size = a->info.component_size;
 		di->container_member = inst;
+		di->bb.supported = 0;
 		super->random = random32();
 		di->next = rv;
 		rv = di;
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v2 4/9] imsm: record new bad block in bad block log
From: Tomasz Majchrzak @ 2016-11-29 13:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Jes.Sorensen, Tomasz Majchrzak
In-Reply-To: <1480424555-31509-1-git-send-email-tomasz.majchrzak@intel.com>

Check for a duplicate first or try to merge it with existing bad block.
If block range exceeds BBM_LOG_MAX_LBA_ENTRY_VAL (256) blocks, it must
be split into multiple ranges. Fail if maximum number of bad blocks has
been already reached.

Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
---
 super-intel.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 134 insertions(+), 8 deletions(-)

diff --git a/super-intel.c b/super-intel.c
index b2afdff..2af44d9 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -843,6 +843,86 @@ static __u32 get_imsm_bbm_log_size(struct bbm_log *log)
 		sizeof(log->entry_count) +
 		log->entry_count * sizeof(struct bbm_log_entry);
 }
+
+/* check if bad block is not partially stored in bbm log */
+static int is_stored_in_bbm(struct bbm_log *log, const __u8 idx, const unsigned
+			    long long sector, const int length, __u32 *pos)
+{
+	__u32 i;
+
+	for (i = *pos; i < log->entry_count; i++) {
+		struct bbm_log_entry *entry = &log->marked_block_entries[i];
+		unsigned long long bb_start;
+		unsigned long long bb_end;
+
+		bb_start = __le48_to_cpu(&entry->defective_block_start);
+		bb_end = bb_start + (entry->marked_count + 1);
+
+		if ((entry->disk_ordinal == idx) && (bb_start >= sector) &&
+		    (bb_end <= sector + length)) {
+			*pos = i;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/* record new bad block in bbm log */
+static int record_new_badblock(struct bbm_log *log, const __u8 idx, unsigned
+			       long long sector, int length)
+{
+	int new_bb = 0;
+	__u32 pos = 0;
+	struct bbm_log_entry *entry = NULL;
+
+	while (is_stored_in_bbm(log, idx, sector, length, &pos)) {
+		struct bbm_log_entry *e = &log->marked_block_entries[pos];
+
+		if ((e->marked_count + 1 == BBM_LOG_MAX_LBA_ENTRY_VAL) &&
+		    (__le48_to_cpu(&e->defective_block_start) == sector)) {
+			sector += BBM_LOG_MAX_LBA_ENTRY_VAL;
+			length -= BBM_LOG_MAX_LBA_ENTRY_VAL;
+			pos = pos + 1;
+			continue;
+		}
+		entry = e;
+		break;
+	}
+
+	if (entry) {
+		int cnt = (length <= BBM_LOG_MAX_LBA_ENTRY_VAL) ? length :
+			BBM_LOG_MAX_LBA_ENTRY_VAL;
+		entry->defective_block_start = __cpu_to_le48(sector);
+		entry->marked_count = cnt - 1;
+		if (cnt == length)
+			return 1;
+		sector += cnt;
+		length -= cnt;
+	}
+
+	new_bb = ROUND_UP(length, BBM_LOG_MAX_LBA_ENTRY_VAL) /
+		BBM_LOG_MAX_LBA_ENTRY_VAL;
+	if (log->entry_count + new_bb > BBM_LOG_MAX_ENTRIES)
+		return 0;
+
+	while (length > 0) {
+		int cnt = (length <= BBM_LOG_MAX_LBA_ENTRY_VAL) ? length :
+			BBM_LOG_MAX_LBA_ENTRY_VAL;
+		struct bbm_log_entry *entry =
+			&log->marked_block_entries[log->entry_count];
+
+		entry->defective_block_start = __cpu_to_le48(sector);
+		entry->marked_count = cnt - 1;
+		entry->disk_ordinal = idx;
+
+		sector += cnt;
+		length -= cnt;
+
+		log->entry_count++;
+	}
+
+	return new_bb;
+}
 #endif /* MDASSEMBLE */
 
 /* allocate and load BBM log from metadata */
@@ -7734,6 +7814,25 @@ skip_mark_checkpoint:
 	return consistent;
 }
 
+static int imsm_disk_slot_to_ord(struct active_array *a, int slot)
+{
+	int inst = a->info.container_member;
+	struct intel_super *super = a->container->sb;
+	struct imsm_dev *dev = get_imsm_dev(super, inst);
+	struct imsm_map *map = get_imsm_map(dev, MAP_0);
+
+	if (slot > map->num_members) {
+		pr_err("imsm: imsm_disk_slot_to_ord %d out of range 0..%d\n",
+		       slot, map->num_members - 1);
+		return -1;
+	}
+
+	if (slot < 0)
+		return -1;
+
+	return get_imsm_ord_tbl_ent(dev, slot, MAP_0);
+}
+
 static void imsm_set_disk(struct active_array *a, int n, int state)
 {
 	int inst = a->info.container_member;
@@ -7744,19 +7843,14 @@ static void imsm_set_disk(struct active_array *a, int n, int state)
 	struct mdinfo *mdi;
 	int recovery_not_finished = 0;
 	int failed;
-	__u32 ord;
+	int ord;
 	__u8 map_state;
 
-	if (n > map->num_members)
-		pr_err("imsm: set_disk %d out of range 0..%d\n",
-			n, map->num_members - 1);
-
-	if (n < 0)
+	ord = imsm_disk_slot_to_ord(a, n);
+	if (ord < 0)
 		return;
 
 	dprintf("imsm: set_disk %d:%x\n", n, state);
-
-	ord = get_imsm_ord_tbl_ent(dev, n, MAP_0);
 	disk = get_imsm_disk(super, ord_to_idx(ord));
 
 	/* check for new failures */
@@ -9653,6 +9747,37 @@ int validate_container_imsm(struct mdinfo *info)
 }
 #ifndef MDASSEMBLE
 /*******************************************************************************
+* Function:   imsm_record_badblock
+* Description: This routine stores new bad block record in BBM log
+*
+* Parameters:
+*     a		: array containing a bad block
+*     slot	: disk number containing a bad block
+*     sector	: bad block sector
+*     length	: bad block sectors range
+* Returns:
+*     1 : Success
+*     0 : Error
+******************************************************************************/
+static int imsm_record_badblock(struct active_array *a, int slot,
+			  unsigned long long sector, int length)
+{
+	struct intel_super *super = a->container->sb;
+	int ord;
+	int ret;
+
+	ord = imsm_disk_slot_to_ord(a, slot);
+	if (ord < 0)
+		return 0;
+
+	ret = record_new_badblock(super->bbm_log, ord_to_idx(ord), sector,
+				   length);
+	if (ret)
+		super->updates_pending++;
+
+	return ret;
+}
+/*******************************************************************************
  * Function:	init_migr_record_imsm
  * Description:	Function inits imsm migration record
  * Parameters:
@@ -11205,5 +11330,6 @@ struct superswitch super_imsm = {
 	.activate_spare = imsm_activate_spare,
 	.process_update = imsm_process_update,
 	.prepare_update = imsm_prepare_update,
+	.record_bad_block = imsm_record_badblock,
 #endif /* MDASSEMBLE */
 };
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v2 5/9] imsm: clear bad block from bad block log
From: Tomasz Majchrzak @ 2016-11-29 13:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Jes.Sorensen, Tomasz Majchrzak
In-Reply-To: <1480424555-31509-1-git-send-email-tomasz.majchrzak@intel.com>

Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
---
 super-intel.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/super-intel.c b/super-intel.c
index 2af44d9..ec592c2 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -923,6 +923,28 @@ static int record_new_badblock(struct bbm_log *log, const __u8 idx, unsigned
 
 	return new_bb;
 }
+
+/* clear given bad block */
+static int clear_badblock(struct bbm_log *log, const __u8 idx, const unsigned
+			  long long sector, const int length) {
+	__u32 i = 0;
+
+	while (i < log->entry_count) {
+		struct bbm_log_entry *entries = log->marked_block_entries;
+
+		if ((entries[i].disk_ordinal == idx) &&
+		    (__le48_to_cpu(&entries[i].defective_block_start) ==
+		     sector) && (entries[i].marked_count + 1 == length)) {
+			if (i < log->entry_count - 1)
+				entries[i] = entries[log->entry_count - 1];
+			log->entry_count--;
+			break;
+		}
+		i++;
+	}
+
+	return 1;
+}
 #endif /* MDASSEMBLE */
 
 /* allocate and load BBM log from metadata */
@@ -9778,6 +9800,36 @@ static int imsm_record_badblock(struct active_array *a, int slot,
 	return ret;
 }
 /*******************************************************************************
+* Function:   imsm_clear_badblock
+* Description: This routine clears bad block record from BBM log
+*
+* Parameters:
+*     a		: array containing a bad block
+*     slot	: disk number containing a bad block
+*     sector	: bad block sector
+*     length	: bad block sectors range
+* Returns:
+*     1 : Success
+*     0 : Error
+******************************************************************************/
+static int imsm_clear_badblock(struct active_array *a, int slot,
+			unsigned long long sector, int length)
+{
+	struct intel_super *super = a->container->sb;
+	int ord;
+	int ret;
+
+	ord = imsm_disk_slot_to_ord(a, slot);
+	if (ord < 0)
+		return 0;
+
+	ret = clear_badblock(super->bbm_log, ord_to_idx(ord), sector, length);
+	if (ret)
+		super->updates_pending++;
+
+	return ret;
+}
+/*******************************************************************************
  * Function:	init_migr_record_imsm
  * Description:	Function inits imsm migration record
  * Parameters:
@@ -11331,5 +11383,6 @@ struct superswitch super_imsm = {
 	.process_update = imsm_process_update,
 	.prepare_update = imsm_prepare_update,
 	.record_bad_block = imsm_record_badblock,
+	.clear_bad_block  = imsm_clear_badblock,
 #endif /* MDASSEMBLE */
 };
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v2 6/9] imsm: clear bad blocks if disk becomes unavailable
From: Tomasz Majchrzak @ 2016-11-29 13:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Jes.Sorensen, Tomasz Majchrzak
In-Reply-To: <1480424555-31509-1-git-send-email-tomasz.majchrzak@intel.com>

If a disk fails or goes missing, clear the bad blocks associated with it
from metadata. If necessary, update disk ordinals.

Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
---
 super-intel.c | 46 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/super-intel.c b/super-intel.c
index ec592c2..731878c 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -924,6 +924,24 @@ static int record_new_badblock(struct bbm_log *log, const __u8 idx, unsigned
 	return new_bb;
 }
 
+/* clear all bad blocks for given disk */
+static void clear_disk_badblocks(struct bbm_log *log, const __u8 idx)
+{
+	__u32 i = 0;
+
+	while (i < log->entry_count) {
+		struct bbm_log_entry *entries = log->marked_block_entries;
+
+		if (entries[i].disk_ordinal == idx) {
+			if (i < log->entry_count - 1)
+				entries[i] = entries[log->entry_count - 1];
+			log->entry_count--;
+		} else {
+			i++;
+		}
+	}
+}
+
 /* clear given bad block */
 static int clear_badblock(struct bbm_log *log, const __u8 idx, const unsigned
 			  long long sector, const int length) {
@@ -7505,7 +7523,8 @@ static int is_resyncing(struct imsm_dev *dev)
 }
 
 /* return true if we recorded new information */
-static int mark_failure(struct imsm_dev *dev, struct imsm_disk *disk, int idx)
+static int mark_failure(struct intel_super *super,
+			struct imsm_dev *dev, struct imsm_disk *disk, int idx)
 {
 	__u32 ord;
 	int slot;
@@ -7547,12 +7566,16 @@ static int mark_failure(struct imsm_dev *dev, struct imsm_disk *disk, int idx)
 	}
 	if (map->failed_disk_num == 0xff)
 		map->failed_disk_num = slot;
+
+	clear_disk_badblocks(super->bbm_log, ord_to_idx(ord));
+
 	return 1;
 }
 
-static void mark_missing(struct imsm_dev *dev, struct imsm_disk *disk, int idx)
+static void mark_missing(struct intel_super *super,
+			 struct imsm_dev *dev, struct imsm_disk *disk, int idx)
 {
-	mark_failure(dev, disk, idx);
+	mark_failure(super, dev, disk, idx);
 
 	if (disk->scsi_id == __cpu_to_le32(~(__u32)0))
 		return;
@@ -7588,7 +7611,7 @@ static void handle_missing(struct intel_super *super, struct imsm_dev *dev)
 			end_migration(dev, super, map_state);
 	}
 	for (dl = super->missing; dl; dl = dl->next)
-		mark_missing(dev, &dl->disk, dl->index);
+		mark_missing(super, dev, &dl->disk, dl->index);
 	super->updates_pending++;
 }
 
@@ -7877,7 +7900,7 @@ static void imsm_set_disk(struct active_array *a, int n, int state)
 
 	/* check for new failures */
 	if (state & DS_FAULTY) {
-		if (mark_failure(dev, disk, ord_to_idx(ord)))
+		if (mark_failure(super, dev, disk, ord_to_idx(ord)))
 			super->updates_pending++;
 	}
 
@@ -8947,7 +8970,7 @@ static int apply_takeover_update(struct imsm_update_takeover *u,
 	for (du = super->missing; du; du = du->next)
 		if (du->index >= 0) {
 			set_imsm_ord_tbl_ent(map, du->index, du->index);
-			mark_missing(dv->dev, &du->disk, du->index);
+			mark_missing(super, dv->dev, &du->disk, du->index);
 		}
 
 	return 1;
@@ -9521,8 +9544,9 @@ static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned ind
 	struct dl *iter;
 	struct imsm_dev *dev;
 	struct imsm_map *map;
-	int i, j, num_members;
+	unsigned int i, j, num_members;
 	__u32 ord;
+	struct bbm_log *log = super->bbm_log;
 
 	dprintf("deleting device[%d] from imsm_super\n", index);
 
@@ -9555,6 +9579,14 @@ static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned ind
 		}
 	}
 
+	for (i = 0; i < log->entry_count; i++) {
+		struct bbm_log_entry *entry = &log->marked_block_entries[i];
+
+		if (entry->disk_ordinal <= index)
+			continue;
+		entry->disk_ordinal--;
+	}
+
 	mpb->num_disks--;
 	super->updates_pending++;
 	if (*dlp) {
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v2 7/9] imsm: provide list of bad blocks for an array
From: Tomasz Majchrzak @ 2016-11-29 13:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Jes.Sorensen, Tomasz Majchrzak
In-Reply-To: <1480424555-31509-1-git-send-email-tomasz.majchrzak@intel.com>

Provide list of bad blocks using memory allocated in advance so it's
safe to call it from monitor.

Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
---
 super-intel.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/super-intel.c b/super-intel.c
index 731878c..fd2145c 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -393,6 +393,7 @@ struct intel_super {
 	struct intel_hba *hba; /* device path of the raid controller for this metadata */
 	const struct imsm_orom *orom; /* platform firmware support */
 	struct intel_super *next; /* (temp) list for disambiguating family_num */
+	struct md_bb bb;	/* memory for get_bad_blocks call */
 };
 
 struct intel_disk {
@@ -4298,6 +4299,7 @@ static void __free_imsm(struct intel_super *super, int free_disks)
 static void free_imsm(struct intel_super *super)
 {
 	__free_imsm(super, 1);
+	free(super->bb.entries);
 	free(super);
 }
 
@@ -4318,6 +4320,14 @@ static struct intel_super *alloc_super(void)
 
 	super->current_vol = -1;
 	super->create_offset = ~((unsigned long long) 0);
+
+	super->bb.entries = malloc(BBM_LOG_MAX_ENTRIES *
+				   sizeof(struct md_bb_entry));
+	if (!super->bb.entries) {
+		free(super);
+		return NULL;
+	}
+
 	return super;
 }
 
@@ -9862,6 +9872,34 @@ static int imsm_clear_badblock(struct active_array *a, int slot,
 	return ret;
 }
 /*******************************************************************************
+* Function:   imsm_get_badblocks
+* Description: This routine get list of bad blocks for an array
+*
+* Parameters:
+*     a		: array
+*     slot	: disk number
+* Returns:
+*     bb	: structure containing bad blocks
+*     NULL	: error
+******************************************************************************/
+static struct md_bb *imsm_get_badblocks(struct active_array *a, int slot)
+{
+	int inst = a->info.container_member;
+	struct intel_super *super = a->container->sb;
+	struct imsm_dev *dev = get_imsm_dev(super, inst);
+	struct imsm_map *map = get_imsm_map(dev, MAP_0);
+	int ord;
+
+	ord = imsm_disk_slot_to_ord(a, slot);
+	if (ord < 0)
+		return NULL;
+
+	get_volume_badblocks(super->bbm_log, ord_to_idx(ord), pba_of_lba0(map),
+			     blocks_per_member(map), &super->bb);
+
+	return &super->bb;
+}
+/*******************************************************************************
  * Function:	init_migr_record_imsm
  * Description:	Function inits imsm migration record
  * Parameters:
@@ -11416,5 +11454,6 @@ struct superswitch super_imsm = {
 	.prepare_update = imsm_prepare_update,
 	.record_bad_block = imsm_record_badblock,
 	.clear_bad_block  = imsm_clear_badblock,
+	.get_bad_blocks   = imsm_get_badblocks,
 #endif /* MDASSEMBLE */
 };
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v2 8/9] imsm: implement "--examine-badblocks" command
From: Tomasz Majchrzak @ 2016-11-29 13:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Jes.Sorensen, Tomasz Majchrzak
In-Reply-To: <1480424555-31509-1-git-send-email-tomasz.majchrzak@intel.com>

Implement "--examine-badblocks" command to provide list of bad blocks in
metadata for a disk.

Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
---
 super-intel.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/super-intel.c b/super-intel.c
index fd2145c..ab14fce 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -9900,6 +9900,61 @@ static struct md_bb *imsm_get_badblocks(struct active_array *a, int slot)
 	return &super->bb;
 }
 /*******************************************************************************
+* Function:   examine_badblocks_imsm
+* Description: Prints list of bad blocks on a disk to the standard output
+*
+* Parameters:
+*     st	: metadata handler
+*     fd	: open file descriptor for device
+*     devname	: device name
+* Returns:
+*     0 : Success
+*     1 : Error
+******************************************************************************/
+static int examine_badblocks_imsm(struct supertype *st, int fd, char *devname)
+{
+	struct intel_super *super = st->sb;
+	struct bbm_log *log = super->bbm_log;
+	struct dl *d = NULL;
+	int any = 0;
+
+	for (d = super->disks; d ; d = d->next) {
+		if (strcmp(d->devname, devname) == 0)
+			break;
+	}
+
+	if ((d == NULL) || (d->index < 0)) { /* serial mismatch probably */
+		pr_err("%s doesn't appear to be part of a raid array\n",
+		       devname);
+		return 1;
+	}
+
+	if (log != NULL) {
+		unsigned int i;
+		struct bbm_log_entry *entry = &log->marked_block_entries[0];
+
+		for (i = 0; i < log->entry_count; i++) {
+			if (entry[i].disk_ordinal == d->index) {
+				unsigned long long sector = __le48_to_cpu(
+					&entry[i].defective_block_start);
+				int cnt = entry[i].marked_count + 1;
+
+				if (!any) {
+					printf("Bad-blocks on %s:\n", devname);
+					any = 1;
+				}
+
+				printf("%20llu for %d sectors\n", sector, cnt);
+			}
+		}
+	}
+
+	if (!any)
+		printf("No bad-blocks list configured on %s\n", devname);
+
+	return 0;
+}
+/*******************************************************************************
  * Function:	init_migr_record_imsm
  * Description:	Function inits imsm migration record
  * Parameters:
@@ -11420,6 +11475,7 @@ struct superswitch super_imsm = {
 	.manage_reshape = imsm_manage_reshape,
 	.recover_backup = recover_backup_imsm,
 	.copy_metadata = copy_metadata_imsm,
+	.examine_badblocks = examine_badblocks_imsm,
 #endif
 	.match_home	= match_home_imsm,
 	.uuid_from_super= uuid_from_super_imsm,
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH v2 9/9] imsm: 4kn support for bad block log
From: Tomasz Majchrzak @ 2016-11-29 13:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Jes.Sorensen, Tomasz Majchrzak
In-Reply-To: <1480424555-31509-1-git-send-email-tomasz.majchrzak@intel.com>

Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
---
 super-intel.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/super-intel.c b/super-intel.c
index ab14fce..04f14ee 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -1547,6 +1547,7 @@ void convert_to_4k(struct intel_super *super)
 	struct imsm_super *mpb = super->anchor;
 	struct imsm_disk *disk;
 	int i;
+	__u32 bbm_log_size = __le32_to_cpu(mpb->bbm_log_size);
 
 	for (i = 0; i < mpb->num_disks ; i++) {
 		disk = __get_imsm_disk(mpb, i);
@@ -1575,6 +1576,24 @@ void convert_to_4k(struct intel_super *super)
 			set_pba_of_lba0(map, pba_of_lba0(map)/IMSM_4K_DIV);
 		}
 	}
+	if (bbm_log_size) {
+		struct bbm_log *log = (void *)mpb +
+			__le32_to_cpu(mpb->mpb_size) - bbm_log_size;
+		__u32 i;
+
+		for (i = 0; i < log->entry_count; i++) {
+			struct bbm_log_entry *entry =
+				&log->marked_block_entries[i];
+
+			__u8 count = entry->marked_count + 1;
+			unsigned long long sector =
+				__le48_to_cpu(&entry->defective_block_start);
+
+			entry->defective_block_start =
+				__cpu_to_le48(sector/IMSM_4K_DIV);
+			entry->marked_count = max(count/IMSM_4K_DIV, 1) - 1;
+		}
+	}
 
 	mpb->check_sum = __gen_imsm_checksum(mpb);
 }
@@ -1656,6 +1675,7 @@ void convert_from_4k(struct intel_super *super)
 	struct imsm_super *mpb = super->anchor;
 	struct imsm_disk *disk;
 	int i;
+	__u32 bbm_log_size = __le32_to_cpu(mpb->bbm_log_size);
 
 	for (i = 0; i < mpb->num_disks ; i++) {
 		disk = __get_imsm_disk(mpb, i);
@@ -1685,6 +1705,24 @@ void convert_from_4k(struct intel_super *super)
 			set_pba_of_lba0(map, pba_of_lba0(map)*IMSM_4K_DIV);
 		}
 	}
+	if (bbm_log_size) {
+		struct bbm_log *log = (void *)mpb +
+			__le32_to_cpu(mpb->mpb_size) - bbm_log_size;
+		__u32 i;
+
+		for (i = 0; i < log->entry_count; i++) {
+			struct bbm_log_entry *entry =
+				&log->marked_block_entries[i];
+
+			__u8 count = entry->marked_count + 1;
+			unsigned long long sector =
+				__le48_to_cpu(&entry->defective_block_start);
+
+			entry->defective_block_start =
+				__cpu_to_le48(sector*IMSM_4K_DIV);
+			entry->marked_count = count*IMSM_4K_DIV - 1;
+		}
+	}
 
 	mpb->check_sum = __gen_imsm_checksum(mpb);
 }
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH 1/8] imsm: parse bad block log in metadata on startup
From: Tomasz Majchrzak @ 2016-11-29 13:17 UTC (permalink / raw)
  To: Jes Sorensen; +Cc: linux-raid
In-Reply-To: <wrfjh977mqnt.fsf@redhat.com>

On Wed, Nov 16, 2016 at 09:43:18AM -0500, Jes Sorensen wrote:
> Tomasz Majchrzak <tomasz.majchrzak@intel.com> writes:
> > Always allocate memory for all log entries to avoid a need for memory
> > allocation when monitor requests to record a bad block.
> >
> > Also some extra checks added to make static code analyzer happy.
> >
> > Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
> > ---
> >  super-intel.c | 158 +++++++++++++++++++++++++++++++++++++++++-----------------
> >  1 file changed, 112 insertions(+), 46 deletions(-)
> 
> > @@ -785,6 +787,92 @@ static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index)
> >  	return NULL;
> >  }
> >  
> > +#if BYTE_ORDER == LITTLE_ENDIAN
> > +static inline unsigned long long __le48_to_cpu(const struct bbm_log_block_addr
> > +					       *addr)
> > +{
> > +	return ((((__u64)addr->dw1) << 16) | addr->w1);
> > +}
> > +
> > +static inline struct bbm_log_block_addr __cpu_to_le48(unsigned long long sec)
> > +{
> > +	struct bbm_log_block_addr addr;
> > +
> > +	addr.w1 =  (__u16)(sec & 0xFFFF);
> > +	addr.dw1 = (__u32)((sec >> 16) & 0xFFFFFFFF);
> > +	return addr;
> > +}
> 
> Again, 0xffff/0xffffffff
> 
> > +#elif BYTE_ORDER == BIG_ENDIAN
> > +static inline unsigned long long __le48_to_cpu(const struct bbm_log_block_addr
> > +					       *addr)
> > +{
> > +	return ((((__u64)__le32_to_cpu(addr->dw1)) << 16) |
> > +		__le16_to_cpu(addr->w1));
> > +}
> > +
> > +static inline struct bbm_log_block_addr __cpu_to_le48(unsigned long long sec)
> > +{
> > +	struct bbm_log_block_addr addr;
> > +
> > +	addr.w1 =  __cpu_to_le16((__u16)(sec & 0xFFFF));
> > +	addr.dw1 = __cpu_to_le32((__u32)(sec >> 16) & 0xFFFFFFFF);
> > +	return addr;
> > +}
> 
> Given that __cpu_to_le*()/__le*_to_cpu*() are no-ops on little-endian,
> you don't really need two versions of these. The big-endian version
> should suffice for both, which makes it far less cluttered. Unless I got
> something wrong here of course.
> 
> > +#else
> > +#  error "unknown endianness."
> > +#endif
> > +

Hi Jes,

Internally IMSM metadata stores bad block address as 48-bit little-endian
value. Those functions provide conversion from/to a structure consisting of
2 fields (32 + 16 bits). For little-endian CPU it's just about shifting bits,
for big-endian CPU bits have to be swapped. IMSM is not available on
big-endian platforms so big-endian implementation is provided for
completeness. I haven't changed it in my latest patch.

Regards,

Tomek

^ permalink raw reply

* Re: [PATCH 1/8] imsm: parse bad block log in metadata on startup
From: Jes Sorensen @ 2016-11-29 15:18 UTC (permalink / raw)
  To: Tomasz Majchrzak; +Cc: linux-raid
In-Reply-To: <20161129131721.GA31745@proton.igk.intel.com>

Tomasz Majchrzak <tomasz.majchrzak@intel.com> writes:
> On Wed, Nov 16, 2016 at 09:43:18AM -0500, Jes Sorensen wrote:
>> Tomasz Majchrzak <tomasz.majchrzak@intel.com> writes:
>> > Always allocate memory for all log entries to avoid a need for memory
>> > allocation when monitor requests to record a bad block.
>> >
>> > Also some extra checks added to make static code analyzer happy.
>> >
>> > Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
>> > ---
>> >  super-intel.c | 158
>> > +++++++++++++++++++++++++++++++++++++++++-----------------
>> >  1 file changed, 112 insertions(+), 46 deletions(-)
>> 
>> > @@ -785,6 +787,92 @@ static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index)
>> >  	return NULL;
>> >  }
>> >  
>> > +#if BYTE_ORDER == LITTLE_ENDIAN
>> > +static inline unsigned long long __le48_to_cpu(const struct bbm_log_block_addr
>> > +					       *addr)
>> > +{
>> > +	return ((((__u64)addr->dw1) << 16) | addr->w1);
>> > +}
>> > +
>> > +static inline struct bbm_log_block_addr __cpu_to_le48(unsigned long long sec)
>> > +{
>> > +	struct bbm_log_block_addr addr;
>> > +
>> > +	addr.w1 =  (__u16)(sec & 0xFFFF);
>> > +	addr.dw1 = (__u32)((sec >> 16) & 0xFFFFFFFF);
>> > +	return addr;
>> > +}
>> 
>> Again, 0xffff/0xffffffff
>> 
>> > +#elif BYTE_ORDER == BIG_ENDIAN
>> > +static inline unsigned long long __le48_to_cpu(const struct
>> > bbm_log_block_addr
>> > +					       *addr)
>> > +{
>> > +	return ((((__u64)__le32_to_cpu(addr->dw1)) << 16) |
>> > +		__le16_to_cpu(addr->w1));
>> > +}
>> > +
>> > +static inline struct bbm_log_block_addr __cpu_to_le48(unsigned
>> > long long sec)
>> > +{
>> > +	struct bbm_log_block_addr addr;
>> > +
>> > +	addr.w1 =  __cpu_to_le16((__u16)(sec & 0xFFFF));
>> > +	addr.dw1 = __cpu_to_le32((__u32)(sec >> 16) & 0xFFFFFFFF);
>> > +	return addr;
>> > +}
>> 
>> Given that __cpu_to_le*()/__le*_to_cpu*() are no-ops on little-endian,
>> you don't really need two versions of these. The big-endian version
>> should suffice for both, which makes it far less cluttered. Unless I got
>> something wrong here of course.
>> 
>> > +#else
>> > +#  error "unknown endianness."
>> > +#endif
>> > +
>
> Hi Jes,
>
> Internally IMSM metadata stores bad block address as 48-bit little-endian
> value. Those functions provide conversion from/to a structure consisting of
> 2 fields (32 + 16 bits). For little-endian CPU it's just about shifting bits,
> for big-endian CPU bits have to be swapped. IMSM is not available on
> big-endian platforms so big-endian implementation is provided for
> completeness. I haven't changed it in my latest patch.

Hi Tomek,

My point is that __cpu_to_le{16,32}() are no-ops on little-endian so the
above function using those macros would be valid for both endianess.
While I do understand that no big endian system is available with IMSM
BIOS support, we should still allow big endian users to put IMSM disks
their systems for recovery purposes etc.

Cheers,
Jes

^ permalink raw reply

* Re: [PATCH 1/7] imsm: metadata changes for PPL
From: Jes Sorensen @ 2016-11-29 15:21 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: linux-raid
In-Reply-To: <f4ded4a5-abf2-f988-f5d0-7063f5d080d1@intel.com>

Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> On 11/29/2016 12:27 AM, Jes Sorensen wrote:
>> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>>> @@ -189,7 +195,30 @@ struct imsm_dev {
>>>  	__u16 cache_policy;
>>>  	__u8  cng_state;
>>>  	__u8  cng_sub_state;
>>> -#define IMSM_DEV_FILLERS 10
>>> +	__u16 my_vol_raid_dev_num; /* Used in Unique volume Id for this RaidDev */
>>> +
>>> +	/* NVM_EN */
>>> +	__u8 nv_cache_mode;
>>> +	union {
>>> +		__u8 nv_cache_flags;
>>> +		struct {
>>> +		    __u8 dirty:1; /* 1 - cache is dirty, 0 - clean */
>>> +		    __u8 nvc_health:2;
>>> +		    __u8 expansion_bytes:5;
>>> +		} nvCache;
>>> +	};
>> 
>> This sets off alarm bells here - you declare a union of a u8 and a
>> matching bitfield, but do not handle bit endian bitfields. If someone
>> tried to use this on a big endian system this could get messy.
>
> Those fields are not used in the code at all, the only thing added to
> this structure that is used is 'rwh_policy'. The rest is to fill the
> gaps between IMSM format in UEFI/Windows.

Hi Artur,

I did notice the code wasn't actually used, sorry I didn't mention
that. However I would still prefer to see at least a comment in the code
indicating that this would fail on BE systems.

>>> @@ -7565,12 +7594,15 @@ mark_checkpoint:
>>>  
>>>  skip_mark_checkpoint:
>>>  	/* mark dirty / clean */
>>> -	if (dev->vol.dirty != !consistent) {
>>> +	if ((dev->vol.dirty & RAIDVOL_DIRTY) != !consistent) {
>> 
>> This part makes my head spin (to be honest, the original code did
>> too). I had to go write a small snipped of C to figure out what the
>> compiler does with !x given x > 0, since consistent can be 0, 1, and 2
>> here.
>> 
>> Basically for RAIDVOL_CLEAN and consistent = 0, and RAIDVOL_DIRTY and
>> consistent = 1 and 2, this statement triggers. Maybe I am just terrible
>> dense when it comes to reading obfuscated C, but could we change this
>> round to a construct that is a little easier to read?
>
> Yes, I admit it is confusing. Does this look better?
>
> -       if (dev->vol.dirty != !consistent) {
> +       if (((dev->vol.dirty & RAIDVOL_DIRTY) && consistent) ||
> +           (!(dev->vol.dirty & RAIDVOL_DIRTY) && !consistent)) {

This I can read without getting headache, so yes :)

Cheers,
Jes

^ permalink raw reply

* Re: [PATCH 3/7] imsm: PPL support
From: Jes Sorensen @ 2016-11-29 15:23 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: linux-raid
In-Reply-To: <bdf93ddb-689e-fa11-ebaa-3d5d19480d82@intel.com>

Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> On 11/29/2016 12:51 AM, Jes Sorensen wrote:
>>> @@ -3177,6 +3195,9 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *
>>>  
>>>  		disk = &super->disks->disk;
>>>  		info->data_offset = total_blocks(&super->disks->disk) - reserved;
>>> +		/* mpb anchor sector - see store_imsm_mpb() */
>>> +		info->sb_start = total_blocks(&super->disks->disk) -
>>> +				 ((2 * super->sector_size) >> 9);
>>>  		info->component_size = reserved;
>>>  		info->disk.state  = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0;
>>>  		/* we don't change info->disk.raid_disk here because
>> 
>> Hi Artur,
>> 
>> I have been sitting staring at the above for a while, and looking at
>> store_imsm_mpb() it is not clear to me what is meant to happen here.
>> 
>> For 4k sector drives, aren't you pushing back sb_start way further than
>> you are for 512 byte sector drives? Ie. you are subtracting 16 sectors
>> for the 4k drive but only two sectors for the 512 byte sector drive?
>> 
>> Maybe it's because it's Monday or I lost the last of my marbles, but
>> could you possibly enlighten me here please?
>
> Jes,
>
> You read it correctly. The reason for this is that the IMSM anchor is
> located in the second _logical_ sector from the end of the drive. So for
> 4k drives this will be 16 512-byte sectors from the end.

I see, so the IMSM implementation uses 512 byte logical sectors on top
of 4k drives? Could I ask you to add a note explaining this in the code?

Thanks,
Jes

^ permalink raw reply

* [PATCH v3 1/9] imsm: parse bad block log in metadata on startup
From: Tomasz Majchrzak @ 2016-11-29 15:40 UTC (permalink / raw)
  To: linux-raid; +Cc: Jes.Sorensen, Tomasz Majchrzak
In-Reply-To: <1480424555-31509-2-git-send-email-tomasz.majchrzak@intel.com>

Always allocate memory for all log entries to avoid a need for memory
allocation when monitor requests to record a bad block.

Also some extra checks added to make static code analyzer happy.

Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
---
 super-intel.c | 140 +++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 94 insertions(+), 46 deletions(-)

diff --git a/super-intel.c b/super-intel.c
index 5740088..eeeca24 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -218,22 +218,24 @@ struct imsm_super {
 } __attribute__ ((packed));
 
 #define BBM_LOG_MAX_ENTRIES 254
+#define BBM_LOG_MAX_LBA_ENTRY_VAL 256		/* Represents 256 LBAs */
+#define BBM_LOG_SIGNATURE 0xabadb10c
+
+struct bbm_log_block_addr {
+	__u16 w1;
+	__u32 dw1;
+} __attribute__ ((__packed__));
 
 struct bbm_log_entry {
-	__u64 defective_block_start;
-#define UNREADABLE 0xFFFFFFFF
-	__u32 spare_block_offset;
-	__u16 remapped_marked_count;
-	__u16 disk_ordinal;
+	__u8 marked_count;		/* Number of blocks marked - 1 */
+	__u8 disk_ordinal;		/* Disk entry within the imsm_super */
+	struct bbm_log_block_addr defective_block_start;
 } __attribute__ ((__packed__));
 
 struct bbm_log {
 	__u32 signature; /* 0xABADB10C */
 	__u32 entry_count;
-	__u32 reserved_spare_block_count; /* 0 */
-	__u32 reserved; /* 0xFFFF */
-	__u64 first_spare_lba;
-	struct bbm_log_entry mapped_block_entries[BBM_LOG_MAX_ENTRIES];
+	struct bbm_log_entry marked_block_entries[BBM_LOG_MAX_ENTRIES];
 } __attribute__ ((__packed__));
 
 #ifndef MDASSEMBLE
@@ -788,6 +790,74 @@ static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index)
 	return NULL;
 }
 
+static inline unsigned long long __le48_to_cpu(const struct bbm_log_block_addr
+					       *addr)
+{
+	return ((((__u64)__le32_to_cpu(addr->dw1)) << 16) |
+		__le16_to_cpu(addr->w1));
+}
+
+static inline struct bbm_log_block_addr __cpu_to_le48(unsigned long long sec)
+{
+	struct bbm_log_block_addr addr;
+
+	addr.w1 =  __cpu_to_le16((__u16)(sec & 0xffff));
+	addr.dw1 = __cpu_to_le32((__u32)(sec >> 16) & 0xffffffff);
+	return addr;
+}
+
+#ifndef MDASSEMBLE
+/* get size of the bbm log */
+static __u32 get_imsm_bbm_log_size(struct bbm_log *log)
+{
+	if (!log || log->entry_count == 0)
+		return 0;
+
+	return sizeof(log->signature) +
+		sizeof(log->entry_count) +
+		log->entry_count * sizeof(struct bbm_log_entry);
+}
+#endif /* MDASSEMBLE */
+
+/* allocate and load BBM log from metadata */
+static int load_bbm_log(struct intel_super *super)
+{
+	struct imsm_super *mpb = super->anchor;
+	__u32 bbm_log_size =  __le32_to_cpu(mpb->bbm_log_size);
+
+	super->bbm_log = xcalloc(1, sizeof(struct bbm_log));
+	if (!super->bbm_log)
+		return 1;
+
+	if (bbm_log_size) {
+		struct bbm_log *log = (void *)mpb +
+			__le32_to_cpu(mpb->mpb_size) - bbm_log_size;
+
+		__u32 entry_count;
+
+		if (bbm_log_size < sizeof(log->signature) +
+		    sizeof(log->entry_count))
+			return 2;
+
+		entry_count = __le32_to_cpu(log->entry_count);
+		if ((__le32_to_cpu(log->signature) != BBM_LOG_SIGNATURE) ||
+		    (entry_count > BBM_LOG_MAX_ENTRIES))
+			return 3;
+
+		if (bbm_log_size !=
+		    sizeof(log->signature) + sizeof(log->entry_count) +
+		    entry_count * sizeof(struct bbm_log_entry))
+			return 4;
+
+		memcpy(super->bbm_log, log, bbm_log_size);
+	} else {
+		super->bbm_log->signature = __cpu_to_le32(BBM_LOG_SIGNATURE);
+		super->bbm_log->entry_count = 0;
+	}
+
+	return 0;
+}
+
 /*
  * for second_map:
  *  == MAP_0 get first map
@@ -1546,7 +1616,7 @@ static void examine_super_imsm(struct supertype *st, char *homehost)
 	printf("          Disks : %d\n", mpb->num_disks);
 	printf("   RAID Devices : %d\n", mpb->num_raid_devs);
 	print_imsm_disk(__get_imsm_disk(mpb, super->disks->index), super->disks->index, reserved);
-	if (super->bbm_log) {
+	if (get_imsm_bbm_log_size(super->bbm_log)) {
 		struct bbm_log *log = super->bbm_log;
 
 		printf("\n");
@@ -1554,9 +1624,6 @@ static void examine_super_imsm(struct supertype *st, char *homehost)
 		printf("       Log Size : %d\n", __le32_to_cpu(mpb->bbm_log_size));
 		printf("      Signature : %x\n", __le32_to_cpu(log->signature));
 		printf("    Entry Count : %d\n", __le32_to_cpu(log->entry_count));
-		printf("   Spare Blocks : %d\n",  __le32_to_cpu(log->reserved_spare_block_count));
-		printf("    First Spare : %llx\n",
-		       (unsigned long long) __le64_to_cpu(log->first_spare_lba));
 	}
 	for (i = 0; i < mpb->num_raid_devs; i++) {
 		struct mdinfo info;
@@ -3757,19 +3824,6 @@ static int parse_raid_devices(struct intel_super *super)
 	return 0;
 }
 
-/* retrieve a pointer to the bbm log which starts after all raid devices */
-struct bbm_log *__get_imsm_bbm_log(struct imsm_super *mpb)
-{
-	void *ptr = NULL;
-
-	if (__le32_to_cpu(mpb->bbm_log_size)) {
-		ptr = mpb;
-		ptr += mpb->mpb_size - __le32_to_cpu(mpb->bbm_log_size);
-	}
-
-	return ptr;
-}
-
 /*******************************************************************************
  * Function:	check_mpb_migr_compatibility
  * Description:	Function checks for unsupported migration features:
@@ -3922,12 +3976,6 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname)
 		return 3;
 	}
 
-	/* FIXME the BBM log is disk specific so we cannot use this global
-	 * buffer for all disks.  Ok for now since we only look at the global
-	 * bbm_log_size parameter to gate assembly
-	 */
-	super->bbm_log = __get_imsm_bbm_log(super->anchor);
-
 	return 0;
 }
 
@@ -3973,6 +4021,9 @@ load_and_parse_mpb(int fd, struct intel_super *super, char *devname, int keep_fd
 	if (err)
 		return err;
 	err = parse_raid_devices(super);
+	if (err)
+		return err;
+	err = load_bbm_log(super);
 	clear_hi(super);
 	return err;
 }
@@ -4037,6 +4088,8 @@ static void __free_imsm(struct intel_super *super, int free_disks)
 		free(elem);
 		elem = next;
 	}
+	if (super->bbm_log)
+		free(super->bbm_log);
 	super->hba = NULL;
 }
 
@@ -4643,7 +4696,7 @@ static int get_super_block(struct intel_super **super_list, char *devnm, char *d
 		*super_list = s;
 	} else {
 		if (s)
-			free(s);
+			free_imsm(s);
 		if (dfd >= 0)
 			close(dfd);
 	}
@@ -4706,6 +4759,8 @@ static int load_super_imsm(struct supertype *st, int fd, char *devname)
 
 	super = alloc_super();
 	get_dev_sector_size(fd, NULL, &super->sector_size);
+	if (!super)
+		return 1;
 	/* Load hba and capabilities if they exist.
 	 * But do not preclude loading metadata in case capabilities or hba are
 	 * non-compliant and ignore_hw_compat is set.
@@ -5050,7 +5105,7 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
 	super = alloc_super();
 	if (super &&
 	    posix_memalign(&super->buf, MAX_SECTOR_SIZE, mpb_size) != 0) {
-		free(super);
+		free_imsm(super);
 		super = NULL;
 	}
 	if (!super) {
@@ -5061,7 +5116,7 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
 	    MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE) != 0) {
 		pr_err("could not allocate migr_rec buffer\n");
 		free(super->buf);
-		free(super);
+		free_imsm(super);
 		return 0;
 	}
 	memset(super->buf, 0, mpb_size);
@@ -5656,11 +5711,6 @@ static int store_super_imsm(struct supertype *st, int fd)
 #endif
 }
 
-static int imsm_bbm_log_size(struct imsm_super *mpb)
-{
-	return __le32_to_cpu(mpb->bbm_log_size);
-}
-
 #ifndef MDASSEMBLE
 static int validate_geometry_imsm_container(struct supertype *st, int level,
 					    int layout, int raiddisks, int chunk,
@@ -5696,6 +5746,10 @@ static int validate_geometry_imsm_container(struct supertype *st, int level,
 	 * note that there is no fd for the disks in array.
 	 */
 	super = alloc_super();
+	if (!super) {
+		close(fd);
+		return 0;
+	}
 	if (!get_dev_sector_size(fd, NULL, &super->sector_size)) {
 		close(fd);
 		free_imsm(super);
@@ -6933,12 +6987,6 @@ static struct mdinfo *container_content_imsm(struct supertype *st, char *subarra
 		pr_err("Unsupported attributes in IMSM metadata.Arrays activation is blocked.\n");
 	}
 
-	/* check for bad blocks */
-	if (imsm_bbm_log_size(super->anchor)) {
-		pr_err("BBM log found in IMSM metadata.Arrays activation is blocked.\n");
-		sb_errors = 1;
-	}
-
 	/* count spare devices, not used in maps
 	 */
 	for (d = super->disks; d; d = d->next)
-- 
1.8.3.1


^ permalink raw reply related

* Re: [RFC PATCH 1/2] RAID1: a new I/O barrier implementation to remove resync window
From: Shaohua Li @ 2016-11-29 19:29 UTC (permalink / raw)
  To: Coly Li
  Cc: linux-raid, Shaohua Li, Neil Brown, Johannes Thumshirn,
	Guoqing Jiang
In-Reply-To: <c83399e6-2fbc-9b5d-70b5-0000a24f0653@suse.de>

On Mon, Nov 28, 2016 at 02:42:22PM +0800, Coly Li wrote:
> On 2016/11/23 上午5:35, Shaohua Li wrote:
> > On Tue, Nov 22, 2016 at 05:54:00AM +0800, Coly Li wrote:
> >> 'Commit 79ef3a8aa1cb ("raid1: Rewrite the implementation of iobarrier.")'
> >> introduces a sliding resync window for raid1 I/O barrier, this idea limits
> >> I/O barriers to happen only inside a slidingresync window, for regular
> >> I/Os out of this resync window they don't need to wait for barrier any
> >> more. On large raid1 device, it helps a lot to improve parallel writing
> >> I/O throughput when there are background resync I/Os performing at
> >> same time. 
> >>
> >> The idea of sliding resync widow is awesome, but there are several
> >> challenges are very difficult to solve,
> >>  - code complexity
> >>    Sliding resync window requires several veriables to work collectively,
> >>    this is complexed and very hard to make it work correctly. Just grep
> >>    "Fixes: 79ef3a8aa1" in kernel git log, there are 8 more patches to fix
> >>    the original resync window patch. This is not the end, any further
> >>    related modification may easily introduce more regreassion.
> >>  - multiple sliding resync windows
> >>    Currently raid1 code only has a single sliding resync window, we cannot
> >>    do parallel resync with current I/O barrier implementation.
> >>    Implementing multiple resync windows are much more complexed, and very
> >>    hard to make it correctly.
> >>
> >> Therefore I decide to implement a much simpler raid1 I/O barrier, by
> >> removing resync window code, I believe life will be much easier.
> >>
> >> The brief idea of the simpler barrier is,
> >>  - Do not maintain a logbal unique resync window
> >>  - Use multiple hash buckets to reduce I/O barrier conflictions, regular
> >>    I/O only has to wait for a resync I/O when both them have same barrier
> >>    bucket index, vice versa.
> >>  - I/O barrier can be recuded to an acceptable number if there are enought
> >>    barrier buckets
> >>
> >> Here I explain how the barrier buckets are designed,
> >>  - BARRIER_UNIT_SECTOR_SIZE
> >>    The whole LBA address space of a raid1 device is divided into multiple
> >>    barrier units, by the size of BARRIER_UNIT_SECTOR_SIZE.
> >>    Bio request won't go across border of barrier unit size, that means
> >>    maximum bio size is BARRIER_UNIT_SECTOR_SIZE<<9 in bytes.
> >>  - BARRIER_BUCKETS_NR
> >>    There are BARRIER_BUCKETS_NR buckets in total, if multiple I/O requests
> >>    hit different barrier units, they only need to compete I/O barrier with
> >>    other I/Os which hit the same barrier bucket index with each other. The
> >>    index of a barrier bucket which a bio should look for is calculated by
> >>    get_barrier_bucket_idx(),
> >> 	(sector >> BARRIER_UNIT_SECTOR_BITS) % BARRIER_BUCKETS_NR
> >>    sector is the start sector number of a bio. align_to_barrier_unit_end()
> >>    will make sure the finall bio sent into generic_make_request() won't
> >>    exceed border of the barrier unit size.
> >>  - RRIER_BUCKETS_NR
> >>    Number of barrier buckets is defined by,
> >> 	#define BARRIER_BUCKETS_NR	(PAGE_SIZE/sizeof(long))
> >>    For 4KB page size, there are 512 buckets for each raid1 device. That
> >>    means the propobility of full random I/O barrier confliction may be
> >>    reduced down to 1/512.
> > 
> > Thanks! The idea is awesome and does makes the code easier to understand.
> > 
> > For hash conflict, I'm worrying about one case. resync does sequential access.
> > Say we have a sequential normal IO. If the first normal IO and resync IO have
> > conflict, it's possible they will have conflict subsequently, since both are
> > doing sequential access. We can change the hash to something like this:
> > 
> > for the first 64M * 512, the hash is 0, 1, ... 511
> > For the second 64M * 512, the hash is 1, 3, 5 ...
> > The third 64M * 512, the hash is 2, 5, 8 ...
> > 
> > It should be very easy to implement something like this, and this should reduce
> > the conflict of sequential access.
> >  
> 
> Hi Shaohua,
> 
> What I concerned was memory footprint. For very fast sequential I/O,
> lineal mapping buckets means each (64 bytes) cache line contains 8 long
> integer, it is very friendly for CPU caching,
>  - sequential writing 8 barrier units only requires 1 memory fetching
> for each barrier variables (barrier[], nr_pending[], nr_waiting[],
> nr_queued[]).
>  - memory prefetch may have positive effect in sequential I/O.
> 
> It will be very rare that resync I/O and regular I/O are always
> conflicted in same barrier bucket: resync I/O throughput usually is
> slower than regular I/O, it is very hard to keep them always conflicting
> in same bucket. If this condition really happens, current sliding resync
> window implementation may also have a similar conflict (regular I/O
> always hits resync window). So the barrier buckets don't make things worse.
> 
> If we use a non-continuous mapping, memory fingerprint of the buckets
> will be quite distributed, and occupies more cache lines for a
> sequential I/O, which will increase the probability of more cache bounce
> if there are multiple sequential I/O (on different offset) happen on
> different CPU cores.
> 
> This is why I use a very simple linear buckets hashing.

Don't think memory footprint matters much. And wasting a little memory (eg,
make the barrier and stuffes cacheline aligned) doesn't matter here too if
necessary. If we could reduce the hash confilict in an easy way, why not? I
think Neil's suggestion (using hash_long) makes sense.
 
> >> Comparing to single sliding resync window,
> >>  - Currently resync I/O grows linearly, therefore regular and resync I/O
> >>    will have confliction within a single barrier units. So it is similar to
> >>    single sliding resync window.
> >>  - But a barrier unit bucket is shared by all barrier units with identical
> >>    barrier uinit index, the probability of confliction might be higher
> >>    than single sliding resync window, in condition that writing I/Os
> >>    always hit barrier units which have identical barrier bucket index with
> >>    the resync I/Os. This is a very rare condition in real I/O work loads,
> >>    I cannot imagine how it could happen in practice.
> >>  - Therefore we can achieve a good enough low confliction rate with much
> >>    simpler barrier algorithm and implementation.
> >>
> >> If user has a (realy) large raid1 device, for example 10PB size, we may
> >> just increase the buckets number BARRIER_BUCKETS_NR. Now this is a macro,
> >> it is possible to be a raid1-created-time-defined variable in future.
> >>
> >> There are two changes should be noticed,
> >>  - In raid1d(), I change the code to decrease conf->nr_pending[idx] into
> >>    single loop, it looks like this,
> >> 	spin_lock_irqsave(&conf->device_lock, flags);
> >> 	conf->nr_queued[idx]--;
> >> 	spin_unlock_irqrestore(&conf->device_lock, flags);
> >>    This change generates more spin lock operations, but in next patch of
> >>    this patch set, it will be replaced by a single line code,
> >> 	atomic_dec(conf->nr_queueud[idx]);
> >>    So we don't need to worry about spin lock cost here.
> >>  - In raid1_make_request(), wait_barrier() is replaced by,
> >>    a) wait_read_barrier(): wait barrier in regular read I/O code path
> >>    b) wait_barrier(): wait barrier in regular write I/O code path
> >>    The differnece is wait_read_barrier() only waits if array is frozen, I
> >>    am not able to combile them into one function, because they must receive
> >>    differnet data types in their arguments list.
> >>  - align_to_barrier_unit_end() is called to make sure both regular and
> >>    resync I/O won't go across the border of a barrier unit size.
> >>  
> >> Open question:
> >>  - Need review from md clustring developer, I don't touch related code now.
> > 
> > Don't think it matters, but please open eyes, Guoqing!
> >  
> >> Signed-off-by: Coly Li <colyli@suse.de>
> >> Cc: Shaohua Li <shli@fb.com>
> >> Cc: Neil Brown <neilb@suse.de>
> >> Cc: Johannes Thumshirn <jthumshirn@suse.de>
> >> Cc: Guoqing Jiang <gqjiang@suse.com>
> >> ---
> >>  drivers/md/raid1.c | 389 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------------
> >>  drivers/md/raid1.h |  42 +++++-------
> >>  2 files changed, 242 insertions(+), 189 deletions(-)
> >>
> >> Index: linux-raid1/drivers/md/raid1.c
> >> ===================================================================
> >> --- linux-raid1.orig/drivers/md/raid1.c
> >> +++ linux-raid1/drivers/md/raid1.c
> >> @@ -66,9 +66,8 @@
> >>   */
> >>  static int max_queued_requests = 1024;
> >>  
> >> -static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
> >> -			  sector_t bi_sector);
> >> -static void lower_barrier(struct r1conf *conf);
> >> +static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
> >> +static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
> >>  
> >>  static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
> >>  {
> >> @@ -92,7 +91,6 @@ static void r1bio_pool_free(void *r1_bio
> >>  #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
> >>  #define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
> >>  #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
> >> -#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
> >>  
> >>  static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
> >>  {
> >> @@ -198,6 +196,7 @@ static void put_buf(struct r1bio *r1_bio
> >>  {
> >>  	struct r1conf *conf = r1_bio->mddev->private;
> >>  	int i;
> >> +	sector_t sector_nr = r1_bio->sector;
> >>  
> >>  	for (i = 0; i < conf->raid_disks * 2; i++) {
> >>  		struct bio *bio = r1_bio->bios[i];
> >> @@ -207,7 +206,7 @@ static void put_buf(struct r1bio *r1_bio
> >>  
> >>  	mempool_free(r1_bio, conf->r1buf_pool);
> >>  
> >> -	lower_barrier(conf);
> >> +	lower_barrier(conf, sector_nr);
> >>  }
> >>  
> >>  static void reschedule_retry(struct r1bio *r1_bio)
> >> @@ -215,10 +214,15 @@ static void reschedule_retry(struct r1bi
> >>  	unsigned long flags;
> >>  	struct mddev *mddev = r1_bio->mddev;
> >>  	struct r1conf *conf = mddev->private;
> >> +	sector_t sector_nr;
> >> +	long idx;
> >> +
> >> +	sector_nr = r1_bio->sector;
> >> +	idx = get_barrier_bucket_idx(sector_nr);
> >>  
> >>  	spin_lock_irqsave(&conf->device_lock, flags);
> >>  	list_add(&r1_bio->retry_list, &conf->retry_list);
> >> -	conf->nr_queued ++;
> >> +	conf->nr_queued[idx]++;
> >>  	spin_unlock_irqrestore(&conf->device_lock, flags);
> >>  
> >>  	wake_up(&conf->wait_barrier);
> >> @@ -235,8 +239,6 @@ static void call_bio_endio(struct r1bio
> >>  	struct bio *bio = r1_bio->master_bio;
> >>  	int done;
> >>  	struct r1conf *conf = r1_bio->mddev->private;
> >> -	sector_t start_next_window = r1_bio->start_next_window;
> >> -	sector_t bi_sector = bio->bi_iter.bi_sector;
> >>  
> >>  	if (bio->bi_phys_segments) {
> >>  		unsigned long flags;
> >> @@ -255,19 +257,14 @@ static void call_bio_endio(struct r1bio
> >>  	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
> >>  		bio->bi_error = -EIO;
> >>  
> >> -	if (done) {
> >> +	if (done)
> >>  		bio_endio(bio);
> >> -		/*
> >> -		 * Wake up any possible resync thread that waits for the device
> >> -		 * to go idle.
> >> -		 */
> >> -		allow_barrier(conf, start_next_window, bi_sector);
> >> -	}
> >>  }
> >>  
> >>  static void raid_end_bio_io(struct r1bio *r1_bio)
> >>  {
> >>  	struct bio *bio = r1_bio->master_bio;
> >> +	struct r1conf *conf = r1_bio->mddev->private;
> >>  
> >>  	/* if nobody has done the final endio yet, do it now */
> >>  	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
> >> @@ -278,6 +275,12 @@ static void raid_end_bio_io(struct r1bio
> >>  
> >>  		call_bio_endio(r1_bio);
> >>  	}
> >> +
> >> +	/*
> >> +	 * Wake up any possible resync thread that waits for the device
> >> +	 * to go idle.
> >> +	 */
> >> +	allow_barrier(conf, r1_bio->sector);
> > Why this change?
> > 
> 
> I move allow_barrier() here on purpose. Because current raid1 code
> raises nr_pending for each master bio, the barrier buckets patch raises
> nr_pending[idx] for each r1_bio.
> 
> Current code increases nr_pending for each master bio (the bio structure
> received by raid1_make_request()). A master bio may be split by multiple
> r1_bio structures, when all r1_bio completes, nr_pending is decreased
> for the original master bio.
> 
> Since the master bio may go across multiple barrier units, in this patch
> master bio will be split into multiple r1_bio structures for each
> barrier units. In this case, we need to call wait_barrer() for each
> r1_bio, and call allow_barrier() on each r1_bio completion. This is why
> allow_barrier() is moved form master bio completion time to r1_bio
> completion time.
> 
> A master bio may also be split into multiple r1_bio if bad blocks
> encountered, and these r1_bio may stay in same barrier unit. In this
> case the different r1_bio does increase nr_pending[] for same bucket
> index, we should also call wait_barrier() for each r1_bio and call
> allow_barrier() at its completion time.

I think if you do bio_split before raid1_make_request, these issues go away.

> >>  	free_r1bio(r1_bio);
> >>  }
> >>  
> >> @@ -311,6 +314,7 @@ static int find_bio_disk(struct r1bio *r
> >>  	return mirror;
> >>  }
> >>  
> >> +/* bi_end_io callback for regular READ bio */
> >>  static void raid1_end_read_request(struct bio *bio)
> >>  {
> >>  	int uptodate = !bio->bi_error;
> >> @@ -490,6 +494,25 @@ static void raid1_end_write_request(stru
> >>  		bio_put(to_put);
> >>  }
> >>  
> >> +static sector_t align_to_barrier_unit_end(sector_t start_sector,
> >> +					  sector_t sectors)
> >> +{
> >> +	sector_t len;
> >> +
> >> +	WARN_ON(sectors == 0);
> >> +	/* len is the number of sectors from start_sector to end of the
> >> +	 * barrier unit which start_sector belongs to.
> >> +	 */
> >> +	len = ((start_sector + sectors + (1<<BARRIER_UNIT_SECTOR_BITS) - 1) &
> >> +	       (~(BARRIER_UNIT_SECTOR_SIZE - 1))) -
> >> +	      start_sector;
> >> +
> >> +	if (len > sectors)
> >> +		len = sectors;
> >> +
> >> +	return len;
> >> +}
> > 
> > I'd prefer calling bio_split at the early of raid1_make_request and split the
> > bio if it crosses the bucket boundary. please see raid10_make_request for
> > example. resync will not cross the boundary. So the code will not need to worry
> > about the boundary. I believe this will make the code simpiler (all the
> > align_to_barrier_unit_end calling can removed) and easy to understand.
> > 
> 
> Indeed, my first implementation uses bio_split(). The reasons why I
> don't use it later are,
> - indeed I need to write more code.
> - I can't simply use existing r1_bio completion code to call
> bio_end_io() to the master bio when bio->bi_phys_segments is zero (see
> call_bio_endio()). Because r1_bio->master_bio is the split bio, not the
> original master bio received by raid1_make_request()
> 
> Therefore finally I decide to use align_to_barrier_unit_end() to
> generate more r1_bio structures if the original master bio goes across
> barrier unit size, it makes less modification in this patch.

That bi_phys_segments issue doesn't make sense to me. Please see how raid10 use
bio_split.
- rename raid1_make_request to __raid1_make_request
- add a raid1_make_request, split bio there, and then call __raid1_make_request

the __raid1_make_request does everthing it currently does, the bi_phys_segments
should still work. This might add more code. But now we don't need to worry
about the boundary problem everywhere else except the new raid1_make_request.
For example, the allow_barrier issue goes away with this approach. This will
make the code more understandable and less error prone.
 
> >> +
> >>  /*
> >>   * This routine returns the disk from which the requested read should
> >>   * be done. There is a per-array 'next expected sequential IO' sector
> >> @@ -691,6 +714,7 @@ static int read_balance(struct r1conf *c
> >>  		conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
> >>  	}
> >>  	rcu_read_unlock();
> >> +	sectors = align_to_barrier_unit_end(this_sector, sectors);
> >>  	*max_sectors = sectors;
> >>  
> >>  	return best_disk;
> >> @@ -779,168 +803,174 @@ static void flush_pending_writes(struct
> >>   *    there is no normal IO happeing.  It must arrange to call
> >>   *    lower_barrier when the particular background IO completes.
> >>   */
> >> +
> >>  static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
> >>  {
> >> +	long idx = get_barrier_bucket_idx(sector_nr);
> >> +
> >>  	spin_lock_irq(&conf->resync_lock);
> >>  
> >>  	/* Wait until no block IO is waiting */
> >> -	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
> >> +	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting[idx],
> >>  			    conf->resync_lock);
> >>  
> >>  	/* block any new IO from starting */
> >> -	conf->barrier++;
> >> -	conf->next_resync = sector_nr;
> >> +	conf->barrier[idx]++;
> >>  
> >>  	/* For these conditions we must wait:
> >>  	 * A: while the array is in frozen state
> >> -	 * B: while barrier >= RESYNC_DEPTH, meaning resync reach
> >> -	 *    the max count which allowed.
> >> -	 * C: next_resync + RESYNC_SECTORS > start_next_window, meaning
> >> -	 *    next resync will reach to the window which normal bios are
> >> -	 *    handling.
> >> -	 * D: while there are any active requests in the current window.
> >> +	 * B: while conf->nr_pending[idx] is not 0, meaning regular I/O
> >> +	 *    existing in sector number ranges corresponding to idx.
> >> +	 * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning resync reach
> >> +	 *    the max count which allowed in sector number ranges
> >> +	 *    conrresponding to idx.
> >>  	 */
> >>  	wait_event_lock_irq(conf->wait_barrier,
> >> -			    !conf->array_frozen &&
> >> -			    conf->barrier < RESYNC_DEPTH &&
> >> -			    conf->current_window_requests == 0 &&
> >> -			    (conf->start_next_window >=
> >> -			     conf->next_resync + RESYNC_SECTORS),
> >> +			    !conf->array_frozen && !conf->nr_pending[idx] &&
> >> +			    conf->barrier[idx] < RESYNC_DEPTH,
> >>  			    conf->resync_lock);
> > 
> > So there is a slight behavior change. Originally we guarautee no more than
> > RESYNC_DEPTH sync. Now this only checks 'conf->barrier[idx] < RESYNC_DEPTH'.
> > How about barrier[idx-1], barrier[idx-2]...? It's possible conf->barrier[idx] <
> > RESYNC_DEPTH, but barrier[idx] + barrier[idx-1] > RESYNC_DEPTH. Not sure how
> > important this is, but at least we can check barrier[idx] + barrier[idx-1].
> 
> The original code wants to rise more multiple barriers, but less then
> RESYNC_DEPTH. It helps resync I/O throughput with less resync I/O and
> regular I/O conflict. Considering conf->barrier is a global barrier,
> large RESYNC_DEPTH will starve regular I/O, it is only set to 32 for
> whole raid1 device.
> 
> In the barrier buckets patch, conf->barrier[idx] only takes effect in a
> single bucket. More barriers raised in this barrier buckets won't
> interfere regular I/O in other barrier buckets, therefore we can have
> much more resync I/O barriers to raise, which is good for resync I/O
> throughput without starve more regular I/Os.
> 
> If we have 512 barrier buckets, that means on the whole raid1 device
> there are 512*RESYNC_DEPTH barriers can be raised, and allows more
> parallel resync I/O.
> 
> Before implementing parallel resync, I keep RESYNC_DEPTH as 32 in this
> patch, because we only have a resync I/O hits one barrier buckets, same
> RESYNC_DEPTH won't change barrier behavior now. Latter when we have
> parallel resync I/O, let's see whether we need to modify this value,
> also still keep it as 32.

The problem is resync write could harm latency of regular IO. Even we have
resync limitation mechanism, sending a lot of resync write out could harm
regular IO latency for worst case. If the application has P99 metrics, more
resync IO definitively will harm it. This will not be easy to observe though.
Could we have a global pending counter to record barrier IO and using it in the
same way as the old code? That said I'm not sure how important this is, but
this could make things worse, so I'd like to avoid the regression.
 
> >> -
> >> -	conf->nr_pending++;
> >> +	conf->nr_pending[idx]++;
> >>  	spin_unlock_irq(&conf->resync_lock);
> >>  }
> >>  
> >> -static void lower_barrier(struct r1conf *conf)
> >> +static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
> >>  {
> >>  	unsigned long flags;
> >> -	BUG_ON(conf->barrier <= 0);
> >> +	long idx = get_barrier_bucket_idx(sector_nr);
> >> +
> >> +	BUG_ON(conf->barrier[idx] <= 0);
> >>  	spin_lock_irqsave(&conf->resync_lock, flags);
> >> -	conf->barrier--;
> >> -	conf->nr_pending--;
> >> +	conf->barrier[idx]--;
> >> +	conf->nr_pending[idx]--;
> >>  	spin_unlock_irqrestore(&conf->resync_lock, flags);
> >>  	wake_up(&conf->wait_barrier);
> >>  }
> >>  
> >> -static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
> >> +/* A regular I/O should wait when,
> >> + * - The whole array is frozen (both READ and WRITE)
> >> + * - bio is WRITE and in same barrier bucket conf->barrier[idx] raised
> >> + */
> >> +static void _wait_barrier(struct r1conf *conf, long idx)
> >>  {
> >> -	bool wait = false;
> >> -
> >> -	if (conf->array_frozen || !bio)
> >> -		wait = true;
> >> -	else if (conf->barrier && bio_data_dir(bio) == WRITE) {
> >> -		if ((conf->mddev->curr_resync_completed
> >> -		     >= bio_end_sector(bio)) ||
> >> -		    (conf->next_resync + NEXT_NORMALIO_DISTANCE
> >> -		     <= bio->bi_iter.bi_sector))
> >> -			wait = false;
> >> -		else
> >> -			wait = true;
> >> +	spin_lock_irq(&conf->resync_lock);
> >> +	if (conf->array_frozen || conf->barrier[idx]) {
> >> +		conf->nr_waiting[idx]++;
> >> +		/* Wait for the barrier to drop. */
> >> +		wait_event_lock_irq(
> >> +			conf->wait_barrier,
> >> +			!conf->array_frozen && !conf->barrier[idx],
> >> +			conf->resync_lock);
> >> +		conf->nr_waiting[idx]--;
> >>  	}
> >>  
> >> -	return wait;
> >> +	conf->nr_pending[idx]++;
> >> +	spin_unlock_irq(&conf->resync_lock);
> >>  }
> >>  
> >> -static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
> >> +static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
> >>  {
> >> -	sector_t sector = 0;
> >> +	long idx = get_barrier_bucket_idx(sector_nr);
> >>  
> >>  	spin_lock_irq(&conf->resync_lock);
> >> -	if (need_to_wait_for_sync(conf, bio)) {
> >> -		conf->nr_waiting++;
> >> -		/* Wait for the barrier to drop.
> >> -		 * However if there are already pending
> >> -		 * requests (preventing the barrier from
> >> -		 * rising completely), and the
> >> -		 * per-process bio queue isn't empty,
> >> -		 * then don't wait, as we need to empty
> >> -		 * that queue to allow conf->start_next_window
> >> -		 * to increase.
> >> -		 */
> >> -		wait_event_lock_irq(conf->wait_barrier,
> >> -				    !conf->array_frozen &&
> >> -				    (!conf->barrier ||
> >> -				     ((conf->start_next_window <
> >> -				       conf->next_resync + RESYNC_SECTORS) &&
> >> -				      current->bio_list &&
> >> -				      !bio_list_empty(current->bio_list))),
> >> -				    conf->resync_lock);
> >> -		conf->nr_waiting--;
> >> -	}
> >> -
> >> -	if (bio && bio_data_dir(bio) == WRITE) {
> >> -		if (bio->bi_iter.bi_sector >= conf->next_resync) {
> >> -			if (conf->start_next_window == MaxSector)
> >> -				conf->start_next_window =
> >> -					conf->next_resync +
> >> -					NEXT_NORMALIO_DISTANCE;
> >> -
> >> -			if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
> >> -			    <= bio->bi_iter.bi_sector)
> >> -				conf->next_window_requests++;
> >> -			else
> >> -				conf->current_window_requests++;
> >> -			sector = conf->start_next_window;
> >> -		}
> >> +	if (conf->array_frozen) {
> >> +		conf->nr_waiting[idx]++;
> >> +		/* Wait for array to unfreeze */
> >> +		wait_event_lock_irq(
> >> +			conf->wait_barrier,
> >> +			!conf->array_frozen,
> >> +			conf->resync_lock);
> >> +		conf->nr_waiting[idx]--;
> >>  	}
> >> -
> >> -	conf->nr_pending++;
> >> +	conf->nr_pending[idx]++;
> >>  	spin_unlock_irq(&conf->resync_lock);
> >> -	return sector;
> >>  }
> >>  
> >> -static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
> >> -			  sector_t bi_sector)
> >> +static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
> >> +{
> >> +	long idx = get_barrier_bucket_idx(sector_nr);
> >> +
> >> +	_wait_barrier(conf, idx);
> >> +}
> >> +
> >> +static void wait_all_barriers(struct r1conf *conf)
> >> +{
> >> +	long idx;
> >> +
> >> +	for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
> >> +		_wait_barrier(conf, idx);
> > 
> > This is racy. Since resync is sequential, idealy we only need to check the
> > bucket sequentially. The compiler could do  _wait_barrier(conf, 1) and then
> > _wait_barrier(conf, 0). It's possible the bucket 1 has barrier right after the
> > check of bucket 0. Even the compiler doesn't reorder, there is case the bucket
> > 511 should be check before bucket 0 if the resync is just crossing 512 buckets.
> > It would be better we check the resync position and adjust the bucket wait
> > order according to the position.
> > 
> 
> wait_all_barriers() and allow_all_barriers() are used in close_sync() only,
>  static void close_sync(struct r1conf *conf)
>  {
> -	wait_barrier(conf, NULL);
> -	allow_barrier(conf, 0, 0);
> +	wait_all_barriers(conf);
> +	allow_all_barriers(conf);
> [snip]
>  }
> 
> wait_barrier() and allow_barrier() called here is for a synchronization
> purpose, to make sure before close_sync() returns there is no resync I/O
> existing.
> 
> close_sync() is called in raid1_sync_request() when resync I/O exceeds
> the size of raid1 device, and resync should be closed. In this
> condition, there won't be any new resync I/O generated. If a
> conf->barrier[idx] reaches 0, it won't increase before a new resync
> restarts. Therefore it is safe to call _wait_barrier() one by one, until
> every conf->barrier[idx] reaches to 0.
> 
> This is a usage of wait/allow_barrier() which is not mentioned in the
> code. They are not for regular I/O waiting for resync I/O, they are used
> as a synchronization to make sure all on-flying resync I/O to complete.

thanks, this makes sense.

> >>  	int good_sectors = RESYNC_SECTORS;
> >>  	int min_bad = 0; /* number of sectors that are bad in all devices */
> >> +	long idx = get_barrier_bucket_idx(sector_nr);
> >>  
> >>  	if (!conf->r1buf_pool)
> >>  		if (init_resync(conf))
> >> @@ -2535,7 +2571,7 @@ static sector_t raid1_sync_request(struc
> >>  	 * If there is non-resync activity waiting for a turn, then let it
> >>  	 * though before starting on this new sync request.
> >>  	 */
> >> -	if (conf->nr_waiting)
> >> +	if (conf->nr_waiting[idx])
> >>  		schedule_timeout_uninterruptible(1);
> >>  
> >>  	/* we are incrementing sector_nr below. To be safe, we check against
> >> @@ -2562,6 +2598,8 @@ static sector_t raid1_sync_request(struc
> >>  	r1_bio->sector = sector_nr;
> >>  	r1_bio->state = 0;
> >>  	set_bit(R1BIO_IsSync, &r1_bio->state);
> >> +	/* make sure good_sectors won't go across barrier unit border */
> >> +	good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors);
> >>  
> >>  	for (i = 0; i < conf->raid_disks * 2; i++) {
> >>  		struct md_rdev *rdev;
> >> @@ -2786,6 +2824,22 @@ static struct r1conf *setup_conf(struct
> >>  	if (!conf)
> >>  		goto abort;
> >>  
> >> +	conf->nr_pending = kzalloc(PAGE_SIZE, GFP_KERNEL);
> >> +	if (!conf->nr_pending)
> >> +		goto abort;
> > 
> > This allocation is a little wierd. I'd define a count and uses
> > sizeof(nr_pending) * count to do the allocation. There is nothing related to
> > PAGE_SIZE. Alternatively, just make nr_pending an array in r1conf.
> 
> This is just for better memory usage, r1conf is allocated by slab
> allocator, large not aligned size may cause internal memory
> fragmentation inside slab's pages. call kzalloc() with PAGE_SIZE will
> avoid such issue.

Not sure if slab has closest slab for this, but that's not important. I think
sizeof(nr_pending) * count makes more sense here, even sizeof(nr_pending)
* count == PAGE_SIZE.

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH 1/2] raid5-cache: add another check conditon before replaying one stripe
From: Shaohua Li @ 2016-11-29 19:53 UTC (permalink / raw)
  To: Zhengyuan Liu; +Cc: shli, songliubraving, neilb, linux-raid, liuyun01
In-Reply-To: <1480129034-14700-1-git-send-email-liuzhengyuan@kylinos.cn>

On Sat, Nov 26, 2016 at 10:57:13AM +0800, Zhengyuan Liu wrote:
> New stripe that was just allocated has no STRIPE_R5C_CACHING state too,
> add this check condition could avoid unnecessary replaying for empty stripe.
> 
> r5l_recovery_replay_one_stripe would reset stripe for any case, delete it
> to make code more clean.

applied, thanks! 
> Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
> ---
>  drivers/md/raid5-cache.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 5f817bd..9911164 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -1887,9 +1887,9 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
>  		}
>  
>  		if (payload->header.type == R5LOG_PAYLOAD_DATA) {
> -			if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
> +			if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
> +			    test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
>  				r5l_recovery_replay_one_stripe(conf, sh, ctx);
> -				r5l_recovery_reset_stripe(sh);
>  				sh->log_start = ctx->pos;
>  				list_move_tail(&sh->lru, cached_stripe_list);
>  			}
> -- 
> 2.7.4
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 2/2] raid5-cache: don't set STRIPE_R5C_PARTIAL_STRIPE flag while load stripe into cache
From: Shaohua Li @ 2016-11-29 19:55 UTC (permalink / raw)
  To: Zhengyuan Liu; +Cc: shli, songliubraving, neilb, linux-raid, liuyun01
In-Reply-To: <1480129034-14700-2-git-send-email-liuzhengyuan@kylinos.cn>

On Sat, Nov 26, 2016 at 10:57:14AM +0800, Zhengyuan Liu wrote:
> r5c_recovery_load_one_stripe should not set STRIPE_R5C_PARTIAL_STRIPE flag,as
> the data-only stripe may be STRIPE_R5C_FULL_STRIPE stripe. The state machine
> would release the stripe later and add it into neither r5c_cached_full_stripes
> list or r5c_cached_partial_stripes list and set correct flag. Also we should
> fix the counter corresponding.
> 
> Reviewed-by: JackieLiu <liuyun01@kylinos.cn>
> Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
> ---
>  drivers/md/raid5-cache.c |  3 +--
>  drivers/md/raid5.c       | 11 +++++++++++
>  2 files changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 9911164..e0ac758 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -1930,9 +1930,8 @@ static void r5c_recovery_load_one_stripe(struct r5l_log *log,
>  			set_bit(R5_UPTODATE, &dev->flags);
>  		}
>  	}
> -	set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state);
> -	atomic_inc(&conf->r5c_cached_partial_stripes);
>  	list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
> +	atomic_inc(&log->stripe_in_journal_count);
>  }

Applied this part and discarded the latter. Thanks!
 
>  /*
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index dbab8c7..8120ce4 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -677,12 +677,23 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
>  					atomic_inc(&conf->active_stripes);
>  				BUG_ON(list_empty(&sh->lru) &&
>  				       !test_bit(STRIPE_EXPANDING, &sh->state));
> +
>  				inc_empty_inactive_list_flag = 0;
>  				if (!list_empty(conf->inactive_list + hash))
>  					inc_empty_inactive_list_flag = 1;
>  				list_del_init(&sh->lru);
>  				if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag)
>  					atomic_inc(&conf->empty_inactive_list_nr);
> +
> +				if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) {
> +					WARN_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0);
> +					atomic_dec(&conf->r5c_cached_partial_stripes);
> +				}
> +				if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) {
> +					WARN_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0);
> +					atomic_dec(&conf->r5c_cached_full_stripes);
> +				}
> +
>  				if (sh->group) {
>  					sh->group->stripes_cnt--;
>  					sh->group = NULL;
> -- 
> 2.7.4
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 1/4] md/raid5-cache: remove unnecessary function parameters
From: Song Liu @ 2016-11-29 20:14 UTC (permalink / raw)
  To: linux-raid; +Cc: Song Liu
In-Reply-To: <20161128081921.5641-1-liuyun01@kylinos.cn>

Reviewed-by: Song Liu <songliubraving@fb.com>



^ permalink raw reply

* Re: Is trim/discard supported on LVM RAID logical volume?
From: Chris Murphy @ 2016-11-29 20:20 UTC (permalink / raw)
  To: Patrick Dung; +Cc: Linux-RAID
In-Reply-To: <CAEtPA0ByX8yYx8xNc_=4SP0bqXVz-D4o-bM5XZF8Y6NcnqsEKQ@mail.gmail.com>

Seen /etc/lvm/lvm.conf issue_discards is probably set to 0.


Chris Murphy

^ permalink raw reply

* Re: [PATCH 2/4] md/raid5-cache: use ring add to prevent overflow
From: Song Liu @ 2016-11-29 20:31 UTC (permalink / raw)
  To: linux-raid; +Cc: Song Liu
In-Reply-To: <20161128081921.5641-2-liuyun01@kylinos.cn>

Reviewed-by: Song Liu <songliubraving@fb.com>



^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox