Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH v2 4/7] super1: PPL support
From: Artur Paszkiewicz @ 2016-12-05 15:32 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid
In-Reply-To: <20161205153204.7343-1-artur.paszkiewicz@intel.com>

Enable creating and assembling raid5 arrays with PPL for 1.1 and 1.2
metadata.

When creating, reserve enough space for PPL and store its size and
location in the superblock and set MD_FEATURE_PPL bit. PPL is stored in
the metadata region reserved for internal write-intent bitmap, so don't
allow using bitmap and PPL together.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 Create.c      | 19 +++++++++++--
 Grow.c        | 15 +++++++++-
 mdadm.h       |  4 ++-
 super-ddf.c   |  2 +-
 super-gpt.c   |  2 +-
 super-intel.c | 10 +++++--
 super-mbr.c   |  2 +-
 super0.c      |  2 +-
 super1.c      | 88 +++++++++++++++++++++++++++++++++++++++++++++++++----------
 9 files changed, 118 insertions(+), 26 deletions(-)

diff --git a/Create.c b/Create.c
index ebbdd94..0168c3a 100644
--- a/Create.c
+++ b/Create.c
@@ -201,6 +201,15 @@ int Create(struct supertype *st, char *mddev,
 		return 1;
 	}
 
+	if (s->rwh_policy == RWH_POLICY_PPL) {
+		if (s->bitmap_file) {
+			pr_err("PPL is not compatible with bitmap\n");
+			return 1;
+		} else {
+			s->bitmap_file = "none";
+		}
+	}
+
 	/* now set some defaults */
 
 	if (s->layout == UnSet) {
@@ -259,7 +268,8 @@ int Create(struct supertype *st, char *mddev,
 	if (st && ! st->ss->validate_geometry(st, s->level, s->layout, s->raiddisks,
 					      &s->chunk, s->size*2,
 					      data_offset, NULL,
-					      &newsize, c->verbose>=0))
+					      &newsize, s->rwh_policy,
+					      c->verbose>=0))
 		return 1;
 
 	if (s->chunk && s->chunk != UnSet) {
@@ -358,7 +368,8 @@ int Create(struct supertype *st, char *mddev,
 						st, s->level, s->layout, s->raiddisks,
 						&s->chunk, s->size*2,
 						dv->data_offset, dname,
-						&freesize, c->verbose > 0)) {
+						&freesize, s->rwh_policy,
+						c->verbose > 0)) {
 				case -1: /* Not valid, message printed, and not
 					  * worth checking any further */
 					exit(2);
@@ -395,6 +406,7 @@ int Create(struct supertype *st, char *mddev,
 						       &s->chunk, s->size*2,
 						       dv->data_offset,
 						       dname, &freesize,
+						       s->rwh_policy,
 						       c->verbose >= 0)) {
 
 				pr_err("%s is not suitable for this array.\n",
@@ -501,7 +513,8 @@ int Create(struct supertype *st, char *mddev,
 						       s->raiddisks,
 						       &s->chunk, minsize*2,
 						       data_offset,
-						       NULL, NULL, 0)) {
+						       NULL, NULL,
+						       s->rwh_policy, 0)) {
 				pr_err("devices too large for RAID level %d\n", s->level);
 				return 1;
 			}
diff --git a/Grow.c b/Grow.c
index 455c5f9..8bfb09c 100755
--- a/Grow.c
+++ b/Grow.c
@@ -290,6 +290,7 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
 	int major = BITMAP_MAJOR_HI;
 	int vers = md_get_version(fd);
 	unsigned long long bitmapsize, array_size;
+	struct mdinfo *mdi;
 
 	if (vers < 9003) {
 		major = BITMAP_MAJOR_HOSTENDIAN;
@@ -389,12 +390,23 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
 		free(st);
 		return 1;
 	}
+
+	mdi = sysfs_read(fd, NULL, GET_RWH_POLICY);
+	if (mdi) {
+		if (mdi->rwh_policy == RWH_POLICY_PPL) {
+			pr_err("Cannot add bitmap to array with PPL\n");
+			free(mdi);
+			free(st);
+			return 1;
+		}
+		free(mdi);
+	}
+
 	if (strcmp(s->bitmap_file, "internal") == 0 ||
 	    strcmp(s->bitmap_file, "clustered") == 0) {
 		int rv;
 		int d;
 		int offset_setable = 0;
-		struct mdinfo *mdi;
 		if (st->ss->add_internal_bitmap == NULL) {
 			pr_err("Internal bitmaps not supported with %s metadata\n", st->ss->name);
 			return 1;
@@ -446,6 +458,7 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
 			sysfs_init(mdi, fd, NULL);
 			rv = sysfs_set_num_signed(mdi, NULL, "bitmap/location",
 						  mdi->bitmap_offset);
+			free(mdi);
 		} else {
 			if (strcmp(s->bitmap_file, "clustered") == 0)
 				array.state |= (1 << MD_SB_CLUSTERED);
diff --git a/mdadm.h b/mdadm.h
index 03de402..6d52bdf 100644
--- a/mdadm.h
+++ b/mdadm.h
@@ -299,6 +299,8 @@ struct mdinfo {
 		#define MaxSector  (~0ULL) /* resync/recovery complete position */
 	};
 	long			bitmap_offset;	/* 0 == none, 1 == a file */
+	unsigned int		ppl_offset;
+	unsigned int		ppl_sectors;
 	unsigned long		safe_mode_delay; /* ms delay to mark clean */
 	int			new_level, delta_disks, new_layout, new_chunk;
 	int			errors;
@@ -972,7 +974,7 @@ extern struct superswitch {
 				 int *chunk, unsigned long long size,
 				 unsigned long long data_offset,
 				 char *subdev, unsigned long long *freesize,
-				 int verbose);
+				 int rwh_policy, int verbose);
 
 	/* Return a linked list of 'mdinfo' structures for all arrays
 	 * in the container.  For non-containers, it is like
diff --git a/super-ddf.c b/super-ddf.c
index 18e1e77..6184a73 100644
--- a/super-ddf.c
+++ b/super-ddf.c
@@ -3347,7 +3347,7 @@ static int validate_geometry_ddf(struct supertype *st,
 				 int *chunk, unsigned long long size,
 				 unsigned long long data_offset,
 				 char *dev, unsigned long long *freesize,
-				 int verbose)
+				 int rwh_policy, int verbose)
 {
 	int fd;
 	struct mdinfo *sra;
diff --git a/super-gpt.c b/super-gpt.c
index 1a2adce..efb0c00 100644
--- a/super-gpt.c
+++ b/super-gpt.c
@@ -195,7 +195,7 @@ static int validate_geometry(struct supertype *st, int level,
 			     int *chunk, unsigned long long size,
 			     unsigned long long data_offset,
 			     char *subdev, unsigned long long *freesize,
-			     int verbose)
+			     int rwh_policy, int verbose)
 {
 	pr_err("gpt metadata cannot be used this way\n");
 	return 0;
diff --git a/super-intel.c b/super-intel.c
index 0c4726a..eb6b543 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -3482,9 +3482,13 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *
 
 		disk = &super->disks->disk;
 		info->data_offset = total_blocks(&super->disks->disk) - reserved;
-		/* mpb anchor sector - see store_imsm_mpb() */
+		/* mpb anchor is located in the second _logical_ sector from
+		 * the end of the drive, here we convert it to 512 byte sectors
+		 * because this value will be passed to the kernel
+		 */
 		info->sb_start = total_blocks(&super->disks->disk) -
 				 ((2 * super->sector_size) >> 9);
+
 		info->component_size = reserved;
 		info->disk.state  = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0;
 		/* we don't change info->disk.raid_disk here because
@@ -6935,7 +6939,7 @@ static int validate_geometry_imsm(struct supertype *st, int level, int layout,
 				  int raiddisks, int *chunk, unsigned long long size,
 				  unsigned long long data_offset,
 				  char *dev, unsigned long long *freesize,
-				  int verbose)
+				  int rwh_policy, int verbose)
 {
 	int fd, cfd;
 	struct mdinfo *sra;
@@ -10971,7 +10975,7 @@ enum imsm_reshape_type imsm_analyze_change(struct supertype *st,
 				    geo->raid_disks + devNumChange,
 				    &chunk,
 				    geo->size, INVALID_SECTORS,
-				    0, 0, 1))
+				    0, 0, info.rwh_policy, 1))
 		change = -1;
 
 	if (check_devs) {
diff --git a/super-mbr.c b/super-mbr.c
index f5e4cea..66d984c 100644
--- a/super-mbr.c
+++ b/super-mbr.c
@@ -193,7 +193,7 @@ static int validate_geometry(struct supertype *st, int level,
 			     int *chunk, unsigned long long size,
 			     unsigned long long data_offset,
 			     char *subdev, unsigned long long *freesize,
-			     int verbose)
+			     int rwh_policy, int verbose)
 {
 	pr_err("mbr metadata cannot be used this way\n");
 	return 0;
diff --git a/super0.c b/super0.c
index e252c88..cc50fb4 100644
--- a/super0.c
+++ b/super0.c
@@ -1267,7 +1267,7 @@ static int validate_geometry0(struct supertype *st, int level,
 			      int *chunk, unsigned long long size,
 			      unsigned long long data_offset,
 			      char *subdev, unsigned long long *freesize,
-			      int verbose)
+			      int rwh_policy, int verbose)
 {
 	unsigned long long ldsize;
 	int fd;
diff --git a/super1.c b/super1.c
index edd4a1d..7cf46a8 100644
--- a/super1.c
+++ b/super1.c
@@ -48,10 +48,18 @@ struct mdp_superblock_1 {
 
 	__u32	chunksize;	/* in 512byte sectors */
 	__u32	raid_disks;
-	__u32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
-				 * NOTE: signed, so bitmap can be before superblock
-				 * only meaningful of feature_map[0] is set.
-				 */
+	union {
+		__u32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
+					 * NOTE: signed, so bitmap can be before superblock
+					 * only meaningful of feature_map[0] is set.
+					 */
+
+		/* only meaningful when feature_map[MD_FEATURE_PPL] is set */
+		struct {
+			__u16 offset; /* sectors after start of superblock that ppl starts */
+			__u16 size; /* PPL size (including header) in sectors */
+		} ppl;
+	};
 
 	/* These are only valid with feature bit '4' */
 	__u32	new_level;	/* new level we are reshaping to		*/
@@ -131,6 +139,7 @@ struct misc_dev_info {
 #define	MD_FEATURE_NEW_OFFSET		64 /* new_offset must be honoured */
 #define	MD_FEATURE_BITMAP_VERSIONED	256 /* bitmap version number checked properly */
 #define	MD_FEATURE_JOURNAL		512 /* support write journal */
+#define	MD_FEATURE_PPL			1024 /* support PPL */
 #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
 					|MD_FEATURE_RECOVERY_OFFSET	\
 					|MD_FEATURE_RESHAPE_ACTIVE	\
@@ -140,6 +149,7 @@ struct misc_dev_info {
 					|MD_FEATURE_NEW_OFFSET		\
 					|MD_FEATURE_BITMAP_VERSIONED	\
 					|MD_FEATURE_JOURNAL		\
+					|MD_FEATURE_PPL			\
 					)
 
 #ifndef MDASSEMBLE
@@ -289,6 +299,11 @@ static int awrite(struct align_fd *afd, void *buf, int len)
 	return len;
 }
 
+static inline unsigned int choose_ppl_space(int chunk)
+{
+	return 8 + (chunk > 128*2 ? chunk : 128*2);
+}
+
 #ifndef MDASSEMBLE
 static void examine_super1(struct supertype *st, char *homehost)
 {
@@ -392,6 +407,10 @@ static void examine_super1(struct supertype *st, char *homehost)
 	if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) {
 		printf("Internal Bitmap : %ld sectors from superblock\n",
 		       (long)(int32_t)__le32_to_cpu(sb->bitmap_offset));
+	} else if (sb->feature_map & __cpu_to_le32(MD_FEATURE_PPL)) {
+		printf("            PPL : %u sectors at offset %u sectors from superblock\n",
+		       __le16_to_cpu(sb->ppl.size),
+		       __le16_to_cpu(sb->ppl.offset));
 	}
 	if (sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE)) {
 		printf("  Reshape pos'n : %llu%s\n", (unsigned long long)__le64_to_cpu(sb->reshape_position)/2,
@@ -935,8 +954,12 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
 
 	info->data_offset = __le64_to_cpu(sb->data_offset);
 	info->component_size = __le64_to_cpu(sb->size);
-	if (sb->feature_map & __le32_to_cpu(MD_FEATURE_BITMAP_OFFSET))
+	if (sb->feature_map & __le32_to_cpu(MD_FEATURE_BITMAP_OFFSET)) {
 		info->bitmap_offset = (int32_t)__le32_to_cpu(sb->bitmap_offset);
+	} else if (sb->feature_map & __le32_to_cpu(MD_FEATURE_PPL)) {
+		info->ppl_offset = __le16_to_cpu(sb->ppl.offset);
+		info->ppl_sectors = __le16_to_cpu(sb->ppl.size);
+	}
 
 	info->disk.major = 0;
 	info->disk.minor = 0;
@@ -981,6 +1004,11 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
 			bmend += size;
 			if (bmend > earliest)
 				earliest = bmend;
+		} else if (info->ppl_offset > 0) {
+			unsigned long long pplend = info->ppl_offset +
+						    info->ppl_sectors;
+			if (pplend > earliest)
+				earliest = pplend;
 		}
 		if (sb->bblog_offset && sb->bblog_size) {
 			unsigned long long bbend = super_offset;
@@ -1074,9 +1102,16 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
 	}
 
 	info->array.working_disks = working;
-	if (sb->feature_map & __le32_to_cpu(MD_FEATURE_JOURNAL))
+
+	if (sb->feature_map & __le32_to_cpu(MD_FEATURE_JOURNAL)) {
 		info->journal_device_required = 1;
-	info->journal_clean = 0;
+		info->rwh_policy = RWH_POLICY_JOURNAL;
+	} else if (sb->feature_map & __le32_to_cpu(MD_FEATURE_PPL)) {
+		info->rwh_policy = RWH_POLICY_PPL;
+		info->journal_clean = 1;
+	} else {
+		info->rwh_policy = RWH_POLICY_UNKNOWN;
+	}
 }
 
 static struct mdinfo *container_content1(struct supertype *st, char *subarray)
@@ -1238,6 +1273,9 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
 		if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) {
 			bitmap_offset = (long)__le32_to_cpu(sb->bitmap_offset);
 			bm_sectors = calc_bitmap_size(bms, 4096) >> 9;
+		} else if (sb->feature_map & __cpu_to_le32(MD_FEATURE_PPL)) {
+			bitmap_offset = (long)__le16_to_cpu(sb->ppl.offset);
+			bm_sectors = (long)__le16_to_cpu(sb->ppl.size);
 		}
 #endif
 		if (sb_offset < data_offset) {
@@ -1471,6 +1509,9 @@ static int init_super1(struct supertype *st, mdu_array_info_t *info,
 
 	memset(sb->dev_roles, 0xff, MAX_SB_SIZE - sizeof(struct mdp_superblock_1));
 
+	if (s->rwh_policy == RWH_POLICY_PPL)
+		sb->feature_map |= __cpu_to_le32(MD_FEATURE_PPL);
+
 	return 1;
 }
 
@@ -1672,7 +1713,7 @@ static int write_empty_r5l_meta_block(struct supertype *st, int fd)
 	crc = crc32c_le(crc, (void *)mb, META_BLOCK_SIZE);
 	mb->checksum = crc;
 
-	if (lseek64(fd, (sb->data_offset) * 512, 0) < 0LL) {
+	if (lseek64(fd, __le64_to_cpu(sb->data_offset) * 512, 0) < 0LL) {
 		pr_err("cannot seek to offset of the meta block\n");
 		goto fail_to_write;
 	}
@@ -1705,7 +1746,7 @@ static int write_init_super1(struct supertype *st)
 
 	for (di = st->info; di; di = di->next) {
 		if (di->disk.state & (1 << MD_DISK_JOURNAL))
-			sb->feature_map |= MD_FEATURE_JOURNAL;
+			sb->feature_map |= __cpu_to_le32(MD_FEATURE_JOURNAL);
 	}
 
 	for (di = st->info; di; di = di->next) {
@@ -1780,6 +1821,11 @@ static int write_init_super1(struct supertype *st)
 					(((char *)sb) + MAX_SB_SIZE);
 			bm_space = calc_bitmap_size(bms, 4096) >> 9;
 			bm_offset = (long)__le32_to_cpu(sb->bitmap_offset);
+		} else if (sb->feature_map & __cpu_to_le32(MD_FEATURE_PPL)) {
+			bm_space = choose_ppl_space(__le32_to_cpu(sb->chunksize));
+			bm_offset = 8;
+			sb->ppl.offset = __cpu_to_le16(bm_offset);
+			sb->ppl.size = __cpu_to_le16(bm_space);
 		} else {
 			bm_space = choose_bm_space(array_size);
 			bm_offset = 8;
@@ -1851,8 +1897,10 @@ static int write_init_super1(struct supertype *st)
 				goto error_out;
 		}
 
-		if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1))
+		if (rv == 0 &&
+		    (__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET))
 			rv = st->ss->write_bitmap(st, di->fd, NodeNumUpdate);
+
 		close(di->fd);
 		di->fd = -1;
 		if (rv)
@@ -2120,11 +2168,13 @@ static __u64 avail_size1(struct supertype *st, __u64 devsize,
 		return 0;
 
 #ifndef MDASSEMBLE
-	if (__le32_to_cpu(super->feature_map)&MD_FEATURE_BITMAP_OFFSET) {
+	if (__le32_to_cpu(super->feature_map) & MD_FEATURE_BITMAP_OFFSET) {
 		/* hot-add. allow for actual size of bitmap */
 		struct bitmap_super_s *bsb;
 		bsb = (struct bitmap_super_s *)(((char*)super)+MAX_SB_SIZE);
 		bmspace = calc_bitmap_size(bsb, 4096) >> 9;
+	} else if (__le32_to_cpu(super->feature_map) & MD_FEATURE_PPL) {
+		bmspace = __le16_to_cpu(super->ppl.size);
 	}
 #endif
 	/* Allow space for bad block log */
@@ -2492,7 +2542,7 @@ static int validate_geometry1(struct supertype *st, int level,
 			      int *chunk, unsigned long long size,
 			      unsigned long long data_offset,
 			      char *subdev, unsigned long long *freesize,
-			      int verbose)
+			      int rwh_policy, int verbose)
 {
 	unsigned long long ldsize, devsize;
 	int bmspace;
@@ -2514,6 +2564,14 @@ static int validate_geometry1(struct supertype *st, int level,
 		/* not specified, so time to set default */
 		st->minor_version = 2;
 
+	if (st->minor_version != 2 && st->minor_version != 1 &&
+	    rwh_policy == RWH_POLICY_PPL) {
+		if (verbose)
+			pr_err("1.%d metadata does not support PPL\n",
+			       st->minor_version);
+		return 0;
+	}
+
 	fd = open(subdev, O_RDONLY|O_EXCL, 0);
 	if (fd < 0) {
 		if (verbose)
@@ -2534,8 +2592,9 @@ static int validate_geometry1(struct supertype *st, int level,
 		return 0;
 	}
 
-	/* creating:  allow suitable space for bitmap */
-	bmspace = choose_bm_space(devsize);
+	/* creating:  allow suitable space for bitmap or PPL */
+	bmspace = rwh_policy == RWH_POLICY_PPL ?
+		  choose_ppl_space((*chunk)*2) : choose_bm_space(devsize);
 
 	if (data_offset == INVALID_SECTORS)
 		data_offset = st->data_offset;
@@ -2667,5 +2726,6 @@ struct superswitch super1 = {
 #else
 	.swapuuid = 1,
 #endif
+	.supports_ppl = 1,
 	.name = "1.x",
 };
-- 
2.10.1


^ permalink raw reply related

* [PATCH v2 5/7] imsm: allow to assemble with PPL even if dirty degraded
From: Artur Paszkiewicz @ 2016-12-05 15:32 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid
In-Reply-To: <20161205153204.7343-1-artur.paszkiewicz@intel.com>

From: Pawel Baldysiak <pawel.baldysiak@intel.com>

This is necessary to allow PPL recovery in the kernel driver. If the
recovery succeeds, the array is no longer in dirty state and can be
started normally as degraded.

Signed-off-by: Pawel Baldysiak <pawel.baldysiak@intel.com>
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 Assemble.c    | 4 +++-
 super-intel.c | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/Assemble.c b/Assemble.c
index 3da0903..022d826 100644
--- a/Assemble.c
+++ b/Assemble.c
@@ -1943,7 +1943,9 @@ int assemble_container_content(struct supertype *st, int mdfd,
 		   content->uuid, chosen_name);
 
 	if (enough(content->array.level, content->array.raid_disks,
-		   content->array.layout, content->array.state & 1, avail) == 0) {
+		   content->array.layout,
+		   (content->array.state & 1) || content->journal_clean,
+		   avail) == 0) {
 		if (c->export && result)
 			*result |= INCR_NO;
 		else if (c->verbose >= 0) {
diff --git a/super-intel.c b/super-intel.c
index eb6b543..e524ef0 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -3175,6 +3175,7 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
 	info->custom_array_size   <<= 32;
 	info->custom_array_size   |= __le32_to_cpu(dev->size_low);
 	info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
+	info->journal_clean = dev->rwh_policy;
 
 	if (is_gen_migration(dev)) {
 		info->reshape_active = 1;
-- 
2.10.1


^ permalink raw reply related

* [PATCH v2 6/7] Allow changing the RWH policy for a running array
From: Artur Paszkiewicz @ 2016-12-05 15:32 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid
In-Reply-To: <20161205153204.7343-1-artur.paszkiewicz@intel.com>

This extends the --rwh-policy parameter to work also in Misc mode. Using
it changes the currently active RWH policy in the kernel driver and
updates the metadata to make this change permanent. Updating metadata is
not yet implemented for super1, so this is limited to IMSM for now.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 Manage.c      | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 mdadm.c       |  9 +++++++
 mdadm.h       |  1 +
 super-intel.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 153 insertions(+), 1 deletion(-)

diff --git a/Manage.c b/Manage.c
index 5c3d2b9..0119684 100644
--- a/Manage.c
+++ b/Manage.c
@@ -1823,4 +1823,83 @@ int move_spare(char *from_devname, char *to_devname, dev_t devid)
 	close(fd2);
 	return 0;
 }
+
+int ChangeRwhPolicy(char *dev, char *update, int verbose)
+{
+	struct supertype *st;
+	struct mdinfo *info;
+	char *subarray = NULL;
+	int ret = 0;
+	int fd;
+	int new_policy = map_name(rwh_policies, update);
+
+	if (new_policy == UnSet)
+		return 1;
+
+	fd = open(dev, O_RDONLY);
+	if (fd < 0)
+		return 1;
+
+	st = super_by_fd(fd, &subarray);
+	if (!st) {
+		close(fd);
+		return 1;
+	}
+
+	info = sysfs_read(fd, NULL, GET_RWH_POLICY|GET_LEVEL);
+	close(fd);
+	if (!info) {
+		ret = 1;
+		goto free_st;
+	}
+
+	if (new_policy == RWH_POLICY_PPL && !st->ss->supports_ppl) {
+		pr_err("%s metadata does not support PPL\n", st->ss->name);
+		ret = 1;
+		goto free_info;
+	}
+
+	if (info->array.level < 4 || info->array.level > 6) {
+		pr_err("Operation not supported for array level %d\n",
+				info->array.level);
+		ret = 1;
+		goto free_info;
+	}
+
+	if (info->rwh_policy != (unsigned)new_policy) {
+		if (!st->ss->external && new_policy == RWH_POLICY_PPL) {
+			pr_err("Operation supported for external metadata only.\n");
+			ret = 1;
+			goto free_info;
+		}
+
+		if (sysfs_set_str(info, NULL, "rwh_policy", update)) {
+			pr_err("Failed to change array RWH Policy\n");
+			ret = 1;
+			goto free_info;
+		}
+		info->rwh_policy = new_policy;
+	}
+
+	if (subarray) {
+		char container_dev[PATH_MAX];
+		struct mddev_ident ident;
+
+		sprintf(container_dev, "/dev/%s", st->container_devnm);
+
+		st->info = info;
+		ident.st = st;
+
+		ret = Update_subarray(container_dev, subarray, "rwh-policy",
+				&ident, verbose);
+	}
+
+free_info:
+	sysfs_free(info);
+free_st:
+	free(st);
+	free(subarray);
+
+	return ret;
+}
 #endif
diff --git a/mdadm.c b/mdadm.c
index 2b6d3a2..ed21f1c 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -252,6 +252,7 @@ int main(int argc, char *argv[])
 		case UpdateSubarray:
 		case UdevRules:
 		case KillOpt:
+		case RwhPolicy:
 			if (!mode)
 				newmode = MISC;
 			break;
@@ -1211,11 +1212,16 @@ int main(int argc, char *argv[])
 			s.journaldisks = 1;
 			continue;
 		case O(CREATE, RwhPolicy):
+		case O(MISC, RwhPolicy):
 			s.rwh_policy = map_name(rwh_policies, optarg);
 			if (s.rwh_policy == UnSet) {
 				pr_err("Invalid RWH policy: %s\n", optarg);
 				exit(2);
 			}
+			if (mode == MISC) {
+				devmode = opt;
+				c.update = optarg;
+			}
 			continue;
 		}
 		/* We have now processed all the valid options. Anything else is
@@ -1927,6 +1933,9 @@ static int misc_list(struct mddev_dev *devlist,
 		case Action:
 			rv |= SetAction(dv->devname, c->action);
 			continue;
+		case RwhPolicy:
+			rv |= ChangeRwhPolicy(dv->devname, c->update, c->verbose);
+			continue;
 		}
 		if (dv->devname[0] == '/')
 			mdfd = open_mddev(dv->devname, 1);
diff --git a/mdadm.h b/mdadm.h
index 6d52bdf..53f8bd1 100644
--- a/mdadm.h
+++ b/mdadm.h
@@ -1367,6 +1367,7 @@ extern int Update_subarray(char *dev, char *subarray, char *update, struct mddev
 extern int Wait(char *dev);
 extern int WaitClean(char *dev, int sock, int verbose);
 extern int SetAction(char *dev, char *action);
+extern int ChangeRwhPolicy(char *dev, char *update, int verbose);
 
 extern int Incremental(struct mddev_dev *devlist, struct context *c,
 		       struct supertype *st);
diff --git a/super-intel.c b/super-intel.c
index e524ef0..3b40429 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -448,6 +448,7 @@ enum imsm_update_type {
 	update_general_migration_checkpoint,
 	update_size_change,
 	update_prealloc_badblocks_mem,
+	update_rwh_policy,
 };
 
 struct imsm_update_activate_spare {
@@ -540,6 +541,12 @@ struct imsm_update_prealloc_bb_mem {
 	enum imsm_update_type type;
 };
 
+struct imsm_update_rwh_policy {
+	enum imsm_update_type type;
+	int new_policy;
+	int dev_idx;
+};
+
 static const char *_sys_dev_type[] = {
 	[SYS_DEV_UNKNOWN] = "Unknown",
 	[SYS_DEV_SAS] = "SAS",
@@ -3175,7 +3182,6 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
 	info->custom_array_size   <<= 32;
 	info->custom_array_size   |= __le32_to_cpu(dev->size_low);
 	info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
-	info->journal_clean = dev->rwh_policy;
 
 	if (is_gen_migration(dev)) {
 		info->reshape_active = 1;
@@ -3347,6 +3353,8 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
 			info->rwh_policy = RWH_POLICY_PPL;
 		else
 			info->rwh_policy = RWH_POLICY_UNKNOWN;
+
+		info->journal_clean = info->rwh_policy == RWH_POLICY_PPL;
 	}
 }
 
@@ -7183,6 +7191,41 @@ static int update_subarray_imsm(struct supertype *st, char *subarray,
 			}
 			super->updates_pending++;
 		}
+	} else if (strcmp(update, "rwh-policy") == 0) {
+		struct mdinfo *info;
+		int new_policy;
+		char *ep;
+		int vol = strtoul(subarray, &ep, 10);
+
+		if (!ident->st || !ident->st->info)
+			return 2;
+
+		info = ident->st->info;
+
+		if (*ep != '\0' || vol >= super->anchor->num_raid_devs)
+			return 2;
+
+		if (info->rwh_policy == RWH_POLICY_OFF)
+			new_policy = RWH_OFF;
+		else if (info->rwh_policy == RWH_POLICY_PPL)
+			new_policy = RWH_DISTRIBUTED;
+		else
+			return 2;
+
+		if (st->update_tail) {
+			struct imsm_update_rwh_policy *u = xmalloc(sizeof(*u));
+
+			u->type = update_rwh_policy;
+			u->dev_idx = vol;
+			u->new_policy = new_policy;
+			append_metadata_update(st, u, sizeof(*u));
+		} else {
+			struct imsm_dev *dev;
+
+			dev = get_imsm_dev(super, vol);
+			dev->rwh_policy = new_policy;
+			super->updates_pending++;
+		}
 	} else
 		return 2;
 
@@ -9400,6 +9443,21 @@ static void imsm_process_update(struct supertype *st,
 	}
 	case update_prealloc_badblocks_mem:
 		break;
+	case update_rwh_policy: {
+		struct imsm_update_rwh_policy *u = (void *)update->buf;
+		int target = u->dev_idx;
+		struct imsm_dev *dev = get_imsm_dev(super, target);
+		if (!dev) {
+			dprintf("could not find subarray-%d\n", target);
+			break;
+		}
+
+		if (dev->rwh_policy != u->new_policy) {
+			dev->rwh_policy = u->new_policy;
+			super->updates_pending++;
+		}
+		break;
+	}
 	default:
 		pr_err("error: unsuported process update type:(type: %d)\n",	type);
 	}
@@ -9645,6 +9703,11 @@ static int imsm_prepare_update(struct supertype *st,
 		super->extra_space += sizeof(struct bbm_log) -
 			get_imsm_bbm_log_size(super->bbm_log);
 		break;
+	case update_rwh_policy: {
+		if (update->len < (int)sizeof(struct imsm_update_rwh_policy))
+			return 0;
+		break;
+	}
 	default:
 		return 0;
 	}
-- 
2.10.1


^ permalink raw reply related

* [PATCH v2 7/7] Man page changes for --rwh-policy
From: Artur Paszkiewicz @ 2016-12-05 15:32 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid
In-Reply-To: <20161205153204.7343-1-artur.paszkiewicz@intel.com>

Describe the usage of the --rwh-policy parameter in Create and Misc
modes.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 mdadm.8.in | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/mdadm.8.in b/mdadm.8.in
index aa80f0c..d4456a1 100644
--- a/mdadm.8.in
+++ b/mdadm.8.in
@@ -1015,6 +1015,26 @@ simultaneously. If not specified, this defaults to 4.
 Specify journal device for the RAID-4/5/6 array. The journal device
 should be a SSD with reasonable lifetime.
 
+.TP
+.BR \-\-rwh-policy=
+Specify the RAID Write Hole policy for a RAID-4/5/6 array. Currently supported
+options are
+.BR off ,
+.B journal
+and
+.BR ppl .
+
+The
+.B journal
+policy is implicitly selected when using
+.BR \-\-write-journal .
+
+The
+.B ppl
+policy (Partial Parity Log) is a mechanism that can be used with RAID5 arrays.
+This feature prevents data loss by keeping parity consistent with data even in
+case of drive failure during dirty shutdown. PPL is stored in the metadata
+region of RAID member drives, no additional journal drive is needed.
 
 .SH For assemble:
 
@@ -1705,6 +1725,14 @@ can be found it
 under
 .BR "SCRUBBING AND MISMATCHES" .
 
+.TP
+.BR \-\-rwh-policy=
+Change the RAID Write Hole policy for a RAID-4/5/6 array at runtime. For
+details about the RWH policies, see the description for the same parameter
+under
+.B Create mode
+options.
+
 .SH For Incremental Assembly mode:
 .TP
 .BR \-\-rebuild\-map ", " \-r
-- 
2.10.1


^ permalink raw reply related

* [PATCH v3 0/2] Reorganize raid*_make_request to clean up code
From: Robert LeBlanc @ 2016-12-05 20:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Robert LeBlanc

In response to Christoph, I've broken the read and writes into their own
functions to make the code even cleaner. Since it is such a big change, I broke
up the commits into this series instead of creating a v2 of the previous patch.

Changes since v2:
 Shaohua Li
 * Make md_write_start before wait_barrier
 * Move I/O in recovery stripe test to write path
 * Changed to if/then instead of return in __make_request

Changes since v1:
 
 John Stoffel
 * Changed to if/then instead of return in raid1_make_request

 Neil Brown
 * Moved wait_barrier into raid1_{read,write}_request so that it could be after
   ->suspend_{hi,lo}. This prevents a write blocking a resync until the suspend
   region is moved.

Robert LeBlanc (2):
  md/raid1: Refactor raid1_make_request
  md/raid10: Refactor raid10_make_request

 drivers/md/raid1.c  | 259 +++++++++++++++++++++++++++-------------------------
 drivers/md/raid10.c | 215 ++++++++++++++++++++++---------------------
 2 files changed, 245 insertions(+), 229 deletions(-)

-- 
2.10.2


^ permalink raw reply

* [PATCH v3 1/2] md/raid1: Refactor raid1_make_request
From: Robert LeBlanc @ 2016-12-05 20:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Robert LeBlanc
In-Reply-To: <20161205200258.6653-1-robert@leblancnet.us>

Refactor raid1_make_request to make read and write code in their own
functions to clean up the code.

Signed-off-by: Robert LeBlanc <robert@leblancnet.us>
---
 drivers/md/raid1.c | 259 +++++++++++++++++++++++++++--------------------------
 1 file changed, 134 insertions(+), 125 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 94e0afc..4edefb2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1066,17 +1066,106 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	kfree(plug);
 }
 
-static void raid1_make_request(struct mddev *mddev, struct bio * bio)
+static void raid1_read_request(struct mddev *mddev, struct bio *bio,
+				 struct r1bio *r1_bio)
 {
 	struct r1conf *conf = mddev->private;
 	struct raid1_info *mirror;
-	struct r1bio *r1_bio;
 	struct bio *read_bio;
+	struct bitmap *bitmap = mddev->bitmap;
+	const int op = bio_op(bio);
+	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
+	int sectors_handled;
+	int max_sectors;
+	int rdisk;
+
+	wait_barrier(conf, bio);
+
+read_again:
+	rdisk = read_balance(conf, r1_bio, &max_sectors);
+
+	if (rdisk < 0) {
+		/* couldn't find anywhere to read from */
+		raid_end_bio_io(r1_bio);
+		return;
+	}
+	mirror = conf->mirrors + rdisk;
+
+	if (test_bit(WriteMostly, &mirror->rdev->flags) &&
+	    bitmap) {
+		/* Reading from a write-mostly device must
+		 * take care not to over-take any writes
+		 * that are 'behind'
+		 */
+		raid1_log(mddev, "wait behind writes");
+		wait_event(bitmap->behind_wait,
+			   atomic_read(&bitmap->behind_writes) == 0);
+	}
+	r1_bio->read_disk = rdisk;
+	r1_bio->start_next_window = 0;
+
+	read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+	bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
+		 max_sectors);
+
+	r1_bio->bios[rdisk] = read_bio;
+
+	read_bio->bi_iter.bi_sector = r1_bio->sector +
+		mirror->rdev->data_offset;
+	read_bio->bi_bdev = mirror->rdev->bdev;
+	read_bio->bi_end_io = raid1_end_read_request;
+	bio_set_op_attrs(read_bio, op, do_sync);
+	if (test_bit(FailFast, &mirror->rdev->flags) &&
+	    test_bit(R1BIO_FailFast, &r1_bio->state))
+	        read_bio->bi_opf |= MD_FAILFAST;
+	read_bio->bi_private = r1_bio;
+
+	if (mddev->gendisk)
+	        trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
+	                              read_bio, disk_devt(mddev->gendisk),
+	                              r1_bio->sector);
+
+	if (max_sectors < r1_bio->sectors) {
+		/* could not read all from this device, so we will
+		 * need another r1_bio.
+		 */
+
+		sectors_handled = (r1_bio->sector + max_sectors
+				   - bio->bi_iter.bi_sector);
+		r1_bio->sectors = max_sectors;
+		spin_lock_irq(&conf->device_lock);
+		if (bio->bi_phys_segments == 0)
+			bio->bi_phys_segments = 2;
+		else
+			bio->bi_phys_segments++;
+		spin_unlock_irq(&conf->device_lock);
+		/* Cannot call generic_make_request directly
+		 * as that will be queued in __make_request
+		 * and subsequent mempool_alloc might block waiting
+		 * for it.  So hand bio over to raid1d.
+		 */
+		reschedule_retry(r1_bio);
+
+		r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+		r1_bio->master_bio = bio;
+		r1_bio->sectors = bio_sectors(bio) - sectors_handled;
+		r1_bio->state = 0;
+		r1_bio->mddev = mddev;
+		r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+		goto read_again;
+	} else
+		generic_make_request(read_bio);
+}
+
+static void raid1_write_request(struct mddev *mddev, struct bio *bio,
+				struct r1bio *r1_bio)
+{
+	struct r1conf *conf = mddev->private;
 	int i, disks;
-	struct bitmap *bitmap;
+	struct bitmap *bitmap = mddev->bitmap;
 	unsigned long flags;
 	const int op = bio_op(bio);
-	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
 	const unsigned long do_flush_fua = (bio->bi_opf &
 						(REQ_PREFLUSH | REQ_FUA));
@@ -1096,12 +1185,11 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
 
 	md_write_start(mddev, bio); /* wait on superblock update early */
 
-	if (bio_data_dir(bio) == WRITE &&
-	    ((bio_end_sector(bio) > mddev->suspend_lo &&
+	if ((bio_end_sector(bio) > mddev->suspend_lo &&
 	    bio->bi_iter.bi_sector < mddev->suspend_hi) ||
 	    (mddev_is_clustered(mddev) &&
 	     md_cluster_ops->area_resyncing(mddev, WRITE,
-		     bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
+		     bio->bi_iter.bi_sector, bio_end_sector(bio)))) {
 		/* As the suspend_* range is controlled by
 		 * userspace, we want an interruptible
 		 * wait.
@@ -1115,128 +1203,15 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
 			    bio->bi_iter.bi_sector >= mddev->suspend_hi ||
 			    (mddev_is_clustered(mddev) &&
 			     !md_cluster_ops->area_resyncing(mddev, WRITE,
-				     bio->bi_iter.bi_sector, bio_end_sector(bio))))
+				     bio->bi_iter.bi_sector,
+				     bio_end_sector(bio))))
 				break;
 			schedule();
 		}
 		finish_wait(&conf->wait_barrier, &w);
 	}
-
 	start_next_window = wait_barrier(conf, bio);
 
-	bitmap = mddev->bitmap;
-
-	/*
-	 * make_request() can abort the operation when read-ahead is being
-	 * used and no empty request is available.
-	 *
-	 */
-	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
-	r1_bio->master_bio = bio;
-	r1_bio->sectors = bio_sectors(bio);
-	r1_bio->state = 0;
-	r1_bio->mddev = mddev;
-	r1_bio->sector = bio->bi_iter.bi_sector;
-
-	/* We might need to issue multiple reads to different
-	 * devices if there are bad blocks around, so we keep
-	 * track of the number of reads in bio->bi_phys_segments.
-	 * If this is 0, there is only one r1_bio and no locking
-	 * will be needed when requests complete.  If it is
-	 * non-zero, then it is the number of not-completed requests.
-	 */
-	bio->bi_phys_segments = 0;
-	bio_clear_flag(bio, BIO_SEG_VALID);
-
-	if (rw == READ) {
-		/*
-		 * read balancing logic:
-		 */
-		int rdisk;
-
-read_again:
-		rdisk = read_balance(conf, r1_bio, &max_sectors);
-
-		if (rdisk < 0) {
-			/* couldn't find anywhere to read from */
-			raid_end_bio_io(r1_bio);
-			return;
-		}
-		mirror = conf->mirrors + rdisk;
-
-		if (test_bit(WriteMostly, &mirror->rdev->flags) &&
-		    bitmap) {
-			/* Reading from a write-mostly device must
-			 * take care not to over-take any writes
-			 * that are 'behind'
-			 */
-			raid1_log(mddev, "wait behind writes");
-			wait_event(bitmap->behind_wait,
-				   atomic_read(&bitmap->behind_writes) == 0);
-		}
-		r1_bio->read_disk = rdisk;
-		r1_bio->start_next_window = 0;
-
-		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
-			 max_sectors);
-
-		r1_bio->bios[rdisk] = read_bio;
-
-		read_bio->bi_iter.bi_sector = r1_bio->sector +
-			mirror->rdev->data_offset;
-		read_bio->bi_bdev = mirror->rdev->bdev;
-		read_bio->bi_end_io = raid1_end_read_request;
-		bio_set_op_attrs(read_bio, op, do_sync);
-		if (test_bit(FailFast, &mirror->rdev->flags) &&
-		    test_bit(R1BIO_FailFast, &r1_bio->state))
-			read_bio->bi_opf |= MD_FAILFAST;
-		read_bio->bi_private = r1_bio;
-
-		if (mddev->gendisk)
-			trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
-					      read_bio, disk_devt(mddev->gendisk),
-					      r1_bio->sector);
-
-		if (max_sectors < r1_bio->sectors) {
-			/* could not read all from this device, so we will
-			 * need another r1_bio.
-			 */
-
-			sectors_handled = (r1_bio->sector + max_sectors
-					   - bio->bi_iter.bi_sector);
-			r1_bio->sectors = max_sectors;
-			spin_lock_irq(&conf->device_lock);
-			if (bio->bi_phys_segments == 0)
-				bio->bi_phys_segments = 2;
-			else
-				bio->bi_phys_segments++;
-			spin_unlock_irq(&conf->device_lock);
-			/* Cannot call generic_make_request directly
-			 * as that will be queued in __make_request
-			 * and subsequent mempool_alloc might block waiting
-			 * for it.  So hand bio over to raid1d.
-			 */
-			reschedule_retry(r1_bio);
-
-			r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
-			r1_bio->master_bio = bio;
-			r1_bio->sectors = bio_sectors(bio) - sectors_handled;
-			r1_bio->state = 0;
-			r1_bio->mddev = mddev;
-			r1_bio->sector = bio->bi_iter.bi_sector +
-				sectors_handled;
-			goto read_again;
-		} else
-			generic_make_request(read_bio);
-		return;
-	}
-
-	/*
-	 * WRITE:
-	 */
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
 		raid1_log(mddev, "wait queued");
@@ -1280,8 +1255,7 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
 			int bad_sectors;
 			int is_bad;
 
-			is_bad = is_badblock(rdev, r1_bio->sector,
-					     max_sectors,
+			is_bad = is_badblock(rdev, r1_bio->sector, max_sectors,
 					     &first_bad, &bad_sectors);
 			if (is_bad < 0) {
 				/* mustn't write here until the bad block is
@@ -1370,7 +1344,8 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
 			continue;
 
 		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors);
+		bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector,
+			 max_sectors);
 
 		if (first_clone) {
 			/* do behind I/O ?
@@ -1464,6 +1439,40 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
 	wake_up(&conf->wait_barrier);
 }
 
+static void raid1_make_request(struct mddev *mddev, struct bio *bio)
+{
+	struct r1conf *conf = mddev->private;
+	struct r1bio *r1_bio;
+
+	/*
+	 * make_request() can abort the operation when read-ahead is being
+	 * used and no empty request is available.
+	 *
+	 */
+	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+	r1_bio->master_bio = bio;
+	r1_bio->sectors = bio_sectors(bio);
+	r1_bio->state = 0;
+	r1_bio->mddev = mddev;
+	r1_bio->sector = bio->bi_iter.bi_sector;
+
+	/* We might need to issue multiple reads to different
+	 * devices if there are bad blocks around, so we keep
+	 * track of the number of reads in bio->bi_phys_segments.
+	 * If this is 0, there is only one r1_bio and no locking
+	 * will be needed when requests complete.  If it is
+	 * non-zero, then it is the number of not-completed requests.
+	 */
+	bio->bi_phys_segments = 0;
+	bio_clear_flag(bio, BIO_SEG_VALID);
+
+	if (bio_data_dir(bio) == READ)
+		raid1_read_request(mddev, bio, r1_bio);
+	else
+		raid1_write_request(mddev, bio, r1_bio);
+}
+
 static void raid1_status(struct seq_file *seq, struct mddev *mddev)
 {
 	struct r1conf *conf = mddev->private;
-- 
2.10.2


^ permalink raw reply related

* [PATCH v3 2/2] md/raid10: Refactor raid10_make_request
From: Robert LeBlanc @ 2016-12-05 20:02 UTC (permalink / raw)
  To: linux-raid; +Cc: Robert LeBlanc
In-Reply-To: <20161205200258.6653-1-robert@leblancnet.us>

Refactor raid10_make_request into seperate read and write functions to
clean up the code.

Signed-off-by: Robert LeBlanc <robert@leblancnet.us>
---
 drivers/md/raid10.c | 215 +++++++++++++++++++++++++++-------------------------
 1 file changed, 111 insertions(+), 104 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 525ca99..8698e00 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1087,23 +1087,97 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	kfree(plug);
 }
 
-static void __make_request(struct mddev *mddev, struct bio *bio)
+static void raid10_read_request(struct mddev *mddev, struct bio *bio,
+				struct r10bio *r10_bio)
 {
 	struct r10conf *conf = mddev->private;
-	struct r10bio *r10_bio;
 	struct bio *read_bio;
+	const int op = bio_op(bio);
+	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
+	int sectors_handled;
+	int max_sectors;
+	struct md_rdev *rdev;
+	int slot;
+
+	wait_barrier(conf);
+
+read_again:
+	rdev = read_balance(conf, r10_bio, &max_sectors);
+	if (!rdev) {
+		raid_end_bio_io(r10_bio);
+		return;
+	}
+	slot = r10_bio->read_slot;
+
+	read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+	bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
+		 max_sectors);
+
+	r10_bio->devs[slot].bio = read_bio;
+	r10_bio->devs[slot].rdev = rdev;
+
+	read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
+		choose_data_offset(r10_bio, rdev);
+	read_bio->bi_bdev = rdev->bdev;
+	read_bio->bi_end_io = raid10_end_read_request;
+	bio_set_op_attrs(read_bio, op, do_sync);
+	if (test_bit(FailFast, &rdev->flags) &&
+	    test_bit(R10BIO_FailFast, &r10_bio->state))
+	        read_bio->bi_opf |= MD_FAILFAST;
+	read_bio->bi_private = r10_bio;
+
+	if (mddev->gendisk)
+	        trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
+	                              read_bio, disk_devt(mddev->gendisk),
+	                              r10_bio->sector);
+	if (max_sectors < r10_bio->sectors) {
+		/* Could not read all from this device, so we will
+		 * need another r10_bio.
+		 */
+		sectors_handled = (r10_bio->sector + max_sectors
+				   - bio->bi_iter.bi_sector);
+		r10_bio->sectors = max_sectors;
+		spin_lock_irq(&conf->device_lock);
+		if (bio->bi_phys_segments == 0)
+			bio->bi_phys_segments = 2;
+		else
+			bio->bi_phys_segments++;
+		spin_unlock_irq(&conf->device_lock);
+		/* Cannot call generic_make_request directly
+		 * as that will be queued in __generic_make_request
+		 * and subsequent mempool_alloc might block
+		 * waiting for it.  so hand bio over to raid10d.
+		 */
+		reschedule_retry(r10_bio);
+
+		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+		r10_bio->master_bio = bio;
+		r10_bio->sectors = bio_sectors(bio) - sectors_handled;
+		r10_bio->state = 0;
+		r10_bio->mddev = mddev;
+		r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+		goto read_again;
+	} else
+		generic_make_request(read_bio);
+	return;
+}
+
+static void raid10_write_request(struct mddev *mddev, struct bio *bio,
+				 struct r10bio *r10_bio)
+{
+	struct r10conf *conf = mddev->private;
 	int i;
 	const int op = bio_op(bio);
-	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
 	const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
 	unsigned long flags;
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
 	struct raid10_plug_cb *plug = NULL;
+	sector_t sectors;
 	int sectors_handled;
 	int max_sectors;
-	int sectors;
 
 	md_write_start(mddev, bio);
 
@@ -1130,7 +1204,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 		wait_barrier(conf);
 	}
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-	    bio_data_dir(bio) == WRITE &&
 	    (mddev->reshape_backwards
 	     ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
 		bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
@@ -1147,99 +1220,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 
 		conf->reshape_safe = mddev->reshape_position;
 	}
-
-	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
-	r10_bio->master_bio = bio;
-	r10_bio->sectors = sectors;
-
-	r10_bio->mddev = mddev;
-	r10_bio->sector = bio->bi_iter.bi_sector;
-	r10_bio->state = 0;
-
-	/* We might need to issue multiple reads to different
-	 * devices if there are bad blocks around, so we keep
-	 * track of the number of reads in bio->bi_phys_segments.
-	 * If this is 0, there is only one r10_bio and no locking
-	 * will be needed when the request completes.  If it is
-	 * non-zero, then it is the number of not-completed requests.
-	 */
-	bio->bi_phys_segments = 0;
-	bio_clear_flag(bio, BIO_SEG_VALID);
-
-	if (rw == READ) {
-		/*
-		 * read balancing logic:
-		 */
-		struct md_rdev *rdev;
-		int slot;
-
-read_again:
-		rdev = read_balance(conf, r10_bio, &max_sectors);
-		if (!rdev) {
-			raid_end_bio_io(r10_bio);
-			return;
-		}
-		slot = r10_bio->read_slot;
-
-		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
-			 max_sectors);
-
-		r10_bio->devs[slot].bio = read_bio;
-		r10_bio->devs[slot].rdev = rdev;
-
-		read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
-			choose_data_offset(r10_bio, rdev);
-		read_bio->bi_bdev = rdev->bdev;
-		read_bio->bi_end_io = raid10_end_read_request;
-		bio_set_op_attrs(read_bio, op, do_sync);
-		if (test_bit(FailFast, &rdev->flags) &&
-		    test_bit(R10BIO_FailFast, &r10_bio->state))
-			read_bio->bi_opf |= MD_FAILFAST;
-		read_bio->bi_private = r10_bio;
-
-		if (mddev->gendisk)
-			trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
-					      read_bio, disk_devt(mddev->gendisk),
-					      r10_bio->sector);
-		if (max_sectors < r10_bio->sectors) {
-			/* Could not read all from this device, so we will
-			 * need another r10_bio.
-			 */
-			sectors_handled = (r10_bio->sector + max_sectors
-					   - bio->bi_iter.bi_sector);
-			r10_bio->sectors = max_sectors;
-			spin_lock_irq(&conf->device_lock);
-			if (bio->bi_phys_segments == 0)
-				bio->bi_phys_segments = 2;
-			else
-				bio->bi_phys_segments++;
-			spin_unlock_irq(&conf->device_lock);
-			/* Cannot call generic_make_request directly
-			 * as that will be queued in __generic_make_request
-			 * and subsequent mempool_alloc might block
-			 * waiting for it.  so hand bio over to raid10d.
-			 */
-			reschedule_retry(r10_bio);
-
-			r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
-			r10_bio->master_bio = bio;
-			r10_bio->sectors = bio_sectors(bio) - sectors_handled;
-			r10_bio->state = 0;
-			r10_bio->mddev = mddev;
-			r10_bio->sector = bio->bi_iter.bi_sector +
-				sectors_handled;
-			goto read_again;
-		} else
-			generic_make_request(read_bio);
-		return;
-	}
-
-	/*
-	 * WRITE:
-	 */
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
 		raid10_log(mddev, "wait queued");
@@ -1300,8 +1280,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 			int bad_sectors;
 			int is_bad;
 
-			is_bad = is_badblock(rdev, dev_sector,
-					     max_sectors,
+			is_bad = is_badblock(rdev, dev_sector, max_sectors,
 					     &first_bad, &bad_sectors);
 			if (is_bad < 0) {
 				/* Mustn't write here until the bad block
@@ -1405,8 +1384,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 			r10_bio->devs[i].bio = mbio;
 
 			mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr+
-					   choose_data_offset(r10_bio,
-							      rdev));
+					   choose_data_offset(r10_bio, rdev));
 			mbio->bi_bdev = rdev->bdev;
 			mbio->bi_end_io	= raid10_end_write_request;
 			bio_set_op_attrs(mbio, op, do_sync | do_fua);
@@ -1457,8 +1435,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 			r10_bio->devs[i].repl_bio = mbio;
 
 			mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr +
-					   choose_data_offset(
-						   r10_bio, rdev));
+					   choose_data_offset(r10_bio, rdev));
 			mbio->bi_bdev = rdev->bdev;
 			mbio->bi_end_io	= raid10_end_write_request;
 			bio_set_op_attrs(mbio, op, do_sync | do_fua);
@@ -1503,6 +1480,36 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 	one_write_done(r10_bio);
 }
 
+static void __make_request(struct mddev *mddev, struct bio *bio)
+{
+	struct r10conf *conf = mddev->private;
+	struct r10bio *r10_bio;
+
+	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+	r10_bio->master_bio = bio;
+	r10_bio->sectors = bio_sectors(bio);
+
+	r10_bio->mddev = mddev;
+	r10_bio->sector = bio->bi_iter.bi_sector;
+	r10_bio->state = 0;
+
+	/* We might need to issue multiple reads to different
+	 * devices if there are bad blocks around, so we keep
+	 * track of the number of reads in bio->bi_phys_segments.
+	 * If this is 0, there is only one r10_bio and no locking
+	 * will be needed when the request completes.  If it is
+	 * non-zero, then it is the number of not-completed requests.
+	 */
+	bio->bi_phys_segments = 0;
+	bio_clear_flag(bio, BIO_SEG_VALID);
+
+	if (bio_data_dir(bio) == READ)
+		raid10_read_request(mddev, bio, r10_bio);
+	else
+		raid10_write_request(mddev, bio, r10_bio);
+}
+
 static void raid10_make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct r10conf *conf = mddev->private;
-- 
2.10.2


^ permalink raw reply related

* Re: [PATCH v3 0/2] Reorganize raid*_make_request to clean up code
From: Robert LeBlanc @ 2016-12-05 20:04 UTC (permalink / raw)
  To: linux-raid; +Cc: Robert LeBlanc
In-Reply-To: <20161205200258.6653-1-robert@leblancnet.us>

And also rebased against for-next.
----------------
Robert LeBlanc
PGP Fingerprint 79A2 9CA4 6CC4 45DD A904  C70E E654 3BB2 FA62 B9F1


On Mon, Dec 5, 2016 at 1:02 PM, Robert LeBlanc <robert@leblancnet.us> wrote:
> In response to Christoph, I've broken the read and writes into their own
> functions to make the code even cleaner. Since it is such a big change, I broke
> up the commits into this series instead of creating a v2 of the previous patch.
>
> Changes since v2:
>  Shaohua Li
>  * Make md_write_start before wait_barrier
>  * Move I/O in recovery stripe test to write path
>  * Changed to if/then instead of return in __make_request
>
> Changes since v1:
>
>  John Stoffel
>  * Changed to if/then instead of return in raid1_make_request
>
>  Neil Brown
>  * Moved wait_barrier into raid1_{read,write}_request so that it could be after
>    ->suspend_{hi,lo}. This prevents a write blocking a resync until the suspend
>    region is moved.
>
> Robert LeBlanc (2):
>   md/raid1: Refactor raid1_make_request
>   md/raid10: Refactor raid10_make_request
>
>  drivers/md/raid1.c  | 259 +++++++++++++++++++++++++++-------------------------
>  drivers/md/raid10.c | 215 ++++++++++++++++++++++---------------------
>  2 files changed, 245 insertions(+), 229 deletions(-)
>
> --
> 2.10.2
>

^ permalink raw reply

* Re: MD Remnants After --stop
From: Marc Smith @ 2016-12-05 21:37 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid
In-Reply-To: <87r35n5hsn.fsf@notabene.neil.brown.name>

On Sun, Dec 4, 2016 at 7:41 PM, NeilBrown <neilb@suse.com> wrote:
> On Sat, Dec 03 2016, Marc Smith wrote:
>
>> Finally, I got it! Why is it when I want it to break, it doesn't. =)
>
> welcome to my world :-)
>
>
>>
>> I will say, using the modified mdadm that prevents the synthesized
>> CHANGE event, it seems to not induce the problem as regularly.
>>
>> Below are the kernel logs after stopping an array:
>
> Thank you so much for persisting with this.
> The logs you provide make it clear that two separate processes (494 and
> 31178) increment the ->active count by opening the device, but never
> decrement that count by closing the device.
> It seems too unlikely that either process would be holding the
> file descriptor open indefinitely, so something must be going wrong
> either as part of 'open', or as part of 'close'.
>
> Now that I know where to look, the bug is obvious.  Why didn't I see
> that before?
>
> The open request is failing, almost certainly because MD_CLOSING is set,
> but the ->active count isn't being decremented on failure.
> This patch should fix it.
>
> Please test and report results.
>
> Thanks,
> NeilBrown
>
> Fixes: af8d8e6f0315 ("md: changes for MD_STILL_CLOSED flag" v4.9-rc1)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 2089d46b0eb8..a8e07eb2ca5f 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -7087,11 +7087,14 @@ static int md_open(struct block_device *bdev, fmode_t mode)
>         }
>         BUG_ON(mddev != bdev->bd_disk->private_data);
>
> -       if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
> +       if ((err = mutex_lock_interruptible(&mddev->open_mutex))) {
> +               mddev_put(mddev);
>                 goto out;
> +       }
>
>         if (test_bit(MD_CLOSING, &mddev->flags)) {
>                 mutex_unlock(&mddev->open_mutex);
> +               mddev_put(mddev);
>                 return -ENODEV;
>         }
>

That did the trick! I ran 'mdadm --stop' eight different times on two
nodes, and every time it was removed completely from /dev and
/sys/block just like expected. =)

Thanks for your effort and time on this. I think I read in the other
thread RE: this patch, it may make it into 4.9?


--Marc

^ permalink raw reply

* Re: [PATCH v2 6/7] Allow changing the RWH policy for a running array
From: Jes Sorensen @ 2016-12-05 21:50 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: linux-raid
In-Reply-To: <20161205153204.7343-7-artur.paszkiewicz@intel.com>

Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> This extends the --rwh-policy parameter to work also in Misc mode. Using
> it changes the currently active RWH policy in the kernel driver and
> updates the metadata to make this change permanent. Updating metadata is
> not yet implemented for super1, so this is limited to IMSM for now.
>
> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>

Hi Artur,

It looked good all the way up until 6/7, but there is a nit here:

> ---
>  Manage.c      | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  mdadm.c       |  9 +++++++
>  mdadm.h       |  1 +
>  super-intel.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++-
>  4 files changed, 153 insertions(+), 1 deletion(-)
[snip]
> diff --git a/super-intel.c b/super-intel.c
> index e524ef0..3b40429 100644
> --- a/super-intel.c
> +++ b/super-intel.c
> @@ -448,6 +448,7 @@ enum imsm_update_type {
>  	update_general_migration_checkpoint,
>  	update_size_change,
>  	update_prealloc_badblocks_mem,
> +	update_rwh_policy,
>  };
>  
>  struct imsm_update_activate_spare {
> @@ -540,6 +541,12 @@ struct imsm_update_prealloc_bb_mem {
>  	enum imsm_update_type type;
>  };
>  
> +struct imsm_update_rwh_policy {
> +	enum imsm_update_type type;
> +	int new_policy;
> +	int dev_idx;
> +};
> +
>  static const char *_sys_dev_type[] = {
>  	[SYS_DEV_UNKNOWN] = "Unknown",
>  	[SYS_DEV_SAS] = "SAS",
> @@ -3175,7 +3182,6 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>  	info->custom_array_size   <<= 32;
>  	info->custom_array_size   |= __le32_to_cpu(dev->size_low);
>  	info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
> -	info->journal_clean = dev->rwh_policy;
>  
>  	if (is_gen_migration(dev)) {
>  		info->reshape_active = 1;
> @@ -3347,6 +3353,8 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>  			info->rwh_policy = RWH_POLICY_PPL;
>  		else
>  			info->rwh_policy = RWH_POLICY_UNKNOWN;
> +
> +		info->journal_clean = info->rwh_policy == RWH_POLICY_PPL;
>  	}
>  }

This part doesn't make sense, first you set info->rwh_policy based on
sb->feature_map to RWH_POLICY_PPL or RWH_POLICY_UNKNOWN and then right
after you hard set it to RWH_POLICY_PPL.

In general I really would prefer not to see any of those double
assignments if it can be avoided.

Cheers,
Jes

^ permalink raw reply

* Re: [PATCH] md: fix refcount problem on mddev when stopping array.
From: Shaohua Li @ 2016-12-06  0:39 UTC (permalink / raw)
  To: NeilBrown; +Cc: Guoqing Jiang, linux-raid, Marc Smith
In-Reply-To: <87h96j53x9.fsf@notabene.neil.brown.name>

On Mon, Dec 05, 2016 at 04:40:50PM +1100, Neil Brown wrote:
> 
> md_open() gets a counted reference on an mddev using mddev_find().
> If it ends up returning an error, it must drop this reference.
> 
> There are two error paths where the reference is not dropped.
> One only happens if the process is signalled and an awkward time,
> which is quite unlikely.
> The other was introduced recently in commit af8d8e6f0.
> 
> Change the code to ensure the drop the reference when returning an error,
> and make it harded to re-introduce this sort of bug in the future.
> 
> Reported-by: Marc Smith <marc.smith@mcc.edu>
> Fixes: af8d8e6f0315 ("md: changes for MD_STILL_CLOSED flag")
> Signed-off-by: NeilBrown <neilb@suse.com>
> ---
> 
> Hi Shaohua,
>  as this bug was introduced in v4.9-rc1, it would be great if this
>  patch could get to Linus before v4.9-final comes out (on Sunday?).

Applied to the for-next tree. This sounds not significant enough, so I'll push
it to 4.10.

Thanks,
Shaohua
 
> Thanks,
> NeilBrown
> 
> 
>  drivers/md/md.c | 5 ++++-
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 2089d46b0eb8..d1a291ac2a75 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -7092,7 +7092,8 @@ static int md_open(struct block_device *bdev, fmode_t mode)
>  
>  	if (test_bit(MD_CLOSING, &mddev->flags)) {
>  		mutex_unlock(&mddev->open_mutex);
> -		return -ENODEV;
> +		err = -ENODEV;
> +		goto out;
>  	}
>  
>  	err = 0;
> @@ -7101,6 +7102,8 @@ static int md_open(struct block_device *bdev, fmode_t mode)
>  
>  	check_disk_change(bdev);
>   out:
> +	if (err)
> +		mddev_put(mddev);
>  	return err;
>  }
>  
> -- 
> 2.10.2
> 



^ permalink raw reply

* Re: [PATCH 4/4] md/raid5-cache: adjust the write position of the empty block and mark it as a checkpoint
From: Shaohua Li @ 2016-12-06  1:00 UTC (permalink / raw)
  To: JackieLiu; +Cc: songliubraving, 刘正元, linux-raid
In-Reply-To: <D667B2F7-64DE-434D-B793-F4F514BB2714@kylinos.cn>

On Mon, Dec 05, 2016 at 11:58:53AM +0800, JackieLiu wrote:
> 
> > 在 2016年12月3日，04:10，Shaohua Li <shli@kernel.org> 写道：
> > 
> > confusing reading the code. Don't think a 'if () write_empty_block' makes the
> 
> Hi ShaoHua. 
> 
> I rewrote this part of the code, now without data_only_stripes, we write an empty block here,
> with data_only_stripes, mark the first cache block as last_checkpoint.

Next time please send formal patch and fix your mail client (it skews up tabs).
I manually applied this patch this time. Thanks!

Best Regards,
Shaohua

 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 9e72180..5697724 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -2072,7 +2072,6 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
>                 return -ENOMEM;
>         }
> 
> -       ctx->seq += 10;
>         list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
>                 struct r5l_meta_block *mb;
>                 int i;
> @@ -2132,6 +2131,8 @@ static int r5l_recovery_log(struct r5l_log *log)
>         struct mddev *mddev = log->rdev->mddev;
>         struct r5l_recovery_ctx ctx;
>         int ret;
> +       sector_t pos;
> +       struct stripe_head *sh;
> 
>         ctx.pos = log->last_checkpoint;
>         ctx.seq = log->last_cp_seq;
> @@ -2149,6 +2150,18 @@ static int r5l_recovery_log(struct r5l_log *log)
>         if (ret)
>                 return ret;
> 
> +       pos = ctx.pos;
> +       ctx.seq += 10;
> +
> +       if (ctx.data_only_stripes == 0) {
> +               log->next_checkpoint = ctx.pos;
> +               r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
> +               ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
> +       } else {
> +               sh = list_last_entry(&ctx.cached_list, struct stripe_head, lru);
> +               log->next_checkpoint = sh->log_start;
> +       }
> +
>         if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
>                 pr_debug("md/raid:%s: starting from clean shutdown\n",
>                          mdname(mddev));
> @@ -2166,10 +2179,9 @@ static int r5l_recovery_log(struct r5l_log *log)
>         }
> 
>         log->log_start = ctx.pos;
> -       log->next_checkpoint = ctx.pos;
>         log->seq = ctx.seq;
> -       r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq);
> -       r5l_write_super(log, ctx.pos);
> +       log->last_checkpoint = pos;
> +       r5l_write_super(log, pos);
>         return 0;
>  }
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2 05/12] raid5-ppl: Partial Parity Log implementation
From: kbuild test robot @ 2016-12-06  1:06 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: kbuild-all, shli, linux-raid
In-Reply-To: <20161205153113.7268-6-artur.paszkiewicz@intel.com>

[-- Attachment #1: Type: text/plain, Size: 1574 bytes --]

Hi Artur,

[auto build test ERROR on next-20161205]
[cannot apply to v4.9-rc8 v4.9-rc7 v4.9-rc6 v4.9-rc8]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Artur-Paszkiewicz/Partial-Parity-Log-for-MD-RAID-5/20161206-081540
config: i386-randconfig-r0-201649 (attached as .config)
compiler: gcc-5 (Debian 5.4.1-2) 5.4.1 20160904
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   drivers/md/raid5-ppl.c: In function 'ppl_init_log_child':
>> drivers/md/raid5-ppl.c:459:2: error: too few arguments to function 'bio_init'
     bio_init(&log->flush_bio);
     ^
   In file included from include/linux/blkdev.h:19:0,
                    from drivers/md/raid5-ppl.c:16:
   include/linux/bio.h:426:13: note: declared here
    extern void bio_init(struct bio *bio, struct bio_vec *table,
                ^

vim +/bio_init +459 drivers/md/raid5-ppl.c

   453		spin_lock_init(&log->io_list_lock);
   454		INIT_LIST_HEAD(&log->running_ios);
   455		INIT_LIST_HEAD(&log->io_end_ios);
   456		INIT_LIST_HEAD(&log->flushing_ios);
   457		INIT_LIST_HEAD(&log->finished_ios);
   458		INIT_LIST_HEAD(&log->no_mem_stripes);
 > 459		bio_init(&log->flush_bio);
   460	
   461		log->io_kc = log_parent->io_kc;
   462		log->io_pool = log_parent->io_pool;

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 31114 bytes --]

^ permalink raw reply

* Re: [PATCH 1/2] md/r5cache: do r5c_update_log_state after log recovery
From: Shaohua Li @ 2016-12-06  1:10 UTC (permalink / raw)
  To: Zhengyuan Liu; +Cc: linux-raid
In-Reply-To: <1480841385-21180-1-git-send-email-liuzhengyuan@kylinos.cn>

On Sun, Dec 04, 2016 at 04:49:44PM +0800, Zhengyuan Liu wrote:
> We should update log state after we did a log recovery, current completion
> may get wrong log state since log->log_start wasn't initalized until we
> called r5l_recovery_log.
> 
> At log recovery stage, no lock needed as there is no race conditon.
> next_checkpoint field will be initialized in r5l_recovery_log too.

applied, thanks!
 
> Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
> ---
>  drivers/md/raid5-cache.c | 8 +++-----
>  1 file changed, 3 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index fa3319c..07bce0e 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -2522,14 +2522,12 @@ static int r5l_load_log(struct r5l_log *log)
>  	if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
>  		log->max_free_space = RECLAIM_MAX_FREE_SPACE;
>  	log->last_checkpoint = cp;
> -	log->next_checkpoint = cp;
> -	mutex_lock(&log->io_mutex);
> -	r5c_update_log_state(log);
> -	mutex_unlock(&log->io_mutex);
>  
>  	__free_page(page);
>  
> -	return r5l_recovery_log(log);
> +	ret = r5l_recovery_log(log);
> +	r5c_update_log_state(log);
> +	return ret;
>  ioerr:
>  	__free_page(page);
>  	return ret;
> -- 
> 2.7.4
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH 1/2] md/r5cache: sh->log_start in recovery
From: Song Liu @ 2016-12-06  1:46 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	Song Liu

We only need to update sh->log_start at the end of recovery,
which is r5c_recovery_rewrite_data_only_stripes().

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index c3b3124..93f3310 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1681,8 +1681,7 @@ r5l_recovery_replay_one_stripe(struct r5conf *conf,
 
 static struct stripe_head *
 r5c_recovery_alloc_stripe(struct r5conf *conf,
-			  sector_t stripe_sect,
-			  sector_t log_start)
+			  sector_t stripe_sect)
 {
 	struct stripe_head *sh;
 
@@ -1691,7 +1690,6 @@ r5c_recovery_alloc_stripe(struct r5conf *conf,
 		return NULL;  /* no more stripe available */
 
 	r5l_recovery_reset_stripe(sh);
-	sh->log_start = log_start;
 
 	return sh;
 }
@@ -1861,7 +1859,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 						stripe_sect);
 
 		if (!sh) {
-			sh = r5c_recovery_alloc_stripe(conf, stripe_sect, ctx->pos);
+			sh = r5c_recovery_alloc_stripe(conf, stripe_sect);
 			/*
 			 * cannot get stripe from raid5_get_active_stripe
 			 * try replay some stripes
@@ -1870,7 +1868,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 				r5c_recovery_replay_stripes(
 					cached_stripe_list, ctx);
 				sh = r5c_recovery_alloc_stripe(
-					conf, stripe_sect, ctx->pos);
+					conf, stripe_sect);
 			}
 			if (!sh) {
 				pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
@@ -1878,8 +1876,8 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 					conf->min_nr_stripes * 2);
 				raid5_set_cache_size(mddev,
 						     conf->min_nr_stripes * 2);
-				sh = r5c_recovery_alloc_stripe(
-					conf, stripe_sect, ctx->pos);
+				sh = r5c_recovery_alloc_stripe(conf,
+							       stripe_sect);
 			}
 			if (!sh) {
 				pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
@@ -1893,7 +1891,6 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 			if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
 			    test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
 				r5l_recovery_replay_one_stripe(conf, sh, ctx);
-				sh->log_start = ctx->pos;
 				list_move_tail(&sh->lru, cached_stripe_list);
 			}
 			r5l_recovery_load_data(log, sh, ctx, payload,
@@ -1932,8 +1929,6 @@ static void r5c_recovery_load_one_stripe(struct r5l_log *log,
 			set_bit(R5_UPTODATE, &dev->flags);
 		}
 	}
-	list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
-	atomic_inc(&log->stripe_in_journal_count);
 }
 
 /*
@@ -2121,6 +2116,8 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 		sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
 			     REQ_OP_WRITE, WRITE_FUA, false);
 		sh->log_start = ctx->pos;
+		list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
+		atomic_inc(&log->stripe_in_journal_count);
 		ctx->pos = write_pos;
 		ctx->seq += 1;
 
-- 
2.9.3


^ permalink raw reply related

* [PATCH 2/2] md/r5cache: flush data only stripes in r5l_recovery_log()
From: Song Liu @ 2016-12-06  1:46 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuan,
	Song Liu
In-Reply-To: <20161206014657.3592902-1-songliubraving@fb.com>

When there is data only stripes in the journal, we flush them out in
r5l_recovery_log().

We need conf->log in r5l_load_log(), so we need to set it before calling
r5l_load_log(). If r5l_load_log() fails, we set conf->log back to NULL.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 22 ++++++++++++++++++++--
 drivers/md/raid5.c       |  8 +++++++-
 drivers/md/raid5.h       |  4 ++++
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 93f3310..519a680 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2131,10 +2131,12 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 static int r5l_recovery_log(struct r5l_log *log)
 {
 	struct mddev *mddev = log->rdev->mddev;
+	struct r5conf *conf = mddev->private;
 	struct r5l_recovery_ctx ctx;
 	int ret;
 	sector_t pos;
 	struct stripe_head *sh;
+	unsigned long flags;
 
 	ctx.pos = log->last_checkpoint;
 	ctx.seq = log->last_cp_seq;
@@ -2172,12 +2174,26 @@ static int r5l_recovery_log(struct r5l_log *log)
 			 mdname(mddev), ctx.data_only_stripes,
 			 ctx.data_parity_stripes);
 
-		if (ctx.data_only_stripes > 0)
+		if (ctx.data_only_stripes > 0) {
+			log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;
 			if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
 				pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
 				       mdname(mddev));
 				return -EIO;
 			}
+
+			set_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state);
+			spin_lock_irqsave(&conf->device_lock, flags);
+			r5c_flush_cache(conf, INT_MAX);
+			spin_unlock_irqrestore(&conf->device_lock, flags);
+			md_wakeup_thread(conf->mddev->thread);
+			wait_event(conf->wait_for_r5c_pre_init_flush,
+				   atomic_read(&conf->active_stripes) == 0 &&
+				   atomic_read(&conf->r5c_cached_full_stripes) == 0 &&
+				   atomic_read(&conf->r5c_cached_partial_stripes) == 0);
+			clear_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state);
+			log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
+		}
 	}
 
 	log->log_start = ctx.pos;
@@ -2628,14 +2644,16 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	spin_lock_init(&log->stripe_in_journal_lock);
 	atomic_set(&log->stripe_in_journal_count, 0);
 
+	rcu_assign_pointer(conf->log, log);
+
 	if (r5l_load_log(log))
 		goto error;
 
-	rcu_assign_pointer(conf->log, log);
 	set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
 	return 0;
 
 error:
+	rcu_assign_pointer(conf->log, NULL);
 	md_unregister_thread(&log->reclaim_thread);
 reclaim_thread:
 	mempool_destroy(log->meta_pool);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6bf3c26..279f213 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -232,7 +232,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 	 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
 	 * data in journal, so they are not released to cached lists
 	 */
-	if (conf->quiesce && r5c_is_writeback(conf->log) &&
+	if ((conf->quiesce ||
+	     test_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state)) &&
+	    r5c_is_writeback(conf->log) &&
 	    !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
 		if (test_bit(STRIPE_R5C_CACHING, &sh->state))
 			r5c_make_stripe_write_out(sh);
@@ -264,6 +266,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 			    < IO_THRESHOLD)
 				md_wakeup_thread(conf->mddev->thread);
 		atomic_dec(&conf->active_stripes);
+		if (test_bit(R5C_PRE_INIT_FLUSH, &conf->cache_state))
+		    	wake_up(&sh->raid_conf->wait_for_r5c_pre_init_flush);
+
 		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
 			if (!r5c_is_writeback(conf->log))
 				list_add_tail(&sh->lru, temp_inactive_list);
@@ -6638,6 +6643,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	init_waitqueue_head(&conf->wait_for_quiescent);
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
+	init_waitqueue_head(&conf->wait_for_r5c_pre_init_flush);
 	INIT_LIST_HEAD(&conf->handle_list);
 	INIT_LIST_HEAD(&conf->hold_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ed8e136..b39fe46 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -564,6 +564,9 @@ enum r5_cache_state {
 	R5C_EXTRA_PAGE_IN_USE,	/* a stripe is using disk_info.extra_page
 				 * for prexor
 				 */
+	R5C_PRE_INIT_FLUSH,	/* flushing data only stripes recovered from
+				 * the journal
+				 */
 };
 
 struct r5conf {
@@ -679,6 +682,7 @@ struct r5conf {
 	int			group_cnt;
 	int			worker_cnt_per_group;
 	struct r5l_log		*log;
+	wait_queue_head_t	wait_for_r5c_pre_init_flush;
 };
 
 
-- 
2.9.3


^ permalink raw reply related

* Re: [PATCH 1/2] md/r5cache: sh->log_start in recovery
From: JackieLiu @ 2016-12-06  4:10 UTC (permalink / raw)
  To: Song Liu
  Cc: linux-raid, neilb, shli, kernel-team, dan.j.williams, hch,
	刘正元
In-Reply-To: <20161206014657.3592902-1-songliubraving@fb.com>


> 在 2016年12月6日，09:46，Song Liu <songliubraving@fb.com> 写道：
> 
> We only need to update sh->log_start at the end of recovery,
> which is r5c_recovery_rewrite_data_only_stripes().
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
> drivers/md/raid5-cache.c | 17 +++++++----------
> 1 file changed, 7 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index c3b3124..93f3310 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -1681,8 +1681,7 @@ r5l_recovery_replay_one_stripe(struct r5conf *conf,
> 
> static struct stripe_head *
> r5c_recovery_alloc_stripe(struct r5conf *conf,
> -			  sector_t stripe_sect,
> -			  sector_t log_start)
> +			  sector_t stripe_sect)
> {
> 	struct stripe_head *sh;
> 
> @@ -1691,7 +1690,6 @@ r5c_recovery_alloc_stripe(struct r5conf *conf,
> 		return NULL;  /* no more stripe available */
> 
> 	r5l_recovery_reset_stripe(sh);
> -	sh->log_start = log_start;

Hi Song, 
the sh->log_start is not only used in r5c_recovery_rewrite_data_only_stripes function, in my new patch, 
https://git.kernel.org/cgit/linux/kernel/git/shli/md.git/tree/drivers/md/raid5-cache.c?h=for-next&id=43b9674832cc41ad0ad7b7e2ec397e47dcd5f6c3#n2167
Also be used. 

Thanks 
Jackie

> 
> 	return sh;
> }
> @@ -1861,7 +1859,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
> 						stripe_sect);
> 
> 		if (!sh) {
> -			sh = r5c_recovery_alloc_stripe(conf, stripe_sect, ctx->pos);
> +			sh = r5c_recovery_alloc_stripe(conf, stripe_sect);
> 			/*
> 			 * cannot get stripe from raid5_get_active_stripe
> 			 * try replay some stripes
> @@ -1870,7 +1868,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
> 				r5c_recovery_replay_stripes(
> 					cached_stripe_list, ctx);
> 				sh = r5c_recovery_alloc_stripe(
> -					conf, stripe_sect, ctx->pos);
> +					conf, stripe_sect);
> 			}
> 			if (!sh) {
> 				pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
> @@ -1878,8 +1876,8 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
> 					conf->min_nr_stripes * 2);
> 				raid5_set_cache_size(mddev,
> 						     conf->min_nr_stripes * 2);
> -				sh = r5c_recovery_alloc_stripe(
> -					conf, stripe_sect, ctx->pos);
> +				sh = r5c_recovery_alloc_stripe(conf,
> +							       stripe_sect);
> 			}
> 			if (!sh) {
> 				pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
> @@ -1893,7 +1891,6 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
> 			if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
> 			    test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
> 				r5l_recovery_replay_one_stripe(conf, sh, ctx);
> -				sh->log_start = ctx->pos;
> 				list_move_tail(&sh->lru, cached_stripe_list);
> 			}
> 			r5l_recovery_load_data(log, sh, ctx, payload,
> @@ -1932,8 +1929,6 @@ static void r5c_recovery_load_one_stripe(struct r5l_log *log,
> 			set_bit(R5_UPTODATE, &dev->flags);
> 		}
> 	}
> -	list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
> -	atomic_inc(&log->stripe_in_journal_count);
> }
> 
> /*
> @@ -2121,6 +2116,8 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
> 		sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
> 			     REQ_OP_WRITE, WRITE_FUA, false);
> 		sh->log_start = ctx->pos;
> +		list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
> +		atomic_inc(&log->stripe_in_journal_count);
> 		ctx->pos = write_pos;
> 		ctx->seq += 1;
> 
> -- 
> 2.9.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html




^ permalink raw reply

* [PATCH] md/raid5-cache: no recovery is required when create super-block
From: JackieLiu @ 2016-12-06  6:13 UTC (permalink / raw)
  To: songliubraving; +Cc: liuzhengyuan, shli, linux-raid, JackieLiu

When create the super-block information, We do not need to do this
recovery stage, only need to initialize some variables.
---
 drivers/md/raid5-cache.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index c3b3124..96f25df 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2545,7 +2545,13 @@ static int r5l_load_log(struct r5l_log *log)
 
 	__free_page(page);
 
-	ret = r5l_recovery_log(log);
+	if (create_super) {
+		log->log_start = r5l_ring_add(log, cp, BLOCK_SECTORS);
+		log->seq = log->last_cp_seq + 1;
+		log->next_checkpoint = cp;
+	} else
+		ret = r5l_recovery_log(log);
+
 	r5c_update_log_state(log);
 	return ret;
 ioerr:
-- 
2.10.2




^ permalink raw reply related

* Re: [PATCH v2 6/7] Allow changing the RWH policy for a running array
From: Artur Paszkiewicz @ 2016-12-06  7:43 UTC (permalink / raw)
  To: Jes Sorensen; +Cc: linux-raid
In-Reply-To: <wrfjd1h69hbn.fsf@redhat.com>

On 12/05/2016 10:50 PM, Jes Sorensen wrote:
> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>> This extends the --rwh-policy parameter to work also in Misc mode. Using
>> it changes the currently active RWH policy in the kernel driver and
>> updates the metadata to make this change permanent. Updating metadata is
>> not yet implemented for super1, so this is limited to IMSM for now.
>>
>> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
> 
> Hi Artur,
> 
> It looked good all the way up until 6/7, but there is a nit here:
> 
>> ---
>>  Manage.c      | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  mdadm.c       |  9 +++++++
>>  mdadm.h       |  1 +
>>  super-intel.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++-
>>  4 files changed, 153 insertions(+), 1 deletion(-)
> [snip]
>> diff --git a/super-intel.c b/super-intel.c
>> index e524ef0..3b40429 100644
>> --- a/super-intel.c
>> +++ b/super-intel.c
>> @@ -448,6 +448,7 @@ enum imsm_update_type {
>>  	update_general_migration_checkpoint,
>>  	update_size_change,
>>  	update_prealloc_badblocks_mem,
>> +	update_rwh_policy,
>>  };
>>  
>>  struct imsm_update_activate_spare {
>> @@ -540,6 +541,12 @@ struct imsm_update_prealloc_bb_mem {
>>  	enum imsm_update_type type;
>>  };
>>  
>> +struct imsm_update_rwh_policy {
>> +	enum imsm_update_type type;
>> +	int new_policy;
>> +	int dev_idx;
>> +};
>> +
>>  static const char *_sys_dev_type[] = {
>>  	[SYS_DEV_UNKNOWN] = "Unknown",
>>  	[SYS_DEV_SAS] = "SAS",
>> @@ -3175,7 +3182,6 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>  	info->custom_array_size   <<= 32;
>>  	info->custom_array_size   |= __le32_to_cpu(dev->size_low);
>>  	info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
>> -	info->journal_clean = dev->rwh_policy;
>>  
>>  	if (is_gen_migration(dev)) {
>>  		info->reshape_active = 1;
>> @@ -3347,6 +3353,8 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>  			info->rwh_policy = RWH_POLICY_PPL;
>>  		else
>>  			info->rwh_policy = RWH_POLICY_UNKNOWN;
>> +
>> +		info->journal_clean = info->rwh_policy == RWH_POLICY_PPL;
>>  	}
>>  }
> 
> This part doesn't make sense, first you set info->rwh_policy based on
> sb->feature_map to RWH_POLICY_PPL or RWH_POLICY_UNKNOWN and then right
> after you hard set it to RWH_POLICY_PPL.
> 
> In general I really would prefer not to see any of those double
> assignments if it can be avoided.

This isn't a double assignment, there is a '==' there. I'm setting
info->journal_clean to true only if the policy is PPL. I'm not sure how
this change ended up in this patch, it was supposed to go to 5/7. I must
have overlooked it.

Artur

^ permalink raw reply

* Re: [PATCH v2 6/7] Allow changing the RWH policy for a running array
From: Jes Sorensen @ 2016-12-06 14:30 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: linux-raid
In-Reply-To: <5c74d40c-36c4-97ae-9a67-be533d6abfb1@intel.com>

Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> On 12/05/2016 10:50 PM, Jes Sorensen wrote:
>> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>>> This extends the --rwh-policy parameter to work also in Misc mode. Using
>>> it changes the currently active RWH policy in the kernel driver and
>>> updates the metadata to make this change permanent. Updating metadata is
>>> not yet implemented for super1, so this is limited to IMSM for now.
>>>
>>> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
>> 
>> Hi Artur,
>> 
>> It looked good all the way up until 6/7, but there is a nit here:
>> 
>>> ---
>>>  Manage.c | 79
>>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  mdadm.c       |  9 +++++++
>>>  mdadm.h       |  1 +
>>>  super-intel.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++-
>>>  4 files changed, 153 insertions(+), 1 deletion(-)
>> [snip]
>>> diff --git a/super-intel.c b/super-intel.c
>>> index e524ef0..3b40429 100644
>>> --- a/super-intel.c
>>> +++ b/super-intel.c
>>> @@ -448,6 +448,7 @@ enum imsm_update_type {
>>>  	update_general_migration_checkpoint,
>>>  	update_size_change,
>>>  	update_prealloc_badblocks_mem,
>>> +	update_rwh_policy,
>>>  };
>>>  
>>>  struct imsm_update_activate_spare {
>>> @@ -540,6 +541,12 @@ struct imsm_update_prealloc_bb_mem {
>>>  	enum imsm_update_type type;
>>>  };
>>>  
>>> +struct imsm_update_rwh_policy {
>>> +	enum imsm_update_type type;
>>> +	int new_policy;
>>> +	int dev_idx;
>>> +};
>>> +
>>>  static const char *_sys_dev_type[] = {
>>>  	[SYS_DEV_UNKNOWN] = "Unknown",
>>>  	[SYS_DEV_SAS] = "SAS",
>>> @@ -3175,7 +3182,6 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>>  	info->custom_array_size   <<= 32;
>>>  	info->custom_array_size   |= __le32_to_cpu(dev->size_low);
>>>  	info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
>>> -	info->journal_clean = dev->rwh_policy;
>>>  
>>>  	if (is_gen_migration(dev)) {
>>>  		info->reshape_active = 1;
>>> @@ -3347,6 +3353,8 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>>  			info->rwh_policy = RWH_POLICY_PPL;
>>>  		else
>>>  			info->rwh_policy = RWH_POLICY_UNKNOWN;
>>> +
>>> +		info->journal_clean = info->rwh_policy == RWH_POLICY_PPL;
>>>  	}
>>>  }
>> 
>> This part doesn't make sense, first you set info->rwh_policy based on
>> sb->feature_map to RWH_POLICY_PPL or RWH_POLICY_UNKNOWN and then right
>> after you hard set it to RWH_POLICY_PPL.
>> 
>> In general I really would prefer not to see any of those double
>> assignments if it can be avoided.
>
> This isn't a double assignment, there is a '==' there. I'm setting
> info->journal_clean to true only if the policy is PPL. I'm not sure how
> this change ended up in this patch, it was supposed to go to 5/7. I must
> have overlooked it.

Argh you're right, code obfuscation at it's finest - if this is meant to
be in 5/7 do you want to respin the two?

In addition why not put the info->journal_clean assignments up together
with the info->rhw_policy assignments? Would make it a lot easier to
read without making my mistake :)

cheers,
Jes

^ permalink raw reply

* Re: [PATCH v2 6/7] Allow changing the RWH policy for a running array
From: Artur Paszkiewicz @ 2016-12-06 14:50 UTC (permalink / raw)
  To: Jes Sorensen; +Cc: linux-raid
In-Reply-To: <wrfj1sxl870h.fsf@redhat.com>

On 12/06/2016 03:30 PM, Jes Sorensen wrote:
> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>> On 12/05/2016 10:50 PM, Jes Sorensen wrote:
>>> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>>>> This extends the --rwh-policy parameter to work also in Misc mode. Using
>>>> it changes the currently active RWH policy in the kernel driver and
>>>> updates the metadata to make this change permanent. Updating metadata is
>>>> not yet implemented for super1, so this is limited to IMSM for now.
>>>>
>>>> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
>>>
>>> Hi Artur,
>>>
>>> It looked good all the way up until 6/7, but there is a nit here:
>>>
>>>> ---
>>>>  Manage.c | 79
>>>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  mdadm.c       |  9 +++++++
>>>>  mdadm.h       |  1 +
>>>>  super-intel.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++-
>>>>  4 files changed, 153 insertions(+), 1 deletion(-)
>>> [snip]
>>>> diff --git a/super-intel.c b/super-intel.c
>>>> index e524ef0..3b40429 100644
>>>> --- a/super-intel.c
>>>> +++ b/super-intel.c
>>>> @@ -448,6 +448,7 @@ enum imsm_update_type {
>>>>  	update_general_migration_checkpoint,
>>>>  	update_size_change,
>>>>  	update_prealloc_badblocks_mem,
>>>> +	update_rwh_policy,
>>>>  };
>>>>  
>>>>  struct imsm_update_activate_spare {
>>>> @@ -540,6 +541,12 @@ struct imsm_update_prealloc_bb_mem {
>>>>  	enum imsm_update_type type;
>>>>  };
>>>>  
>>>> +struct imsm_update_rwh_policy {
>>>> +	enum imsm_update_type type;
>>>> +	int new_policy;
>>>> +	int dev_idx;
>>>> +};
>>>> +
>>>>  static const char *_sys_dev_type[] = {
>>>>  	[SYS_DEV_UNKNOWN] = "Unknown",
>>>>  	[SYS_DEV_SAS] = "SAS",
>>>> @@ -3175,7 +3182,6 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>>>  	info->custom_array_size   <<= 32;
>>>>  	info->custom_array_size   |= __le32_to_cpu(dev->size_low);
>>>>  	info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
>>>> -	info->journal_clean = dev->rwh_policy;
>>>>  
>>>>  	if (is_gen_migration(dev)) {
>>>>  		info->reshape_active = 1;
>>>> @@ -3347,6 +3353,8 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>>>  			info->rwh_policy = RWH_POLICY_PPL;
>>>>  		else
>>>>  			info->rwh_policy = RWH_POLICY_UNKNOWN;
>>>> +
>>>> +		info->journal_clean = info->rwh_policy == RWH_POLICY_PPL;
>>>>  	}
>>>>  }
>>>
>>> This part doesn't make sense, first you set info->rwh_policy based on
>>> sb->feature_map to RWH_POLICY_PPL or RWH_POLICY_UNKNOWN and then right
>>> after you hard set it to RWH_POLICY_PPL.
>>>
>>> In general I really would prefer not to see any of those double
>>> assignments if it can be avoided.
>>
>> This isn't a double assignment, there is a '==' there. I'm setting
>> info->journal_clean to true only if the policy is PPL. I'm not sure how
>> this change ended up in this patch, it was supposed to go to 5/7. I must
>> have overlooked it.
> 
> Argh you're right, code obfuscation at it's finest - if this is meant to
> be in 5/7 do you want to respin the two?
> 
> In addition why not put the info->journal_clean assignments up together
> with the info->rhw_policy assignments? Would make it a lot easier to
> read without making my mistake :)

Sure, I can change that :) If you agree with the rest I'll just resend
those two patches.

Artur

^ permalink raw reply

* Re: [PATCH v2 6/7] Allow changing the RWH policy for a running array
From: Jes Sorensen @ 2016-12-06 15:08 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: linux-raid
In-Reply-To: <e90fade8-b904-7834-880d-61e1634e5fed@intel.com>

Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> On 12/06/2016 03:30 PM, Jes Sorensen wrote:
>> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>>> On 12/05/2016 10:50 PM, Jes Sorensen wrote:
>>>> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>>>>> This extends the --rwh-policy parameter to work also in Misc mode. Using
>>>>> it changes the currently active RWH policy in the kernel driver and
>>>>> updates the metadata to make this change permanent. Updating metadata is
>>>>> not yet implemented for super1, so this is limited to IMSM for now.
>>>>>
>>>>> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
>>>>
>>>> Hi Artur,
>>>>
>>>> It looked good all the way up until 6/7, but there is a nit here:
>>>>
>>>>> ---
>>>>>  Manage.c | 79
>>>>> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>  mdadm.c       |  9 +++++++
>>>>>  mdadm.h       |  1 +
>>>>>  super-intel.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++-
>>>>>  4 files changed, 153 insertions(+), 1 deletion(-)
>>>> [snip]
>>>>> diff --git a/super-intel.c b/super-intel.c
>>>>> index e524ef0..3b40429 100644
>>>>> --- a/super-intel.c
>>>>> +++ b/super-intel.c
>>>>> @@ -448,6 +448,7 @@ enum imsm_update_type {
>>>>>  	update_general_migration_checkpoint,
>>>>>  	update_size_change,
>>>>>  	update_prealloc_badblocks_mem,
>>>>> +	update_rwh_policy,
>>>>>  };
>>>>>  
>>>>>  struct imsm_update_activate_spare {
>>>>> @@ -540,6 +541,12 @@ struct imsm_update_prealloc_bb_mem {
>>>>>  	enum imsm_update_type type;
>>>>>  };
>>>>>  
>>>>> +struct imsm_update_rwh_policy {
>>>>> +	enum imsm_update_type type;
>>>>> +	int new_policy;
>>>>> +	int dev_idx;
>>>>> +};
>>>>> +
>>>>>  static const char *_sys_dev_type[] = {
>>>>>  	[SYS_DEV_UNKNOWN] = "Unknown",
>>>>>  	[SYS_DEV_SAS] = "SAS",
>>>>> @@ -3175,7 +3182,6 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>>>>  	info->custom_array_size   <<= 32;
>>>>>  	info->custom_array_size   |= __le32_to_cpu(dev->size_low);
>>>>>  	info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
>>>>> -	info->journal_clean = dev->rwh_policy;
>>>>>  
>>>>>  	if (is_gen_migration(dev)) {
>>>>>  		info->reshape_active = 1;
>>>>> @@ -3347,6 +3353,8 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
>>>>>  			info->rwh_policy = RWH_POLICY_PPL;
>>>>>  		else
>>>>>  			info->rwh_policy = RWH_POLICY_UNKNOWN;
>>>>> +
>>>>> +		info->journal_clean = info->rwh_policy == RWH_POLICY_PPL;
>>>>>  	}
>>>>>  }
>>>>
>>>> This part doesn't make sense, first you set info->rwh_policy based on
>>>> sb->feature_map to RWH_POLICY_PPL or RWH_POLICY_UNKNOWN and then right
>>>> after you hard set it to RWH_POLICY_PPL.
>>>>
>>>> In general I really would prefer not to see any of those double
>>>> assignments if it can be avoided.
>>>
>>> This isn't a double assignment, there is a '==' there. I'm setting
>>> info->journal_clean to true only if the policy is PPL. I'm not sure how
>>> this change ended up in this patch, it was supposed to go to 5/7. I must
>>> have overlooked it.
>> 
>> Argh you're right, code obfuscation at it's finest - if this is meant to
>> be in 5/7 do you want to respin the two?
>> 
>> In addition why not put the info->journal_clean assignments up together
>> with the info->rhw_policy assignments? Would make it a lot easier to
>> read without making my mistake :)
>
> Sure, I can change that :) If you agree with the rest I'll just resend
> those two patches.

Yes, that would be perfect.

Thanks,
Jes

^ permalink raw reply

* Re: [PATCH v2 6/7] Allow changing the RWH policy for a running array
From: Artur Paszkiewicz @ 2016-12-06 15:30 UTC (permalink / raw)
  To: Jes Sorensen; +Cc: linux-raid
In-Reply-To: <wrfja8c96qo5.fsf@redhat.com>

On 12/06/2016 04:08 PM, Jes Sorensen wrote:
> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>> Sure, I can change that :) If you agree with the rest I'll just resend
>> those two patches.
> 
> Yes, that would be perfect.

OK, but just in case - if you are going to apply them maybe wait until
Shaohua comments on the kernel patches. These mdadm patches won't be any
good if the kernel part doesn't get accepted.

Thanks,
Artur

^ permalink raw reply

* Re: [PATCH v2 6/7] Allow changing the RWH policy for a running array
From: Jes Sorensen @ 2016-12-06 15:38 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: linux-raid
In-Reply-To: <c21e1560-ca0d-67c8-83ca-04d275698d8b@intel.com>

Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> On 12/06/2016 04:08 PM, Jes Sorensen wrote:
>> Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
>>> Sure, I can change that :) If you agree with the rest I'll just resend
>>> those two patches.
>> 
>> Yes, that would be perfect.
>
> OK, but just in case - if you are going to apply them maybe wait until
> Shaohua comments on the kernel patches. These mdadm patches won't be any
> good if the kernel part doesn't get accepted.

I can cope with that :) I was trying to get in sync as I felt I was
falling behind the last couple of weeks. If I drop the ball or miss it,
please feel free to throw heavy objects at me.

Cheers,
Jes

^ permalink raw reply

* Re: [PATCH v2 00/12] Partial Parity Log for MD RAID 5
From: NeilBrown @ 2016-12-07  0:32 UTC (permalink / raw)
  To: Artur Paszkiewicz, shli; +Cc: linux-raid
In-Reply-To: <20161205153113.7268-1-artur.paszkiewicz@intel.com>

[-- Attachment #1: Type: text/plain, Size: 1768 bytes --]

On Tue, Dec 06 2016, Artur Paszkiewicz wrote:

> This series of patches implements the Partial Parity Log for RAID5 arrays. The
> purpose of this feature is closing the RAID Write Hole. It is a solution
> alternative to the existing raid5-cache, but the implementation is based on it
> and reuses some of the code by introducing support for interchangeable
> policies. This allows decoupling policy from mechanism and not adding more
> boilerplate code in raid5.c.
>
> The issue addressed by PPL is, that on a dirty shutdown, parity for a
> particular stripe may be inconsistent with data on other member disks. In
> degraded state, there is no way to recalculate parity, because one of the disks
> is missing. PPL addresses this issue and allows recalculating the parity. It
> stores only enough data needed for recovering from RWH and is not a true
> journal, like the raid5-cache implementation. It does not protect from losing
> in-flight data.
>
> PPL is a distributed log - data is stored on all RAID member drives in the
> metadata area. It does not need a dedicated journaling drive. Performance is
> reduced by up to 30%-40% but it scales with the number of drives in the array
> and the journaling drive does not become a bottleneck.

I would expect to see as description of what a PPL actually is and how
it works here... but there is none.

The change-log for patch 06 has a tiny bit more information which is
just enough to be able to start trying to understand the code, but it
isn't much.
And none of this description gets into the code, or into the
Documentation/.  This makes it hard to review and hard to maintain.

Remember: if you want people to review you code, it is in your interest
to make it easy.  That means give lots of details.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox