Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH 3/7] imsm: PPL support
From: Artur Paszkiewicz @ 2016-11-24 12:29 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20161124122952.16529-1-artur.paszkiewicz@intel.com>

Enable creating and assembling IMSM raid5 arrays with PPL.

Write the IMSM MPB location for a device to the newly added rdev
sb_start sysfs attribute and 'journal_ppl' to 'state' attribute for
every active member.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 mdadm.h       |  1 +
 super-intel.c | 33 +++++++++++++++++++++++++++++++++
 sysfs.c       |  4 ++++
 3 files changed, 38 insertions(+)

diff --git a/mdadm.h b/mdadm.h
index 4eabf59..5600341 100755
--- a/mdadm.h
+++ b/mdadm.h
@@ -252,6 +252,7 @@ struct mdinfo {
 	unsigned long long	custom_array_size; /* size for non-default sized
 						    * arrays (in sectors)
 						    */
+	unsigned long long	sb_start;
 #define NO_RESHAPE		0
 #define VOLUME_RESHAPE		1
 #define CONTAINER_RESHAPE	2
diff --git a/super-intel.c b/super-intel.c
index df09272..79a3d78 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -1261,6 +1261,15 @@ static void print_imsm_dev(struct intel_super *super,
 	}
 	printf("\n");
 	printf("    Dirty State : %s\n", dev->vol.dirty ? "dirty" : "clean");
+	printf("     RWH Policy : ");
+	if (dev->rwh_policy == RWH_OFF)
+		printf("off\n");
+	else if (dev->rwh_policy == RWH_DISTRIBUTED)
+		printf("PPL distributed\n");
+	else if (dev->rwh_policy == RWH_JOURNALING_DRIVE)
+		printf("PPL journaling drive\n");
+	else
+		printf("<unknown:%d>\n", dev->rwh_policy);
 }
 
 static void print_imsm_disk(struct imsm_disk *disk, int index, __u32 reserved)
@@ -3043,6 +3052,15 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
 			}
 		}
 	}
+
+	if (info->array.level == 5) {
+		if (dev->rwh_policy == RWH_OFF)
+			info->rwh_policy = RWH_POLICY_OFF;
+		else if (dev->rwh_policy == RWH_DISTRIBUTED)
+			info->rwh_policy = RWH_POLICY_PPL;
+		else
+			info->rwh_policy = RWH_POLICY_UNKNOWN;
+	}
 }
 
 static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev,
@@ -3177,6 +3195,9 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *
 
 		disk = &super->disks->disk;
 		info->data_offset = total_blocks(&super->disks->disk) - reserved;
+		/* mpb anchor sector - see store_imsm_mpb() */
+		info->sb_start = total_blocks(&super->disks->disk) -
+				 ((2 * super->sector_size) >> 9);
 		info->component_size = reserved;
 		info->disk.state  = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0;
 		/* we don't change info->disk.raid_disk here because
@@ -5034,6 +5055,17 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 	}
 	mpb->num_raid_devs++;
 
+	if (s->rwh_policy == UnSet || s->rwh_policy == RWH_POLICY_OFF) {
+		dev->rwh_policy = RWH_OFF;
+	} else if (s->rwh_policy == RWH_POLICY_PPL) {
+		dev->rwh_policy = RWH_DISTRIBUTED;
+	} else {
+		free(dev);
+		free(dv);
+		pr_err("imsm supports only PPL RWH Policy\n");
+		return 0;
+	}
+
 	dv->dev = dev;
 	dv->index = super->current_vol;
 	dv->next = super->devlist;
@@ -11061,6 +11093,7 @@ struct superswitch super_imsm = {
 	.container_content = container_content_imsm,
 	.validate_container = validate_container_imsm,
 
+	.supports_ppl	= 1,
 	.external	= 1,
 	.name = "imsm",
 
diff --git a/sysfs.c b/sysfs.c
index 4772d77..b4437a3 100644
--- a/sysfs.c
+++ b/sysfs.c
@@ -732,7 +732,11 @@ int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume)
 			rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk);
 		if (resume)
 			sysfs_set_num(sra, sd, "recovery_start", sd->recovery_start);
+		if (sra->rwh_policy == RWH_POLICY_PPL &&
+		    (sd->recovery_start == MaxSector || !resume))
+			sysfs_set_str(sra, sd, "state", "journal_ppl");
 	}
+	sysfs_set_num(sra, sd, "sb_start", sd->sb_start);
 	return rv;
 }
 
-- 
2.10.1


^ permalink raw reply related

* [PATCH 2/7] Generic support for --rwh-policy and PPL
From: Artur Paszkiewicz @ 2016-11-24 12:29 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20161124122952.16529-1-artur.paszkiewicz@intel.com>

Add a new parameter to mdadm: --rwh-policy=. It can be used to create a
raid5 array using PPL. Add the necessary plumbing to pass this option to
metadata handlers. The write journal functionality is treated as a
different RWH policy, which is implicitly selected when using
--write-journal.

Show the currently enabled RWH policy type in the output from
mdadm --detail. The value is retrieved from the array's sysfs attribute
'rwh_policy'.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 Create.c      |  7 ++++++-
 Detail.c      | 18 ++++++++++++++++--
 Kill.c        |  2 +-
 ReadMe.c      |  1 +
 maps.c        |  7 +++++++
 mdadm.c       | 35 ++++++++++++++++++++++++++++++++---
 mdadm.h       | 15 +++++++++++++--
 super-ddf.c   |  4 ++--
 super-intel.c | 12 ++++++------
 super0.c      |  6 +++---
 super1.c      |  4 ++--
 sysfs.c       | 11 +++++++++++
 12 files changed, 100 insertions(+), 22 deletions(-)

diff --git a/Create.c b/Create.c
index 1594a39..52e7e2b 100644
--- a/Create.c
+++ b/Create.c
@@ -594,6 +594,11 @@ int Create(struct supertype *st, char *mddev,
 		return 1;
 	}
 
+	if (s->rwh_policy == RWH_POLICY_PPL && !st->ss->supports_ppl) {
+		pr_err("%s metadata does not support PPL\n", st->ss->name);
+		return 1;
+	}
+
 	/* We need to create the device */
 	map_lock(&map);
 	mdfd = create_mddev(mddev, name, c->autof, LOCAL, chosen_name);
@@ -720,7 +725,7 @@ int Create(struct supertype *st, char *mddev,
 				name += 2;
 		}
 	}
-	if (!st->ss->init_super(st, &info.array, s->size, name, c->homehost, uuid,
+	if (!st->ss->init_super(st, &info.array, s, name, c->homehost, uuid,
 				data_offset))
 		goto abort_locked;
 
diff --git a/Detail.c b/Detail.c
index 925e479..d79503d 100644
--- a/Detail.c
+++ b/Detail.c
@@ -504,15 +504,29 @@ int Detail(char *dev, struct context *c)
 		case 10:
 		case 6:
 			if (array.chunk_size)
-				printf("     Chunk Size : %dK\n\n",
+				printf("     Chunk Size : %dK\n",
 				       array.chunk_size/1024);
 			break;
 		case -1:
-			printf("       Rounding : %dK\n\n", array.chunk_size/1024);
+			printf("       Rounding : %dK\n", array.chunk_size/1024);
 			break;
 		default: break;
 		}
 
+		if (array.level == 4 || array.level == 5 || array.level == 6) {
+			struct mdinfo *mdi = sysfs_read(fd, NULL,
+							GET_RWH_POLICY);
+			if (mdi) {
+				char *rwh_policy = map_num(rwh_policies,
+							   mdi->rwh_policy);
+				sysfs_free(mdi);
+				if (rwh_policy)
+					printf("     RWH Policy : %s\n",
+					       rwh_policy);
+			}
+		}
+		printf("\n");
+
 		if (e && e->percent >= 0) {
 			static char *sync_action[] = {
 				"Rebuild", "Resync",
diff --git a/Kill.c b/Kill.c
index f2fdb85..ff52561 100644
--- a/Kill.c
+++ b/Kill.c
@@ -63,7 +63,7 @@ int Kill(char *dev, struct supertype *st, int force, int verbose, int noexcl)
 	rv = st->ss->load_super(st, fd, dev);
 	if (rv == 0 || (force && rv >= 2)) {
 		st->ss->free_super(st);
-		st->ss->init_super(st, NULL, 0, "", NULL, NULL,
+		st->ss->init_super(st, NULL, NULL, "", NULL, NULL,
 				   INVALID_SECTORS);
 		if (st->ss->store_super(st, fd)) {
 			if (verbose >= 0)
diff --git a/ReadMe.c b/ReadMe.c
index d3fcb61..11165a8 100644
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -143,6 +143,7 @@ struct option long_options[] = {
     {"nodes",1, 0, Nodes}, /* also for --assemble */
     {"home-cluster",1, 0, ClusterName},
     {"write-journal",1, 0, WriteJournal},
+    {"rwh-policy",1, 0, RwhPolicy},
 
     /* For assemble */
     {"uuid",      1, 0, 'u'},
diff --git a/maps.c b/maps.c
index 64f1df2..2e8dd9e 100644
--- a/maps.c
+++ b/maps.c
@@ -129,6 +129,13 @@ mapping_t faultylayout[] = {
 	{ NULL, 0}
 };
 
+mapping_t rwh_policies[] = {
+	{ "off", RWH_POLICY_OFF},
+	{ "journal", RWH_POLICY_JOURNAL},
+	{ "ppl", RWH_POLICY_PPL},
+	{ NULL, 0}
+};
+
 char *map_num(mapping_t *map, int num)
 {
 	while (map->name) {
diff --git a/mdadm.c b/mdadm.c
index cca0933..9ecdce6 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -78,6 +78,7 @@ int main(int argc, char *argv[])
 		.level		= UnSet,
 		.layout		= UnSet,
 		.bitmap_chunk	= UnSet,
+		.rwh_policy	= UnSet,
 	};
 
 	char sys_hostname[256];
@@ -1198,6 +1199,13 @@ int main(int argc, char *argv[])
 
 			s.journaldisks = 1;
 			continue;
+		case O(CREATE, RwhPolicy):
+			s.rwh_policy = map_name(rwh_policies, optarg);
+			if (s.rwh_policy == UnSet) {
+				pr_err("Invalid RWH policy: %s\n", optarg);
+				exit(2);
+			}
+			continue;
 		}
 		/* We have now processed all the valid options. Anything else is
 		 * an error
@@ -1225,9 +1233,30 @@ int main(int argc, char *argv[])
 		exit(0);
 	}
 
-	if (s.journaldisks && (s.level < 4 || s.level > 6)) {
-		pr_err("--write-journal is only supported for RAID level 4/5/6.\n");
-		exit(2);
+	if (s.journaldisks) {
+		if (s.level < 4 || s.level > 6) {
+			pr_err("--write-journal is only supported for RAID level 4/5/6.\n");
+			exit(2);
+		}
+		if (s.rwh_policy == UnSet) {
+			s.rwh_policy = RWH_POLICY_JOURNAL;
+		} else if (s.rwh_policy != RWH_POLICY_JOURNAL) {
+			pr_err("--write-journal is not supported with RWH policy: %s\n",
+			       map_num(rwh_policies, s.rwh_policy));
+			exit(2);
+		}
+	}
+
+	if (s.rwh_policy != UnSet) {
+		if (s.level < 4 || s.level > 6) {
+			pr_err("--rwh-policy is only supported for RAID level 4/5/6.\n");
+			exit(2);
+		}
+		if (s.rwh_policy == RWH_POLICY_JOURNAL && !s.journaldisks) {
+			pr_err("--write-journal is required for RWH policy: %s\n",
+			       map_num(rwh_policies, s.rwh_policy));
+			exit(2);
+		}
 	}
 
 	if (!mode && devs_found) {
diff --git a/mdadm.h b/mdadm.h
index 240ab7f..4eabf59 100755
--- a/mdadm.h
+++ b/mdadm.h
@@ -268,6 +268,13 @@ struct mdinfo {
 	int journal_device_required;
 	int journal_clean;
 
+	enum {
+		RWH_POLICY_UNKNOWN,
+		RWH_POLICY_OFF,
+		RWH_POLICY_JOURNAL,
+		RWH_POLICY_PPL,
+	} rwh_policy;
+
 	/* During reshape we can sometimes change the data_offset to avoid
 	 * over-writing still-valid data.  We need to know if there is space.
 	 * So getinfo_super will fill in space_before and space_after in sectors.
@@ -409,6 +416,7 @@ enum special_options {
 	ClusterName,
 	ClusterConfirm,
 	WriteJournal,
+	RwhPolicy,
 };
 
 enum prefix_standard {
@@ -506,6 +514,7 @@ struct shape {
 	int	assume_clean;
 	int	write_behind;
 	unsigned long long size;
+	int	rwh_policy;
 };
 
 /* List of device names - wildcards expanded */
@@ -596,6 +605,7 @@ enum sysfs_read_flags {
 	GET_STATE	= (1 << 23),
 	GET_ERROR	= (1 << 24),
 	GET_ARRAY_STATE = (1 << 25),
+	GET_RWH_POLICY	= (1 << 26),
 };
 
 /* If fd >= 0, get the array it is open on,
@@ -679,7 +689,7 @@ extern int restore_stripes(int *dest, unsigned long long *offsets,
 
 extern char *map_num(mapping_t *map, int num);
 extern int map_name(mapping_t *map, char *name);
-extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[];
+extern mapping_t r5layout[], r6layout[], pers[], modes[], faultylayout[], rwh_policies[];
 
 extern char *map_dev_preferred(int major, int minor, int create,
 			       char *prefer);
@@ -839,7 +849,7 @@ extern struct superswitch {
 	 * metadata.
 	 */
 	int (*init_super)(struct supertype *st, mdu_array_info_t *info,
-			  unsigned long long size, char *name,
+			  struct shape *s, char *name,
 			  char *homehost, int *uuid,
 			  unsigned long long data_offset);
 
@@ -1035,6 +1045,7 @@ extern struct superswitch {
 	/* validate container after assemble */
 	int (*validate_container)(struct mdinfo *info);
 
+	int supports_ppl;
 	int swapuuid; /* true if uuid is bigending rather than hostendian */
 	int external;
 	const char *name; /* canonical metadata name */
diff --git a/super-ddf.c b/super-ddf.c
index 1707ad1..18e1e77 100644
--- a/super-ddf.c
+++ b/super-ddf.c
@@ -2290,7 +2290,7 @@ static unsigned int find_vde_by_guid(const struct ddf_super *ddf,
 
 static int init_super_ddf(struct supertype *st,
 			  mdu_array_info_t *info,
-			  unsigned long long size, char *name, char *homehost,
+			  struct shape *s, char *name, char *homehost,
 			  int *uuid, unsigned long long data_offset)
 {
 	/* This is primarily called by Create when creating a new array.
@@ -2328,7 +2328,7 @@ static int init_super_ddf(struct supertype *st,
 	struct virtual_disk *vd;
 
 	if (st->sb)
-		return init_super_ddf_bvd(st, info, size, name, homehost, uuid,
+		return init_super_ddf_bvd(st, info, s->size, name, homehost, uuid,
 					  data_offset);
 
 	if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) {
diff --git a/super-intel.c b/super-intel.c
index 69f6201..df09272 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -4887,7 +4887,7 @@ static int check_name(struct intel_super *super, char *name, int quiet)
 }
 
 static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
-				  unsigned long long size, char *name,
+				  struct shape *s, char *name,
 				  char *homehost, int *uuid,
 				  long long data_offset)
 {
@@ -4981,7 +4981,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 	strncpy((char *) dev->volume, name, MAX_RAID_SERIAL_LEN);
 	array_blocks = calc_array_size(info->level, info->raid_disks,
 					       info->layout, info->chunk_size,
-					       size * 2);
+					       s->size * 2);
 	/* round array size down to closest MB */
 	array_blocks = (array_blocks >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT;
 
@@ -4995,7 +4995,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 	vol->curr_migr_unit = 0;
 	map = get_imsm_map(dev, MAP_0);
 	set_pba_of_lba0(map, super->create_offset);
-	set_blocks_per_member(map, info_to_blocks_per_member(info, size));
+	set_blocks_per_member(map, info_to_blocks_per_member(info, s->size));
 	map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info));
 	map->failed_disk_num = ~0;
 	if (info->level > 0)
@@ -5023,7 +5023,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 		map->num_domains = 1;
 
 	/* info->size is only int so use the 'size' parameter instead */
-	num_data_stripes = (size * 2) / info_to_blocks_per_strip(info);
+	num_data_stripes = (s->size * 2) / info_to_blocks_per_strip(info);
 	num_data_stripes /= map->num_domains;
 	set_num_data_stripes(map, num_data_stripes);
 
@@ -5045,7 +5045,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 }
 
 static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
-			   unsigned long long size, char *name,
+		           struct shape *s, char *name,
 			   char *homehost, int *uuid,
 			   unsigned long long data_offset)
 {
@@ -5068,7 +5068,7 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
 	}
 
 	if (st->sb)
-		return init_super_imsm_volume(st, info, size, name, homehost, uuid,
+		return init_super_imsm_volume(st, info, s, name, homehost, uuid,
 					      data_offset);
 
 	if (info)
diff --git a/super0.c b/super0.c
index 55ebd8b..151e52a 100644
--- a/super0.c
+++ b/super0.c
@@ -721,7 +721,7 @@ static int update_super0(struct supertype *st, struct mdinfo *info,
  * We use the first 8 bytes (64bits) of the sha1 of the host name
  */
 static int init_super0(struct supertype *st, mdu_array_info_t *info,
-		       unsigned long long size, char *ignored_name,
+		       struct shape *s, char *ignored_name,
 		       char *homehost, int *uuid,
 		       unsigned long long data_offset)
 {
@@ -760,8 +760,8 @@ static int init_super0(struct supertype *st, mdu_array_info_t *info,
 	sb->gvalid_words = 0; /* ignored */
 	sb->ctime = time(0);
 	sb->level = info->level;
-	sb->size = size;
-	if (size != (unsigned long long)sb->size)
+	sb->size = s->size;
+	if (s->size != (unsigned long long)sb->size)
 		return 0;
 	sb->nr_disks = info->nr_disks;
 	sb->raid_disks = info->raid_disks;
diff --git a/super1.c b/super1.c
index d323439..8a98ac2 100644
--- a/super1.c
+++ b/super1.c
@@ -1388,7 +1388,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
 }
 
 static int init_super1(struct supertype *st, mdu_array_info_t *info,
-		       unsigned long long size, char *name, char *homehost,
+		       struct shape *s, char *name, char *homehost,
 		       int *uuid, unsigned long long data_offset)
 {
 	struct mdp_superblock_1 *sb;
@@ -1441,7 +1441,7 @@ static int init_super1(struct supertype *st, mdu_array_info_t *info,
 	sb->ctime = __cpu_to_le64((unsigned long long)time(0));
 	sb->level = __cpu_to_le32(info->level);
 	sb->layout = __cpu_to_le32(info->layout);
-	sb->size = __cpu_to_le64(size*2ULL);
+	sb->size = __cpu_to_le64(s->size*2ULL);
 	sb->chunksize = __cpu_to_le32(info->chunk_size>>9);
 	sb->raid_disks = __cpu_to_le32(info->raid_disks);
 
diff --git a/sysfs.c b/sysfs.c
index f8a9f0b..4772d77 100644
--- a/sysfs.c
+++ b/sysfs.c
@@ -240,6 +240,17 @@ struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options)
 	} else
 		sra->sysfs_array_state[0] = 0;
 
+	if (options & GET_RWH_POLICY) {
+		strcpy(base, "rwh_policy");
+		if (load_sys(fname, buf, sizeof(buf))) {
+			sra->rwh_policy = RWH_POLICY_UNKNOWN;
+		} else {
+			sra->rwh_policy = map_name(rwh_policies, buf);
+			if (sra->rwh_policy == UnSet)
+				sra->rwh_policy = RWH_POLICY_UNKNOWN;
+		}
+	}
+
 	if (! (options & GET_DEVS))
 		return sra;
 
-- 
2.10.1


^ permalink raw reply related

* [PATCH 1/7] imsm: metadata changes for PPL
From: Artur Paszkiewicz @ 2016-11-24 12:29 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20161124122952.16529-1-artur.paszkiewicz@intel.com>

Updates for the IMSM metadata format, including PPL header structures.

Extend imsm_vol dirty field adding a third value, which is required to
enable PPL recovery in Windows and UEFI drivers.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 super-intel.c | 48 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/super-intel.c b/super-intel.c
index 5740088..69f6201 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -104,8 +104,11 @@ struct imsm_disk {
 	__u32 status;			 /* 0xF0 - 0xF3 */
 	__u32 owner_cfg_num; /* which config 0,1,2... owns this disk */
 	__u32 total_blocks_hi;		 /* 0xF4 - 0xF5 total blocks hi */
-#define	IMSM_DISK_FILLERS	3
-	__u32 filler[IMSM_DISK_FILLERS]; /* 0xF5 - 0x107 MPB_DISK_FILLERS for future expansion */
+	__u8 is_journal_disk;
+	__u8 filler1; /* MPB_DISK_FILLERS - reserved for future expansion */
+	__u16 filler2;
+#define	IMSM_DISK_FILLERS	2
+	__u32 filler[IMSM_DISK_FILLERS];
 };
 
 /* map selector for map managment
@@ -154,6 +157,9 @@ struct imsm_vol {
 #define MIGR_STATE_CHANGE 4
 #define MIGR_REPAIR 5
 	__u8  migr_type;	/* Initializing, Rebuilding, ... */
+#define RAIDVOL_CLEAN          0
+#define RAIDVOL_DIRTY          1
+#define RAIDVOL_DSRECORD_VALID 2
 	__u8  dirty;
 	__u8  fs_state;		/* fast-sync state for CnG (0xff == disabled) */
 	__u16 verify_errors;	/* number of mismatches */
@@ -189,7 +195,30 @@ struct imsm_dev {
 	__u16 cache_policy;
 	__u8  cng_state;
 	__u8  cng_sub_state;
-#define IMSM_DEV_FILLERS 10
+	__u16 my_vol_raid_dev_num; /* Used in Unique volume Id for this RaidDev */
+
+	/* NVM_EN */
+	__u8 nv_cache_mode;
+	union {
+		__u8 nv_cache_flags;
+		struct {
+		    __u8 dirty:1; /* 1 - cache is dirty, 0 - clean */
+		    __u8 nvc_health:2;
+		    __u8 expansion_bytes:5;
+		} nvCache;
+	};
+
+	/* Unique Volume Id of the NvCache Volume associated with this volume */
+	__u32 nvc_vol_orig_family_num;
+	__u16 nvc_vol_raid_dev_num;
+
+#define RWH_OFF 0
+#define RWH_DISTRIBUTED 1
+#define RWH_JOURNALING_DRIVE 2
+	__u8  rwh_policy;              /* Raid Write Hole Policy */
+	__u8  filler1;
+
+#define IMSM_DEV_FILLERS 7
 	__u32 filler[IMSM_DEV_FILLERS];
 	struct imsm_vol vol;
 } __attribute__ ((packed));
@@ -7565,12 +7594,15 @@ mark_checkpoint:
 
 skip_mark_checkpoint:
 	/* mark dirty / clean */
-	if (dev->vol.dirty != !consistent) {
+	if ((dev->vol.dirty & RAIDVOL_DIRTY) != !consistent) {
 		dprintf("imsm: mark '%s'\n", consistent ? "clean" : "dirty");
-		if (consistent)
-			dev->vol.dirty = 0;
-		else
-			dev->vol.dirty = 1;
+		if (consistent) {
+			dev->vol.dirty = RAIDVOL_CLEAN;
+		} else {
+			dev->vol.dirty = RAIDVOL_DIRTY;
+			if (dev->rwh_policy)
+				dev->vol.dirty |= RAIDVOL_DSRECORD_VALID;
+		}
 		super->updates_pending++;
 	}
 
-- 
2.10.1


^ permalink raw reply related

* [PATCH 0/7] mdadm support for Partial Parity Log
From: Artur Paszkiewicz @ 2016-11-24 12:29 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Artur Paszkiewicz

This is the mdadm part of the PPL functionality. It adds a new parameter to
mdadm to allow selecting the RWH policy for an array. Other changes include
displaying the RWH policy in the output from --detail and --examine.

As with the kernel patches sent earlier, all of this is currently targeted and
tested with IMSM and native MD v1.1 and v1.2 metadata arrays.

Artur Paszkiewicz (6):
  imsm: metadata changes for PPL
  Generic support for --rwh-policy and PPL
  imsm: PPL support
  super1: PPL support
  Allow changing the RWH policy for a running array
  Man page changes for --rwh-policy

Pawel Baldysiak (1):
  imsm: allow to assemble with PPL even if dirty degraded

 Assemble.c    |   4 +-
 Create.c      |  26 ++++++++--
 Detail.c      |  18 ++++++-
 Grow.c        |  15 +++++-
 Kill.c        |   2 +-
 Manage.c      |  79 ++++++++++++++++++++++++++++
 ReadMe.c      |   1 +
 maps.c        |   7 +++
 mdadm.8.in    |  28 ++++++++++
 mdadm.c       |  44 ++++++++++++++--
 mdadm.h       |  21 ++++++--
 super-ddf.c   |   6 +--
 super-gpt.c   |   2 +-
 super-intel.c | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++++------
 super-mbr.c   |   2 +-
 super0.c      |   8 +--
 super1.c      |  92 +++++++++++++++++++++++++++------
 sysfs.c       |  15 ++++++
 18 files changed, 475 insertions(+), 56 deletions(-)

-- 
2.10.1


^ permalink raw reply

* [PATCH 12/12] raid5-ppl: runtime PPL enabling or disabling
From: Artur Paszkiewicz @ 2016-11-24 12:28 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20161124122847.16456-1-artur.paszkiewicz@intel.com>

Introduce a sysfs attribute to get or set the current RWH policy. The
raid5_reset_cache function is used to free the stripe cache and allocate
it again. This is needed to allocate or free the ppl_pages for the
stripes in the stripe cache.

When enabling the log at runtime it is necessary to overwrite the PPL
header to avoid recovering from stale PPL data if the log had been used
previously with this array.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/raid5-ppl.c |  38 ++++++++++++++---
 drivers/md/raid5.c     | 109 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+), 5 deletions(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 7ce3ce5..0967b6d 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -715,6 +715,27 @@ static int ppl_load(struct r5l_log *log)
 	return ret;
 }
 
+static int ppl_invalidate(struct r5l_log *log)
+{
+	struct ppl_conf *ppl_conf = log->private;
+	int i;
+
+	for (i = 0; i < ppl_conf->count; i++) {
+		struct r5l_log *log_child = ppl_conf->child_logs[i];
+		int ret;
+
+		/* Missing drive */
+		if (!log_child)
+			continue;
+
+		ret = ppl_write_empty_header(log_child);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 #define IMSM_MPB_SIG "Intel Raid ISM Cfg Sig. "
 #define IMSM_MPB_ORIG_FAMILY_NUM_OFFSET 64
 
@@ -946,11 +967,18 @@ static int __ppl_init_log(struct r5l_log *log, struct r5conf *conf)
 		ppl_conf->child_logs[i] = log_child;
 	}
 
-	ret = ppl_load(log);
-	if (!ret && mddev->recovery_cp == 0 && !mddev->degraded)
-		mddev->recovery_cp = MaxSector;
-	else if (ret < 0)
-		goto err;
+	if (mddev->pers) {
+		dbg("Array running - invalidate PPL\n");
+		ret = ppl_invalidate(log);
+		if (ret)
+			goto err;
+	} else {
+		ret = ppl_load(log);
+		if (!ret && mddev->recovery_cp == 0 && !mddev->degraded)
+			mddev->recovery_cp = MaxSector;
+		else if (ret < 0)
+			goto err;
+	}
 
 	rcu_assign_pointer(conf->log, log);
 	set_bit(MD_HAS_PPL, &mddev->flags);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7a1a564..bd06cfb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6413,6 +6413,114 @@ raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
 				raid5_show_group_thread_cnt,
 				raid5_store_group_thread_cnt);
 
+static ssize_t
+raid5_show_rwh_policy(struct mddev *mddev, char *page)
+{
+	struct r5conf *conf;
+	int ret = 0;
+	spin_lock(&mddev->lock);
+	conf = mddev->private;
+	if (conf) {
+		const char *policy = NULL;
+		if (!conf->log)
+			policy = "off";
+		else if (conf->log->rwh_policy == RWH_POLICY_JOURNAL)
+			policy = "journal";
+		else if (conf->log->rwh_policy == RWH_POLICY_PPL)
+			policy = "ppl";
+		if (policy)
+			ret = sprintf(page, "%s\n", policy);
+	}
+	spin_unlock(&mddev->lock);
+	return ret;
+}
+
+static void raid5_reset_cache(struct mddev *mddev)
+{
+	struct r5conf *conf = mddev->private;
+
+	mutex_lock(&conf->cache_size_mutex);
+	while (conf->max_nr_stripes &&
+		       drop_one_stripe(conf))
+			;
+
+	while (conf->min_nr_stripes > conf->max_nr_stripes)
+		if (!grow_one_stripe(conf, GFP_KERNEL))
+			break;
+	mutex_unlock(&conf->cache_size_mutex);
+}
+
+static ssize_t
+raid5_store_rwh_policy(struct mddev *mddev, const char *page, size_t len)
+{
+	struct r5conf *conf;
+	int err;
+	int new_policy, current_policy;
+
+	if (len >= PAGE_SIZE)
+		return -EINVAL;
+
+	err = mddev_lock(mddev);
+	if (err)
+		return err;
+	conf = mddev->private;
+	if (!conf) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	if (conf->log)
+		current_policy = conf->log->rwh_policy;
+	else
+		current_policy = RWH_POLICY_OFF;
+
+	if (strncmp(page, "off", 3) == 0) {
+		new_policy = RWH_POLICY_OFF;
+	} else if (strncmp(page, "journal", 7) == 0) {
+		new_policy = RWH_POLICY_JOURNAL;
+	} else if (strncmp(page, "ppl", 3) == 0) {
+		new_policy = RWH_POLICY_PPL;
+	} else {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (new_policy == current_policy)
+		goto out;
+
+	if (current_policy == RWH_POLICY_PPL && new_policy == RWH_POLICY_OFF) {
+		struct r5l_log *log;
+		mddev_suspend(mddev);
+		log = conf->log;
+		conf->log = NULL;
+		synchronize_rcu();
+		r5l_exit_log(log);
+		raid5_reset_cache(mddev);
+		mddev_resume(mddev);
+	} else if (current_policy == RWH_POLICY_OFF &&
+		   new_policy == RWH_POLICY_PPL) {
+		mddev_suspend(mddev);
+		err = r5l_init_log(conf, NULL, new_policy);
+		if (!err)
+			raid5_reset_cache(mddev);
+		mddev_resume(mddev);
+	} else {
+		err = -EINVAL;
+		goto out;
+	}
+
+	md_update_sb(mddev, 1);
+out:
+	mddev_unlock(mddev);
+
+	return err ?: len;
+}
+
+static struct md_sysfs_entry
+raid5_rwh_policy = __ATTR(rwh_policy, S_IRUGO | S_IWUSR,
+				raid5_show_rwh_policy,
+				raid5_store_rwh_policy);
+
 static struct attribute *raid5_attrs[] =  {
 	&raid5_stripecache_size.attr,
 	&raid5_stripecache_active.attr,
@@ -6421,6 +6529,7 @@ static struct attribute *raid5_attrs[] =  {
 	&raid5_skip_copy.attr,
 	&raid5_rmw_level.attr,
 	&r5c_journal_mode.attr,
+	&raid5_rwh_policy.attr,
 	NULL,
 };
 static struct attribute_group raid5_attrs_group = {
-- 
2.10.1


^ permalink raw reply related

* [PATCH 11/12] raid5-ppl: support disk add/remove with distributed PPL
From: Artur Paszkiewicz @ 2016-11-24 12:28 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20161124122847.16456-1-artur.paszkiewicz@intel.com>

Add a function to modify the log by adding or removing an rdev when a
drive fails or is added as a spare.

Adding a drive to the log is as simple as initializing and adding a new
child log, removing a drive is more complicated because it requires
stopping the child log and freeing all of its resources. In order to do
that, we busy wait for any submitted log bios to complete and then
manually finish and free the io_units. No new log requests will happen
at this point. A new list is added to struct r5l_io_unit to have access
to stripes that have been written to the log but are not completely
processed yet.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/md.c          |  3 +-
 drivers/md/raid5-cache.c | 13 ++++++-
 drivers/md/raid5-cache.h |  3 ++
 drivers/md/raid5-ppl.c   | 89 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c       | 20 +++++++++++
 5 files changed, 126 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7049833..279e303 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8334,7 +8334,8 @@ static int remove_and_add_spares(struct mddev *mddev,
 		    !test_bit(Blocked, &rdev->flags) &&
 		    ((test_bit(RemoveSynchronized, &rdev->flags) ||
 		     (!test_bit(In_sync, &rdev->flags) &&
-		      !test_bit(Journal, &rdev->flags))) &&
+		      !test_bit(Journal, &rdev->flags) &&
+		      !test_bit(JournalPpl, &rdev->flags))) &&
 		    atomic_read(&rdev->nr_pending)==0)) {
 			if (mddev->pers->hot_remove_disk(
 				    mddev, rdev) == 0) {
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index cb764db..fbd8148 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -345,7 +345,7 @@ void r5l_io_run_stripes(struct r5l_io_unit *io)
 	struct stripe_head *sh, *next;
 
 	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
-		list_del_init(&sh->log_list);
+		list_move_tail(&sh->log_list, &io->stripe_finished_list);
 
 		r5c_finish_cache_stripe(sh);
 
@@ -553,6 +553,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
 	io->log = log;
 	INIT_LIST_HEAD(&io->log_sibling);
 	INIT_LIST_HEAD(&io->stripe_list);
+	INIT_LIST_HEAD(&io->stripe_finished_list);
 	bio_list_init(&io->flush_barriers);
 	io->state = IO_UNIT_RUNNING;
 
@@ -2529,6 +2530,16 @@ void r5l_exit_log(struct r5l_log *log)
 	kfree(log);
 }
 
+/*
+ * operation: 0 - remove rdev from log, 1 - add rdev to log
+ */
+int r5l_modify_log(struct r5l_log *log, struct md_rdev *rdev, int operation)
+{
+	if (log && log->policy->modify_log)
+		return log->policy->modify_log(log, rdev, operation);
+	return 0;
+}
+
 struct r5l_policy r5l_journal = {
 	.init_log = __r5l_init_log,
 	.exit_log = __r5l_exit_log,
diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
index 40bd5ce..8f466c6 100644
--- a/drivers/md/raid5-cache.h
+++ b/drivers/md/raid5-cache.h
@@ -111,6 +111,7 @@ struct r5l_io_unit {
 	sector_t log_end;	/* where the io_unit ends */
 	struct list_head log_sibling; /* log->running_ios */
 	struct list_head stripe_list; /* stripes added to the io_unit */
+	struct list_head stripe_finished_list; /* stripes written to log */
 
 	int state;
 	bool need_split_bio;
@@ -141,6 +142,7 @@ enum r5l_io_unit_state {
 struct r5l_policy {
 	int (*init_log)(struct r5l_log *log, struct r5conf *conf);
 	void (*exit_log)(struct r5l_log *log);
+	int (*modify_log)(struct r5l_log *log, struct md_rdev *rdev, int op);
 	int (*write_stripe)(struct r5l_log *log, struct stripe_head *sh);
 	void (*write_stripe_run)(struct r5l_log *log);
 	void (*flush_stripe_to_raid)(struct r5l_log *log);
@@ -151,6 +153,7 @@ struct r5l_policy {
 
 extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev, int policy_type);
 extern void r5l_exit_log(struct r5l_log *log);
+extern int r5l_modify_log(struct r5l_log *log, struct md_rdev *rdev, int operation);
 extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh);
 extern void r5l_write_stripe_run(struct r5l_log *log);
 extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 1a00b2d..7ce3ce5 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -108,6 +108,7 @@ static struct r5l_io_unit *ppl_new_iounit(struct r5l_log *log,
 	io->log = log;
 	INIT_LIST_HEAD(&io->log_sibling);
 	INIT_LIST_HEAD(&io->stripe_list);
+	INIT_LIST_HEAD(&io->stripe_finished_list);
 	io->state = IO_UNIT_RUNNING;
 
 	io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
@@ -960,6 +961,93 @@ static int __ppl_init_log(struct r5l_log *log, struct r5conf *conf)
 	return ret;
 }
 
+static void ppl_log_stop(struct r5l_log *log)
+{
+	struct r5l_io_unit *io, *next;
+	unsigned long flags;
+	bool wait;
+
+	/* wait for in flight ios to complete */
+	do {
+		wait = false;
+		spin_lock_irqsave(&log->io_list_lock, flags);
+		list_for_each_entry(io, &log->running_ios, log_sibling) {
+			if (io->state == IO_UNIT_IO_START) {
+				wait = true;
+				break;
+			}
+		}
+		if (!wait)
+			wait = !list_empty(&log->flushing_ios);
+		spin_unlock_irqrestore(&log->io_list_lock, flags);
+	} while (wait);
+
+	/* clean up iounits */
+	spin_lock_irqsave(&log->io_list_lock, flags);
+
+	list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
+		list_move_tail(&io->log_sibling, &log->finished_ios);
+		bio_put(io->current_bio);
+		mempool_free(io->meta_page, log->meta_pool);
+	}
+	list_splice_tail_init(&log->io_end_ios, &log->finished_ios);
+
+	list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
+		struct stripe_head *sh;
+		list_for_each_entry(sh, &io->stripe_list, log_list) {
+			clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
+			sh->log_io = NULL;
+		}
+		r5l_io_run_stripes(io);
+		list_for_each_entry(sh, &io->stripe_finished_list, log_list) {
+			sh->log_io = NULL;
+		}
+		list_del(&io->log_sibling);
+		mempool_free(io, log->io_pool);
+	}
+	r5l_run_no_mem_stripe(log);
+
+	spin_unlock_irqrestore(&log->io_list_lock, flags);
+}
+
+static int __ppl_modify_log(struct r5l_log *log, struct md_rdev *rdev, int op)
+{
+	struct r5l_log *log_child;
+	struct ppl_conf *ppl_conf = log->private;
+
+	if (!rdev)
+		return -EINVAL;
+
+	dbg("rdev->raid_disk: %d op: %d\n", rdev->raid_disk, op);
+
+	if (rdev->raid_disk < 0)
+		return 0;
+
+	if (rdev->raid_disk >= ppl_conf->count)
+		return -ENODEV;
+
+	if (op == 0) {
+		log_child = ppl_conf->child_logs[rdev->raid_disk];
+		if (!log_child)
+			return 0;
+		ppl_conf->child_logs[rdev->raid_disk] = NULL;
+		ppl_log_stop(log_child);
+		ppl_exit_log_child(log_child);
+	} else if (op == 1) {
+		int ret = ppl_init_log_child(log, rdev, &log_child);
+		if (ret)
+			return ret;
+		ret = ppl_write_empty_header(log_child);
+		if (ret)
+			return ret;
+		ppl_conf->child_logs[rdev->raid_disk] = log_child;
+	} else {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int __ppl_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 {
 	struct ppl_conf *ppl_conf = log->private;
@@ -997,6 +1085,7 @@ static void __ppl_flush_stripe_to_raid(struct r5l_log *log)
 struct r5l_policy r5l_ppl = {
 	.init_log = __ppl_init_log,
 	.exit_log = __ppl_exit_log,
+	.modify_log = __ppl_modify_log,
 	.write_stripe = __ppl_write_stripe,
 	.write_stripe_run = __ppl_write_stripe_run,
 	.flush_stripe_to_raid = __ppl_flush_stripe_to_raid,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 61179fa..7a1a564 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7403,6 +7403,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			*rdevp = rdev;
 		}
 	}
+	if (test_bit(JournalPpl, &rdev->flags) && conf->log) {
+		int ret;
+		if (conf->log->rwh_policy != RWH_POLICY_PPL)
+			return -EINVAL;
+		ret = r5l_modify_log(conf->log, rdev, 0);
+		if (ret)
+			return ret;
+		if (p->replacement) {
+			ret = r5l_modify_log(conf->log, p->replacement, 1);
+			if (ret)
+				return ret;
+		}
+	}
 	if (p->replacement) {
 		/* We must have just cleared 'rdev' */
 		p->rdev = p->replacement;
@@ -7495,6 +7508,13 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		}
 	}
 out:
+	if (conf->log && !test_bit(Replacement, &rdev->flags) &&
+	    conf->log->rwh_policy == RWH_POLICY_PPL) {
+		int ret = r5l_modify_log(conf->log, rdev, 1);
+		if (ret)
+			return ret;
+	}
+
 	print_raid5_conf(conf);
 	return err;
 }
-- 
2.10.1


^ permalink raw reply related

* [PATCH 10/12] raid5-ppl: recovery from dirty shutdown using PPL
From: Artur Paszkiewicz @ 2016-11-24 12:28 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20161124122847.16456-1-artur.paszkiewicz@intel.com>

The recovery algorithm recalculates parity for every dirty stripe by
xor-ing the partial parity and data from each updated data member disk.
To verify PPL correctness a CRC is used for the PPL header. Each header
entry also contains a CRC for its partial parity data. If the header is
valid, recovery is performed for each entry until an invalid entry is
found. If the array is not degraded and recovery using PPL fully
succeeds, there is no need to resync the array because data and parity
will be consistent, so in this case resync will be disabled.

Due to compatibility with IMSM implementations on other systems, we
can't assume that the block size is always 4K. Writes generated by MD
raid5 don't have this issue, but in other environments it is possible to
have writes with the size of even a single 512-byte sector. The recovery
code takes this into account and also the logical sector size of the
underlying drives.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/raid5-ppl.c | 347 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c     |   5 +-
 2 files changed, 351 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 2cc4ee9..1a00b2d 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -16,6 +16,7 @@
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/crc32c.h>
+#include <linux/async_tx.h>
 #include <linux/module.h>
 #include <linux/raid/md_p.h>
 #include "md.h"
@@ -373,6 +374,346 @@ static void __ppl_stripe_write_finished(struct r5l_io_unit *io)
 	spin_unlock_irqrestore(&log->io_list_lock, flags);
 }
 
+static void ppl_xor(int size, struct page *page1, struct page *page2,
+		    struct page *page_result)
+{
+	struct async_submit_ctl submit;
+	struct dma_async_tx_descriptor *tx;
+	struct page *xor_srcs[] = { page1, page2 };
+
+	init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
+			  NULL, NULL, NULL, NULL);
+	tx = async_xor(page_result, xor_srcs, 0, 2, size, &submit);
+
+	async_tx_quiesce(&tx);
+}
+
+static int ppl_recover_entry(struct r5l_log *log, struct ppl_header_entry *e,
+			     sector_t ppl_sector)
+{
+	struct mddev *mddev = log->rdev->mddev;
+	struct r5conf *conf = mddev->private;
+
+	int block_size = queue_logical_block_size(mddev->queue);
+	struct page *pages;
+	struct page *page1;
+	struct page *page2;
+	sector_t r_sector_first = e->data_sector * (block_size >> 9);
+	sector_t r_sector_last = r_sector_first + (e->data_size >> 9) - 1;
+	int strip_sectors = conf->chunk_sectors;
+	int i;
+	int ret = 0;
+
+	if (e->pp_size > 0 && (e->pp_size >> 9) < strip_sectors) {
+		if (e->data_size > e->pp_size)
+			r_sector_last = r_sector_first +
+				(e->data_size / e->pp_size) * strip_sectors - 1;
+		strip_sectors = e->pp_size >> 9;
+	}
+
+	pages = alloc_pages(GFP_KERNEL, 1);
+	if (!pages)
+		return -ENOMEM;
+	page1 = pages;
+	page2 = pages + 1;
+
+	dbg("array sector first %llu, last %llu\n",
+	    (unsigned long long)r_sector_first,
+	    (unsigned long long)r_sector_last);
+
+	/* if start and end is 4k aligned, use a 4k block */
+	if (block_size == 512 &&
+			r_sector_first % (PAGE_SIZE >> 9) == 0 &&
+			(r_sector_last + 1) % (PAGE_SIZE >> 9) == 0)
+		block_size = PAGE_SIZE;
+
+	/* iterate through blocks in strip */
+	for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
+		bool update_parity = false;
+		sector_t parity_sector;
+		struct md_rdev *parity_rdev;
+		struct stripe_head sh;
+		int disk;
+
+		dbg("  iter %d start\n", i);
+		memset(page_address(page1), 0, PAGE_SIZE);
+
+		/* iterate through data member disks */
+		for (disk = 0; disk < (conf->raid_disks - conf->max_degraded);
+				disk++) {
+			int dd_idx;
+			struct md_rdev *rdev;
+			sector_t sector;
+			sector_t r_sector = r_sector_first + i +
+					    (disk * conf->chunk_sectors);
+
+			dbg("    data member disk %d start\n", disk);
+			if (r_sector > r_sector_last) {
+				dbg("    array sector %llu doesn't need parity update\n",
+				    (unsigned long long)r_sector);
+				continue;
+			}
+
+			update_parity = true;
+
+			/* map raid sector to member disk */
+			sector = raid5_compute_sector(conf, r_sector, 0, &dd_idx, NULL);
+			dbg("    processing array sector %llu => data mem disk %d, sector %llu\n",
+			    (unsigned long long)r_sector, dd_idx,
+			    (unsigned long long)sector);
+
+			rdev = conf->disks[dd_idx].rdev;
+			if (!rdev) {
+				dbg("    data member disk %d missing\n", dd_idx);
+				update_parity = false;
+				break;
+			}
+
+			dbg("    reading data member disk %s sector %llu\n",
+			    rdev->bdev->bd_disk->disk_name,
+			    (unsigned long long)sector);
+			if (!sync_page_io(rdev, sector, block_size, page2,
+					REQ_OP_READ, 0, false)) {
+				md_error(mddev, rdev);
+				dbg("    read failed!\n");
+				ret = -EIO;
+				goto out;
+			}
+
+			ppl_xor(block_size, page1, page2, page1);
+		}
+
+		if (!update_parity)
+			continue;
+
+		if (e->pp_size > 0) {
+			dbg("  reading pp disk sector %llu\n",
+			    (unsigned long long)(ppl_sector + i));
+			if (!sync_page_io(log->rdev,
+					ppl_sector - log->rdev->data_offset + i,
+					block_size, page2, REQ_OP_READ, 0,
+					false)) {
+				dbg("  read failed!\n");
+				md_error(mddev, log->rdev);
+				ret = -EIO;
+				goto out;
+			}
+
+			ppl_xor(block_size, page1, page2, page1);
+		}
+
+		/* map raid sector to parity disk */
+		parity_sector = raid5_compute_sector(conf, r_sector_first + i,
+				0, &disk, &sh);
+		BUG_ON(sh.pd_idx != e->parity_disk);
+		parity_rdev = conf->disks[sh.pd_idx].rdev;
+
+		BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
+		dbg("  write parity at sector %llu, parity disk %s\n",
+		    (unsigned long long)parity_sector,
+		    parity_rdev->bdev->bd_disk->disk_name);
+		if (!sync_page_io(parity_rdev, parity_sector, block_size,
+				page1, REQ_OP_WRITE, 0, false)) {
+			dbg("  parity write error!\n");
+			md_error(mddev, parity_rdev);
+			ret = -EIO;
+			goto out;
+		}
+	}
+
+out:
+	__free_pages(pages, 1);
+	return ret;
+}
+
+static int ppl_recover(struct r5l_log *log, struct ppl_header *pplhdr)
+{
+	struct mddev *mddev = log->rdev->mddev;
+	sector_t ppl_sector = log->rdev->ppl.sector + (PPL_HEADER_SIZE >> 9);
+	struct page *page;
+	int i;
+	int ret = 0;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	/* iterate through all PPL entries saved */
+	for (i = 0; i < pplhdr->entries_count; i++) {
+		struct ppl_header_entry *e = &pplhdr->entries[i];
+		u32 size = le32_to_cpu(e->pp_size);
+		sector_t sector = ppl_sector;
+		int ppl_entry_sectors = size >> 9;
+		u32 crc, crc_stored;
+
+		dbg("disk: %d, entry: %d, ppl_sector: %llu ppl_size: %u\n",
+		    log->rdev->raid_disk, i, (unsigned long long)ppl_sector,
+		    size);
+
+		crc = ~0;
+		crc_stored = le32_to_cpu(e->checksum);
+
+		while (size) {
+			int s = size > PAGE_SIZE ? PAGE_SIZE : size;
+
+			if (!sync_page_io(log->rdev,
+					sector - log->rdev->data_offset,
+					s, page, REQ_OP_READ, 0, false)) {
+				md_error(mddev, log->rdev);
+				ret = -EIO;
+				goto out;
+			}
+
+			crc = crc32c_le(crc, page_address(page), s);
+
+			size -= s;
+			sector += s >> 9;
+		}
+
+		crc = ~crc;
+
+		if (crc != crc_stored) {
+			dbg("ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
+			    crc_stored, crc);
+			ret++;
+		} else {
+			int ret2;
+			e->data_sector = le64_to_cpu(e->data_sector);
+			e->pp_size = le32_to_cpu(e->pp_size);
+			e->data_size = le32_to_cpu(e->data_size);
+
+			ret2 = ppl_recover_entry(log, e, ppl_sector);
+			if (ret2) {
+				ret = ret2;
+				goto out;
+			}
+		}
+
+		ppl_sector += ppl_entry_sectors;
+	}
+out:
+	__free_page(page);
+	return ret;
+}
+
+static int ppl_write_empty_header(struct r5l_log *log)
+{
+	struct page *page;
+	struct ppl_header *pplhdr;
+	int ret = 0;
+
+	dbg("disk: %d ppl_sector: %llu\n",
+	    log->rdev->raid_disk, (unsigned long long)log->rdev->ppl.sector);
+
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page)
+		return -ENOMEM;
+
+	pplhdr = page_address(page);
+	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+	pplhdr->signature = cpu_to_le32(log->uuid_checksum);
+	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
+
+	if (!sync_page_io(log->rdev, log->rdev->ppl.sector -
+			  log->rdev->data_offset, PPL_HEADER_SIZE, page,
+			  REQ_OP_WRITE, 0, false)) {
+		md_error(log->rdev->mddev, log->rdev);
+		ret = -EIO;
+	}
+
+	__free_page(page);
+	return ret;
+}
+
+static int ppl_load_distributed(struct r5l_log *log)
+{
+	struct mddev *mddev = log->rdev->mddev;
+	struct page *page;
+	struct ppl_header *pplhdr;
+	u32 crc, crc_stored;
+	int ret = 0;
+
+	dbg("disk: %d\n", log->rdev->raid_disk);
+
+	/* read PPL header */
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	if (!sync_page_io(log->rdev,
+			  log->rdev->ppl.sector - log->rdev->data_offset,
+			  PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
+		md_error(mddev, log->rdev);
+		ret = -EIO;
+		goto out;
+	}
+	pplhdr = page_address(page);
+
+	/* check header validity */
+	crc_stored = le32_to_cpu(pplhdr->checksum);
+	pplhdr->checksum = 0;
+	crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
+
+	if (crc_stored != crc) {
+		dbg("ppl header crc does not match: stored: 0x%x calculated: 0x%x\n",
+		    crc_stored, crc);
+		ret = 1;
+		goto out;
+	}
+
+	pplhdr->signature = le32_to_cpu(pplhdr->signature);
+	pplhdr->generation = le64_to_cpu(pplhdr->generation);
+	pplhdr->entries_count = le32_to_cpu(pplhdr->entries_count);
+
+	if (pplhdr->signature != log->uuid_checksum) {
+		dbg("ppl header signature does not match: stored: 0x%x configured: 0x%x\n",
+		    pplhdr->signature, log->uuid_checksum);
+		ret = 1;
+		goto out;
+	}
+
+	if (mddev->recovery_cp != MaxSector)
+		ret = ppl_recover(log, pplhdr);
+out:
+	__free_page(page);
+
+	if (ret >= 0) {
+		int ret2 = ppl_write_empty_header(log);
+		if (ret2)
+			ret = ret2;
+	}
+
+	dbg("return: %d\n", ret);
+	return ret;
+}
+
+static int ppl_load(struct r5l_log *log)
+{
+	struct ppl_conf *ppl_conf = log->private;
+	int ret = 0;
+	int i;
+
+	for (i = 0; i < ppl_conf->count; i++) {
+		struct r5l_log *log_child = ppl_conf->child_logs[i];
+		int ret2;
+
+		/* Missing drive */
+		if (!log_child)
+			continue;
+
+		ret2 = ppl_load_distributed(log_child);
+		if (ret2 < 0) {
+			ret = ret2;
+			break;
+		}
+
+		ret += ret2;
+	}
+
+	dbg("return: %d\n", ret);
+	return ret;
+}
+
 #define IMSM_MPB_SIG "Intel Raid ISM Cfg Sig. "
 #define IMSM_MPB_ORIG_FAMILY_NUM_OFFSET 64
 
@@ -604,6 +945,12 @@ static int __ppl_init_log(struct r5l_log *log, struct r5conf *conf)
 		ppl_conf->child_logs[i] = log_child;
 	}
 
+	ret = ppl_load(log);
+	if (!ret && mddev->recovery_cp == 0 && !mddev->degraded)
+		mddev->recovery_cp = MaxSector;
+	else if (ret < 0)
+		goto err;
+
 	rcu_assign_pointer(conf->log, log);
 	set_bit(MD_HAS_PPL, &mddev->flags);
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ba4fff9..61179fa 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7106,7 +7106,10 @@ static int raid5_run(struct mddev *mddev)
 
 	if (mddev->degraded > dirty_parity_disks &&
 	    mddev->recovery_cp != MaxSector) {
-		if (mddev->ok_start_degraded)
+		if (rwh_policy)
+			pr_warn("md/raid:%s: starting dirty degraded array with journal.\n",
+				mdname(mddev));
+		else if (mddev->ok_start_degraded)
 			pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
 				mdname(mddev));
 		else {
-- 
2.10.1


^ permalink raw reply related

* [PATCH 09/12] raid5-ppl: read PPL signature from IMSM metadata
From: Artur Paszkiewicz @ 2016-11-24 12:28 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20161124122847.16456-1-artur.paszkiewicz@intel.com>

The PPL signature is used to determine if the stored PPL is valid for a
given array. With IMSM, the PPL signature should match the
orig_family_num field of the superblock. To avoid passing this value
from userspace, it can be read from the IMSM MPB when initializing the
log.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/raid5-ppl.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 2132823..2cc4ee9 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -373,6 +373,75 @@ static void __ppl_stripe_write_finished(struct r5l_io_unit *io)
 	spin_unlock_irqrestore(&log->io_list_lock, flags);
 }
 
+#define IMSM_MPB_SIG "Intel Raid ISM Cfg Sig. "
+#define IMSM_MPB_ORIG_FAMILY_NUM_OFFSET 64
+
+static int ppl_find_signature_imsm(struct mddev *mddev, u32 *signature)
+{
+	struct md_rdev *rdev;
+	char *buf;
+	int ret = 0;
+	u32 orig_family_num = 0;
+	struct page *page;
+	struct mddev *container;
+
+	container = mddev_find_container(mddev);
+	if (!container || strncmp(container->metadata_type, "imsm", 4)) {
+		pr_err("Container metadata type is not imsm\n");
+		return -EINVAL;
+	}
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	buf = page_address(page);
+
+	rdev_for_each(rdev, container) {
+		u32 tmp;
+		struct md_rdev *rdev2;
+		bool found = false;
+
+		/* only use rdevs that are both in container and mddev */
+		rdev_for_each(rdev2, mddev)
+			if (rdev2->bdev == rdev->bdev) {
+				found = true;
+				break;
+			}
+
+		if (!found)
+			continue;
+
+		if (!sync_page_io(rdev, 0,
+				queue_logical_block_size(rdev->bdev->bd_queue),
+				page, REQ_OP_READ, 0, true)) {
+			ret = -EIO;
+			goto out;
+		}
+
+		if (strncmp(buf, IMSM_MPB_SIG, strlen(IMSM_MPB_SIG)) != 0) {
+			dbg("imsm mpb signature does not match\n");
+			ret = 1;
+			goto out;
+		}
+
+		tmp = le32_to_cpu(*(u32 *)(buf + IMSM_MPB_ORIG_FAMILY_NUM_OFFSET));
+
+		if (orig_family_num && orig_family_num != tmp) {
+			dbg("orig_family_num is not the same on all disks\n");
+			ret = 1;
+			goto out;
+		}
+
+		orig_family_num = tmp;
+	}
+
+	*signature = orig_family_num;
+out:
+	__free_page(page);
+	return ret;
+}
+
 static void ppl_exit_log_child(struct r5l_log *log)
 {
 	clear_bit(JournalPpl, &log->rdev->flags);
@@ -467,9 +536,17 @@ static int __ppl_init_log(struct r5l_log *log, struct r5conf *conf)
 		return -ENOMEM;
 	log->private = ppl_conf;
 
-	if (!mddev->external)
+	if (mddev->external) {
+		ret = ppl_find_signature_imsm(mddev, &log->uuid_checksum);
+		if (ret) {
+			pr_err("Failed to read imsm signature\n");
+			ret = -EINVAL;
+			goto err;
+		}
+	} else {
 		log->uuid_checksum = crc32c_le(~0, mddev->uuid,
 					       sizeof(mddev->uuid));
+	}
 
 	if (mddev->bitmap) {
 		pr_err("PPL is not compatible with bitmap\n");
-- 
2.10.1


^ permalink raw reply related

* [PATCH 08/12] md: expose rdev->sb_start as sysfs attribute
From: Artur Paszkiewicz @ 2016-11-24 12:28 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20161124122847.16456-1-artur.paszkiewicz@intel.com>

This is meant for external metadata arrays, to pass the location of the
superblock to the kernel.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/md.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5a14f8e..7049833 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3204,6 +3204,27 @@ static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
 
+static ssize_t
+sb_start_show(struct md_rdev *rdev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)rdev->sb_start);
+}
+
+static ssize_t
+sb_start_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+	unsigned long long sb_start;
+	if (kstrtoull(buf, 10, &sb_start) < 0)
+		return -EINVAL;
+	if (!rdev->mddev->external)
+		return -EBUSY;
+	rdev->sb_start = sb_start;
+	return len;
+}
+
+static struct rdev_sysfs_entry rdev_sb_start =
+__ATTR(sb_start, S_IRUGO|S_IWUSR, sb_start_show, sb_start_store);
+
 static struct attribute *rdev_default_attrs[] = {
 	&rdev_state.attr,
 	&rdev_errors.attr,
@@ -3214,6 +3235,7 @@ static struct attribute *rdev_default_attrs[] = {
 	&rdev_recovery_start.attr,
 	&rdev_bad_blocks.attr,
 	&rdev_unack_bad_blocks.attr,
+	&rdev_sb_start.attr,
 	NULL,
 };
 static ssize_t
-- 
2.10.1


^ permalink raw reply related

* [PATCH 07/12] md: mddev_find_container helper function
From: Artur Paszkiewicz @ 2016-11-24 12:28 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20161124122847.16456-1-artur.paszkiewicz@intel.com>

This allows finding the parent container of a subarray for external
metadata arrays.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/md.c | 37 +++++++++++++++++++++++++++++++++++++
 drivers/md/md.h |  3 +++
 2 files changed, 40 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7028d54..5a14f8e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -595,6 +595,43 @@ static struct mddev *mddev_find(dev_t unit)
 	goto retry;
 }
 
+struct mddev *mddev_find_container(struct mddev *subarray)
+{
+	struct mddev *mddev, *ret = NULL;
+	char *container_name;
+	int len = 0;
+	int i;
+
+	if (!subarray->external)
+		return NULL;
+
+	container_name = subarray->metadata_type + 1;
+	i = 0;
+	while (container_name[i]) {
+		if (container_name[i] == '/') {
+			len = i;
+			break;
+		}
+		i++;
+	}
+	if (len == 0)
+		return NULL;
+
+	spin_lock(&all_mddevs_lock);
+	list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
+		char *name = mdname(mddev);
+		if (strlen(name) == len &&
+				strncmp(name, container_name, len) == 0) {
+			ret = mddev;
+			break;
+		}
+	}
+	spin_unlock(&all_mddevs_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mddev_find_container);
+
 static struct attribute_group md_redundancy_group;
 
 void mddev_unlock(struct mddev *mddev)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d1e56f8..ca0b68f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -711,4 +711,7 @@ static inline int mddev_is_clustered(struct mddev *mddev)
 {
 	return mddev->cluster_info && mddev->bitmap_info.nodes > 1;
 }
+
+extern struct mddev *mddev_find_container(struct mddev *subarray);
+
 #endif /* _MD_MD_H */
-- 
2.10.1


^ permalink raw reply related

* [PATCH 06/12] raid5-ppl: calculate partial parity
From: Artur Paszkiewicz @ 2016-11-24 12:28 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20161124122847.16456-1-artur.paszkiewicz@intel.com>

Partial parity is calculated as follows:

- reconstruct-write case:
  xor data from all not updated disks in a stripe

- read-modify-write case:
  xor old data and parity from all updated disks in a stripe

Implement it using the async_tx API and integrate into raid_run_ops().
It must be called when we still have access to old data, so do it when
STRIPE_OP_BIODRAIN is set, but before ops_run_prexor5(). The result is
stored into sh->ppl_page.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/raid5.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7d9c253..ba4fff9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1939,6 +1939,49 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
 			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
 }
 
+static struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+		       struct dma_async_tx_descriptor *tx)
+{
+	int disks = sh->disks;
+	struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
+	int count = 0, pd_idx = sh->pd_idx, i;
+	struct async_submit_ctl submit;
+
+	if (test_bit(STRIPE_FULL_WRITE, &sh->state))
+		return tx;
+
+	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+		for (i = disks; i--;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_Wantdrain, &dev->flags)
+					|| i == pd_idx)
+				xor_srcs[count++] = dev->page;
+		}
+	} else if (sh->reconstruct_state == reconstruct_state_drain_run) {
+		for (i = disks; i--;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_UPTODATE, &dev->flags))
+				xor_srcs[count++] = dev->page;
+		}
+	} else {
+		return tx;
+	}
+
+	init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, tx,
+			NULL, sh, flex_array_get(percpu->scribble, 0)
+				+ sizeof(struct page *) * (sh->disks + 2));
+
+	if (count == 1)
+		tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
+				&submit);
+	else
+		tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
+				&submit);
+
+	return tx;
+}
+
 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 {
 	int overlap_clear = 0, i, disks = sh->disks;
@@ -1969,6 +2012,11 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			async_tx_ack(tx);
 	}
 
+	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request) &&
+			test_bit(MD_HAS_PPL, &conf->mddev->flags) &&
+			conf->disks[sh->pd_idx].rdev)
+		tx = ops_run_partial_parity(sh, percpu, tx);
+
 	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
 		if (level < 6)
 			tx = ops_run_prexor5(sh, percpu, tx);
@@ -3025,6 +3073,33 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
 		goto overlap;
 
+	/*
+	 * PPL does not allow writes to different disks in the same stripe when
+	 * they are not next to each other. Not really an overlap, but
+	 * wait_for_overlap can be used to handle this.
+	 */
+	if (forwrite && test_bit(MD_HAS_PPL, &conf->mddev->flags)) {
+		int i;
+		int di = 0;
+		int first = -1, last = -1;
+		int count = 0;
+		for (i = 0; i < sh->disks; i++) {
+			if (i == sh->pd_idx)
+				continue;
+
+			if (sh->dev[i].towrite || i == dd_idx) {
+				count++;
+				if (first < 0)
+					first = di;
+				last = di;
+			}
+			di++;
+		}
+
+		if (last > first && (last - first > count - 1))
+			goto overlap;
+	}
+
 	if (!forwrite || previous)
 		clear_bit(STRIPE_BATCH_READY, &sh->state);
 
-- 
2.10.1


^ permalink raw reply related

* [PATCH 05/12] raid5-ppl: Partial Parity Log implementation
From: Artur Paszkiewicz @ 2016-11-24 12:28 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20161124122847.16456-1-artur.paszkiewicz@intel.com>

This implements the write logging functionality, using the policy logic
introduced in previous patches.

PPL is a distributed log - data is stored on all RAID member drives in
the metadata area. PPL is written to the parity disk of a particular
stripe. Distributed log is implemented by using one r5l_log instance per
each array member. They are grouped in child_logs array in struct
ppl_conf, which is assigned to a common parent log. This parent log
serves as a proxy and is used in raid5 personality code - it is assigned
as _the_ log in r5conf->log. The child logs are where all the real work
is done.

The PPL consists of a 4KB header (struct ppl_header), and at least 128KB
for partial parity data. It is stored right after the array data (for
IMSM) or in the bitmap area (super 1.1 and 1.2) and can be overwritten
even at each array write request.

Attach a page for holding the partial parity data to each stripe_head.
Allocate it only if mddev has the MD_HAS_PPL flag set.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/raid5-cache.c |  12 +-
 drivers/md/raid5-cache.h |   6 +
 drivers/md/raid5-ppl.c   | 564 ++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/raid5.c       |  15 +-
 drivers/md/raid5.h       |   1 +
 5 files changed, 590 insertions(+), 8 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 9de122a..cb764db 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -119,8 +119,8 @@ static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
 	return log->device_size > used_size + size;
 }
 
-static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
-				    enum r5l_io_unit_state state)
+void __r5l_set_io_unit_state(struct r5l_io_unit *io,
+			     enum r5l_io_unit_state state)
 {
 	if (WARN_ON(io->state >= state))
 		return;
@@ -340,7 +340,7 @@ static void r5c_finish_cache_stripe(struct stripe_head *sh)
 	}
 }
 
-static void r5l_io_run_stripes(struct r5l_io_unit *io)
+void r5l_io_run_stripes(struct r5l_io_unit *io)
 {
 	struct stripe_head *sh, *next;
 
@@ -935,7 +935,7 @@ static sector_t r5l_reclaimable_space(struct r5l_log *log)
 				 r5c_calculate_new_cp(conf));
 }
 
-static void r5l_run_no_mem_stripe(struct r5l_log *log)
+void r5l_run_no_mem_stripe(struct r5l_log *log)
 {
 	struct stripe_head *sh;
 
@@ -1040,7 +1040,7 @@ static void r5l_log_flush_endio(struct bio *bio)
  * only write stripes of an io_unit to raid disks till the io_unit is the first
  * one whose data/parity is in log.
  */
-static void __r5l_flush_stripe_to_raid(struct r5l_log *log)
+void __r5l_flush_stripe_to_raid(struct r5l_log *log)
 {
 	bool do_flush;
 
@@ -1362,7 +1362,7 @@ bool r5l_log_disk_error(struct r5conf *conf)
 	if (!log)
 		ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
 	else
-		ret = test_bit(Faulty, &log->rdev->flags);
+		ret = log->rdev && test_bit(Faulty, &log->rdev->flags);
 	rcu_read_unlock();
 	return ret;
 }
diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
index 2e19453..40bd5ce 100644
--- a/drivers/md/raid5-cache.h
+++ b/drivers/md/raid5-cache.h
@@ -159,4 +159,10 @@ extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
 extern void r5l_quiesce(struct r5l_log *log, int state);
 extern bool r5l_log_disk_error(struct r5conf *conf);
 
+extern void __r5l_set_io_unit_state(struct r5l_io_unit *io,
+				    enum r5l_io_unit_state state);
+extern void r5l_io_run_stripes(struct r5l_io_unit *io);
+extern void r5l_run_no_mem_stripe(struct r5l_log *log);
+extern void __r5l_flush_stripe_to_raid(struct r5l_log *log);
+
 #endif /* _RAID5_CACHE_H */
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 263fad7..2132823 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -14,7 +14,569 @@
 
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/module.h>
+#include <linux/raid/md_p.h>
+#include "md.h"
 #include "raid5.h"
 #include "raid5-cache.h"
 
-struct r5l_policy r5l_ppl;
+static bool ppl_debug;
+module_param(ppl_debug, bool, 0644);
+MODULE_PARM_DESC(ppl_debug, "Debug mode for md raid5 PPL");
+
+#define dbg(format, args...)						\
+do {									\
+	if (ppl_debug)							\
+		printk(KERN_DEBUG"[%d] %s() "format,			\
+			current->pid, __func__, ##args);		\
+} while (0)
+
+struct ppl_conf {
+	int count;
+	struct r5l_log **child_logs;
+};
+
+struct ppl_header_entry {
+	__le64 data_sector;	/* Raid sector of the new data */
+	__le32 pp_size;		/* Length of partial parity */
+	__le32 data_size;	/* Length of data */
+	__u8 parity_disk;	/* Member disk containing parity */
+	__le32 checksum;	/* Checksum of this entry */
+} __packed;
+
+#define PPL_HEADER_SIZE PAGE_SIZE
+#define PPL_HDR_RESERVED 512
+#define PPL_HDR_ENTRY_SPACE \
+	(PPL_HEADER_SIZE - PPL_HDR_RESERVED - 3 * sizeof(u32) - sizeof(u64))
+#define PPL_HDR_MAX_ENTRIES \
+	(PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
+#define PPL_ENTRY_SPACE_IMSM (128 * 1024)
+
+struct ppl_header {
+	__u8 reserved[PPL_HDR_RESERVED];/* Reserved space */
+	__le32 signature;		/* Signature (family number of volume) */
+	__le64 generation;		/* Generation number of PP Header */
+	__le32 entries_count;		/* Number of entries in entry array */
+	__le32 checksum;		/* Checksum of PP Header */
+	struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES];
+} __packed;
+
+static void ppl_log_endio(struct bio *bio)
+{
+	struct r5l_io_unit *io = bio->bi_private;
+	struct r5l_log *log = io->log;
+	unsigned long flags;
+
+	dbg("io %p seq: %llu\n", io, io->seq);
+
+	if (bio->bi_error)
+		md_error(log->rdev->mddev, log->rdev);
+
+	bio_put(bio);
+	mempool_free(io->meta_page, log->meta_pool);
+
+	spin_lock_irqsave(&log->io_list_lock, flags);
+	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
+	if (log->need_cache_flush) {
+		list_move_tail(&io->log_sibling, &log->io_end_ios);
+	} else {
+		list_move_tail(&io->log_sibling, &log->finished_ios);
+		r5l_io_run_stripes(io);
+	}
+	spin_unlock_irqrestore(&log->io_list_lock, flags);
+
+	if (log->need_cache_flush)
+		md_wakeup_thread(log->rdev->mddev->thread);
+}
+
+static struct r5l_io_unit *ppl_new_iounit(struct r5l_log *log,
+					  struct stripe_head *sh)
+{
+	struct r5l_io_unit *io;
+	struct ppl_header *pplhdr;
+	struct r5conf *conf = log->rdev->mddev->private;
+	struct r5l_log *parent_log = conf->log;
+
+	io = mempool_alloc(log->io_pool, GFP_ATOMIC);
+	if (!io)
+		return NULL;
+
+	memset(io, 0, sizeof(*io));
+	io->log = log;
+	INIT_LIST_HEAD(&io->log_sibling);
+	INIT_LIST_HEAD(&io->stripe_list);
+	io->state = IO_UNIT_RUNNING;
+
+	io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
+	pplhdr = page_address(io->meta_page);
+	clear_page(pplhdr);
+	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+	pplhdr->signature = cpu_to_le32(log->uuid_checksum);
+
+	io->current_bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
+	bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, 0);
+
+	io->current_bio->bi_bdev = log->rdev->bdev;
+	io->current_bio->bi_iter.bi_sector = log->rdev->ppl.sector;
+	io->current_bio->bi_end_io = ppl_log_endio;
+	io->current_bio->bi_private = io;
+	bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
+
+	spin_lock(&parent_log->io_list_lock);
+	io->seq = parent_log->seq++;
+	spin_unlock(&parent_log->io_list_lock);
+	pplhdr->generation = cpu_to_le64(io->seq);
+
+	return io;
+}
+
+static int ppl_log_stripe(struct r5l_log *log, struct stripe_head *sh)
+{
+	struct r5l_io_unit *io;
+	struct ppl_header *pplhdr;
+	struct ppl_header_entry *pplhdr_entry = NULL;
+	int i;
+	sector_t data_sector;
+	unsigned long flags;
+	int data_disks = 0;
+	unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
+
+	dbg("<%llu>\n", (unsigned long long)sh->sector);
+
+	io = log->current_io;
+	if (!io) {
+		io = ppl_new_iounit(log, sh);
+		if (!io)
+			return -ENOMEM;
+		spin_lock_irqsave(&log->io_list_lock, flags);
+		list_add_tail(&io->log_sibling, &log->running_ios);
+		spin_unlock_irqrestore(&log->io_list_lock, flags);
+	} else if (io->meta_offset >= entry_space) {
+		/*
+		 * this io_unit is full - set meta_offset to -1 to
+		 * indicate that other units are waiting for this one
+		 */
+		io->meta_offset = -1;
+
+		dbg("add blocked io_unit by %p seq: %llu\n", io, io->seq);
+		io = ppl_new_iounit(log, sh);
+		if (!io) {
+			log->current_io->meta_offset = entry_space;
+			return -ENOMEM;
+		}
+		/*
+		 * reuse need_split_bio to mark that this io_unit is
+		 * blocked by an other
+		 */
+		io->need_split_bio = true;
+
+		spin_lock_irqsave(&log->io_list_lock, flags);
+		list_add_tail(&io->log_sibling, &log->running_ios);
+		spin_unlock_irqrestore(&log->io_list_lock, flags);
+	}
+
+	log->current_io = io;
+	io->meta_offset += PAGE_SIZE;
+
+	for (i = 0; i < sh->disks; i++) {
+		struct r5dev *dev = &sh->dev[i];
+		if (i != sh->pd_idx && test_bit(R5_LOCKED, &dev->flags)) {
+			if (!data_disks)
+				data_sector = dev->sector;
+			data_disks++;
+		}
+	}
+	BUG_ON(!data_disks);
+
+	dbg("io: %p seq: %llu data_sector: %llu data_disks: %d\n",
+	    io, io->seq, (unsigned long long)data_sector, data_disks);
+	pplhdr = page_address(io->meta_page);
+
+	if (pplhdr->entries_count > 0) {
+		/* check if we can merge with the previous entry */
+		struct ppl_header_entry *prev;
+		prev = &pplhdr->entries[pplhdr->entries_count-1];
+
+		if ((prev->data_sector + (prev->pp_size >> 9) == data_sector) &&
+		    (prev->data_size == prev->pp_size * data_disks) &&
+		    (data_sector >> ilog2(sh->raid_conf->chunk_sectors) ==
+		     prev->data_sector >> ilog2(sh->raid_conf->chunk_sectors)))
+			pplhdr_entry = prev;
+	}
+
+	if (pplhdr_entry) {
+		pplhdr_entry->data_size += PAGE_SIZE * data_disks;
+		pplhdr_entry->pp_size += PAGE_SIZE;
+	} else {
+		pplhdr_entry = &pplhdr->entries[pplhdr->entries_count++];
+		pplhdr_entry->data_sector = data_sector;
+		pplhdr_entry->data_size = PAGE_SIZE * data_disks;
+		pplhdr_entry->pp_size = PAGE_SIZE;
+		pplhdr_entry->parity_disk = sh->pd_idx;
+	}
+
+	BUG_ON(pplhdr->entries_count > PPL_HDR_MAX_ENTRIES);
+
+	if (test_bit(STRIPE_FULL_WRITE, &sh->state))
+		bio_add_page(io->current_bio, ZERO_PAGE(0), PAGE_SIZE, 0);
+	else
+		bio_add_page(io->current_bio, sh->ppl_page, PAGE_SIZE, 0);
+
+	list_add_tail(&sh->log_list, &io->stripe_list);
+	atomic_inc(&io->pending_stripe);
+	sh->log_io = io;
+
+	return 0;
+}
+
+static int ppl_write_stripe(struct r5l_log *log, struct stripe_head *sh)
+{
+	struct r5l_io_unit *io = sh->log_io;
+
+	if (io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
+	    test_bit(STRIPE_SYNCING, &sh->state) || !log || !log->rdev ||
+	    test_bit(Faulty, &log->rdev->flags)) {
+		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
+		return -EAGAIN;
+	}
+
+	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
+	clear_bit(STRIPE_DELAYED, &sh->state);
+	atomic_inc(&sh->count);
+
+	mutex_lock(&log->io_mutex);
+	if (ppl_log_stripe(log, sh)) {
+		spin_lock_irq(&log->io_list_lock);
+		list_add_tail(&sh->log_list, &log->no_mem_stripes);
+		spin_unlock_irq(&log->io_list_lock);
+	}
+	mutex_unlock(&log->io_mutex);
+
+	return 0;
+}
+
+static void ppl_submit_iounit(struct r5l_io_unit *io)
+{
+	struct mddev *mddev = io->log->rdev->mddev;
+	struct r5conf *conf = mddev->private;
+	int chunk_pages = conf->chunk_sectors >> (PAGE_SHIFT - 9);
+	int block_size = queue_logical_block_size(mddev->queue);
+	struct ppl_header *pplhdr = page_address(io->meta_page);
+	struct bio *bio = io->current_bio;
+	int i;
+	int bvi = 1;
+
+	dbg("io %p seq: %llu\n", io, io->seq);
+
+	for (i = 0; i < pplhdr->entries_count; i++) {
+		struct ppl_header_entry *e = &pplhdr->entries[i];
+		u32 crc = ~0;
+		u32 pp_size;
+
+		if (e->pp_size >> 9 == conf->chunk_sectors &&
+				e->data_size == e->pp_size *
+				(conf->raid_disks - conf->max_degraded)) {
+			int x;
+
+			for (x = bvi; x < bio->bi_vcnt - chunk_pages; x++)
+				bio->bi_io_vec[x] = bio->bi_io_vec[x + chunk_pages];
+
+			bio->bi_vcnt -= chunk_pages;
+			bio->bi_iter.bi_size -= chunk_pages << PAGE_SHIFT;
+			e->pp_size = 0;
+		}
+
+		pp_size = e->pp_size;
+
+		while (pp_size) {
+			void *addr = page_address(bio->bi_io_vec[bvi].bv_page);
+			crc = crc32c_le(crc, addr, PAGE_SIZE);
+			pp_size -= PAGE_SIZE;
+			bvi++;
+		}
+
+		dbg("    entry: %d, data sector: %llu, PPL size: %u, data size %u\n",
+		    i, e->data_sector, e->pp_size, e->data_size);
+
+		e->data_sector = cpu_to_le64(e->data_sector >>
+				ilog2(block_size >> 9));
+		e->pp_size = cpu_to_le32(e->pp_size);
+		e->data_size = cpu_to_le32(e->data_size);
+		e->checksum = cpu_to_le32(~crc);
+	}
+	pplhdr->entries_count = cpu_to_le32(pplhdr->entries_count);
+	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
+
+	dbg("submit_bio() size: %u sector: %llu dev: %s\n",
+	    bio->bi_iter.bi_size, (unsigned long long)bio->bi_iter.bi_sector,
+	    bio->bi_bdev->bd_disk->disk_name);
+	submit_bio(bio);
+}
+
+static void ppl_submit_current_io(struct r5l_log *log)
+{
+	struct r5l_io_unit *io, *io_submit = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&log->io_list_lock, flags);
+	list_for_each_entry(io, &log->running_ios, log_sibling) {
+		if (io->state >= IO_UNIT_IO_START)
+			break;
+
+		if (io->state == IO_UNIT_RUNNING && !io->need_split_bio) {
+			__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
+
+			if (io == log->current_io) {
+				BUG_ON(io->meta_offset < 0);
+				log->current_io = NULL;
+			}
+
+			io_submit = io;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&log->io_list_lock, flags);
+
+	if (io_submit)
+		ppl_submit_iounit(io_submit);
+}
+
+static void ppl_write_stripe_run(struct r5l_log *log)
+{
+	mutex_lock(&log->io_mutex);
+	ppl_submit_current_io(log);
+	mutex_unlock(&log->io_mutex);
+}
+
+static void __ppl_stripe_write_finished(struct r5l_io_unit *io)
+{
+	struct r5l_log *log = io->log;
+	unsigned long flags;
+
+	dbg("io %p seq: %llu\n", io, io->seq);
+
+	spin_lock_irqsave(&log->io_list_lock, flags);
+
+	if (io->meta_offset < 0) {
+		struct r5l_io_unit *io_next = list_first_entry(&log->running_ios,
+				struct r5l_io_unit, log_sibling);
+		BUG_ON(!io_next->need_split_bio);
+		io_next->need_split_bio = false;
+	}
+
+	list_del(&io->log_sibling);
+	mempool_free(io, log->io_pool);
+	r5l_run_no_mem_stripe(log);
+
+	spin_unlock_irqrestore(&log->io_list_lock, flags);
+}
+
+static void ppl_exit_log_child(struct r5l_log *log)
+{
+	clear_bit(JournalPpl, &log->rdev->flags);
+	kfree(log);
+}
+
+static void __ppl_exit_log(struct r5l_log *log)
+{
+	struct ppl_conf *ppl_conf = log->private;
+
+	if (ppl_conf->child_logs) {
+		struct r5l_log *log_child;
+		int i;
+
+		for (i = 0; i < ppl_conf->count; i++) {
+			log_child = ppl_conf->child_logs[i];
+			if (!log_child)
+				continue;
+
+			clear_bit(MD_HAS_PPL, &log_child->rdev->mddev->flags);
+			ppl_exit_log_child(log_child);
+		}
+		kfree(ppl_conf->child_logs);
+	}
+	kfree(ppl_conf);
+
+	mempool_destroy(log->meta_pool);
+	if (log->bs)
+		bioset_free(log->bs);
+	mempool_destroy(log->io_pool);
+	kmem_cache_destroy(log->io_kc);
+}
+
+static int ppl_init_log_child(struct r5l_log *log_parent,
+			      struct md_rdev *rdev, struct r5l_log **log_child)
+{
+	struct r5l_log *log;
+	struct request_queue *q;
+
+	log = kzalloc(sizeof(struct r5l_log), GFP_KERNEL);
+	if (!log)
+		return -ENOMEM;
+
+	*log_child = log;
+	log->rdev = rdev;
+
+	mutex_init(&log->io_mutex);
+	spin_lock_init(&log->io_list_lock);
+	INIT_LIST_HEAD(&log->running_ios);
+	INIT_LIST_HEAD(&log->io_end_ios);
+	INIT_LIST_HEAD(&log->flushing_ios);
+	INIT_LIST_HEAD(&log->finished_ios);
+	INIT_LIST_HEAD(&log->no_mem_stripes);
+	bio_init(&log->flush_bio);
+
+	log->io_kc = log_parent->io_kc;
+	log->io_pool = log_parent->io_pool;
+	log->bs = log_parent->bs;
+	log->meta_pool = log_parent->meta_pool;
+	log->uuid_checksum = log_parent->uuid_checksum;
+
+	if (rdev->mddev->external) {
+		log->rdev->ppl.sector = log->rdev->data_offset +
+					log->rdev->sectors;
+		log->rdev->ppl.size = (PPL_HEADER_SIZE +
+				       PPL_ENTRY_SPACE_IMSM) << 9;
+	} else {
+		log->rdev->ppl.sector = log->rdev->sb_start +
+					log->rdev->ppl.offset;
+	}
+	log->policy = log_parent->policy;
+	q = bdev_get_queue(log->rdev->bdev);
+	log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
+
+	set_bit(JournalPpl, &rdev->flags);
+
+	return 0;
+}
+
+static int __ppl_init_log(struct r5l_log *log, struct r5conf *conf)
+{
+	struct ppl_conf *ppl_conf;
+	struct mddev *mddev = conf->mddev;
+	int ret;
+	int i;
+
+	if (PAGE_SIZE != 4096)
+		return -EINVAL;
+
+	ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
+	if (!ppl_conf)
+		return -ENOMEM;
+	log->private = ppl_conf;
+
+	if (!mddev->external)
+		log->uuid_checksum = crc32c_le(~0, mddev->uuid,
+					       sizeof(mddev->uuid));
+
+	if (mddev->bitmap) {
+		pr_err("PPL is not compatible with bitmap\n");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	spin_lock_init(&log->io_list_lock);
+
+	log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
+	if (!log->io_kc) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	log->io_pool = mempool_create_slab_pool(conf->raid_disks, log->io_kc);
+	if (!log->io_pool) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	log->bs = bioset_create(conf->raid_disks, 0);
+	if (!log->bs) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	log->meta_pool = mempool_create_page_pool(conf->raid_disks, 0);
+	if (!log->meta_pool) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	log->need_cache_flush = true;
+
+	ppl_conf->count = conf->raid_disks;
+	ppl_conf->child_logs = kzalloc(sizeof(struct r5l_log *) * ppl_conf->count,
+				       GFP_KERNEL);
+	if (!ppl_conf->child_logs) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < ppl_conf->count; i++) {
+		struct r5l_log *log_child;
+		struct md_rdev *rdev = conf->disks[i].rdev;
+
+		if (!rdev)
+			continue;
+
+		ret = ppl_init_log_child(log, rdev, &log_child);
+		if (ret)
+			goto err;
+
+		ppl_conf->child_logs[i] = log_child;
+	}
+
+	rcu_assign_pointer(conf->log, log);
+	set_bit(MD_HAS_PPL, &mddev->flags);
+
+	return 0;
+err:
+	__ppl_exit_log(log);
+	return ret;
+}
+
+static int __ppl_write_stripe(struct r5l_log *log, struct stripe_head *sh)
+{
+	struct ppl_conf *ppl_conf = log->private;
+	struct r5l_log *log_child = ppl_conf->child_logs[sh->pd_idx];
+
+	return ppl_write_stripe(log_child, sh);
+}
+
+static void __ppl_write_stripe_run(struct r5l_log *log)
+{
+	struct ppl_conf *ppl_conf = log->private;
+	struct r5l_log *log_child;
+	int i;
+
+	for (i = 0; i < ppl_conf->count; i++) {
+		log_child = ppl_conf->child_logs[i];
+		if (log_child)
+			ppl_write_stripe_run(log_child);
+	}
+}
+
+static void __ppl_flush_stripe_to_raid(struct r5l_log *log)
+{
+	struct ppl_conf *ppl_conf = log->private;
+	struct r5l_log *log_child;
+	int i;
+
+	for (i = 0; i < ppl_conf->count; i++) {
+		log_child = ppl_conf->child_logs[i];
+		if (log_child)
+			__r5l_flush_stripe_to_raid(log_child);
+	}
+}
+
+struct r5l_policy r5l_ppl = {
+	.init_log = __ppl_init_log,
+	.exit_log = __ppl_exit_log,
+	.write_stripe = __ppl_write_stripe,
+	.write_stripe_run = __ppl_write_stripe_run,
+	.flush_stripe_to_raid = __ppl_flush_stripe_to_raid,
+	.stripe_write_finished = __ppl_stripe_write_finished,
+	.handle_flush_request = NULL,
+	.quiesce = NULL,
+};
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index fdf2503..7d9c253 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -464,6 +464,11 @@ static void shrink_buffers(struct stripe_head *sh)
 		sh->dev[i].page = NULL;
 		put_page(p);
 	}
+
+	if (sh->ppl_page) {
+		put_page(sh->ppl_page);
+		sh->ppl_page = NULL;
+	}
 }
 
 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -480,6 +485,13 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 		sh->dev[i].page = page;
 		sh->dev[i].orig_page = page;
 	}
+
+	if (test_bit(MD_HAS_PPL, &sh->raid_conf->mddev->flags)) {
+		sh->ppl_page = alloc_page(gfp);
+		if (!sh->ppl_page)
+			return 1;
+	}
+
 	return 0;
 }
 
@@ -875,7 +887,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 	might_sleep();
 
-	if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
+	if (!test_bit(STRIPE_R5C_CACHING, &sh->state) ||
+	    test_bit(MD_HAS_PPL, &conf->mddev->flags)) {
 		/* writing out phase */
 		if (r5l_write_stripe(conf->log, sh) == 0)
 			return;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 1851be8..9b6b912 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -228,6 +228,7 @@ struct stripe_head {
 	struct list_head	log_list;
 	sector_t		log_start; /* first meta block on the journal */
 	struct list_head	r5c; /* for r5c_cache->stripe_in_journal */
+	struct page		*ppl_page;
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
-- 
2.10.1


^ permalink raw reply related

* [PATCH 04/12] md: superblock changes for PPL
From: Artur Paszkiewicz @ 2016-11-24 12:28 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20161124122847.16456-1-artur.paszkiewicz@intel.com>

Include information about PPL location and size into mdp_superblock_1
and copy it to/from rdev. Because PPL is mutually exclusive with bitmap,
put it in place of 'bitmap_offset'. Add a new flag MD_FEATURE_PPL for
'feature_map', analogically to MD_FEATURE_BITMAP_OFFSET. Add MD_HAS_PPL
to mddev->flags to indicate that PPL is enabled on an array.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/md.c                | 14 ++++++++++++++
 drivers/md/md.h                |  8 ++++++++
 drivers/md/raid5.c             |  7 +++++++
 include/uapi/linux/raid/md_p.h | 18 ++++++++++++++----
 4 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4876687..7028d54 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1524,6 +1524,11 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
 	} else if (sb->bblog_offset != 0)
 		rdev->badblocks.shift = 0;
 
+	if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+		rdev->ppl.offset = le16_to_cpu(sb->ppl.offset);
+		rdev->ppl.size = le16_to_cpu(sb->ppl.size);
+	}
+
 	if (!refdev) {
 		ret = 1;
 	} else {
@@ -1636,6 +1641,9 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 
 		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
 			set_bit(MD_HAS_JOURNAL, &mddev->flags);
+
+		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL)
+			set_bit(MD_HAS_PPL, &mddev->flags);
 	} else if (mddev->pers == NULL) {
 		/* Insist of good event counter while assembling, except for
 		 * spares (which don't need an event count) */
@@ -1849,6 +1857,12 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
 		sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
 
+	if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+		sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
+		sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
+		sb->ppl.size = cpu_to_le16(rdev->ppl.size);
+	}
+
 	rdev_for_each(rdev2, mddev) {
 		i = rdev2->desc_nr;
 		if (test_bit(Faulty, &rdev2->flags))
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2fc75ac..d1e56f8 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -122,6 +122,13 @@ struct md_rdev {
 					   * sysfs entry */
 
 	struct badblocks badblocks;
+
+	struct {
+		unsigned int offset; /* Offset from superblock to start of PPL.
+				      * Not used by external metadata. */
+		unsigned int size;   /* Size in sectors of the PPL space */
+		sector_t sector;     /* First sector of the PPL space */
+	} ppl;
 };
 enum flag_bits {
 	Faulty,			/* device is known to have a fault */
@@ -235,6 +242,7 @@ enum mddev_flags {
 				 * never cause the array to become failed.
 				 */
 	MD_NEED_REWRITE,	/* metadata write needs to be repeated */
+	MD_HAS_PPL,		/* The raid array has PPL feature set */
 };
 #define MD_UPDATE_SB_FLAGS (BIT(MD_CHANGE_DEVS) | \
 			    BIT(MD_CHANGE_CLEAN) | \
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 26e4045..fdf2503 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6795,6 +6795,10 @@ static int raid5_run(struct mddev *mddev)
 	rdev_for_each(rdev, mddev) {
 		long long diff;
 
+		if (test_bit(MD_HAS_PPL, &mddev->flags) &&
+		    test_bit(In_sync, &rdev->flags))
+			set_bit(JournalPpl, &rdev->flags);
+
 		if (test_bit(JournalPpl, &rdev->flags) &&
 		    test_bit(In_sync, &rdev->flags))
 			ppl_disks++;
@@ -6817,6 +6821,9 @@ static int raid5_run(struct mddev *mddev)
 			min_offset_diff = diff;
 	}
 
+	if (ppl_disks)
+		set_bit(MD_HAS_PPL, &mddev->flags);
+
 	if (mddev->reshape_position != MaxSector) {
 		/* Check that we can continue the reshape.
 		 * Difficulties arise if the stripe we would write to
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index 9930f3e..455caa8 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -242,10 +242,18 @@ struct mdp_superblock_1 {
 
 	__le32	chunksize;	/* in 512byte sectors */
 	__le32	raid_disks;
-	__le32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
-				 * NOTE: signed, so bitmap can be before superblock
-				 * only meaningful of feature_map[0] is set.
-				 */
+	union {
+		__le32	bitmap_offset;	/* sectors after start of superblock that bitmap starts
+					 * NOTE: signed, so bitmap can be before superblock
+					 * only meaningful of feature_map[0] is set.
+					 */
+
+		/* only meaningful when feature_map[MD_FEATURE_PPL] is set */
+		struct {
+			__le16 offset; /* sectors after start of superblock that ppl starts */
+			__le16 size; /* PPL size (including header) in sectors */
+		} ppl;
+	};
 
 	/* These are only valid with feature bit '4' */
 	__le32	new_level;	/* new level we are reshaping to		*/
@@ -318,6 +326,7 @@ struct mdp_superblock_1 {
 					     */
 #define MD_FEATURE_CLUSTERED		256 /* clustered MD */
 #define	MD_FEATURE_JOURNAL		512 /* support write cache */
+#define	MD_FEATURE_PPL			1024 /* support PPL */
 #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
 					|MD_FEATURE_RECOVERY_OFFSET	\
 					|MD_FEATURE_RESHAPE_ACTIVE	\
@@ -328,6 +337,7 @@ struct mdp_superblock_1 {
 					|MD_FEATURE_RECOVERY_BITMAP	\
 					|MD_FEATURE_CLUSTERED		\
 					|MD_FEATURE_JOURNAL		\
+					|MD_FEATURE_PPL			\
 					)
 
 struct r5l_payload_header {
-- 
2.10.1


^ permalink raw reply related

* [PATCH 03/12] raid5-cache: add a new policy
From: Artur Paszkiewicz @ 2016-11-24 12:28 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20161124122847.16456-1-artur.paszkiewicz@intel.com>

Add a source file for the new policy implementation and allow selecting
the policy based on the policy_type parameter in r5l_init_log().

Introduce a new flag for rdev state flags to allow enabling the new
policy from userspace.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/Makefile      |  2 +-
 drivers/md/md.c          |  5 +++++
 drivers/md/md.h          |  3 +++
 drivers/md/raid5-cache.c | 17 +++++++++++++++--
 drivers/md/raid5-cache.h |  9 ++++++++-
 drivers/md/raid5-ppl.c   | 20 ++++++++++++++++++++
 drivers/md/raid5.c       | 42 ++++++++++++++++++++++++++++++++++++------
 7 files changed, 88 insertions(+), 10 deletions(-)
 create mode 100644 drivers/md/raid5-ppl.c

diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3cbda1a..4d48714 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -18,7 +18,7 @@ dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 dm-era-y	+= dm-era-target.o
 dm-verity-y	+= dm-verity-target.o
 md-mod-y	+= md.o bitmap.o
-raid456-y	+= raid5.o raid5-cache.o
+raid456-y	+= raid5.o raid5-cache.o raid5-ppl.o
 
 # Note: link order is important.  All raid personalities
 # and must come before md.o, as they each initialise 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c7894fb..4876687 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2575,6 +2575,8 @@ state_show(struct md_rdev *rdev, char *page)
 		len += sprintf(page+len, "journal%s", sep);
 	if (test_bit(WriteMostly, &flags))
 		len += sprintf(page+len, "write_mostly%s", sep);
+	if (test_bit(JournalPpl, &flags))
+		len += sprintf(page+len, "journal_ppl%s", sep);
 	if (test_bit(Blocked, &flags) ||
 	    (rdev->badblocks.unacked_exist
 	     && !test_bit(Faulty, &flags)))
@@ -2753,6 +2755,9 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 	} else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
 		clear_bit(ExternalBbl, &rdev->flags);
 		err = 0;
+	} else if (cmd_match(buf, "journal_ppl")) {
+		set_bit(JournalPpl, &rdev->flags);
+		err = 0;
 	}
 	if (!err)
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 5c08f84..2fc75ac 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -172,6 +172,9 @@ enum flag_bits {
 				 * Usually, this device should be faster
 				 * than other devices in the array
 				 */
+	JournalPpl,		/* This device is used for raid5
+				 * Partial Parity Log.
+				 */
 	ClusterRemove,
 	RemoveSynchronized,	/* synchronize_rcu() was called after
 				 * this device was known to be faulty,
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index b10bd4c..9de122a 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2539,16 +2539,29 @@ struct r5l_policy r5l_journal = {
 	.handle_flush_request = __r5l_handle_flush_request,
 	.quiesce = __r5l_quiesce,
 };
+extern struct r5l_policy r5l_ppl;
 
-int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
+int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev, int policy_type)
 {
 	int ret;
 	struct r5l_log *log = kzalloc(sizeof(*log), GFP_KERNEL);
 	if (!log)
 		return -ENOMEM;
 
+	switch (policy_type) {
+	case RWH_POLICY_JOURNAL:
+		log->policy = &r5l_journal;
+		break;
+	case RWH_POLICY_PPL:
+		log->policy = &r5l_ppl;
+		break;
+	default:
+		kfree(log);
+		return -EINVAL;
+	}
+
 	log->rdev = rdev;
-	log->policy = &r5l_journal;
+	log->rwh_policy = policy_type;
 
 	ret = log->policy->init_log(log, conf);
 	if (ret)
diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
index c3028a1..2e19453 100644
--- a/drivers/md/raid5-cache.h
+++ b/drivers/md/raid5-cache.h
@@ -81,6 +81,13 @@ struct r5l_log {
 	struct work_struct deferred_io_work;
 
 	struct r5l_policy *policy;
+	enum {
+		RWH_POLICY_OFF,
+		RWH_POLICY_JOURNAL,
+		RWH_POLICY_PPL,
+	} rwh_policy;
+
+	void *private;
 };
 
 /*
@@ -142,7 +149,7 @@ struct r5l_policy {
 	void (*quiesce)(struct r5l_log *log, int state);
 };
 
-extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
+extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev, int policy_type);
 extern void r5l_exit_log(struct r5l_log *log);
 extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh);
 extern void r5l_write_stripe_run(struct r5l_log *log);
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
new file mode 100644
index 0000000..263fad7
--- /dev/null
+++ b/drivers/md/raid5-ppl.c
@@ -0,0 +1,20 @@
+/*
+ * Partial Parity Log for closing the RAID5 write hole
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include "raid5.h"
+#include "raid5-cache.h"
+
+struct r5l_policy r5l_ppl;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 58ee1d3..26e4045 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6779,8 +6779,10 @@ static int raid5_run(struct mddev *mddev)
 	struct r5conf *conf;
 	int working_disks = 0;
 	int dirty_parity_disks = 0;
+	int ppl_disks = 0;
 	struct md_rdev *rdev;
 	struct md_rdev *journal_dev = NULL;
+	int rwh_policy = RWH_POLICY_OFF;
 	sector_t reshape_offset = 0;
 	int i;
 	long long min_offset_diff = 0;
@@ -6793,6 +6795,10 @@ static int raid5_run(struct mddev *mddev)
 	rdev_for_each(rdev, mddev) {
 		long long diff;
 
+		if (test_bit(JournalPpl, &rdev->flags) &&
+		    test_bit(In_sync, &rdev->flags))
+			ppl_disks++;
+
 		if (test_bit(Journal, &rdev->flags)) {
 			journal_dev = rdev;
 			continue;
@@ -6983,6 +6989,22 @@ static int raid5_run(struct mddev *mddev)
 		goto abort;
 	}
 
+	if (ppl_disks) {
+		if (ppl_disks != working_disks) {
+			pr_err("md/raid:%s: distributed PPL must be enabled on all member devices - aborting\n",
+			       mdname(mddev));
+			goto abort;
+		}
+		rwh_policy = RWH_POLICY_PPL;
+	}
+
+	if (journal_dev) {
+		if (ppl_disks)
+			pr_warn("md/raid:%s: using journal device and PPL not allowed - ignoring PPL\n",
+				mdname(mddev));
+		rwh_policy = RWH_POLICY_JOURNAL;
+	}
+
 	/* device size must be a multiple of chunk size */
 	mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
 	mddev->resync_max_sectors = mddev->dev_sectors;
@@ -7108,12 +7130,17 @@ static int raid5_run(struct mddev *mddev)
 		blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
 	}
 
-	if (journal_dev) {
-		char b[BDEVNAME_SIZE];
+	if (rwh_policy) {
+		if (journal_dev) {
+			char b[BDEVNAME_SIZE];
 
-		pr_debug("md/raid:%s: using device %s as journal\n",
-			 mdname(mddev), bdevname(journal_dev->bdev, b));
-		if (r5l_init_log(conf, journal_dev))
+			pr_debug("md/raid:%s: using device %s as journal\n",
+				 mdname(mddev), bdevname(journal_dev->bdev, b));
+		} else if (rwh_policy == RWH_POLICY_PPL) {
+			pr_debug("md/raid:%s: enabling distributed PPL journal\n",
+				 mdname(mddev));
+		}
+		if (r5l_init_log(conf, journal_dev, rwh_policy))
 			goto abort;
 	}
 
@@ -7309,6 +7336,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 
 	if (test_bit(Journal, &rdev->flags)) {
 		char b[BDEVNAME_SIZE];
+		int ret;
 		if (conf->log)
 			return -EBUSY;
 
@@ -7317,7 +7345,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * The array is in readonly mode if journal is missing, so no
 		 * write requests running. We should be safe
 		 */
-		r5l_init_log(conf, rdev);
+		ret = r5l_init_log(conf, rdev, RWH_POLICY_JOURNAL);
+		if (ret)
+			return ret;
 		pr_debug("md/raid:%s: using device %s as journal\n",
 			 mdname(mddev), bdevname(rdev->bdev, b));
 		return 0;
-- 
2.10.1


^ permalink raw reply related

* [PATCH 02/12] raid5-cache: add policy logic
From: Artur Paszkiewicz @ 2016-11-24 12:28 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20161124122847.16456-1-artur.paszkiewicz@intel.com>

Add a struct r5l_policy and wrapper functions for non-static functions
from raid5-cache. This allows adding different policies for raid5
logging without changing the mechanism - calls from the raid5
personality stay the same.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/raid5-cache.c | 112 +++++++++++++++++++++++++++++++++++------------
 drivers/md/raid5-cache.h |  13 ++++++
 2 files changed, 98 insertions(+), 27 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index fc6f9fa..b10bd4c 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -727,7 +727,7 @@ static inline void r5l_add_no_space_stripe(struct r5l_log *log,
  * running in raid5d, where reclaim could wait for raid5d too (when it flushes
  * data from log to raid disks), so we shouldn't wait for reclaim here
  */
-int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
+static int __r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 {
 	struct r5conf *conf = sh->raid_conf;
 	int write_disks = 0;
@@ -737,8 +737,6 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 	int ret = 0;
 	bool wake_reclaim = false;
 
-	if (!log)
-		return -EAGAIN;
 	/* Don't support stripe batch */
 	if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
 	    test_bit(STRIPE_SYNCING, &sh->state)) {
@@ -825,19 +823,28 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 	return 0;
 }
 
-void r5l_write_stripe_run(struct r5l_log *log)
+int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
+{
+	if (log && log->policy->write_stripe)
+		return log->policy->write_stripe(log, sh);
+	return -EAGAIN;
+}
+
+static void __r5l_write_stripe_run(struct r5l_log *log)
 {
-	if (!log)
-		return;
 	mutex_lock(&log->io_mutex);
 	r5l_submit_current_io(log);
 	mutex_unlock(&log->io_mutex);
 }
 
-int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
+void r5l_write_stripe_run(struct r5l_log *log)
+{
+	if (log && log->policy->write_stripe_run)
+		log->policy->write_stripe_run(log);
+}
+
+static int __r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
 {
-	if (!log)
-		return -ENODEV;
 
 	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
 		/*
@@ -869,6 +876,13 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
 	return -EAGAIN;
 }
 
+int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
+{
+	if (log && log->policy->handle_flush_request)
+		return log->policy->handle_flush_request(log, bio);
+	return -ENODEV;
+}
+
 /* This will run after log space is reclaimed */
 static void r5l_run_no_space_stripes(struct r5l_log *log)
 {
@@ -990,8 +1004,9 @@ void r5l_stripe_write_finished(struct stripe_head *sh)
 	io = sh->log_io;
 	sh->log_io = NULL;
 
-	if (io && atomic_dec_and_test(&io->pending_stripe))
-		__r5l_stripe_write_finished(io);
+	if (io && atomic_dec_and_test(&io->pending_stripe) &&
+			io->log->policy->stripe_write_finished)
+		io->log->policy->stripe_write_finished(io);
 }
 
 static void r5l_log_flush_endio(struct bio *bio)
@@ -1025,11 +1040,11 @@ static void r5l_log_flush_endio(struct bio *bio)
  * only write stripes of an io_unit to raid disks till the io_unit is the first
  * one whose data/parity is in log.
  */
-void r5l_flush_stripe_to_raid(struct r5l_log *log)
+static void __r5l_flush_stripe_to_raid(struct r5l_log *log)
 {
 	bool do_flush;
 
-	if (!log || !log->need_cache_flush)
+	if (!log->need_cache_flush)
 		return;
 
 	spin_lock_irq(&log->io_list_lock);
@@ -1051,6 +1066,12 @@ void r5l_flush_stripe_to_raid(struct r5l_log *log)
 	submit_bio(&log->flush_bio);
 }
 
+void r5l_flush_stripe_to_raid(struct r5l_log *log)
+{
+	if (log && log->policy->flush_stripe_to_raid)
+		log->policy->flush_stripe_to_raid(log);
+}
+
 static void r5l_write_super(struct r5l_log *log, sector_t cp);
 static void r5l_write_super_and_discard_space(struct r5l_log *log,
 	sector_t end)
@@ -1307,10 +1328,10 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
 	md_wakeup_thread(log->reclaim_thread);
 }
 
-void r5l_quiesce(struct r5l_log *log, int state)
+static void __r5l_quiesce(struct r5l_log *log, int state)
 {
 	struct mddev *mddev;
-	if (!log || state == 2)
+	if (state == 2)
 		return;
 	if (state == 0)
 		kthread_unpark(log->reclaim_thread->tsk);
@@ -1324,6 +1345,12 @@ void r5l_quiesce(struct r5l_log *log, int state)
 	}
 }
 
+void r5l_quiesce(struct r5l_log *log, int state)
+{
+	if (log && log->policy->quiesce)
+		log->policy->quiesce(log, state);
+}
+
 bool r5l_log_disk_error(struct r5conf *conf)
 {
 	struct r5l_log *log;
@@ -2389,11 +2416,9 @@ static int r5l_load_log(struct r5l_log *log)
 	return ret;
 }
 
-int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
+static int __r5l_init_log(struct r5l_log *log, struct r5conf *conf)
 {
-	struct request_queue *q = bdev_get_queue(rdev->bdev);
-	struct r5l_log *log;
-
+	struct request_queue *q = bdev_get_queue(log->rdev->bdev);
 	if (PAGE_SIZE != 4096)
 		return -EINVAL;
 
@@ -2412,15 +2437,11 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 		return -EINVAL;
 	}
 
-	log = kzalloc(sizeof(*log), GFP_KERNEL);
-	if (!log)
-		return -ENOMEM;
-	log->rdev = rdev;
 
 	log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
 
-	log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
-				       sizeof(rdev->mddev->uuid));
+	log->uuid_checksum = crc32c_le(~0, log->rdev->mddev->uuid,
+				       sizeof(log->rdev->mddev->uuid));
 
 	mutex_init(&log->io_mutex);
 
@@ -2485,16 +2506,53 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 io_pool:
 	kmem_cache_destroy(log->io_kc);
 io_kc:
-	kfree(log);
 	return -EINVAL;
 }
 
-void r5l_exit_log(struct r5l_log *log)
+static void __r5l_exit_log(struct r5l_log *log)
 {
 	md_unregister_thread(&log->reclaim_thread);
 	mempool_destroy(log->meta_pool);
 	bioset_free(log->bs);
 	mempool_destroy(log->io_pool);
 	kmem_cache_destroy(log->io_kc);
+}
+
+void r5l_exit_log(struct r5l_log *log)
+{
+	if (!log)
+		return;
+
+	if (log->policy->exit_log)
+		log->policy->exit_log(log);
+
 	kfree(log);
 }
+
+struct r5l_policy r5l_journal = {
+	.init_log = __r5l_init_log,
+	.exit_log = __r5l_exit_log,
+	.write_stripe = __r5l_write_stripe,
+	.write_stripe_run = __r5l_write_stripe_run,
+	.flush_stripe_to_raid = __r5l_flush_stripe_to_raid,
+	.stripe_write_finished = __r5l_stripe_write_finished,
+	.handle_flush_request = __r5l_handle_flush_request,
+	.quiesce = __r5l_quiesce,
+};
+
+int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
+{
+	int ret;
+	struct r5l_log *log = kzalloc(sizeof(*log), GFP_KERNEL);
+	if (!log)
+		return -ENOMEM;
+
+	log->rdev = rdev;
+	log->policy = &r5l_journal;
+
+	ret = log->policy->init_log(log, conf);
+	if (ret)
+		kfree(log);
+
+	return ret;
+}
diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
index f3d0c56..c3028a1 100644
--- a/drivers/md/raid5-cache.h
+++ b/drivers/md/raid5-cache.h
@@ -79,6 +79,8 @@ struct r5l_log {
 
 	/* to submit async io_units, to fulfill ordering of flush */
 	struct work_struct deferred_io_work;
+
+	struct r5l_policy *policy;
 };
 
 /*
@@ -129,6 +131,17 @@ enum r5l_io_unit_state {
 	IO_UNIT_STRIPE_END = 3,	/* stripes data finished writing to raid */
 };
 
+struct r5l_policy {
+	int (*init_log)(struct r5l_log *log, struct r5conf *conf);
+	void (*exit_log)(struct r5l_log *log);
+	int (*write_stripe)(struct r5l_log *log, struct stripe_head *sh);
+	void (*write_stripe_run)(struct r5l_log *log);
+	void (*flush_stripe_to_raid)(struct r5l_log *log);
+	void (*stripe_write_finished)(struct r5l_io_unit *io);
+	int (*handle_flush_request)(struct r5l_log *log, struct bio *bio);
+	void (*quiesce)(struct r5l_log *log, int state);
+};
+
 extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
 extern void r5l_exit_log(struct r5l_log *log);
 extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh);
-- 
2.10.1


^ permalink raw reply related

* [PATCH 01/12] raid5-cache: move declarations to separate header
From: Artur Paszkiewicz @ 2016-11-24 12:28 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20161124122847.16456-1-artur.paszkiewicz@intel.com>

Next patches will be reusing raid5-cache structures and functions, so
put them in their own header.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/raid5-cache.c | 128 +-----------------------------------------
 drivers/md/raid5-cache.h | 142 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c       |   1 +
 drivers/md/raid5.h       |   9 ---
 4 files changed, 144 insertions(+), 136 deletions(-)
 create mode 100644 drivers/md/raid5-cache.h

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 5f817bd..fc6f9fa 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -23,6 +23,7 @@
 #include "md.h"
 #include "raid5.h"
 #include "bitmap.h"
+#include "raid5-cache.h"
 
 /*
  * metadata/data stored in disk with 4k size unit (a block) regardless
@@ -52,16 +53,6 @@
  */
 #define R5L_POOL_SIZE	4
 
-/*
- * r5c journal modes of the array: write-back or write-through.
- * write-through mode has identical behavior as existing log only
- * implementation.
- */
-enum r5c_journal_mode {
-	R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
-	R5C_JOURNAL_MODE_WRITE_BACK = 1,
-};
-
 static char *r5c_journal_mode_str[] = {"write-through",
 				       "write-back"};
 /*
@@ -95,123 +86,6 @@ static char *r5c_journal_mode_str[] = {"write-through",
  *	- return IO for pending writes
  */
 
-struct r5l_log {
-	struct md_rdev *rdev;
-
-	u32 uuid_checksum;
-
-	sector_t device_size;		/* log device size, round to
-					 * BLOCK_SECTORS */
-	sector_t max_free_space;	/* reclaim run if free space is at
-					 * this size */
-
-	sector_t last_checkpoint;	/* log tail. where recovery scan
-					 * starts from */
-	u64 last_cp_seq;		/* log tail sequence */
-
-	sector_t log_start;		/* log head. where new data appends */
-	u64 seq;			/* log head sequence */
-
-	sector_t next_checkpoint;
-	u64 next_cp_seq;
-
-	struct mutex io_mutex;
-	struct r5l_io_unit *current_io;	/* current io_unit accepting new data */
-
-	spinlock_t io_list_lock;
-	struct list_head running_ios;	/* io_units which are still running,
-					 * and have not yet been completely
-					 * written to the log */
-	struct list_head io_end_ios;	/* io_units which have been completely
-					 * written to the log but not yet written
-					 * to the RAID */
-	struct list_head flushing_ios;	/* io_units which are waiting for log
-					 * cache flush */
-	struct list_head finished_ios;	/* io_units which settle down in log disk */
-	struct bio flush_bio;
-
-	struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
-
-	struct kmem_cache *io_kc;
-	mempool_t *io_pool;
-	struct bio_set *bs;
-	mempool_t *meta_pool;
-
-	struct md_thread *reclaim_thread;
-	unsigned long reclaim_target;	/* number of space that need to be
-					 * reclaimed.  if it's 0, reclaim spaces
-					 * used by io_units which are in
-					 * IO_UNIT_STRIPE_END state (eg, reclaim
-					 * dones't wait for specific io_unit
-					 * switching to IO_UNIT_STRIPE_END
-					 * state) */
-	wait_queue_head_t iounit_wait;
-
-	struct list_head no_space_stripes; /* pending stripes, log has no space */
-	spinlock_t no_space_stripes_lock;
-
-	bool need_cache_flush;
-
-	/* for r5c_cache */
-	enum r5c_journal_mode r5c_journal_mode;
-
-	/* all stripes in r5cache, in the order of seq at sh->log_start */
-	struct list_head stripe_in_journal_list;
-
-	spinlock_t stripe_in_journal_lock;
-	atomic_t stripe_in_journal_count;
-
-	/* to submit async io_units, to fulfill ordering of flush */
-	struct work_struct deferred_io_work;
-};
-
-/*
- * an IO range starts from a meta data block and end at the next meta data
- * block. The io unit's the meta data block tracks data/parity followed it. io
- * unit is written to log disk with normal write, as we always flush log disk
- * first and then start move data to raid disks, there is no requirement to
- * write io unit with FLUSH/FUA
- */
-struct r5l_io_unit {
-	struct r5l_log *log;
-
-	struct page *meta_page;	/* store meta block */
-	int meta_offset;	/* current offset in meta_page */
-
-	struct bio *current_bio;/* current_bio accepting new data */
-
-	atomic_t pending_stripe;/* how many stripes not flushed to raid */
-	u64 seq;		/* seq number of the metablock */
-	sector_t log_start;	/* where the io_unit starts */
-	sector_t log_end;	/* where the io_unit ends */
-	struct list_head log_sibling; /* log->running_ios */
-	struct list_head stripe_list; /* stripes added to the io_unit */
-
-	int state;
-	bool need_split_bio;
-	struct bio *split_bio;
-
-	unsigned int has_flush:1;      /* include flush request */
-	unsigned int has_fua:1;        /* include fua request */
-	unsigned int has_null_flush:1; /* include empty flush request */
-	/*
-	 * io isn't sent yet, flush/fua request can only be submitted till it's
-	 * the first IO in running_ios list
-	 */
-	unsigned int io_deferred:1;
-
-	struct bio_list flush_barriers;   /* size == 0 flush bios */
-};
-
-/* r5l_io_unit state */
-enum r5l_io_unit_state {
-	IO_UNIT_RUNNING = 0,	/* accepting new IO */
-	IO_UNIT_IO_START = 1,	/* io_unit bio start writing to log,
-				 * don't accepting new bio */
-	IO_UNIT_IO_END = 2,	/* io_unit bio finish writing to log */
-	IO_UNIT_STRIPE_END = 3,	/* stripes data finished writing to raid */
-};
-
 bool r5c_is_writeback(struct r5l_log *log)
 {
 	return (log != NULL &&
diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
new file mode 100644
index 0000000..f3d0c56
--- /dev/null
+++ b/drivers/md/raid5-cache.h
@@ -0,0 +1,142 @@
+#ifndef _RAID5_CACHE_H
+#define _RAID5_CACHE_H
+
+/*
+ * r5c journal modes of the array: write-back or write-through.
+ * write-through mode has identical behavior as existing log only
+ * implementation.
+ */
+enum r5c_journal_mode {
+	R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
+	R5C_JOURNAL_MODE_WRITE_BACK = 1,
+};
+
+struct r5l_log {
+	struct md_rdev *rdev;
+
+	u32 uuid_checksum;
+
+	sector_t device_size;		/* log device size, round to
+					 * BLOCK_SECTORS */
+	sector_t max_free_space;	/* reclaim run if free space is at
+					 * this size */
+
+	sector_t last_checkpoint;	/* log tail. where recovery scan
+					 * starts from */
+	u64 last_cp_seq;		/* log tail sequence */
+
+	sector_t log_start;		/* log head. where new data appends */
+	u64 seq;			/* log head sequence */
+
+	sector_t next_checkpoint;
+	u64 next_cp_seq;
+
+	struct mutex io_mutex;
+	struct r5l_io_unit *current_io;	/* current io_unit accepting new data */
+
+	spinlock_t io_list_lock;
+	struct list_head running_ios;	/* io_units which are still running,
+					 * and have not yet been completely
+					 * written to the log */
+	struct list_head io_end_ios;	/* io_units which have been completely
+					 * written to the log but not yet written
+					 * to the RAID */
+	struct list_head flushing_ios;	/* io_units which are waiting for log
+					 * cache flush */
+	struct list_head finished_ios;	/* io_units which settle down in log disk */
+	struct bio flush_bio;
+
+	struct list_head no_mem_stripes;   /* pending stripes, -ENOMEM */
+
+	struct kmem_cache *io_kc;
+	mempool_t *io_pool;
+	struct bio_set *bs;
+	mempool_t *meta_pool;
+
+	struct md_thread *reclaim_thread;
+	unsigned long reclaim_target;	/* number of space that need to be
+					 * reclaimed.  if it's 0, reclaim spaces
+					 * used by io_units which are in
+					 * IO_UNIT_STRIPE_END state (eg, reclaim
+					 * dones't wait for specific io_unit
+					 * switching to IO_UNIT_STRIPE_END
+					 * state) */
+	wait_queue_head_t iounit_wait;
+
+	struct list_head no_space_stripes; /* pending stripes, log has no space */
+	spinlock_t no_space_stripes_lock;
+
+	bool need_cache_flush;
+
+	/* for r5c_cache */
+	enum r5c_journal_mode r5c_journal_mode;
+
+	/* all stripes in r5cache, in the order of seq at sh->log_start */
+	struct list_head stripe_in_journal_list;
+
+	spinlock_t stripe_in_journal_lock;
+	atomic_t stripe_in_journal_count;
+
+	/* to submit async io_units, to fulfill ordering of flush */
+	struct work_struct deferred_io_work;
+};
+
+/*
+ * an IO range starts from a meta data block and end at the next meta data
+ * block. The io unit's the meta data block tracks data/parity followed it. io
+ * unit is written to log disk with normal write, as we always flush log disk
+ * first and then start move data to raid disks, there is no requirement to
+ * write io unit with FLUSH/FUA
+ */
+struct r5l_io_unit {
+	struct r5l_log *log;
+
+	struct page *meta_page;	/* store meta block */
+	int meta_offset;	/* current offset in meta_page */
+
+	struct bio *current_bio;/* current_bio accepting new data */
+
+	atomic_t pending_stripe;/* how many stripes not flushed to raid */
+	u64 seq;		/* seq number of the metablock */
+	sector_t log_start;	/* where the io_unit starts */
+	sector_t log_end;	/* where the io_unit ends */
+	struct list_head log_sibling; /* log->running_ios */
+	struct list_head stripe_list; /* stripes added to the io_unit */
+
+	int state;
+	bool need_split_bio;
+
+	struct bio *split_bio;
+
+	unsigned int has_flush:1;      /* include flush request */
+	unsigned int has_fua:1;        /* include fua request */
+	unsigned int has_null_flush:1; /* include empty flush request */
+	/*
+	 * io isn't sent yet, flush/fua request can only be submitted till it's
+	 * the first IO in running_ios list
+	 */
+	unsigned int io_deferred:1;
+
+	struct bio_list flush_barriers;   /* size == 0 flush bios */
+};
+
+/* r5l_io_unit state */
+enum r5l_io_unit_state {
+	IO_UNIT_RUNNING = 0,	/* accepting new IO */
+	IO_UNIT_IO_START = 1,	/* io_unit bio start writing to log,
+				 * don't accepting new bio */
+	IO_UNIT_IO_END = 2,	/* io_unit bio finish writing to log */
+	IO_UNIT_STRIPE_END = 3,	/* stripes data finished writing to raid */
+};
+
+extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
+extern void r5l_exit_log(struct r5l_log *log);
+extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh);
+extern void r5l_write_stripe_run(struct r5l_log *log);
+extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
+extern void r5l_stripe_write_finished(struct stripe_head *sh);
+extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
+extern void r5l_quiesce(struct r5l_log *log, int state);
+extern bool r5l_log_disk_error(struct r5conf *conf);
+
+#endif /* _RAID5_CACHE_H */
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dbab8c7..58ee1d3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -61,6 +61,7 @@
 #include "raid5.h"
 #include "raid0.h"
 #include "bitmap.h"
+#include "raid5-cache.h"
 
 #define cpu_to_group(cpu) cpu_to_node(cpu)
 #define ANY_GROUP NUMA_NO_NODE
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index d13fe45..1851be8 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -748,15 +748,6 @@ extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
 extern struct stripe_head *
 raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
 			int previous, int noblock, int noquiesce);
-extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
-extern void r5l_exit_log(struct r5l_log *log);
-extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
-extern void r5l_write_stripe_run(struct r5l_log *log);
-extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
-extern void r5l_stripe_write_finished(struct stripe_head *sh);
-extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
-extern void r5l_quiesce(struct r5l_log *log, int state);
-extern bool r5l_log_disk_error(struct r5conf *conf);
 extern bool r5c_is_writeback(struct r5l_log *log);
 extern int
 r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
-- 
2.10.1


^ permalink raw reply related

* [PATCH 00/12] Partial Parity Log for MD RAID 5
From: Artur Paszkiewicz @ 2016-11-24 12:28 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz

This series of patches implements the Partial Parity Log for RAID5 arrays. The
purpose of this feature is closing the RAID Write Hole. It is a solution
alternative to the existing raid5-cache, but the implementation is based on it
and reuses some of the code by introducing support for interchangeable
policies. This allows decoupling policy from mechanism and not adding more
boilerplate code in raid5.c.

The issue addressed by PPL is, that on a dirty shutdown, parity for a
particular stripe may be inconsistent with data on other member disks. In
degraded state, there is no way to recalculate parity, because one of the disks
is missing. PPL addresses this issue and allows recalculating the parity. It
stores only enough data needed for recovering from RWH and is not a true
journal, like the raid5-cache implementation. It does not protect from losing
in-flight data.

PPL is a distributed log - data is stored on all RAID member drives in the
metadata area. It does not need a dedicated journaling drive. Performance is
reduced by up to 30%-40% but it scales with the number of drives in the array
and the journaling drive does not become a bottleneck.

This feature originated from Intel RSTe, which uses IMSM metadata. This
patchset implements PPL for external metadata (specifically IMSM) as well as
native MD v1.1 and v1.2 metadata.

Thanks,
Artur

Artur Paszkiewicz (12):
  raid5-cache: move declarations to separate header
  raid5-cache: add policy logic
  raid5-cache: add a new policy
  md: superblock changes for PPL
  raid5-ppl: Partial Parity Log implementation
  raid5-ppl: calculate partial parity
  md: mddev_find_container helper function
  md: expose rdev->sb_start as sysfs attribute
  raid5-ppl: read PPL signature from IMSM metadata
  raid5-ppl: recovery from dirty shutdown using PPL
  raid5-ppl: support disk add/remove with distributed PPL
  raid5-ppl: runtime PPL enabling or disabling

 drivers/md/Makefile            |    2 +-
 drivers/md/md.c                |   81 ++-
 drivers/md/md.h                |   14 +
 drivers/md/raid5-cache.c       |  276 +++++-----
 drivers/md/raid5-cache.h       |  171 ++++++
 drivers/md/raid5-ppl.c         | 1123 ++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c             |  274 +++++++++-
 drivers/md/raid5.h             |   10 +-
 include/uapi/linux/raid/md_p.h |   18 +-
 9 files changed, 1786 insertions(+), 183 deletions(-)
 create mode 100644 drivers/md/raid5-cache.h
 create mode 100644 drivers/md/raid5-ppl.c

-- 
2.10.1

^ permalink raw reply

* Re: Fwd: Re: mdadm I/O error with Ddf RAID
From: Arka Sharma @ 2016-11-24 11:29 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid
In-Reply-To: <CAPO=kN1kn4voZpzhD4spYY3+X=6tVWdZSbvD=41zJWr7J2pSSA@mail.gmail.com>

Thanks Neil for your assistance. We have root caused the issue. There
was a problem in setting PhysicalRefNo and Starting Block in Config
Record. Now the wrong LBA is not seen. Thanks for your response.

Regards,
Arka

On Tue, Nov 22, 2016 at 3:00 PM, Arka Sharma <arka.sw1988@gmail.com> wrote:
> I have observed that following block
> else if (!mddev->bitmap)
>                         j = mddev->recovery_cp;
> is getting executed in md_do_sync. I performed to test. In case 1 I
> filled the entire 32 mb of physical disks with FF and then wrote the
> metadata. And in the following case we filled the 32 mb with zeros and
> then wrote the metadata. In both the cases we receive md/raid1:md126:
> not clean -- starting background reconstruction message from md when
> there is access to LBA 1000182866. However when I create raid 1 using
> mdadm and reboot the system there is no access to  LBA 1000182866.
> Also when I read that sector after creating raid 1 with mdadm we see
> this block contains FF. As we have confirmed that mdadm also writing
> the config data at 1000182610. Only in case of raid created through
> our application results access at that offset.
>
> Regards,
> Arka
>
> On Tue, Nov 22, 2016 at 5:24 AM, NeilBrown <neilb@suse.com> wrote:
>> On Tue, Nov 22 2016, Arka Sharma wrote:
>>
>>> ---------- Forwarded message ----------
>>> From: "Arka Sharma" <arka.sw1988@gmail.com>
>>> Date: 21 Nov 2016 12:57 p.m.
>>> Subject: Re: mdadm I/O error with Ddf RAID
>>> To: "NeilBrown" <neilb@suse.com>
>>> Cc: <linux-raid@vger.kernel.org>
>>>
>>> I have run mdadm --examine on both the component devices as well as on
>>> the container. This shows that one of the component disk is marked as
>>> offline and status is failed. When I run mdadm --detail on the RAID
>>> device it shows the component disk 0 state as removed. Since I am very
>>> much new to md and linux in general I am not able to fully root cause
>>> this issue. I have made couple of observation though, that before the
>>> invalid sector 18446744073709551615 is sent, the sector 1000182866 is
>>> accessed after which mdraid reports as not clean starts background
>>> reconstruction. I read the LBA 1000182866 and this block contains FF.
>>> So is md expecting something in the metadata we are not populating ?
>>> Please find the attached md127.txt which is the output of the mdadm
>>> --examine <container>, blk-core_diff.txt which contains the printk's
>>> and dmesg.txt, also DDF_Header0.txt and DDF_Header1.txt are the dump
>>> of ddf headers for both the disks.
>>
>> Thanks for providing more details.
>>
>> Sector 1000182866 is 256 sectors into the config section.
>> It starts reading the config section at 1000182610 and gets 256 sectors,
>> so it reads the rest from 1000182866 and then starts the array.
>>
>> My guess is that md is getting confused about resync and recovery.
>> It tries a resync, but as the array appears degraded, this code:
>>                 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
>>                         j = mddev->resync_min;
>>                 else if (!mddev->bitmap)
>>                         j = mddev->recovery_cp;
>>
>> in md_do_sync() sets 'j' to MaxSector, which is effectively "-1".  It
>> then starts resync from there and goes crazy.  You could put a printk in
>> there to confirm.
>>
>> I don't know why.  Something about the config makes mdadm think the
>> array is degraded.  I might try to find time to dig into it again later.
>>
>> NeilBrown

^ permalink raw reply

* [patch] md/r5cache: enable IRQs on error path
From: Dan Carpenter @ 2016-11-24 11:13 UTC (permalink / raw)
  To: Shaohua Li, Song Liu; +Cc: linux-raid, kernel-janitors

We need to re-enable the IRQs here before returning.

Fixes: a39f7afde358 ("md/r5cache: write-out phase and reclaim support")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 2a60ce4..6610134 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1029,7 +1029,7 @@ static sector_t r5c_calculate_new_cp(struct r5conf *conf)
 	spin_lock_irqsave(&log->stripe_in_journal_lock, flags);
 	if (list_empty(&conf->log->stripe_in_journal_list)) {
 		/* all stripes flushed */
-		spin_unlock(&log->stripe_in_journal_lock);
+		spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags);
 		return log->next_checkpoint;
 	}
 	sh = list_first_entry(&conf->log->stripe_in_journal_list,

^ permalink raw reply related

* [PATCH 1/1] IMSM: Update num_data_stripes during migration
From: Pawel Baldysiak @ 2016-11-24  8:48 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Pawel Baldysiak, Maksymilian Kunt

This patch adds updataing num_data_stripes during reshape.
Previously this field once set during creation was never updated.
Also, num_data_strips value multipied by chunk_size is used
for set proper component size for RAID5.

Signed-off-by: Pawel Baldysiak <pawel.baldysiak@intel.com>
Signed-off-by: Maksymilian Kunt <maksymilian.kunt@intel.com>
---
 super-intel.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 46 insertions(+), 7 deletions(-)

diff --git a/super-intel.c b/super-intel.c
index 5740088..3d21f31 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -910,14 +910,12 @@ static unsigned long long blocks_per_member(struct imsm_map *map)
 	return join_u32(map->blocks_per_member_lo, map->blocks_per_member_hi);
 }
 
-#ifndef MDASSEMBLE
 static unsigned long long num_data_stripes(struct imsm_map *map)
 {
 	if (map == NULL)
 		return 0;
 	return join_u32(map->num_data_stripes_lo, map->num_data_stripes_hi);
 }
-#endif
 
 static void set_total_blocks(struct imsm_disk *disk, unsigned long long n)
 {
@@ -2916,7 +2914,13 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
 	}
 
 	info->data_offset	  = pba_of_lba0(map_to_analyse);
-	info->component_size	  = blocks_per_member(map_to_analyse);
+
+	if (info->array.level == 5) {
+		info->component_size = num_data_stripes(map_to_analyse) *
+				       map_to_analyse->blocks_per_strip;
+	} else {
+		info->component_size = blocks_per_member(map_to_analyse);
+	}
 
 	info->component_size = imsm_component_size_aligment_check(
 							info->array.level,
@@ -7065,7 +7069,14 @@ static struct mdinfo *container_content_imsm(struct supertype *st, char *subarra
 
 			info_d->events = __le32_to_cpu(mpb->generation_num);
 			info_d->data_offset = pba_of_lba0(map);
-			info_d->component_size = blocks_per_member(map);
+
+			if (map->raid_level == 5) {
+				info_d->component_size =
+						num_data_stripes(map) *
+						map->blocks_per_strip;
+			} else {
+				info_d->component_size = blocks_per_member(map);
+			}
 		}
 		/* now that the disk list is up-to-date fixup recovery_start */
 		update_recovery_start(super, dev, this);
@@ -8271,9 +8282,23 @@ static int apply_reshape_migration_update(struct imsm_update_reshape_migration *
 
 			/* update chunk size
 			 */
-			if (u->new_chunksize > 0)
+			if (u->new_chunksize > 0) {
+				unsigned long long num_data_stripes;
+				int used_disks =
+					imsm_num_data_members(dev, MAP_0);
+
+				if (used_disks == 0)
+					return ret_val;
+
 				map->blocks_per_strip =
 					__cpu_to_le16(u->new_chunksize * 2);
+				num_data_stripes =
+					(join_u32(dev->size_low, dev->size_high)
+					/ used_disks);
+				num_data_stripes /= map->blocks_per_strip;
+				num_data_stripes /= map->num_domains;
+				set_num_data_stripes(map, num_data_stripes);
+			}
 
 			/* add disk
 			 */
@@ -8340,13 +8365,19 @@ static int apply_size_change_update(struct imsm_update_size_change *u,
 			struct imsm_map *map = get_imsm_map(dev, MAP_0);
 			int used_disks = imsm_num_data_members(dev, MAP_0);
 			unsigned long long blocks_per_member;
+			unsigned long long num_data_stripes;
 
 			/* calculate new size
 			 */
 			blocks_per_member = u->new_size / used_disks;
-			dprintf("(size: %llu, blocks per member: %llu)\n",
-				u->new_size, blocks_per_member);
+			num_data_stripes = blocks_per_member /
+					   map->blocks_per_strip;
+			num_data_stripes /= map->num_domains;
+			dprintf("(size: %llu, blocks per member: %llu, num_data_stipes: %llu)\n",
+				u->new_size, blocks_per_member,
+				num_data_stripes);
 			set_blocks_per_member(map, blocks_per_member);
+			set_num_data_stripes(map, num_data_stripes);
 			imsm_set_array_size(dev, u->new_size);
 
 			ret_val = 1;
@@ -8597,6 +8628,14 @@ static int apply_takeover_update(struct imsm_update_takeover *u,
 	map = get_imsm_map(dev, MAP_0);
 
 	if (u->direction == R10_TO_R0) {
+		unsigned long long num_data_stripes;
+
+		map->num_domains = 1;
+		num_data_stripes = blocks_per_member(map);
+		num_data_stripes /= map->blocks_per_strip;
+		num_data_stripes /= map->num_domains;
+		set_num_data_stripes(map, num_data_stripes);
+
 		/* Number of failed disks must be half of initial disk number */
 		if (imsm_count_failed(super, dev, MAP_0) !=
 				(map->num_members / 2))
-- 
2.9.3


^ permalink raw reply related

* Re: raid1 bitmap and multiple removed disks
From: Diego Guella @ 2016-11-24  8:46 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid
In-Reply-To: <87fumhbu41.fsf@notabene.neil.brown.name>

Hi Neil,


Il 24/11/2016 01:26, NeilBrown ha scritto:
> On Wed, Nov 23 2016, Diego Guella wrote:
>
>> (2nd attempt: the previous one didn't make it)
>> Hi,
>>
>> I am using linux raid1 for a double-purpose: redundancy and backup.
>>
>> I have a raid1 array of 5 disks, 3 of which are kept for backup purposes.
>> Let's call disks A, B, C, D, E.
>> Disks A and B are _always_ connected to the system.
>> Disks C, D, E are backup disks.
>> Here follows a description of how I use the backup disks.
>> This morning I connect disk C, and let it resync.
>> Tomorrow morning, I shut down the system, remove disk C and keep it away
>> as a daily backup.
>> I connect the next disk (D), then start up the system.
>> Linux raid1 recognizes the "old" disk and does not allow it to enter the
>> array (this is evidenced by system logs).
>> I then add disk D to the array, and let it resync.
> So this would be a full resync - right?
By "let it resync" I mean:
- mdadm /dev/md1 -a /dev/sdX
- (watch /proc/mdstat until it finishes)
I don't touch the raid1 until the resync finishes.

The first time disk D is added to the array (suppose it is a brand new 
disk), yes, it is a full resync (~20 hours).
BUT if D is not brand new, and it has already been part of this raid1 
"rotation", the resync is clearly not a full resync:
- mdadm says "re-adding /dev/sdX", although i told it "mdadm /dev/md1 -a 
/dev/sdX"
- watching /proc/mdstat (or better, looking at dmesg), the resync takes 
a hour or two, depending on how much the data is changed.


>> The next day, I connect the next disk (E), and so on, rotating them.
>> The "connect and disconnect" is always performed when the system is
>> powered off, although sometimes I hot-connect the disk with the system
>> already powered up.
>> The purpose of this is to have an emergency backup: I can disconnect ALL
>> disks from the system and connect only one of the daily backups, going
>> "back to the past"(TM).
>>
>> This array has a write-intent bitmap, in order to speed up the resync
>> (it is a 4TB array, and sometimes it needs nearly 20 hours to resync
>> without bitmaps due to system load).
>>
>> This worked flawlessly (for some years) until some days ago, when the
>> array suffered a strange inconsistency, and the filesystem nearly gone
>> nuts in about 20 minutes of uptime. I will elaborate more on this
>> later.
> Did you ever test your backups?
Of course.
I tested this "raid1 backup system" back some years ago, with Debian 
Lenny, artificially destroying the / partition, to the point where the 
system would not boot. Then, I took one of the "backup" disks, throw it 
in as the only disk in the system, and powered up the system. All was 
working, effectively going "back to the past"(TM).

More recently, occasionally I needed to go "back to the past"(TM) to 
recover some accidentally-deleted files to a temporary flash drive, and 
I even needed to go "back to the past"(TM) because of a bad system 
update: I then zeroed out the superblocks of all the other devices, and 
resynced them to the backup, bringing up full redundancy from a backup.

The most recent "back to the past"(TM) was some days ago.
This is what I called "I will elaborate more on this later" in my 
previous mail:
- I changed the bitmap-chunk: disks A, B, C had a new bitmap-chunk while 
disks D, E (the backups) had the old bitmap-chunk (they were detached 
and offline).
- A, B, C completely resynced
- power down
- remove C; insert D
- power up
- mdadm /dev/md1 -a /dev/sdD
- kernel panic in 20 minutes

This episode was my fault: I *thought* the RAID1 was smart enough to 
recognize the different bitmap-chunks and adapt them, but I was wrong. 
The array resynced completely in some minutes (or at least, it *thought* 
it was resynced), and then probably the filesystem read some (old) block 
from disk D and boom!
I should have zeroed out the superblock of any device that didn't 'see' 
(read: was online) the bitmap-chunk change.

Moreover, since that episode spawned many doubts in my mind, I ran a 
checkarray 2 days ago on /dev/md1: the result was 0 mismatch_cnt.


>> Since that problem happened, some questions come to my mind:
>> What raid1 bitmaps allow me to do?
> - accelerate resync after a crash.
> - accelerate recovery when you remove a drive and re-add it.
>
>> Can they record _correctly_ the state of multiple removed disks, in
>> order to overwrite only out-of-sync chunks of multiple removed disks?
> All that is recorded is the set of regions which have been written to
> since the array was last in a non-degraded state.
Hmm... My array is a 5-devices array. This is because I have 5 
components total: 2 online and 3 backups (actually: 2 online, 1 
resyncing, and 2 backups).
That's needed (I performed tests many years ago) because if I set it 
(for example) as a 3-devices array, bitmap were not working: every time 
I added a backup disk, raid1 performed a full resync (many many hours).

So: my array is _always_ in a degraded state (and it cannot _ever_ be 
non-degraded, at least until I leave it as a 5-devices array: I don't 
have enough SATA ports to connect every component).
Does this change anything?


>> In other words, am I allowed to do what I described above?
> If the recovery that happened when you swapped drives was not a full
> recovery, then probably not.
The recovery was full once the disk was brand new, then it seems to 
become "known" to the array, and after the first full resync it performs 
a bitmap-driven resync.
Does this change anything?


>> If not, can I change something in my actions in order to have a daily
>> backup using raid1?
> I wrote something about this a few years ago...
>   http://tracking.deviltechnologies.com/f/a/VPRXX7FggKxkR6o3483qZw~~/AAB-JAA~/RgRaF9yFP0EIAOwbEIOkxRJXA3NwY1gEAAAAAFkGc2hhcmVkYQNuZXdgDTUyLjM4LjE5MS4yMTlCCgADBak2WPlRlulSGmxpbnV4LXJhaWRAdmdlci5rZXJuZWwub3JnCVEEAAAAAEQxaHR0cDovL3Blcm1hbGluay5nbWFuZS5vcmcvZ21hbmUubGludXgucmFpZC8zNTA3NEcCe30T
>
> or this thread
>    http://tracking.deviltechnologies.com/f/a/4RW-JGI-J1MY0p25SXWrZw~~/AAB-JAA~/RgRaF9yFP0EIAOwbEIOkxRJXA3NwY1gEAAAAAFkGc2hhcmVkYQNuZXdgDTUyLjM4LjE5MS4yMTlCCgADBak2WPlRlulSGmxpbnV4LXJhaWRAdmdlci5rZXJuZWwub3JnCVEEAAAAAEQvaHR0cDovL3d3dy5zcGluaWNzLm5ldC9saXN0cy9yYWlkL21zZzM1NTMyLmh0bWxHAnt9Ew~~

OK, I read that thread. Thanks for pointing me to that.
_IF_ that's the only solution, I prefer to give up on bitmaps: I don't 
like the idea of the stacked raid1 arrays because it's not flexible 
enough for me.
With a single plain raid1 array I can grow the number of RAID devices in 
the future to an unknown number; while using a stacked one I need to 
know in advance how many devices will participate in the array.

However, from that same thread, Phil Turmel wrote:

> This is a problem.  MD only knows about two disk.  You have three.  When two disks are in place and sync'ed, the bitmaps will essentially stay cleared.
> When you swap to the other disk, its bitmap is also clear, for the same reason.  I'm sure mdadm notices the different event counts, but the clear bitmap would leave mdadm little or nothing to do to resync, as far as it knows.  But lots of writes have happened in the meantime, and they won't get copied to the freshly inserted drive.  Mdadm will read from both disks in parallel when there are parallel workloads, so one workload would get current data and the other would get stale data.
> If you perform a "check" pass after swapping and resyncing, I bet it finds many mismatches.  It definitely can't work as described.
> I'm not sure, but this might work if you could temporarily set it up as a triple mirror, so each disk has a unique slot/role.

In my case, MD knows about all disks: I have 5 disks, and /dev/md1 is a 
5-devices raid1 array.
Moreover, my array is _never_ non-degraded, and I even performed a 
checkarray which returned 0 mismatch_cnt.


I'm not trolling there, I just want to learn and understand what's 
happening, since I relied on this behavior for _years_ now.

I can even perform some tests (non-destructive: this is a production 
system), and I may even be able to arrange some destructive tests at 
home if needed (I need to check how many spare disks I have).
This production system actually have 3 raid1 arrays set up in the same 
way (every drive has 3 partitions for these arrays): one for swap, one 
for /, and one for /home.
The / array is relatively small (about 13 GB), so I may even be able to 
dd many of them out, saving them in order to perform binary compares, 
and other things like that.


Please note:
I _never_ use "mdadm -f" or "mdadm -r". I _always_ power off the system 
when removing devices from the raid1.


Thanks for your reply,
Diego Guella


^ permalink raw reply

* Re: [RFC PATCH 1/2] RAID1: a new I/O barrier implementation to remove resync window
From: Guoqing Jiang @ 2016-11-24  7:34 UTC (permalink / raw)
  To: Coly Li, linux-raid; +Cc: Shaohua Li, Neil Brown, Johannes Thumshirn
In-Reply-To: <1479765241-15528-1-git-send-email-colyli@suse.de>

Hi Coly,

Please see below comments, just FYI.

On 11/22/2016 05:54 AM, Coly Li wrote:
>   - In raid1_make_request(), wait_barrier() is replaced by,
>     a) wait_read_barrier(): wait barrier in regular read I/O code path
>     b) wait_barrier(): wait barrier in regular write I/O code path
>     The differnece is wait_read_barrier() only waits if array is frozen, I
>     am not able to combile them into one function, because they must receive
>     differnet data types in their arguments list.

Maybe it is possible to add a parameter to distinguish read and write, then
the two functions can be unified.

>   - align_to_barrier_unit_end() is called to make sure both regular and
>     resync I/O won't go across the border of a barrier unit size.
>   
> Open question:
>   - Need review from md clustring developer, I don't touch related code now.

I don't find problems with some tests so far.

>   static void reschedule_retry(struct r1bio *r1_bio)
> @@ -215,10 +214,15 @@ static void reschedule_retry(struct r1bi
>   	unsigned long flags;
>   	struct mddev *mddev = r1_bio->mddev;
>   	struct r1conf *conf = mddev->private;
> +	sector_t sector_nr;
> +	long idx;
> +
> +	sector_nr = r1_bio->sector;
> +	idx = get_barrier_bucket_idx(sector_nr);

Isn't "idx = get_barrier_bucket_idx(r1_bio->sector)" enough here?

>   @@ -255,19 +257,14 @@ static void call_bio_endio(struct r1bio
>   	if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
>   		bio->bi_error = -EIO;
>   
> -	if (done) {
> +	if (done)
>   		bio_endio(bio);
> -		/*
> -		 * Wake up any possible resync thread that waits for the device
> -		 * to go idle.
> -		 */
> -		allow_barrier(conf, start_next_window, bi_sector);
> -	}
>   }
>   
>   static void raid_end_bio_io(struct r1bio *r1_bio)
>   {
>   	struct bio *bio = r1_bio->master_bio;
> +	struct r1conf *conf = r1_bio->mddev->private;
>   
>   	/* if nobody has done the final endio yet, do it now */
>   	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
> @@ -278,6 +275,12 @@ static void raid_end_bio_io(struct r1bio
>   
>   		call_bio_endio(r1_bio);
>   	}
> +
> +	/*
> +	 * Wake up any possible resync thread that waits for the device
> +	 * to go idle.
> +	 */
> +	allow_barrier(conf, r1_bio->sector);
>   	free_r1bio(r1_bio);
>   }

I am not sure it is safe for above changes since call_bio_endio is not only
called within raid_end_bio_io.

>   
> @@ -311,6 +314,7 @@ static int find_bio_disk(struct r1bio *r
>   	return mirror;
>   }
>   
> +/* bi_end_io callback for regular READ bio */

Not related to the patch itself, it would be better to make the similar
changes in other patches.

> -static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
> +/* A regular I/O should wait when,
> + * - The whole array is frozen (both READ and WRITE)
> + * - bio is WRITE and in same barrier bucket conf->barrier[idx] raised
> + */
> +static void _wait_barrier(struct r1conf *conf, long idx)
>   {
> -	bool wait = false;
> -
> -	if (conf->array_frozen || !bio)
> -		wait = true;
> -	else if (conf->barrier && bio_data_dir(bio) == WRITE) {
> -		if ((conf->mddev->curr_resync_completed
> -		     >= bio_end_sector(bio)) ||
> -		    (conf->next_resync + NEXT_NORMALIO_DISTANCE
> -		     <= bio->bi_iter.bi_sector))
> -			wait = false;
> -		else
> -			wait = true;
> +	spin_lock_irq(&conf->resync_lock);
> +	if (conf->array_frozen || conf->barrier[idx]) {
> +		conf->nr_waiting[idx]++;
> +		/* Wait for the barrier to drop. */
> +		wait_event_lock_irq(
> +			conf->wait_barrier,
> +			!conf->array_frozen && !conf->barrier[idx],
> +			conf->resync_lock);
> +		conf->nr_waiting[idx]--;
>   	}
>   
> -	return wait;
> +	conf->nr_pending[idx]++;
> +	spin_unlock_irq(&conf->resync_lock);
>   }
>   
> -static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
> +static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
>   {
> -	sector_t sector = 0;
> +	long idx = get_barrier_bucket_idx(sector_nr);
>   
>   	spin_lock_irq(&conf->resync_lock);
> -	if (need_to_wait_for_sync(conf, bio)) {
> -		conf->nr_waiting++;
> -		/* Wait for the barrier to drop.
> -		 * However if there are already pending
> -		 * requests (preventing the barrier from
> -		 * rising completely), and the
> -		 * per-process bio queue isn't empty,
> -		 * then don't wait, as we need to empty
> -		 * that queue to allow conf->start_next_window
> -		 * to increase.
> -		 */
> -		wait_event_lock_irq(conf->wait_barrier,
> -				    !conf->array_frozen &&
> -				    (!conf->barrier ||
> -				     ((conf->start_next_window <
> -				       conf->next_resync + RESYNC_SECTORS) &&
> -				      current->bio_list &&
> -				      !bio_list_empty(current->bio_list))),
> -				    conf->resync_lock);
> -		conf->nr_waiting--;
> -	}
> -
> -	if (bio && bio_data_dir(bio) == WRITE) {
> -		if (bio->bi_iter.bi_sector >= conf->next_resync) {
> -			if (conf->start_next_window == MaxSector)
> -				conf->start_next_window =
> -					conf->next_resync +
> -					NEXT_NORMALIO_DISTANCE;
> -
> -			if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
> -			    <= bio->bi_iter.bi_sector)
> -				conf->next_window_requests++;
> -			else
> -				conf->current_window_requests++;
> -			sector = conf->start_next_window;
> -		}
> +	if (conf->array_frozen) {
> +		conf->nr_waiting[idx]++;
> +		/* Wait for array to unfreeze */
> +		wait_event_lock_irq(
> +			conf->wait_barrier,
> +			!conf->array_frozen,
> +			conf->resync_lock);
> +		conf->nr_waiting[idx]--;
>   	}
> -
> -	conf->nr_pending++;
> +	conf->nr_pending[idx]++;
>   	spin_unlock_irq(&conf->resync_lock);
> -	return sector;
>   }
>   
> -static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
> -			  sector_t bi_sector)
> +static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
> +{
> +	long idx = get_barrier_bucket_idx(sector_nr);
> +
> +	_wait_barrier(conf, idx);
> +}
> +

I personally prefer to use one wait_barrier to cover both read and 
write, something like:

wait_barrier(struct r1conf *conf, long idx, int direction)

BTW: there are some unnecessary changes inside the patch, maybe you can 
remove it
or introduce other patches for them.

Regards,
Guoqing

^ permalink raw reply

* [PATCH v5] md/r5cache: handle alloc_page failure
From: Song Liu @ 2016-11-24  6:50 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuang521,
	liuzhengyuan, Song Liu

RMW of r5c write back cache uses an extra page to store old data for
prexor. handle_stripe_dirtying() allocates this page by calling
alloc_page(). However, alloc_page() may fail.

To handle alloc_page() failures, this patch adds an extra page to
disk_info. When alloc_page fails, handle_stripe() trys to use these
pages. When these pages are used by other stripe (R5C_EXTRA_PAGE_IN_USE),
the stripe is added to delayed_list.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 27 ++++++++++++++++-
 drivers/md/raid5.c       | 78 ++++++++++++++++++++++++++++++++++++++++--------
 drivers/md/raid5.h       |  6 ++++
 3 files changed, 98 insertions(+), 13 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 8cb79fc..818874d 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2334,15 +2334,40 @@ int r5c_try_caching_write(struct r5conf *conf,
  */
 void r5c_release_extra_page(struct stripe_head *sh)
 {
+	struct r5conf *conf = sh->raid_conf;
 	int i;
+	bool using_disk_info_extra_page;
+
+	using_disk_info_extra_page =
+		sh->dev[0].orig_page == conf->disks[0].extra_page;
 
 	for (i = sh->disks; i--; )
 		if (sh->dev[i].page != sh->dev[i].orig_page) {
 			struct page *p = sh->dev[i].orig_page;
 
 			sh->dev[i].orig_page = sh->dev[i].page;
-			put_page(p);
+			if (!using_disk_info_extra_page)
+				put_page(p);
 		}
+
+	if (using_disk_info_extra_page) {
+		clear_bit(R5C_EXTRA_PAGE_IN_USE, &conf->cache_state);
+		md_wakeup_thread(conf->mddev->thread);
+	}
+}
+
+void r5c_use_extra_page(struct stripe_head *sh)
+{
+	struct r5conf *conf = sh->raid_conf;
+	int i;
+	struct r5dev *dev;
+
+	for (i = sh->disks; i--; ) {
+		dev = &sh->dev[i];
+		if (dev->orig_page != dev->page)
+			put_page(dev->orig_page);
+		dev->orig_page = conf->disks[i].extra_page;
+	}
 }
 
 /*
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dbab8c7..db909b9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -876,6 +876,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 	if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
 		/* writing out phase */
+		if (s->waiting_extra_page)
+			return;
 		if (r5l_write_stripe(conf->log, sh) == 0)
 			return;
 	} else {  /* caching phase */
@@ -2007,6 +2009,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 		INIT_LIST_HEAD(&sh->batch_list);
 		INIT_LIST_HEAD(&sh->lru);
 		INIT_LIST_HEAD(&sh->r5c);
+		INIT_LIST_HEAD(&sh->log_list);
 		atomic_set(&sh->count, 1);
 		sh->log_start = MaxSector;
 		for (i = 0; i < disks; i++) {
@@ -2253,10 +2256,24 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	 */
 	ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
 	if (ndisks) {
-		for (i=0; i<conf->raid_disks; i++)
+		for (i = 0; i < conf->pool_size; i++)
 			ndisks[i] = conf->disks[i];
-		kfree(conf->disks);
-		conf->disks = ndisks;
+
+		for (i = conf->pool_size; i < newsize; i++) {
+			ndisks[i].extra_page = alloc_page(GFP_NOIO);
+			if (!ndisks[i].extra_page)
+				err = -ENOMEM;
+		}
+
+		if (err) {
+			for (i = conf->pool_size; i < newsize; i++)
+				if (ndisks[i].extra_page)
+					put_page(ndisks[i].extra_page);
+			kfree(ndisks);
+		} else {
+			kfree(conf->disks);
+			conf->disks = ndisks;
+		}
 	} else
 		err = -ENOMEM;
 
@@ -3580,10 +3597,10 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 		break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
 }
 
-static void handle_stripe_dirtying(struct r5conf *conf,
-				   struct stripe_head *sh,
-				   struct stripe_head_state *s,
-				   int disks)
+static int handle_stripe_dirtying(struct r5conf *conf,
+				  struct stripe_head *sh,
+				  struct stripe_head_state *s,
+				  int disks)
 {
 	int rmw = 0, rcw = 0, i;
 	sector_t recovery_cp = conf->mddev->recovery_cp;
@@ -3649,12 +3666,32 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 			    dev->page == dev->orig_page &&
 			    !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
 				/* alloc page for prexor */
-				dev->orig_page = alloc_page(GFP_NOIO);
+				struct page *p = alloc_page(GFP_NOIO);
+
+				if (p) {
+					dev->orig_page = p;
+					continue;
+				}
 
-				/* will handle failure in a later patch*/
-				BUG_ON(!dev->orig_page);
+				/*
+				 * alloc_page() failed, try use
+				 * disk_info->extra_page
+				 */
+				if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
+						      &conf->cache_state)) {
+					r5c_use_extra_page(sh);
+					break;
+				}
+
+				/* extra_page in use, add to delayed_list */
+				set_bit(STRIPE_DELAYED, &sh->state);
+				s->waiting_extra_page = 1;
+				return -EAGAIN;
 			}
+		}
 
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
 			if ((dev->towrite ||
 			     i == sh->pd_idx || i == sh->qd_idx ||
 			     test_bit(R5_InJournal, &dev->flags)) &&
@@ -3730,6 +3767,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
 	     !test_bit(STRIPE_BIT_DELAY, &sh->state)))
 		schedule_reconstruction(sh, s, rcw == 0, 0);
+	return 0;
 }
 
 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
@@ -4545,8 +4583,12 @@ static void handle_stripe(struct stripe_head *sh)
 			if (ret == -EAGAIN ||
 			    /* stripe under reclaim: !caching && injournal */
 			    (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
-			     s.injournal > 0))
-				handle_stripe_dirtying(conf, sh, &s, disks);
+			     s.injournal > 0)) {
+				ret = handle_stripe_dirtying(conf, sh, &s,
+							     disks);
+				if (ret == -EAGAIN)
+					goto finish;
+			}
 		}
 	}
 
@@ -6458,6 +6500,8 @@ static void raid5_free_percpu(struct r5conf *conf)
 
 static void free_conf(struct r5conf *conf)
 {
+	int i;
+
 	if (conf->log)
 		r5l_exit_log(conf->log);
 	if (conf->shrinker.nr_deferred)
@@ -6466,6 +6510,9 @@ static void free_conf(struct r5conf *conf)
 	free_thread_groups(conf);
 	shrink_stripes(conf);
 	raid5_free_percpu(conf);
+	for (i = 0; i < conf->pool_size; i++)
+		if (conf->disks[i].extra_page)
+			put_page(conf->disks[i].extra_page);
 	kfree(conf->disks);
 	kfree(conf->stripe_hashtbl);
 	kfree(conf);
@@ -6612,9 +6659,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 
 	conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
 			      GFP_KERNEL);
+
 	if (!conf->disks)
 		goto abort;
 
+	for (i = 0; i < max_disks; i++) {
+		conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
+		if (!conf->disks[i].extra_page)
+			goto abort;
+	}
+
 	conf->mddev = mddev;
 
 	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index d13fe45..ed8e136 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -276,6 +276,7 @@ struct stripe_head_state {
 	struct md_rdev *blocked_rdev;
 	int handle_bad_blocks;
 	int log_failed;
+	int waiting_extra_page;
 };
 
 /* Flags for struct r5dev.flags */
@@ -439,6 +440,7 @@ enum {
 
 struct disk_info {
 	struct md_rdev	*rdev, *replacement;
+	struct page	*extra_page; /* extra page to use in prexor */
 };
 
 /*
@@ -559,6 +561,9 @@ enum r5_cache_state {
 				 * only process stripes that are already
 				 * occupying the log
 				 */
+	R5C_EXTRA_PAGE_IN_USE,	/* a stripe is using disk_info.extra_page
+				 * for prexor
+				 */
 };
 
 struct r5conf {
@@ -765,6 +770,7 @@ extern void
 r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
 			    struct stripe_head_state *s);
 extern void r5c_release_extra_page(struct stripe_head *sh);
+extern void r5c_use_extra_page(struct stripe_head *sh);
 extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
 extern void r5c_handle_cached_data_endio(struct r5conf *conf,
 	struct stripe_head *sh, int disks, struct bio_list *return_bi);
-- 
2.9.3


^ permalink raw reply related

* Re: [PATCH v4] md/r5cache: handle alloc_page failure
From: Song Liu @ 2016-11-24  6:44 UTC (permalink / raw)
  To: NeilBrown
  Cc: linux-raid@vger.kernel.org, Shaohua Li, Kernel Team,
	dan.j.williams@intel.com, hch@infradead.org,
	liuzhengyuang521@gmail.com, liuzhengyuan@kylinos.cn
In-Reply-To: <87y4099z9i.fsf@notabene.neil.brown.name>


> On Nov 23, 2016, at 10:18 PM, NeilBrown <neilb@suse.com> wrote:
> 
> On Thu, Nov 24 2016, Song Liu wrote:
> 
>> +void r5c_use_extra_page(struct stripe_head *sh)
>> +{
>> +	struct r5conf *conf = sh->raid_conf;
>> +	int i;
>> +	struct r5dev *dev;
>> +	struct page *p;
>> +
>> +	for (i = sh->disks; i--; ) {
>> +		dev = &sh->dev[i];
>> +		if (dev->orig_page != dev->page) {
>> +			p = dev->orig_page;
>> +			dev->orig_page = dev->page;
>> 			put_page(p);
>> 		}
>> +		dev->orig_page = conf->disks[i].extra_page;
> 
> It seems a bit pointless to assign to dev->orig_page twice.
> Why not:
> 
>  if (dev->orig_page != dev->page)
>      put_page(dev->orig_page);
>  dev->orig_page = conf->......
> 
> ??
> 
>> @@ -2255,8 +2258,27 @@ static int resize_stripes(struct r5conf *conf, int newsize)
>> 	if (ndisks) {
>> 		for (i=0; i<conf->raid_disks; i++)
>> 			ndisks[i] = conf->disks[i];
>> -		kfree(conf->disks);
>> -		conf->disks = ndisks;
>> +
>> +		/* allocate extra_page for ndisks */
>> +		for (i = 0; i < newsize; i++) {
>> +			ndisks[i].extra_page = alloc_page(GFP_NOIO);
>> +			if (!ndisks[i].extra_page)
>> +				err = -ENOMEM;
>> +		}
>> +
>> +		if (err) {
>> +			/* if any error, free extra_page for ndisks */
>> +			for (i = 0; i < newsize; i++)
>> +				if (ndisks[i].extra_page)
>> +					put_page(ndisks[i].extra_page);
>> +			kfree(ndisks);
>> +		} else {
>> +			/* if no error, free extra_page for old disks */
>> +			for (i = 0; i < conf->previous_raid_disks; i++)
>> +				put_page(ndisks[i].extra_page);
>> +			kfree(conf->disks);
>> +			conf->disks = ndisks;
>> +		}
> 
> This looks a bit odd too.  We never reduce conf->pool_size, so we never
> need to free anything.
> 
> for (i = conf->pool_size; i < newsize; i++)
>     if ((ndisks[i].extra_page = alloc_page(GFP_NOIO)) == NULL)
>         err = -ENOMEM;
> for (i = conf->pool_size; err == -ENOMEM && i < newsize; i++)
>     if (ndisks[i].extra_page)
>         put_page(ndisks[i].extra_page);
> 
> 
> Maybe that it a little terse, but something like that would be better I
> think.
> Certainly you don't need to free ndisks.  If the allocation succeeds,
> just use the new array whether other allocations succeed or fail.
> 
>> @@ -6466,6 +6515,12 @@ static void free_conf(struct r5conf *conf)
>> 	free_thread_groups(conf);
>> 	shrink_stripes(conf);
>> 	raid5_free_percpu(conf);
>> +	for (i = 0; i < conf->raid_disks; i++)
>> +		if (conf->disks[i].extra_page) {
>> +			put_page(conf->disks[i].extra_page);
>> +			conf->disks[i].extra_page = NULL;
> 
> There is no point setting extra_page to NULL, as the whole array is
> freed on the next line.
> 
> 
> Apart from those few little things, it looks good.  Thanks.
> 
> NeilBrown

Thanks for these feedback! New patch coming.  

Song

^ permalink raw reply

* Re: [PATCH v4] md/r5cache: handle alloc_page failure
From: NeilBrown @ 2016-11-24  6:18 UTC (permalink / raw)
  To: linux-raid
  Cc: shli, kernel-team, dan.j.williams, hch, liuzhengyuang521,
	liuzhengyuan, Song Liu
In-Reply-To: <20161124051404.1969242-1-songliubraving@fb.com>

[-- Attachment #1: Type: text/plain, Size: 2517 bytes --]

On Thu, Nov 24 2016, Song Liu wrote:

> +void r5c_use_extra_page(struct stripe_head *sh)
> +{
> +	struct r5conf *conf = sh->raid_conf;
> +	int i;
> +	struct r5dev *dev;
> +	struct page *p;
> +
> +	for (i = sh->disks; i--; ) {
> +		dev = &sh->dev[i];
> +		if (dev->orig_page != dev->page) {
> +			p = dev->orig_page;
> +			dev->orig_page = dev->page;
>  			put_page(p);
>  		}
> +		dev->orig_page = conf->disks[i].extra_page;

It seems a bit pointless to assign to dev->orig_page twice.
Why not:

  if (dev->orig_page != dev->page)
      put_page(dev->orig_page);
  dev->orig_page = conf->......

??

> @@ -2255,8 +2258,27 @@ static int resize_stripes(struct r5conf *conf, int newsize)
>  	if (ndisks) {
>  		for (i=0; i<conf->raid_disks; i++)
>  			ndisks[i] = conf->disks[i];
> -		kfree(conf->disks);
> -		conf->disks = ndisks;
> +
> +		/* allocate extra_page for ndisks */
> +		for (i = 0; i < newsize; i++) {
> +			ndisks[i].extra_page = alloc_page(GFP_NOIO);
> +			if (!ndisks[i].extra_page)
> +				err = -ENOMEM;
> +		}
> +
> +		if (err) {
> +			/* if any error, free extra_page for ndisks */
> +			for (i = 0; i < newsize; i++)
> +				if (ndisks[i].extra_page)
> +					put_page(ndisks[i].extra_page);
> +			kfree(ndisks);
> +		} else {
> +			/* if no error, free extra_page for old disks */
> +			for (i = 0; i < conf->previous_raid_disks; i++)
> +				put_page(ndisks[i].extra_page);
> +			kfree(conf->disks);
> +			conf->disks = ndisks;
> +		}

This looks a bit odd too.  We never reduce conf->pool_size, so we never
need to free anything.

 for (i = conf->pool_size; i < newsize; i++)
     if ((ndisks[i].extra_page = alloc_page(GFP_NOIO)) == NULL)
         err = -ENOMEM;
 for (i = conf->pool_size; err == -ENOMEM && i < newsize; i++)
     if (ndisks[i].extra_page)
         put_page(ndisks[i].extra_page);


Maybe that it a little terse, but something like that would be better I
think.
Certainly you don't need to free ndisks.  If the allocation succeeds,
just use the new array whether other allocations succeed or fail.

> @@ -6466,6 +6515,12 @@ static void free_conf(struct r5conf *conf)
>  	free_thread_groups(conf);
>  	shrink_stripes(conf);
>  	raid5_free_percpu(conf);
> +	for (i = 0; i < conf->raid_disks; i++)
> +		if (conf->disks[i].extra_page) {
> +			put_page(conf->disks[i].extra_page);
> +			conf->disks[i].extra_page = NULL;

There is no point setting extra_page to NULL, as the whole array is
freed on the next line.


Apart from those few little things, it looks good.  Thanks.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox