Linux RAID subsystem development

Linux RAID subsystem development
 help / color / mirror / Atom feed

* [PATCH v2 3/4] IMSM: Add support for 4Kn sector size drives
From: Pawel Baldysiak @ 2016-11-17 13:58 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Pawel Baldysiak, Tomasz Majchrzak
In-Reply-To: <1479391118-31600-1-git-send-email-pawel.baldysiak@intel.com>

This patch adds support for drives with 4Kn sector size
for IMSM metadata. Mixing member drives with 4kn and 512
is not allowed. Some offsets were aligned with sector size.
Internal metadata representation and all calculations
are still based on 512-byte sector sizes. This
implementation converts only sector based values
when reading/writing to drive, because they needs to be
stored in metadata according to accual member drive sector size.

Signed-off-by: Pawel Baldysiak <pawel.baldysiak@intel.com>
Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
---
 super-intel.c | 201 +++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 156 insertions(+), 45 deletions(-)

diff --git a/super-intel.c b/super-intel.c
index a1b5cfb..a1114ac 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -90,6 +90,7 @@
 #define IMSM_RESERVED_SECTORS 4096
 #define NUM_BLOCKS_DIRTY_STRIPE_REGION 2056
 #define SECT_PER_MB_SHIFT 11
+#define MAX_SECTOR_SIZE 4096
 
 /* Disk configuration info. */
 #define IMSM_MAX_DEVICES 255
@@ -318,14 +319,15 @@ static void set_migr_type(struct imsm_dev *dev, __u8 migr_type)
 	}
 }
 
-static unsigned int sector_count(__u32 bytes)
+static unsigned int sector_count(__u32 bytes, unsigned int sector_size)
 {
-	return ROUND_UP(bytes, 512) / 512;
+	return ROUND_UP(bytes, sector_size) / sector_size;
 }
 
-static unsigned int mpb_sectors(struct imsm_super *mpb)
+static unsigned int mpb_sectors(struct imsm_super *mpb,
+					unsigned int sector_size)
 {
-	return sector_count(__le32_to_cpu(mpb->mpb_size));
+	return sector_count(__le32_to_cpu(mpb->mpb_size), sector_size);
 }
 
 struct intel_dev {
@@ -915,12 +917,12 @@ static unsigned long long num_data_stripes(struct imsm_map *map)
 		return 0;
 	return join_u32(map->num_data_stripes_lo, map->num_data_stripes_hi);
 }
+#endif
 
 static void set_total_blocks(struct imsm_disk *disk, unsigned long long n)
 {
 	split_ull(n, &disk->total_blocks_lo, &disk->total_blocks_hi);
 }
-#endif
 
 static void set_pba_of_lba0(struct imsm_map *map, unsigned long long n)
 {
@@ -1122,6 +1124,8 @@ static unsigned long long min_acceptable_spare_size_imsm(struct supertype *st)
 
 static int is_gen_migration(struct imsm_dev *dev);
 
+#define IMSM_4K_DIV 8
+
 #ifndef MDASSEMBLE
 static __u64 blocks_per_migr_unit(struct intel_super *super,
 				  struct imsm_dev *dev);
@@ -1253,6 +1257,48 @@ static void print_imsm_disk(struct imsm_disk *disk, int index, __u32 reserved)
 	       human_size(sz * 512));
 }
 
+void convert_to_4k_imsm_disk(struct imsm_disk *disk)
+{
+	set_total_blocks(disk, (total_blocks(disk)/IMSM_4K_DIV));
+}
+
+void convert_to_4k(struct intel_super *super)
+{
+	struct imsm_super *mpb = super->anchor;
+	struct imsm_disk *disk;
+	int i;
+
+	for (i = 0; i < mpb->num_disks ; i++) {
+		disk = __get_imsm_disk(mpb, i);
+		/* disk */
+		convert_to_4k_imsm_disk(disk);
+	}
+	for (i = 0; i < mpb->num_raid_devs; i++) {
+		struct imsm_dev *dev = __get_imsm_dev(mpb, i);
+		struct imsm_map *map = get_imsm_map(dev, MAP_0);
+		/* dev */
+		split_ull((join_u32(dev->size_low, dev->size_high)/IMSM_4K_DIV),
+				 &dev->size_low, &dev->size_high);
+		dev->vol.curr_migr_unit /= IMSM_4K_DIV;
+
+		/* map0 */
+		set_blocks_per_member(map, blocks_per_member(map)/IMSM_4K_DIV);
+		map->blocks_per_strip /= IMSM_4K_DIV;
+		set_pba_of_lba0(map, pba_of_lba0(map)/IMSM_4K_DIV);
+
+		if (dev->vol.migr_state) {
+			/* map1 */
+			map = get_imsm_map(dev, MAP_1);
+			set_blocks_per_member(map,
+			    blocks_per_member(map)/IMSM_4K_DIV);
+			map->blocks_per_strip /= IMSM_4K_DIV;
+			set_pba_of_lba0(map, pba_of_lba0(map)/IMSM_4K_DIV);
+		}
+	}
+
+	mpb->check_sum = __gen_imsm_checksum(mpb);
+}
+
 void examine_migr_rec_imsm(struct intel_super *super)
 {
 	struct migr_record *migr_rec = super->migr_rec;
@@ -1310,6 +1356,45 @@ void examine_migr_rec_imsm(struct intel_super *super)
 	}
 }
 #endif /* MDASSEMBLE */
+
+void convert_from_4k(struct intel_super *super)
+{
+	struct imsm_super *mpb = super->anchor;
+	struct imsm_disk *disk;
+	int i;
+
+	for (i = 0; i < mpb->num_disks ; i++) {
+		disk = __get_imsm_disk(mpb, i);
+		/* disk */
+		set_total_blocks(disk, (total_blocks(disk)*IMSM_4K_DIV));
+	}
+
+	for (i = 0; i < mpb->num_raid_devs; i++) {
+		struct imsm_dev *dev = __get_imsm_dev(mpb, i);
+		struct imsm_map *map = get_imsm_map(dev, MAP_0);
+		/* dev */
+		split_ull((join_u32(dev->size_low, dev->size_high)*IMSM_4K_DIV),
+				 &dev->size_low, &dev->size_high);
+		dev->vol.curr_migr_unit *= IMSM_4K_DIV;
+
+		/* map0 */
+		set_blocks_per_member(map, blocks_per_member(map)*IMSM_4K_DIV);
+		map->blocks_per_strip *= IMSM_4K_DIV;
+		set_pba_of_lba0(map, pba_of_lba0(map)*IMSM_4K_DIV);
+
+		if (dev->vol.migr_state) {
+			/* map1 */
+			map = get_imsm_map(dev, MAP_1);
+			set_blocks_per_member(map,
+			    blocks_per_member(map)*IMSM_4K_DIV);
+			map->blocks_per_strip *= IMSM_4K_DIV;
+			set_pba_of_lba0(map, pba_of_lba0(map)*IMSM_4K_DIV);
+		}
+	}
+
+	mpb->check_sum = __gen_imsm_checksum(mpb);
+}
+
 /*******************************************************************************
  * function: imsm_check_attributes
  * Description: Function checks if features represented by attributes flags
@@ -1430,7 +1515,7 @@ static void examine_super_imsm(struct supertype *st, char *homehost)
 	sum = __le32_to_cpu(mpb->check_sum);
 	printf("       Checksum : %08x %s\n", sum,
 		__gen_imsm_checksum(mpb) == sum ? "correct" : "incorrect");
-	printf("    MPB Sectors : %d\n", mpb_sectors(mpb));
+	printf("    MPB Sectors : %d\n", mpb_sectors(mpb, super->sector_size));
 	printf("          Disks : %d\n", mpb->num_disks);
 	printf("   RAID Devices : %d\n", mpb->num_raid_devs);
 	print_imsm_disk(__get_imsm_disk(mpb, super->disks->index), super->disks->index, reserved);
@@ -1527,7 +1612,7 @@ static void export_examine_super_imsm(struct supertype *st)
 
 static int copy_metadata_imsm(struct supertype *st, int from, int to)
 {
-	/* The second last 512byte sector of the device contains
+	/* The second last sector of the device contains
 	 * the "struct imsm_super" metadata.
 	 * This contains mpb_size which is the size in bytes of the
 	 * extended metadata.  This is located immediately before
@@ -1540,7 +1625,9 @@ static int copy_metadata_imsm(struct supertype *st, int from, int to)
 	unsigned long long dsize, offset;
 	int sectors;
 	struct imsm_super *sb;
-	int written = 0;
+	struct intel_super *super = st->sb;
+	unsigned int sector_size = super->sector_size;
+	unsigned int written = 0;
 
 	if (posix_memalign(&buf, 4096, 4096) != 0)
 		return 1;
@@ -1548,21 +1635,21 @@ static int copy_metadata_imsm(struct supertype *st, int from, int to)
 	if (!get_dev_size(from, NULL, &dsize))
 		goto err;
 
-	if (lseek64(from, dsize-1024, 0) < 0)
+	if (lseek64(from, dsize-(2*sector_size), 0) < 0)
 		goto err;
-	if (read(from, buf, 512) != 512)
+	if (read(from, buf, sector_size) != sector_size)
 		goto err;
 	sb = buf;
 	if (strncmp((char*)sb->sig, MPB_SIGNATURE, MPB_SIG_LEN) != 0)
 		goto err;
 
-	sectors = mpb_sectors(sb) + 2;
-	offset = dsize - sectors * 512;
+	sectors = mpb_sectors(sb, sector_size) + 2;
+	offset = dsize - sectors * sector_size;
 	if (lseek64(from, offset, 0) < 0 ||
 	    lseek64(to, offset, 0) < 0)
 		goto err;
-	while (written < sectors * 512) {
-		int n = sectors*512 - written;
+	while (written < sectors * sector_size) {
+		int n = sectors*sector_size - written;
 		if (n > 4096)
 			n = 4096;
 		if (read(from, buf, n) != n)
@@ -2678,13 +2765,14 @@ int imsm_reshape_blocks_arrays_changes(struct intel_super *super)
 }
 static unsigned long long imsm_component_size_aligment_check(int level,
 					      int chunk_size,
+					      unsigned int sector_size,
 					      unsigned long long component_size)
 {
 	unsigned int component_size_alligment;
 
 	/* check component size aligment
 	*/
-	component_size_alligment = component_size % (chunk_size/512);
+	component_size_alligment = component_size % (chunk_size/sector_size);
 
 	dprintf("(Level: %i, chunk_size = %i, component_size = %llu), component_size_alligment = %u\n",
 		level, chunk_size, component_size,
@@ -2795,6 +2883,7 @@ static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info,
 	info->component_size = imsm_component_size_aligment_check(
 							info->array.level,
 							info->array.chunk_size,
+							super->sector_size,
 							info->component_size);
 
 	memset(info->uuid, 0, sizeof(info->uuid));
@@ -3615,8 +3704,9 @@ static int parse_raid_devices(struct intel_super *super)
 	if (__le32_to_cpu(mpb->mpb_size) + space_needed > super->len) {
 		void *buf;
 
-		len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + space_needed, 512);
-		if (posix_memalign(&buf, 512, len) != 0)
+		len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + space_needed,
+			      super->sector_size);
+		if (posix_memalign(&buf, MAX_SECTOR_SIZE, len) != 0)
 			return 1;
 
 		memcpy(buf, super->buf, super->len);
@@ -3689,31 +3779,32 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname)
 {
 	unsigned long long dsize;
 	unsigned long long sectors;
+	unsigned int sector_size = super->sector_size;
 	struct stat;
 	struct imsm_super *anchor;
 	__u32 check_sum;
 
 	get_dev_size(fd, NULL, &dsize);
-	if (dsize < 1024) {
+	if (dsize < 2*sector_size) {
 		if (devname)
 			pr_err("%s: device to small for imsm\n",
 			       devname);
 		return 1;
 	}
 
-	if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) {
+	if (lseek64(fd, dsize - (sector_size * 2), SEEK_SET) < 0) {
 		if (devname)
 			pr_err("Cannot seek to anchor block on %s: %s\n",
 			       devname, strerror(errno));
 		return 1;
 	}
 
-	if (posix_memalign((void**)&anchor, 512, 512) != 0) {
+	if (posix_memalign((void **)&anchor, sector_size, sector_size) != 0) {
 		if (devname)
 			pr_err("Failed to allocate imsm anchor buffer on %s\n", devname);
 		return 1;
 	}
-	if (read(fd, anchor, 512) != 512) {
+	if (read(fd, anchor, sector_size) != sector_size) {
 		if (devname)
 			pr_err("Cannot read anchor block on %s: %s\n",
 			       devname, strerror(errno));
@@ -3733,17 +3824,17 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname)
 
 	/* capability and hba must be updated with new super allocation */
 	find_intel_hba_capability(fd, super, devname);
-	super->len = ROUND_UP(anchor->mpb_size, 512);
-	if (posix_memalign(&super->buf, 512, super->len) != 0) {
+	super->len = ROUND_UP(anchor->mpb_size, sector_size);
+	if (posix_memalign(&super->buf, MAX_SECTOR_SIZE, super->len) != 0) {
 		if (devname)
 			pr_err("unable to allocate %zu byte mpb buffer\n",
 			       super->len);
 		free(anchor);
 		return 2;
 	}
-	memcpy(super->buf, anchor, 512);
+	memcpy(super->buf, anchor, sector_size);
 
-	sectors = mpb_sectors(anchor) - 1;
+	sectors = mpb_sectors(anchor, sector_size) - 1;
 	free(anchor);
 
 	if (posix_memalign(&super->migr_rec_buf, 512, MIGR_REC_BUF_SIZE) != 0) {
@@ -3768,14 +3859,15 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname)
 	}
 
 	/* read the extended mpb */
-	if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) {
+	if (lseek64(fd, dsize - (sector_size * (2 + sectors)), SEEK_SET) < 0) {
 		if (devname)
 			pr_err("Cannot seek to extended mpb on %s: %s\n",
 			       devname, strerror(errno));
 		return 1;
 	}
 
-	if ((unsigned)read(fd, super->buf + 512, super->len - 512) != super->len - 512) {
+	if ((unsigned int)read(fd, super->buf + sector_size,
+		    super->len - sector_size) != super->len - sector_size) {
 		if (devname)
 			pr_err("Cannot read extended mpb on %s: %s\n",
 			       devname, strerror(errno));
@@ -3836,6 +3928,8 @@ load_and_parse_mpb(int fd, struct intel_super *super, char *devname, int keep_fd
 	err = load_imsm_mpb(fd, super, devname);
 	if (err)
 		return err;
+	if (super->sector_size == 4096)
+		convert_from_4k(super);
 	err = load_imsm_disk(fd, super, devname, keep_fd);
 	if (err)
 		return err;
@@ -4733,6 +4827,7 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 	 * so st->sb is already set.
 	 */
 	struct intel_super *super = st->sb;
+	unsigned int sector_size = super->sector_size;
 	struct imsm_super *mpb = super->anchor;
 	struct intel_dev *dv;
 	struct imsm_dev *dev;
@@ -4754,9 +4849,9 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 	size_new = disks_to_mpb_size(info->nr_disks);
 	if (size_new > size_old) {
 		void *mpb_new;
-		size_t size_round = ROUND_UP(size_new, 512);
+		size_t size_round = ROUND_UP(size_new, sector_size);
 
-		if (posix_memalign(&mpb_new, 512, size_round) != 0) {
+		if (posix_memalign(&mpb_new, sector_size, size_round) != 0) {
 			pr_err("could not allocate new mpb\n");
 			return 0;
 		}
@@ -4911,10 +5006,11 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
 	if (info)
 		mpb_size = disks_to_mpb_size(info->nr_disks);
 	else
-		mpb_size = 512;
+		mpb_size = MAX_SECTOR_SIZE;
 
 	super = alloc_super();
-	if (super && posix_memalign(&super->buf, 512, mpb_size) != 0) {
+	if (super &&
+	    posix_memalign(&super->buf, MAX_SECTOR_SIZE, mpb_size) != 0) {
 		free(super);
 		super = NULL;
 	}
@@ -5261,9 +5357,9 @@ static int remove_from_super_imsm(struct supertype *st, mdu_disk_info_t *dk)
 static int store_imsm_mpb(int fd, struct imsm_super *mpb);
 
 static union {
-	char buf[512];
+	char buf[MAX_SECTOR_SIZE];
 	struct imsm_super anchor;
-} spare_record __attribute__ ((aligned(512)));
+} spare_record __attribute__ ((aligned(MAX_SECTOR_SIZE)));
 
 /* spare records have their own family number and do not have any defined raid
  * devices
@@ -5294,6 +5390,9 @@ static int write_super_imsm_spares(struct intel_super *super, int doclose)
 		if (__le32_to_cpu(d->disk.total_blocks_hi) > 0)
 			spare->attributes |= MPB_ATTRIB_2TB_DISK;
 
+		if (super->sector_size == 4096)
+			convert_to_4k_imsm_disk(&spare->disk[0]);
+
 		sum = __gen_imsm_checksum(spare);
 		spare->family_num = __cpu_to_le32(sum);
 		spare->orig_family_num = 0;
@@ -5317,6 +5416,7 @@ static int write_super_imsm_spares(struct intel_super *super, int doclose)
 static int write_super_imsm(struct supertype *st, int doclose)
 {
 	struct intel_super *super = st->sb;
+	unsigned int sector_size = super->sector_size;
 	struct imsm_super *mpb = super->anchor;
 	struct dl *d;
 	__u32 generation;
@@ -5377,6 +5477,9 @@ static int write_super_imsm(struct supertype *st, int doclose)
 	if (clear_migration_record)
 		memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SIZE);
 
+	if (sector_size == 4096)
+		convert_to_4k(super);
+
 	/* write the mpb for disks that compose raid devices */
 	for (d = super->disks; d ; d = d->next) {
 		if (d->index < 0 || is_failed(&d->disk))
@@ -5500,6 +5603,8 @@ static int store_super_imsm(struct supertype *st, int fd)
 		return 1;
 
 #ifndef MDASSEMBLE
+	if (super->sector_size == 4096)
+		convert_to_4k(super);
 	return store_imsm_mpb(fd, mpb);
 #else
 	return 1;
@@ -7574,27 +7679,30 @@ static int store_imsm_mpb(int fd, struct imsm_super *mpb)
 	__u32 mpb_size = __le32_to_cpu(mpb->mpb_size);
 	unsigned long long dsize;
 	unsigned long long sectors;
+	unsigned int sector_size;
 
+	get_dev_sector_size(fd, NULL, &sector_size);
 	get_dev_size(fd, NULL, &dsize);
 
-	if (mpb_size > 512) {
+	if (mpb_size > sector_size) {
 		/* -1 to account for anchor */
-		sectors = mpb_sectors(mpb) - 1;
+		sectors = mpb_sectors(mpb, sector_size) - 1;
 
 		/* write the extended mpb to the sectors preceeding the anchor */
-		if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0)
+		if (lseek64(fd, dsize - (sector_size * (2 + sectors)),
+		   SEEK_SET) < 0)
 			return 1;
 
-		if ((unsigned long long)write(fd, buf + 512, 512 * sectors)
-		    != 512 * sectors)
+		if ((unsigned long long)write(fd, buf + sector_size,
+		   sector_size * sectors) != sector_size * sectors)
 			return 1;
 	}
 
 	/* first block is stored on second to last sector of the disk */
-	if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0)
+	if (lseek64(fd, dsize - (sector_size * 2), SEEK_SET) < 0)
 		return 1;
 
-	if (write(fd, buf, 512) != 512)
+	if (write(fd, buf, sector_size) != sector_size)
 		return 1;
 
 	return 0;
@@ -8830,6 +8938,7 @@ static int imsm_prepare_update(struct supertype *st,
 	 */
 	enum imsm_update_type type;
 	struct intel_super *super = st->sb;
+	unsigned int sector_size = super->sector_size;
 	struct imsm_super *mpb = super->anchor;
 	size_t buf_len;
 	size_t len = 0;
@@ -9066,12 +9175,13 @@ static int imsm_prepare_update(struct supertype *st,
 		 * if this allocation fails process_update will notice that
 		 * ->next_len is set and ->next_buf is NULL
 		 */
-		buf_len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + len, 512);
+		buf_len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + len,
+				  sector_size);
 		if (super->next_buf)
 			free(super->next_buf);
 
 		super->next_len = buf_len;
-		if (posix_memalign(&super->next_buf, 512, buf_len) == 0)
+		if (posix_memalign(&super->next_buf, sector_size, buf_len) == 0)
 			memset(super->next_buf, 0, buf_len);
 		else
 			super->next_buf = NULL;
@@ -9566,6 +9676,7 @@ int recover_backup_imsm(struct supertype *st, struct mdinfo *info)
 	int new_disks, i, err;
 	char *buf = NULL;
 	int retval = 1;
+	unsigned int sector_size = super->sector_size;
 	unsigned long curr_migr_unit = __le32_to_cpu(migr_rec->curr_migr_unit);
 	unsigned long num_migr_units = __le32_to_cpu(migr_rec->num_migr_units);
 	char buffer[20];
@@ -9602,7 +9713,7 @@ int recover_backup_imsm(struct supertype *st, struct mdinfo *info)
 			pba_of_lba0(map_dest)) * 512;
 
 	unit_len = __le32_to_cpu(migr_rec->dest_depth_per_unit) * 512;
-	if (posix_memalign((void **)&buf, 512, unit_len) != 0)
+	if (posix_memalign((void **)&buf, sector_size, unit_len) != 0)
 		goto abort;
 	targets = xcalloc(new_disks, sizeof(int));
 
@@ -10148,7 +10259,7 @@ enum imsm_reshape_type imsm_analyze_change(struct supertype *st,
 		 */
 		geo->size = imsm_component_size_aligment_check(
 				    get_imsm_raid_level(dev->vol.map),
-				    chunk * 1024,
+				    chunk * 1024, super->sector_size,
 				    geo->size * 2);
 		if (geo->size == 0) {
 			pr_err("Error. Size expansion is supported only (current size is %llu, requested size /rounded/ is 0).\n",
@@ -10182,7 +10293,7 @@ enum imsm_reshape_type imsm_analyze_change(struct supertype *st,
 			 */
 			max_size = imsm_component_size_aligment_check(
 					get_imsm_raid_level(dev->vol.map),
-					chunk * 1024,
+					chunk * 1024, super->sector_size,
 					max_size);
 		}
 		if (geo->size == MAX_SIZE) {
-- 
2.7.4


^ permalink raw reply related

* [PATCH v2 4/4] IMSM: 4Kn drives support - adapt general migration record
From: Pawel Baldysiak @ 2016-11-17 13:58 UTC (permalink / raw)
  To: jes.sorensen; +Cc: linux-raid, Pawel Baldysiak, Tomasz Majchrzak
In-Reply-To: <1479391118-31600-1-git-send-email-pawel.baldysiak@intel.com>

Convert general migration record for 4Kn drives prior to write and post
read. Calculate record location based on sector size, don't just assume
it's 512. Assure buffer address is aligned to 4096 so write operation
avoids caching.

Signed-off-by: Pawel Baldysiak <pawel.baldysiak@intel.com>
Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
---
 super-intel.c | 99 +++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 73 insertions(+), 26 deletions(-)

diff --git a/super-intel.c b/super-intel.c
index a1114ac..1f79eab 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -244,9 +244,9 @@ static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed"
 
 #define GEN_MIGR_AREA_SIZE 2048 /* General Migration Copy Area size in blocks */
 
-#define MIGR_REC_BUF_SIZE 512 /* size of migr_record i/o buffer */
-#define MIGR_REC_POSITION 512 /* migr_record position offset on disk,
-			       * MIGR_REC_BUF_SIZE <= MIGR_REC_POSITION
+#define MIGR_REC_BUF_SECTORS 1 /* size of migr_record i/o buffer in sectors */
+#define MIGR_REC_SECTOR_POSITION 1 /* migr_record position offset on disk,
+			       * MIGR_REC_BUF_SECTORS <= MIGR_REC_SECTOR_POS
 			       */
 
 #define UNIT_SRC_NORMAL     0   /* Source data for curr_migr_unit must
@@ -1257,6 +1257,19 @@ static void print_imsm_disk(struct imsm_disk *disk, int index, __u32 reserved)
 	       human_size(sz * 512));
 }
 
+void convert_to_4k_imsm_migr_rec(struct intel_super *super)
+{
+	struct migr_record *migr_rec = super->migr_rec;
+
+	migr_rec->blocks_per_unit /= IMSM_4K_DIV;
+	migr_rec->ckpt_area_pba /= IMSM_4K_DIV;
+	migr_rec->dest_1st_member_lba /= IMSM_4K_DIV;
+	migr_rec->dest_depth_per_unit /= IMSM_4K_DIV;
+	split_ull((join_u32(migr_rec->post_migr_vol_cap,
+		 migr_rec->post_migr_vol_cap_hi) / IMSM_4K_DIV),
+		 &migr_rec->post_migr_vol_cap, &migr_rec->post_migr_vol_cap_hi);
+}
+
 void convert_to_4k_imsm_disk(struct imsm_disk *disk)
 {
 	set_total_blocks(disk, (total_blocks(disk)/IMSM_4K_DIV));
@@ -1357,6 +1370,20 @@ void examine_migr_rec_imsm(struct intel_super *super)
 }
 #endif /* MDASSEMBLE */
 
+void convert_from_4k_imsm_migr_rec(struct intel_super *super)
+{
+	struct migr_record *migr_rec = super->migr_rec;
+
+	migr_rec->blocks_per_unit *= IMSM_4K_DIV;
+	migr_rec->ckpt_area_pba *= IMSM_4K_DIV;
+	migr_rec->dest_1st_member_lba *= IMSM_4K_DIV;
+	migr_rec->dest_depth_per_unit *= IMSM_4K_DIV;
+	split_ull((join_u32(migr_rec->post_migr_vol_cap,
+		 migr_rec->post_migr_vol_cap_hi) * IMSM_4K_DIV),
+		 &migr_rec->post_migr_vol_cap,
+		 &migr_rec->post_migr_vol_cap_hi);
+}
+
 void convert_from_4k(struct intel_super *super)
 {
 	struct imsm_super *mpb = super->anchor;
@@ -1629,7 +1656,7 @@ static int copy_metadata_imsm(struct supertype *st, int from, int to)
 	unsigned int sector_size = super->sector_size;
 	unsigned int written = 0;
 
-	if (posix_memalign(&buf, 4096, 4096) != 0)
+	if (posix_memalign(&buf, MAX_SECTOR_SIZE, MAX_SECTOR_SIZE) != 0)
 		return 1;
 
 	if (!get_dev_size(from, NULL, &dsize))
@@ -2499,21 +2526,26 @@ static int imsm_level_to_layout(int level)
 static int read_imsm_migr_rec(int fd, struct intel_super *super)
 {
 	int ret_val = -1;
+	unsigned int sector_size = super->sector_size;
 	unsigned long long dsize;
 
 	get_dev_size(fd, NULL, &dsize);
-	if (lseek64(fd, dsize - MIGR_REC_POSITION, SEEK_SET) < 0) {
+	if (lseek64(fd, dsize - (sector_size*MIGR_REC_SECTOR_POSITION),
+		   SEEK_SET) < 0) {
 		pr_err("Cannot seek to anchor block: %s\n",
 		       strerror(errno));
 		goto out;
 	}
-	if (read(fd, super->migr_rec_buf, MIGR_REC_BUF_SIZE) !=
-							    MIGR_REC_BUF_SIZE) {
+	if (read(fd, super->migr_rec_buf,
+	    MIGR_REC_BUF_SECTORS*sector_size) !=
+	    MIGR_REC_BUF_SECTORS*sector_size) {
 		pr_err("Cannot read migr record block: %s\n",
 		       strerror(errno));
 		goto out;
 	}
 	ret_val = 0;
+	if (sector_size == 4096)
+		convert_from_4k_imsm_migr_rec(super);
 
 out:
 	return ret_val;
@@ -2659,6 +2691,7 @@ static void imsm_update_metadata_locally(struct supertype *st,
 static int write_imsm_migr_rec(struct supertype *st)
 {
 	struct intel_super *super = st->sb;
+	unsigned int sector_size = super->sector_size;
 	unsigned long long dsize;
 	char nm[30];
 	int fd = -1;
@@ -2680,6 +2713,8 @@ static int write_imsm_migr_rec(struct supertype *st)
 
 	map = get_imsm_map(dev, MAP_0);
 
+	if (sector_size == 4096)
+		convert_to_4k_imsm_migr_rec(super);
 	for (sd = super->disks ; sd ; sd = sd->next) {
 		int slot = -1;
 
@@ -2697,13 +2732,15 @@ static int write_imsm_migr_rec(struct supertype *st)
 		if (fd < 0)
 			continue;
 		get_dev_size(fd, NULL, &dsize);
-		if (lseek64(fd, dsize - MIGR_REC_POSITION, SEEK_SET) < 0) {
+		if (lseek64(fd, dsize - (MIGR_REC_SECTOR_POSITION*sector_size),
+		    SEEK_SET) < 0) {
 			pr_err("Cannot seek to anchor block: %s\n",
 			       strerror(errno));
 			goto out;
 		}
-		if (write(fd, super->migr_rec_buf, MIGR_REC_BUF_SIZE) !=
-							    MIGR_REC_BUF_SIZE) {
+		if (write(fd, super->migr_rec_buf,
+		    MIGR_REC_BUF_SECTORS*sector_size) !=
+		    MIGR_REC_BUF_SECTORS*sector_size) {
 			pr_err("Cannot write migr record block: %s\n",
 			       strerror(errno));
 			goto out;
@@ -2711,9 +2748,10 @@ static int write_imsm_migr_rec(struct supertype *st)
 		close(fd);
 		fd = -1;
 	}
+	if (sector_size == 4096)
+		convert_from_4k_imsm_migr_rec(super);
 	/* update checkpoint information in metadata */
 	len = imsm_create_metadata_checkpoint_update(super, &u);
-
 	if (len <= 0) {
 		dprintf("imsm: Cannot prepare update\n");
 		goto out;
@@ -3837,7 +3875,8 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname)
 	sectors = mpb_sectors(anchor, sector_size) - 1;
 	free(anchor);
 
-	if (posix_memalign(&super->migr_rec_buf, 512, MIGR_REC_BUF_SIZE) != 0) {
+	if (posix_memalign(&super->migr_rec_buf, sector_size,
+	    MIGR_REC_BUF_SECTORS*sector_size) != 0) {
 		pr_err("could not allocate migr_rec buffer\n");
 		free(super->buf);
 		return 2;
@@ -4855,8 +4894,8 @@ static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
 			pr_err("could not allocate new mpb\n");
 			return 0;
 		}
-		if (posix_memalign(&super->migr_rec_buf, 512,
-				   MIGR_REC_BUF_SIZE) != 0) {
+		if (posix_memalign(&super->migr_rec_buf, sector_size,
+				   MIGR_REC_BUF_SECTORS*sector_size) != 0) {
 			pr_err("could not allocate migr_rec buffer\n");
 			free(super->buf);
 			free(super);
@@ -5018,7 +5057,8 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
 		pr_err("could not allocate superblock\n");
 		return 0;
 	}
-	if (posix_memalign(&super->migr_rec_buf, 512, MIGR_REC_BUF_SIZE) != 0) {
+	if (posix_memalign(&super->migr_rec_buf, MAX_SECTOR_SIZE,
+	    MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE) != 0) {
 		pr_err("could not allocate migr_rec buffer\n");
 		free(super->buf);
 		free(super);
@@ -5296,10 +5336,12 @@ static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk,
 	}
 
 	/* clear migr_rec when adding disk to container */
-	memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SIZE);
-	if (lseek64(fd, size - MIGR_REC_POSITION, SEEK_SET) >= 0) {
+	memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SECTORS*super->sector_size);
+	if (lseek64(fd, size - MIGR_REC_SECTOR_POSITION*super->sector_size,
+	    SEEK_SET) >= 0) {
 		if (write(fd, super->migr_rec_buf,
-			MIGR_REC_BUF_SIZE) != MIGR_REC_BUF_SIZE)
+		    MIGR_REC_BUF_SECTORS*super->sector_size) !=
+		    MIGR_REC_BUF_SECTORS*super->sector_size)
 			perror("Write migr_rec failed");
 	}
 
@@ -5475,7 +5517,8 @@ static int write_super_imsm(struct supertype *st, int doclose)
 		super->clean_migration_record_by_mdmon = 0;
 	}
 	if (clear_migration_record)
-		memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SIZE);
+		memset(super->migr_rec_buf, 0,
+		    MIGR_REC_BUF_SECTORS*sector_size);
 
 	if (sector_size == 4096)
 		convert_to_4k(super);
@@ -5489,9 +5532,11 @@ static int write_super_imsm(struct supertype *st, int doclose)
 			unsigned long long dsize;
 
 			get_dev_size(d->fd, NULL, &dsize);
-			if (lseek64(d->fd, dsize - 512, SEEK_SET) >= 0) {
+			if (lseek64(d->fd, dsize - sector_size,
+			    SEEK_SET) >= 0) {
 				if (write(d->fd, super->migr_rec_buf,
-					MIGR_REC_BUF_SIZE) != MIGR_REC_BUF_SIZE)
+				    MIGR_REC_BUF_SECTORS*sector_size) !=
+				    MIGR_REC_BUF_SECTORS*sector_size)
 					perror("Write migr_rec failed");
 			}
 		}
@@ -10715,6 +10760,7 @@ static int imsm_manage_reshape(
 	int ret_val = 0;
 	struct intel_super *super = st->sb;
 	struct intel_dev *dv;
+	unsigned int sector_size = super->sector_size;
 	struct imsm_dev *dev = NULL;
 	struct imsm_map *map_src;
 	int migr_vol_qan = 0;
@@ -10792,8 +10838,8 @@ static int imsm_manage_reshape(
 	buf_size += __le32_to_cpu(migr_rec->dest_depth_per_unit) * 512;
 	/* add space for stripe aligment */
 	buf_size += old_data_stripe_length;
-	if (posix_memalign((void **)&buf, 4096, buf_size)) {
-		dprintf("imsm: Cannot allocate checpoint buffer\n");
+	if (posix_memalign((void **)&buf, MAX_SECTOR_SIZE, buf_size)) {
+		dprintf("imsm: Cannot allocate checkpoint buffer\n");
 		goto abort;
 	}
 
@@ -10909,17 +10955,18 @@ static int imsm_manage_reshape(
 	/* clear migr_rec on disks after successful migration */
 	struct dl *d;
 
-	memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SIZE);
+	memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SECTORS*sector_size);
 	for (d = super->disks; d; d = d->next) {
 		if (d->index < 0 || is_failed(&d->disk))
 			continue;
 		unsigned long long dsize;
 
 		get_dev_size(d->fd, NULL, &dsize);
-		if (lseek64(d->fd, dsize - MIGR_REC_POSITION,
+		if (lseek64(d->fd, dsize - MIGR_REC_SECTOR_POSITION*sector_size,
 			    SEEK_SET) >= 0) {
 			if (write(d->fd, super->migr_rec_buf,
-				MIGR_REC_BUF_SIZE) != MIGR_REC_BUF_SIZE)
+			    MIGR_REC_BUF_SECTORS*sector_size) !=
+			    MIGR_REC_BUF_SECTORS*sector_size)
 				perror("Write migr_rec failed");
 		}
 	}
-- 
2.7.4


^ permalink raw reply related

* Re: [PATCH v2 0/4] IMSM: Add support for 4Kn sector size drives
From: Jes Sorensen @ 2016-11-17 14:30 UTC (permalink / raw)
  To: Pawel Baldysiak; +Cc: linux-raid
In-Reply-To: <1479391118-31600-1-git-send-email-pawel.baldysiak@intel.com>

Pawel Baldysiak <pawel.baldysiak@intel.com> writes:
> This patch set adds support for IMSM with 4Kn sector size drives
> First patch adds the generic function for receiving sector size,
> rest are IMSM specific.
> Internal calculation are still based on 512-bytes sector,
> variables are converted during read/write from/to member drive.
> Mixing of devices with different sector size is not allowed.
>
> v2: Adjustments according to Jes' comments:
> Patch 1/4: Fixed typo in original and copy-pasted function ("\b"->"\n");
> Patch 2/4: Added missing "\n" at the end of error msg;
> Patch 3/4, 4/4: Switched hardcoded 4k to #define;
>
> Pawel Baldysiak (4):
>   Add function for getting member drive sector size
>   IMSM: Read and store device sector size
>   IMSM: Add support for 4Kn sector size drives
>   IMSM: 4Kn drives support - adapt general migration record
>
>  mdadm.h       |   1 +
>  super-intel.c | 323 +++++++++++++++++++++++++++++++++++++++++++++-------------
>  super1.c      |   3 +-
>  util.c        |  18 +++-
>  4 files changed, 271 insertions(+), 74 deletions(-)

Looks good - applied!

Thanks,
Jes

^ permalink raw reply

* Re: [PATCH] Increase buffer for sysfs path
From: Jes Sorensen @ 2016-11-17 14:44 UTC (permalink / raw)
  To: Tomasz Majchrzak; +Cc: linux-raid
In-Reply-To: <1477643750-4381-1-git-send-email-tomasz.majchrzak@intel.com>

Tomasz Majchrzak <tomasz.majchrzak@intel.com> writes:
> 'unacknowledged_bad_blocks' is a long name for sysfs property and it
> makes sysfs path over 50 characters long. Increase buffer to the double
> length of the longest path available in sysfs at the moment.
>
> Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
> ---
>  sysfs.c | 36 ++++++++++++++++++++----------------
>  1 file changed, 20 insertions(+), 16 deletions(-)

Applied!

Thanks,
Jes

>
> diff --git a/sysfs.c b/sysfs.c
> index c7a8e66..fc1f01e 100644
> --- a/sysfs.c
> +++ b/sysfs.c
> @@ -27,6 +27,8 @@
>  #include	<dirent.h>
>  #include	<ctype.h>
>  
> +#define MAX_SYSFS_PATH_LEN	120
> +
>  int load_sys(char *path, char *buf, int len)
>  {
>  	int fd = open(path, O_RDONLY);
> @@ -63,15 +65,15 @@ void sysfs_free(struct mdinfo *sra)
>  
>  int sysfs_open(char *devnm, char *devname, char *attr)
>  {
> -	char fname[50];
> +	char fname[MAX_SYSFS_PATH_LEN];
>  	int fd;
>  
> -	sprintf(fname, "/sys/block/%s/md/", devnm);
> +	snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/md/", devnm);
>  	if (devname) {
> -		strcat(fname, devname);
> -		strcat(fname, "/");
> +		strncat(fname, devname, MAX_SYSFS_PATH_LEN - strlen(fname));
> +		strncat(fname, "/", MAX_SYSFS_PATH_LEN - strlen(fname));
>  	}
> -	strcat(fname, attr);
> +	strncat(fname, attr, MAX_SYSFS_PATH_LEN - strlen(fname));
>  	fd = open(fname, O_RDWR);
>  	if (fd < 0 && errno == EACCES)
>  		fd = open(fname, O_RDONLY);
> @@ -396,15 +398,17 @@ unsigned long long get_component_size(int fd)
>  	 * This returns in units of sectors.
>  	 */
>  	struct stat stb;
> -	char fname[50];
> +	char fname[MAX_SYSFS_PATH_LEN];
>  	int n;
>  	if (fstat(fd, &stb))
>  		return 0;
>  	if (major(stb.st_rdev) != (unsigned)get_mdp_major())
> -		sprintf(fname, "/sys/block/md%d/md/component_size",
> +		snprintf(fname, MAX_SYSFS_PATH_LEN,
> +			"/sys/block/md%d/md/component_size",
>  			(int)minor(stb.st_rdev));
>  	else
> -		sprintf(fname, "/sys/block/md_d%d/md/component_size",
> +		snprintf(fname, MAX_SYSFS_PATH_LEN,
> +			"/sys/block/md_d%d/md/component_size",
>  			(int)minor(stb.st_rdev)>>MdpMinorShift);
>  	fd = open(fname, O_RDONLY);
>  	if (fd < 0)
> @@ -420,11 +424,11 @@ unsigned long long get_component_size(int fd)
>  int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev,
>  		  char *name, char *val)
>  {
> -	char fname[50];
> +	char fname[MAX_SYSFS_PATH_LEN];
>  	unsigned int n;
>  	int fd;
>  
> -	sprintf(fname, "/sys/block/%s/md/%s/%s",
> +	snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/md/%s/%s",
>  		sra->sys_name, dev?dev->sys_name:"", name);
>  	fd = open(fname, O_WRONLY);
>  	if (fd < 0)
> @@ -457,11 +461,11 @@ int sysfs_set_num_signed(struct mdinfo *sra, struct mdinfo *dev,
>  
>  int sysfs_uevent(struct mdinfo *sra, char *event)
>  {
> -	char fname[50];
> +	char fname[MAX_SYSFS_PATH_LEN];
>  	int n;
>  	int fd;
>  
> -	sprintf(fname, "/sys/block/%s/uevent",
> +	snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/uevent",
>  		sra->sys_name);
>  	fd = open(fname, O_WRONLY);
>  	if (fd < 0)
> @@ -478,10 +482,10 @@ int sysfs_uevent(struct mdinfo *sra, char *event)
>  
>  int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev, char *name)
>  {
> -	char fname[50];
> +	char fname[MAX_SYSFS_PATH_LEN];
>  	struct stat st;
>  
> -	sprintf(fname, "/sys/block/%s/md/%s/%s",
> +	snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/md/%s/%s",
>  		sra->sys_name, dev?dev->sys_name:"", name);
>  
>  	return stat(fname, &st) == 0;
> @@ -490,10 +494,10 @@ int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev, char *name
>  int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev,
>  		       char *name)
>  {
> -	char fname[50];
> +	char fname[MAX_SYSFS_PATH_LEN];
>  	int fd;
>  
> -	sprintf(fname, "/sys/block/%s/md/%s/%s",
> +	snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/md/%s/%s",
>  		sra->sys_name, dev?dev->sys_name:"", name);
>  	fd = open(fname, O_RDWR);
>  	if (fd < 0)

^ permalink raw reply

* Re: [PATCH] Increase buffer for sysfs disk state
From: Jes Sorensen @ 2016-11-17 14:46 UTC (permalink / raw)
  To: Tomasz Majchrzak; +Cc: linux-raid
In-Reply-To: <1477560856-17310-1-git-send-email-tomasz.majchrzak@intel.com>

Tomasz Majchrzak <tomasz.majchrzak@intel.com> writes:
> Bad block support has incremented sysfs disk state reported by kernel
> ("external_bbl") so it became longer than 20 bytes. It causes reshape to
> fail as it reads truncated entry from sysfs.
>
> Increase buffer so it can accommodate the string including all state
> values currently implemented in kernel at the same time.
>
> Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
> ---
>  Grow.c        | 6 ++++--
>  monitor.c     | 4 ++--
>  super-intel.c | 5 +++--
>  3 files changed, 9 insertions(+), 6 deletions(-)

Applied!

Thanks,
Jes

^ permalink raw reply

* Re: Newly-created arrays don't auto-assemble - related to hostname change?
From: Andy Smith @ 2016-11-17 15:09 UTC (permalink / raw)
  To: linux-raid
In-Reply-To: <87lgwihc2v.fsf@notabene.neil.brown.name>

Hi Neil,

On Thu, Nov 17, 2016 at 05:09:28PM +1100, NeilBrown wrote:
> On Thu, Nov 17 2016, Andy Smith wrote:
> > After install the name of the server was changed from "tbd" to
> > "jfd". Another array was then created (md5), added to
> > /etc/mdadm/mdadm.conf and the initramfs was rebuilt
> > (update-initramfs -u).
> >
> > md5 does not auto-assemble. It can be started fine after boot with:
> >
> >     # mdadm --assemble /dev/md5
> >
> > or:
> >
> >     # mdadm --incremental /dev/sdc
> >     # mdadm --incremental /dev/sdd
> 
> This is almost exactly what udev does when the devices are discovered,
> so if it works here, it should work when udev does it.

Indeed. So confusing. :(

> My only guess is that maybe the "DEVICE /dev/sd*" line in the mdadm.conf
> is causing confusion.  udev might be using a different name, though that
> would be odd.
> 
> Can you try removing that line and see if it makes a difference?

I've now tried that and it hasn't made a difference.

I don't know anything about udev but I guess that assembly is
handled by /lib/udev/rules.d/64-md-raid-assembly.rules whose only
relevant ACTION lines are:

# remember you can limit what gets auto/incrementally assembled by
# mdadm.conf(5)'s 'AUTO' and selectively whitelist using 'ARRAY'
ACTION=="add|change", IMPORT{program}="/sbin/mdadm --incremental --export $tempnode --offroot ${DEVLINKS}"
ACTION=="add|change", ENV{MD_STARTED}=="*unsafe*", ENV{MD_FOREIGN}=="no", ENV{SYSTEMD_WANTS}+="mdadm-last-resort@$env{MD_DEVICE}.timer"

…but I can't work out why they wouldn't be working here.

Time for a Debian bug report?

Cheers,
Andy

^ permalink raw reply

* Re: [PATCH v6 04/11] md/r5cache: caching mode of r5cache
From: Song Liu @ 2016-11-17 17:25 UTC (permalink / raw)
  To: Shaohua Li
  Cc: Shaohua Li, linux-raid@vger.kernel.org, NeilBrown, Kernel Team,
	dan.j.williams@intel.com, hch@infradead.org,
	liuzhengyuang521@gmail.com, liuzhengyuan@kylinos.cn
In-Reply-To: <504BBD2B-D9C8-4D54-992D-6CF460015C32@fb.com>

> 
>> 
>>>>> @@ -1547,6 +1611,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
>>>>> 
>>>>> again:
>>>>> 			dev = &sh->dev[i];
>>>>> +			clear_bit(R5_InJournal, &dev->flags);
>>>> 
>>>> why clear the bit here? isn't this bit cleared when the stripe is initialized?
>>> 
>>> This is necessary when we rewrite a page that is already in journal. This means there is new data coming to 
>>> this stripe and dev, so we should not consider the page is secured in journal. 
>> 
>> This sounds suggest we should clear the bit after writeout is done.
> 
> That may also work. Let me confirm. 
> 

I think we still need clear R5_InJournal in ops_run_biodrain(). r5l_write_stripe() and r5l_log_stripe() uses 
R5_InJournal as a flag of "no need to write journal again". If we do not clear R5_InJournal in 
ops_run_biodrain(), newer data will not be written to journal. 

Thanks,
Song


^ permalink raw reply

* Re: [md PATCH 1/4] md: add block tracing for bio_remapping
From: Shaohua Li @ 2016-11-17 18:04 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid
In-Reply-To: <87twb6hdqq.fsf@notabene.neil.brown.name>

On Thu, Nov 17, 2016 at 04:33:33PM +1100, Neil Brown wrote:
> On Thu, Nov 17 2016, Shaohua Li wrote:
> 
> > On Mon, Nov 14, 2016 at 04:30:21PM +1100, Neil Brown wrote:
> >> The block tracing infrastructure (accessed with blktrace/blkparse)
> >> supports the tracing of mapping bios from one device to another.
> >> This is currently used when a bio in a partition is mapped to the
> >> whole device, when bios are mapped by dm, and for mapping in md/raid5.
> >> Other md personalities do not include this tracing yet, so add it.
> >> 
> >> When a read-error is detected we redirect the request to a different device.
> >> This could justifiably be seen as a new mapping for the originial bio,
> >> or a secondary mapping for the bio that errors.  This patch uses
> >> the second option.
> >> 
> >> When md is used under dm-raid, the mappings are not traced as we do
> >> not have access to the block device number of the parent.
> >
> > Looks the the original sector (the last parameter of trace_block_bio_remap)
> > isn't correct.
> > - in linear/raid0, bio_split already updated bio->bi_iter.bi_sector
> 
> Oh yes, of course.  in the common case 'split == bio' so when
> split->bi_iter.bi_sector  is adjusted, bio->.... is as well.
> I'll fix that, and also add calls to trace_block_split() as appropriate.
> 
> 
> > - in raid1/raid10, r1_bio->sector is updated before the bio is sent.
> 
> Here I really think my code is correct.  r1_bio->sector is always the
> address in the array of the request.  It is only set once for each
> r1_bio, and that is before the call to trace_block_io_remap().

Oh, you are right, sorry. I think moving the trace_remap right before we add
the bio to plug or pending list is better. It will make the code simpler. The
timing of the trace doesn't matter.

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot
From: Shaohua Li @ 2016-11-17 18:13 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, linux-raid, songliubraving, Zhengyuan Liu
In-Reply-To: <87wpg2heg8.fsf@notabene.neil.brown.name>

On Thu, Nov 17, 2016 at 04:18:15PM +1100, Neil Brown wrote:
> On Thu, Nov 17 2016, Shaohua Li wrote:
> 
> > Currently raid5-cache update superblock in .quiesce. But since at
> > shutdown/reboot, .quiesce is called with reconfig mutex locked,
> > superblock isn't guaranteed to be called in reclaim thread (see
> > 8e018c21da3). This will make assemble do unnecessary journal recovery.
> > It doesn't corrupt data but is annoying.  This adds an extra hook to
> > guarantee journal is flushed to raid disks.  And since this hook is
> > called before superblock update, this will guarantee we have a uptodate
> > superblock in shutdown/reboot
> 
> Hi.
> I don't quite follow some of the reasoning here.
> In particular, the ->stop_writes() that you have implemented
> does almost exactly the same thing as r5l_quiesce(1).
> So why not simply call ->quiesce(mddev, 1) in __md_stop_writes()??
> You probably need to also call ->quiesce(mddev, 0) to keep things
> balanced.

reboot (md_notify_reboot) doesn't call .quiesce, maybe we should do though. And
in stop, we hold reconfig_mutex before calling .quiesce. And with commit
8e018c21da3, r5l_write_super_and_discard_space tries to hold the reconfig_mutex
before write super, which it can't hold, so superblock write is skipped. After
.quiesce we don't write superblock. To fix the shutdown case, we can add a
superblock write after .quiesce. But I think it's more generic to add a
->stop_writes since it will work for the reboot case.

> Also you have introduced a static mutex (which isn't my favourite sort
> of thing) without giving any explanation why in the changelog comment.
> So I cannot easily see if that addition is at all justified.

Now it's possible both the reclaim thread and the thread calling
__md_stop_writes run into r5l_do_reclaim. The mutex is trying to avoid races.
I'll add comments there.

Thanks,
Shaohua

> 
> Thanks,
> NeilBrown
> 
> >
> > Signed-off-by: Shaohua Li <shli@fb.com>
> > Cc: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
> > Cc: NeilBrown <neilb@suse.com>
> > ---
> >  drivers/md/md.c          |  3 +++
> >  drivers/md/md.h          |  2 ++
> >  drivers/md/raid5-cache.c | 19 ++++++++++++++++++-
> >  drivers/md/raid5.c       |  3 +++
> >  drivers/md/raid5.h       |  1 +
> >  5 files changed, 27 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/md/md.c b/drivers/md/md.c
> > index 1f1c7f0..155dd42 100644
> > --- a/drivers/md/md.c
> > +++ b/drivers/md/md.c
> > @@ -5471,6 +5471,9 @@ static void __md_stop_writes(struct mddev *mddev)
> >  
> >  	del_timer_sync(&mddev->safemode_timer);
> >  
> > +	if (mddev->pers && mddev->pers->stop_writes)
> > +		mddev->pers->stop_writes(mddev);
> > +
> >  	bitmap_flush(mddev);
> >  
> >  	if (mddev->ro == 0 &&
> > diff --git a/drivers/md/md.h b/drivers/md/md.h
> > index af6b33c..8da6fe9 100644
> > --- a/drivers/md/md.h
> > +++ b/drivers/md/md.h
> > @@ -512,6 +512,8 @@ struct md_personality
> >  	/* congested implements bdi.congested_fn().
> >  	 * Will not be called while array is 'suspended' */
> >  	int (*congested)(struct mddev *mddev, int bits);
> > +	/* stop touching raid disks */
> > +	void (*stop_writes)(struct mddev *mddev);
> >  };
> >  
> >  struct md_sysfs_entry {
> > diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> > index 2e270e6..641dec8 100644
> > --- a/drivers/md/raid5-cache.c
> > +++ b/drivers/md/raid5-cache.c
> > @@ -738,11 +738,13 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
> >  
> >  static void r5l_do_reclaim(struct r5l_log *log)
> >  {
> > +	static DEFINE_MUTEX(lock);
> >  	sector_t reclaim_target = xchg(&log->reclaim_target, 0);
> >  	sector_t reclaimable;
> >  	sector_t next_checkpoint;
> >  	u64 next_cp_seq;
> >  
> > +	mutex_lock(&lock);
> >  	spin_lock_irq(&log->io_list_lock);
> >  	/*
> >  	 * move proper io_unit to reclaim list. We should not change the order.
> > @@ -769,8 +771,10 @@ static void r5l_do_reclaim(struct r5l_log *log)
> >  	spin_unlock_irq(&log->io_list_lock);
> >  
> >  	BUG_ON(reclaimable < 0);
> > -	if (reclaimable == 0)
> > +	if (reclaimable == 0) {
> > +		mutex_unlock(&lock);
> >  		return;
> > +	}
> >  
> >  	/*
> >  	 * write_super will flush cache of each raid disk. We must write super
> > @@ -784,6 +788,8 @@ static void r5l_do_reclaim(struct r5l_log *log)
> >  	log->last_cp_seq = next_cp_seq;
> >  	mutex_unlock(&log->io_mutex);
> >  
> > +	mutex_unlock(&lock);
> > +
> >  	r5l_run_no_space_stripes(log);
> >  }
> >  
> > @@ -836,6 +842,17 @@ void r5l_quiesce(struct r5l_log *log, int state)
> >  	}
> >  }
> >  
> > +void r5l_stop_writes(struct mddev *mddev)
> > +{
> > +	struct r5conf *conf = mddev->private;
> > +	struct r5l_log *log = conf->log;
> > +
> > +	if (!log)
> > +		return;
> > +	r5l_wake_reclaim(log, -1L);
> > +	r5l_do_reclaim(log);
> > +}
> > +
> >  bool r5l_log_disk_error(struct r5conf *conf)
> >  {
> >  	struct r5l_log *log;
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index df88656..3d27338 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -7866,6 +7866,7 @@ static struct md_personality raid6_personality =
> >  	.quiesce	= raid5_quiesce,
> >  	.takeover	= raid6_takeover,
> >  	.congested	= raid5_congested,
> > +	.stop_writes	= r5l_stop_writes,
> >  };
> >  static struct md_personality raid5_personality =
> >  {
> > @@ -7889,6 +7890,7 @@ static struct md_personality raid5_personality =
> >  	.quiesce	= raid5_quiesce,
> >  	.takeover	= raid5_takeover,
> >  	.congested	= raid5_congested,
> > +	.stop_writes	= r5l_stop_writes,
> >  };
> >  
> >  static struct md_personality raid4_personality =
> > @@ -7913,6 +7915,7 @@ static struct md_personality raid4_personality =
> >  	.quiesce	= raid5_quiesce,
> >  	.takeover	= raid4_takeover,
> >  	.congested	= raid5_congested,
> > +	.stop_writes	= r5l_stop_writes,
> >  };
> >  
> >  static int __init raid5_init(void)
> > diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> > index 57ec49f..0643cc0 100644
> > --- a/drivers/md/raid5.h
> > +++ b/drivers/md/raid5.h
> > @@ -633,4 +633,5 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh);
> >  extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
> >  extern void r5l_quiesce(struct r5l_log *log, int state);
> >  extern bool r5l_log_disk_error(struct r5conf *conf);
> > +extern void r5l_stop_writes(struct mddev *mddev);
> >  #endif
> > -- 
> > 2.9.3



^ permalink raw reply

* Re: [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot
From: Shaohua Li @ 2016-11-17 18:23 UTC (permalink / raw)
  To: Wols Lists
  Cc: NeilBrown, Shaohua Li, linux-raid, songliubraving, Zhengyuan Liu
In-Reply-To: <582D7C07.8060507@youngman.org.uk>

On Thu, Nov 17, 2016 at 09:44:39AM +0000, Wols Lists wrote:
> On 17/11/16 05:18, NeilBrown wrote:
> > On Thu, Nov 17 2016, Shaohua Li wrote:
> > 
> >> Currently raid5-cache update superblock in .quiesce. But since at
> >> shutdown/reboot, .quiesce is called with reconfig mutex locked,
> >> superblock isn't guaranteed to be called in reclaim thread (see
> >> 8e018c21da3). This will make assemble do unnecessary journal recovery.
> >> It doesn't corrupt data but is annoying.  This adds an extra hook to
> >> guarantee journal is flushed to raid disks.  And since this hook is
> >> called before superblock update, this will guarantee we have a uptodate
> >> superblock in shutdown/reboot
> > 
> > Hi.
> > I don't quite follow some of the reasoning here.
> > In particular, the ->stop_writes() that you have implemented
> > does almost exactly the same thing as r5l_quiesce(1).
> > So why not simply call ->quiesce(mddev, 1) in __md_stop_writes()??
> > You probably need to also call ->quiesce(mddev, 0) to keep things
> > balanced.
> > 
> > Also you have introduced a static mutex (which isn't my favourite sort
> > of thing) without giving any explanation why in the changelog comment.
> > So I cannot easily see if that addition is at all justified.
> > 
> > Thanks,
> > NeilBrown
> > 
> I need to be careful I don't ruffle any feathers here ...
> 
> But this is saying to me this is a nice feature that hasn't been
> properly spec'd and thought through. Don't get me wrong, I know that -
> in typical linux fashion - people have been adding things, and raid has
> "just growed" topsy fashion. So it's incredibly difficult to spec a new
> feature when you don't have a spec for the stuff you're building it on.
> 
> Anyways, what I'm saying is, it seems to me this caching stuff (it's a
> new feature, iirc) would be great for trying to write out a proper spec
> of what's meant to be going on. It'll roll over into spec'ing the stuff
> it relies on ...
> 
> And yes, I *AM* volunteering to do the work - as I said elsewhere, I
> want to put a load of kerneldoc into the raid source, and get to
> understand it all, but the downside is you'll get a lot of newbie-ish
> questions from me trying to get to grips with what's going on. I'm an
> experienced C programmer but kernel style is alien to me - you know the
> disconnect when you're reading something, you can read the words easily,
> but you can't decipher the meaning. That's how I feel reading the kernel
> source at the moment.
> 
> Are we up for it?

Yep, that makes sense. the journal (current write-through mode and upcoming
write-back mode) does deserve a description. I'll add something into
Documentation dir in kernel source.

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH 1/2] raid5-cache: update superblock at shutdown/reboot
From: Wols Lists @ 2016-11-17 19:11 UTC (permalink / raw)
  To: Shaohua Li
  Cc: NeilBrown, Shaohua Li, linux-raid, songliubraving, Zhengyuan Liu
In-Reply-To: <20161117182359.tyyqmrbngtqufcvf@kernel.org>

On 17/11/16 18:23, Shaohua Li wrote:
>> And yes, I *AM* volunteering to do the work - as I said elsewhere, I
>> > want to put a load of kerneldoc into the raid source, and get to
>> > understand it all, but the downside is you'll get a lot of newbie-ish
>> > questions from me trying to get to grips with what's going on. I'm an
>> > experienced C programmer but kernel style is alien to me - you know the
>> > disconnect when you're reading something, you can read the words easily,
>> > but you can't decipher the meaning. That's how I feel reading the kernel
>> > source at the moment.
>> > 
>> > Are we up for it?

> Yep, that makes sense. the journal (current write-through mode and upcoming
> write-back mode) does deserve a description. I'll add something into
> Documentation dir in kernel source.

From what I can make out ... :-)

The new kernel documentation system actually builds a load of stuff into
the Documentation/output directory from the kernel source. In other
words, it'll be far better edited into the source files. And/or put a
.rst file in the md directory.

I need to dig into this, and work out how it all fits together (I'm
having trouble running it on my main system at the moment, gentoo takes
forever to upgrade and the bits I need won't install ...)

https://www.kernel.org/doc/

When you've written it up, post it to the list, and we'll see about
getting it into the new documentation system rather than just dropping
it into the Documentation directory (I think the new system will put it
in Documentation/output/drivers/md, which is lot more sensible than just
jumbled in Documentation along with everything else.

Cheers,
Wol

^ permalink raw reply

* [PATCH] dm: Avoid sleeping while holding the dm_bufio lock
From: Douglas Anderson @ 2016-11-17 19:24 UTC (permalink / raw)
  To: Alasdair Kergon, Mike Snitzer
  Cc: David Rientjes, Dmitry Torokhov, Sonny Rao, linux,
	Douglas Anderson, dm-devel, shli, linux-raid, linux-kernel

We've seen in-field reports showing _lots_ (18 in one case, 41 in
another) of tasks all sitting there blocked on:

  mutex_lock+0x4c/0x68
  dm_bufio_shrink_count+0x38/0x78
  shrink_slab.part.54.constprop.65+0x100/0x464
  shrink_zone+0xa8/0x198

In the two cases analyzed, we see one task that looks like this:

  Workqueue: kverityd verity_prefetch_io

  __switch_to+0x9c/0xa8
  __schedule+0x440/0x6d8
  schedule+0x94/0xb4
  schedule_timeout+0x204/0x27c
  schedule_timeout_uninterruptible+0x44/0x50
  wait_iff_congested+0x9c/0x1f0
  shrink_inactive_list+0x3a0/0x4cc
  shrink_lruvec+0x418/0x5cc
  shrink_zone+0x88/0x198
  try_to_free_pages+0x51c/0x588
  __alloc_pages_nodemask+0x648/0xa88
  __get_free_pages+0x34/0x7c
  alloc_buffer+0xa4/0x144
  __bufio_new+0x84/0x278
  dm_bufio_prefetch+0x9c/0x154
  verity_prefetch_io+0xe8/0x10c
  process_one_work+0x240/0x424
  worker_thread+0x2fc/0x424
  kthread+0x10c/0x114

...and that looks to be the one holding the mutex.

The problem has been reproduced on fairly easily:
0. Be running Chrome OS w/ verity enabled on the root filesystem
1. Pick test patch: http://crosreview.com/412360
2. Install launchBalloons.sh and balloon.arm from
     http://crbug.com/468342
   ...that's just a memory stress test app.
3. On a 4GB rk3399 machine, run
     nice ./launchBalloons.sh 4 900 100000
   ...that tries to eat 4 * 900 MB of memory and keep accessing.
4. Login to the Chrome web browser and restore many tabs

With that, I've seen printouts like:
  DOUG: long bufio 90758 ms
...and stack trace always show's we're in dm_bufio_prefetch().

The problem is that we try to allocate memory with GFP_NOIO while
we're holding the dm_bufio lock.  Instead we should be using
GFP_NOWAIT.  Using GFP_NOIO can cause us to sleep while holding the
lock and that causes the above problems.

The current behavior explained by David Rientjes:

  It will still try reclaim initially because __GFP_WAIT (or
  __GFP_KSWAPD_RECLAIM) is set by GFP_NOIO.  This is the cause of
  contention on dm_bufio_lock() that the thread holds.  You want to
  pass GFP_NOWAIT instead of GFP_NOIO to alloc_buffer() when holding a
  mutex that can be contended by a concurrent slab shrinker (if
  count_objects didn't use a trylock, this pattern would trivially
  deadlock).

Suggested-by: David Rientjes <rientjes@google.com>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
---
Note that this change was developed and tested against the Chrome OS
4.4 kernel tree, not mainline.  Due to slight differences in verity
between mainline and Chrome OS it became too difficult to reproduce my
testing setup on mainline.  This patch still seems correct and
relevant to upstream, so I'm posting it.  If this is not acceptible to
you then please ignore this patch.

Also note that when I tested the Chrome OS 3.14 kernel tree I couldn't
reproduce the long delays described in the patch.  Presumably
something changed in either the kernel config or the memory management
code between the two kernel versions that made this crop up.  In a
similar vein, it is possible that problems described in this patch are
no longer reproducible upstream.  However, the arguments made in this
patch (that we don't want to block while holding the mutex) still
apply so I think the patch may still have merit.

 drivers/md/dm-bufio.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index b3ba142e59a4..3c767399cc59 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -827,7 +827,8 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
 	 * dm-bufio is resistant to allocation failures (it just keeps
 	 * one buffer reserved in cases all the allocations fail).
 	 * So set flags to not try too hard:
-	 *	GFP_NOIO: don't recurse into the I/O layer
+	 *	GFP_NOWAIT: don't wait; if we need to sleep we'll release our
+	 *		    mutex and wait ourselves.
 	 *	__GFP_NORETRY: don't retry and rather return failure
 	 *	__GFP_NOMEMALLOC: don't use emergency reserves
 	 *	__GFP_NOWARN: don't print a warning in case of failure
@@ -837,7 +838,8 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
 	 */
 	while (1) {
 		if (dm_bufio_cache_size_latch != 1) {
-			b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+			b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY |
+					 __GFP_NOMEMALLOC | __GFP_NOWARN);
 			if (b)
 				return b;
 		}
-- 
2.8.0.rc3.226.g39d4020

^ permalink raw reply related

* Re: dm: Avoid sleeping while holding the dm_bufio lock
From: Mike Snitzer @ 2016-11-17 20:28 UTC (permalink / raw)
  To: Douglas Anderson
  Cc: Alasdair Kergon, David Rientjes, Dmitry Torokhov, Sonny Rao,
	linux, dm-devel, shli, linux-raid, linux-kernel
In-Reply-To: <1479410660-31408-1-git-send-email-dianders@chromium.org>

On Thu, Nov 17 2016 at  2:24pm -0500,
Douglas Anderson <dianders@chromium.org> wrote:

> We've seen in-field reports showing _lots_ (18 in one case, 41 in
> another) of tasks all sitting there blocked on:
> 
>   mutex_lock+0x4c/0x68
>   dm_bufio_shrink_count+0x38/0x78
>   shrink_slab.part.54.constprop.65+0x100/0x464
>   shrink_zone+0xa8/0x198
> 
> In the two cases analyzed, we see one task that looks like this:
> 
>   Workqueue: kverityd verity_prefetch_io
> 
>   __switch_to+0x9c/0xa8
>   __schedule+0x440/0x6d8
>   schedule+0x94/0xb4
>   schedule_timeout+0x204/0x27c
>   schedule_timeout_uninterruptible+0x44/0x50
>   wait_iff_congested+0x9c/0x1f0
>   shrink_inactive_list+0x3a0/0x4cc
>   shrink_lruvec+0x418/0x5cc
>   shrink_zone+0x88/0x198
>   try_to_free_pages+0x51c/0x588
>   __alloc_pages_nodemask+0x648/0xa88
>   __get_free_pages+0x34/0x7c
>   alloc_buffer+0xa4/0x144
>   __bufio_new+0x84/0x278
>   dm_bufio_prefetch+0x9c/0x154
>   verity_prefetch_io+0xe8/0x10c
>   process_one_work+0x240/0x424
>   worker_thread+0x2fc/0x424
>   kthread+0x10c/0x114
> 
> ...and that looks to be the one holding the mutex.
> 
> The problem has been reproduced on fairly easily:
> 0. Be running Chrome OS w/ verity enabled on the root filesystem
> 1. Pick test patch: http://crosreview.com/412360
> 2. Install launchBalloons.sh and balloon.arm from
>      http://crbug.com/468342
>    ...that's just a memory stress test app.
> 3. On a 4GB rk3399 machine, run
>      nice ./launchBalloons.sh 4 900 100000
>    ...that tries to eat 4 * 900 MB of memory and keep accessing.
> 4. Login to the Chrome web browser and restore many tabs
> 
> With that, I've seen printouts like:
>   DOUG: long bufio 90758 ms
> ...and stack trace always show's we're in dm_bufio_prefetch().
> 
> The problem is that we try to allocate memory with GFP_NOIO while
> we're holding the dm_bufio lock.  Instead we should be using
> GFP_NOWAIT.  Using GFP_NOIO can cause us to sleep while holding the
> lock and that causes the above problems.
> 
> The current behavior explained by David Rientjes:
> 
>   It will still try reclaim initially because __GFP_WAIT (or
>   __GFP_KSWAPD_RECLAIM) is set by GFP_NOIO.  This is the cause of
>   contention on dm_bufio_lock() that the thread holds.  You want to
>   pass GFP_NOWAIT instead of GFP_NOIO to alloc_buffer() when holding a
>   mutex that can be contended by a concurrent slab shrinker (if
>   count_objects didn't use a trylock, this pattern would trivially
>   deadlock).
> 
> Suggested-by: David Rientjes <rientjes@google.com>
> Signed-off-by: Douglas Anderson <dianders@chromium.org>
> ---
> Note that this change was developed and tested against the Chrome OS
> 4.4 kernel tree, not mainline.  Due to slight differences in verity
> between mainline and Chrome OS it became too difficult to reproduce my
> testing setup on mainline.  This patch still seems correct and
> relevant to upstream, so I'm posting it.  If this is not acceptible to
> you then please ignore this patch.
> 
> Also note that when I tested the Chrome OS 3.14 kernel tree I couldn't
> reproduce the long delays described in the patch.  Presumably
> something changed in either the kernel config or the memory management
> code between the two kernel versions that made this crop up.  In a
> similar vein, it is possible that problems described in this patch are
> no longer reproducible upstream.  However, the arguments made in this
> patch (that we don't want to block while holding the mutex) still
> apply so I think the patch may still have merit.
> 
>  drivers/md/dm-bufio.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
> index b3ba142e59a4..3c767399cc59 100644
> --- a/drivers/md/dm-bufio.c
> +++ b/drivers/md/dm-bufio.c
> @@ -827,7 +827,8 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
>  	 * dm-bufio is resistant to allocation failures (it just keeps
>  	 * one buffer reserved in cases all the allocations fail).
>  	 * So set flags to not try too hard:
> -	 *	GFP_NOIO: don't recurse into the I/O layer
> +	 *	GFP_NOWAIT: don't wait; if we need to sleep we'll release our
> +	 *		    mutex and wait ourselves.
>  	 *	__GFP_NORETRY: don't retry and rather return failure
>  	 *	__GFP_NOMEMALLOC: don't use emergency reserves
>  	 *	__GFP_NOWARN: don't print a warning in case of failure
> @@ -837,7 +838,8 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
>  	 */
>  	while (1) {
>  		if (dm_bufio_cache_size_latch != 1) {
> -			b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
> +			b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY |
> +					 __GFP_NOMEMALLOC | __GFP_NOWARN);
>  			if (b)
>  				return b;
>  		}
> -- 
> 2.8.0.rc3.226.g39d4020
> 

I have one report of a very low-memory system hitting issues with bufio
(in the context of DM-thinp, due to bufio shrinker) but nothing
implicating alloc_buffer().

In any case, I'm fine with your patch given that we'll just retry.  BUT
spinning in __alloc_buffer_wait_no_callback() doesn't really change the
fact that you're starved for memory.  It just makes this less visible
right?  Meaning that you won't see hung task timeouts?  Or were you
seeing these tasks manifest this back-pressure through other means?

^ permalink raw reply

* Re:
From: Dennis Dataopslag @ 2016-11-17 20:33 UTC (permalink / raw)
  To: linux-raid
In-Reply-To: <CAGELip+rb=Ztv2K8jPdn608Li_RakEq9Gp5Jyof9eC_bO5QMrQ@mail.gmail.com>

CHeers for the reaction and sorry for my late response, I've been out
for business.

Trying to rebuild this RAID is definately worth it for me. The
learning experience alone already makes it worth.

I did read the wiki page and tried several steps that are on there but
it didn't seem to get me out of trouble.

I used this information from the drive, obviously didn't search for
any "hidden" settings:
" Magic : a92b4efc
        Version : 1.2
    Feature Map : 0x0
     Array UUID : 36fdeb4b:c5360009:0958ad1e:17da451b
           Name : TRD106:0  (local to host TRD106)
  Creation Time : Fri Oct 10 12:27:27 2014
     Raid Level : raid5
   Raid Devices : 4

 Avail Dev Size : 1948250112 (929.00 GiB 997.50 GB)
     Array Size : 5844750336 (2786.99 GiB 2992.51 GB)
    Data Offset : 2048 sectors
   Super Offset : 8 sectors
          State : clean
    Device UUID : b49e2752:d37dac6c:8764c52a:372277bd

    Update Time : Sat Nov  5 14:40:33 2016
       Checksum : d47a9ad4 - correct
         Events : 14934

         Layout : left-symmetric
     Chunk Size : 64K

   Device Role : Active device 0
   Array State : AAAA ('A' == active, '.' == missing)"

Anybody that can give me a little extra push?

On 06/11/16 21:00, Dennis Dataopslag wrote:
> Help wanted very much!

Quick response ...
>
> My setup:
> Thecus N5550 NAS with 5 1TB drives installed.
>
> MD0: RAID 5 config of 4 drives (SD[ABCD]2)
> MD10: RAID 1 config of all 5 drives (SD..1), system generated array
> MD50: RAID 1 config of 4 drives (SD[ABCD]3), system generated array
>
> 1 drive (SDE) set as global hot spare.
>
Bit late now, but you would probably have been better with raid-6.
>
> What happened:
> This weekend I thought it might be a good idea to do a SMART test for
> the drives in my NAS.
> I started the test on 1 drive and after it ran for a while I started
> the other ones.
> While the test was running drive 3 failed. I got a message the RAID
> was degraded and started rebuilding. (My assumption is that at this
> moment the global hot spare will automatically be added to the array)
>
> I stopped the SMART tests of all drives at this moment since it seemed
> logical to me the SMART test (or the outcomes) made the drive fail.
> In stopping the tests, drive 1 also failed!!
> I let it for a little but the admin interface kept telling me it was
> degraded, did not seem to take any actions to start rebuilding.

It can't - there's no spare drive to rebuild on, and there aren't enough
drives to build a working array.

> At this point I started googling and found I should remove and reseat
> the drives. This is also what I did but nothing seemd to happen.
> The turned up as new drives in the admin interface and I re-added them
> to the array, they were added as spares.
> Even after adding them the array didn't start rebuilding.
> I checked stat in mdadm and it told me clean FAILED opposed to the
> degraded in the admin interface.

Yup. You've only got two drives of a four-drive raid 5.

Where did you google? Did you read the linux raid wiki?

https://raid.wiki.kernel.org/index.php/Linux_Raid
>
> I rebooted the NAS since it didn't seem to be doing anything I might interrupt.
> after rebooting it seemed as if the entire array had disappeared!!
> I started looking for options in MDADM and tried every "normal"option
> to rebuild the array (--assemble --scan for example)
> Unfortunately I cannot produce a complete list since I cannot find how
> to get it from the logging.
>
> Finally I mdadm --create a new array with the original 4 drives with
> all the right settings. (Got them from 1 of the original volumes)

OUCH OUCH OUCH!

Are you sure you've got the right settings? A lot of "hidden" settings
have changed their values over the years. Do you know which mdadm was
used to create the array in the first place?

> The creation worked but after creation it doesn't seem to have a valid
> partition table. This is the point where I realized I probably fucked
> it up big-time and should call in the help squad!!!
> What I think went wrong is that I re-created an array with the
> original 4 drives from before the first failure but the hot-spare was
> already added?

Nope. You've probably used a newer version of mdadm. That's assuming the
array is still all the original drives. If some of them have been
replaced you've got a still messier problem.
>
> The most important data from the array is saved in an offline backup
> luckily but I would very much like it if there is any way I could
> restore the data from the array.
>
> Is there any way I could get it back online?

You're looking at a big forensic job. I've moved the relevant page to
the archaeology area - probably a bit too soon - but you need to read
the following page

https://raid.wiki.kernel.org/index.php/Reconstruction

Especially the bit about overlays. And wait for the experts to chime in
about how to do a hexdump and work out the values you need to pass to
mdadm to get the array back. It's a lot of work and you could be looking
at a week what with the delays as you wait for replies.

I think it's recoverable. Is it worth it?

Cheers,
Wol

On Sun, Nov 6, 2016 at 10:00 PM, Dennis Dataopslag
<dennisdataopslag@gmail.com> wrote:
> Help wanted very much!
>
> My setup:
> Thecus N5550 NAS with 5 1TB drives installed.
>
> MD0: RAID 5 config of 4 drives (SD[ABCD]2)
> MD10: RAID 1 config of all 5 drives (SD..1), system generated array
> MD50: RAID 1 config of 4 drives (SD[ABCD]3), system generated array
>
> 1 drive (SDE) set as global hot spare.
>
>
> What happened:
> This weekend I thought it might be a good idea to do a SMART test for
> the drives in my NAS.
> I started the test on 1 drive and after it ran for a while I started
> the other ones.
> While the test was running drive 3 failed. I got a message the RAID
> was degraded and started rebuilding. (My assumption is that at this
> moment the global hot spare will automatically be added to the array)
>
> I stopped the SMART tests of all drives at this moment since it seemed
> logical to me the SMART test (or the outcomes) made the drive fail.
> In stopping the tests, drive 1 also failed!!
> I let it for a little but the admin interface kept telling me it was
> degraded, did not seem to take any actions to start rebuilding.
> At this point I started googling and found I should remove and reseat
> the drives. This is also what I did but nothing seemd to happen.
> The turned up as new drives in the admin interface and I re-added them
> to the array, they were added as spares.
> Even after adding them the array didn't start rebuilding.
> I checked stat in mdadm and it told me clean FAILED opposed to the
> degraded in the admin interface.
>
> I rebooted the NAS since it didn't seem to be doing anything I might interrupt.
> after rebooting it seemed as if the entire array had disappeared!!
> I started looking for options in MDADM and tried every "normal"option
> to rebuild the array (--assemble --scan for example)
> Unfortunately I cannot produce a complete list since I cannot find how
> to get it from the logging.
>
> Finally I mdadm --create a new array with the original 4 drives with
> all the right settings. (Got them from 1 of the original volumes)
> The creation worked but after creation it doesn't seem to have a valid
> partition table. This is the point where I realized I probably fucked
> it up big-time and should call in the help squad!!!
> What I think went wrong is that I re-created an array with the
> original 4 drives from before the first failure but the hot-spare was
> already added?
>
> The most important data from the array is saved in an offline backup
> luckily but I would very much like it if there is any way I could
> restore the data from the array.
>
> Is there any way I could get it back online?

^ permalink raw reply

* Re: dm: Avoid sleeping while holding the dm_bufio lock
From: Doug Anderson @ 2016-11-17 20:44 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Alasdair Kergon, David Rientjes, Dmitry Torokhov, Sonny Rao,
	Guenter Roeck, dm-devel, Shaohua Li, linux-raid,
	linux-kernel@vger.kernel.org
In-Reply-To: <20161117202800.GA30170@redhat.com>

Hi,

On Thu, Nov 17, 2016 at 12:28 PM, Mike Snitzer <snitzer@redhat.com> wrote:
> On Thu, Nov 17 2016 at  2:24pm -0500,
> Douglas Anderson <dianders@chromium.org> wrote:
>
>> We've seen in-field reports showing _lots_ (18 in one case, 41 in
>> another) of tasks all sitting there blocked on:
>>
>>   mutex_lock+0x4c/0x68
>>   dm_bufio_shrink_count+0x38/0x78
>>   shrink_slab.part.54.constprop.65+0x100/0x464
>>   shrink_zone+0xa8/0x198
>>
>> In the two cases analyzed, we see one task that looks like this:
>>
>>   Workqueue: kverityd verity_prefetch_io
>>
>>   __switch_to+0x9c/0xa8
>>   __schedule+0x440/0x6d8
>>   schedule+0x94/0xb4
>>   schedule_timeout+0x204/0x27c
>>   schedule_timeout_uninterruptible+0x44/0x50
>>   wait_iff_congested+0x9c/0x1f0
>>   shrink_inactive_list+0x3a0/0x4cc
>>   shrink_lruvec+0x418/0x5cc
>>   shrink_zone+0x88/0x198
>>   try_to_free_pages+0x51c/0x588
>>   __alloc_pages_nodemask+0x648/0xa88
>>   __get_free_pages+0x34/0x7c
>>   alloc_buffer+0xa4/0x144
>>   __bufio_new+0x84/0x278
>>   dm_bufio_prefetch+0x9c/0x154
>>   verity_prefetch_io+0xe8/0x10c
>>   process_one_work+0x240/0x424
>>   worker_thread+0x2fc/0x424
>>   kthread+0x10c/0x114
>>
>> ...and that looks to be the one holding the mutex.
>>
>> The problem has been reproduced on fairly easily:
>> 0. Be running Chrome OS w/ verity enabled on the root filesystem
>> 1. Pick test patch: http://crosreview.com/412360
>> 2. Install launchBalloons.sh and balloon.arm from
>>      http://crbug.com/468342
>>    ...that's just a memory stress test app.
>> 3. On a 4GB rk3399 machine, run
>>      nice ./launchBalloons.sh 4 900 100000
>>    ...that tries to eat 4 * 900 MB of memory and keep accessing.
>> 4. Login to the Chrome web browser and restore many tabs
>>
>> With that, I've seen printouts like:
>>   DOUG: long bufio 90758 ms
>> ...and stack trace always show's we're in dm_bufio_prefetch().
>>
>> The problem is that we try to allocate memory with GFP_NOIO while
>> we're holding the dm_bufio lock.  Instead we should be using
>> GFP_NOWAIT.  Using GFP_NOIO can cause us to sleep while holding the
>> lock and that causes the above problems.
>>
>> The current behavior explained by David Rientjes:
>>
>>   It will still try reclaim initially because __GFP_WAIT (or
>>   __GFP_KSWAPD_RECLAIM) is set by GFP_NOIO.  This is the cause of
>>   contention on dm_bufio_lock() that the thread holds.  You want to
>>   pass GFP_NOWAIT instead of GFP_NOIO to alloc_buffer() when holding a
>>   mutex that can be contended by a concurrent slab shrinker (if
>>   count_objects didn't use a trylock, this pattern would trivially
>>   deadlock).
>>
>> Suggested-by: David Rientjes <rientjes@google.com>
>> Signed-off-by: Douglas Anderson <dianders@chromium.org>
>> ---
>> Note that this change was developed and tested against the Chrome OS
>> 4.4 kernel tree, not mainline.  Due to slight differences in verity
>> between mainline and Chrome OS it became too difficult to reproduce my
>> testing setup on mainline.  This patch still seems correct and
>> relevant to upstream, so I'm posting it.  If this is not acceptible to
>> you then please ignore this patch.
>>
>> Also note that when I tested the Chrome OS 3.14 kernel tree I couldn't
>> reproduce the long delays described in the patch.  Presumably
>> something changed in either the kernel config or the memory management
>> code between the two kernel versions that made this crop up.  In a
>> similar vein, it is possible that problems described in this patch are
>> no longer reproducible upstream.  However, the arguments made in this
>> patch (that we don't want to block while holding the mutex) still
>> apply so I think the patch may still have merit.
>>
>>  drivers/md/dm-bufio.c | 6 ++++--
>>  1 file changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
>> index b3ba142e59a4..3c767399cc59 100644
>> --- a/drivers/md/dm-bufio.c
>> +++ b/drivers/md/dm-bufio.c
>> @@ -827,7 +827,8 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
>>        * dm-bufio is resistant to allocation failures (it just keeps
>>        * one buffer reserved in cases all the allocations fail).
>>        * So set flags to not try too hard:
>> -      *      GFP_NOIO: don't recurse into the I/O layer
>> +      *      GFP_NOWAIT: don't wait; if we need to sleep we'll release our
>> +      *                  mutex and wait ourselves.
>>        *      __GFP_NORETRY: don't retry and rather return failure
>>        *      __GFP_NOMEMALLOC: don't use emergency reserves
>>        *      __GFP_NOWARN: don't print a warning in case of failure
>> @@ -837,7 +838,8 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
>>        */
>>       while (1) {
>>               if (dm_bufio_cache_size_latch != 1) {
>> -                     b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
>> +                     b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY |
>> +                                      __GFP_NOMEMALLOC | __GFP_NOWARN);
>>                       if (b)
>>                               return b;
>>               }
>> --
>> 2.8.0.rc3.226.g39d4020
>>
>
> I have one report of a very low-memory system hitting issues with bufio
> (in the context of DM-thinp, due to bufio shrinker) but nothing
> implicating alloc_buffer().
>
> In any case, I'm fine with your patch given that we'll just retry.  BUT
> spinning in __alloc_buffer_wait_no_callback() doesn't really change the
> fact that you're starved for memory.  It just makes this less visible
> right?  Meaning that you won't see hung task timeouts?  Or were you
> seeing these tasks manifest this back-pressure through other means?

It actually significantly increases responsiveness of the system while
in this state, so it makes a real difference.  I believe it actually
changes behavior because it (at least) unblocks kswapd.  In the bug
report I analyzed, I saw:

   kswapd0         D ffffffc000204fd8     0    72      2 0x00000000
   Call trace:
   [<ffffffc000204fd8>] __switch_to+0x9c/0xa8
   [<ffffffc00090b794>] __schedule+0x440/0x6d8
   [<ffffffc00090bac0>] schedule+0x94/0xb4
   [<ffffffc00090be44>] schedule_preempt_disabled+0x28/0x44
   [<ffffffc00090d900>] __mutex_lock_slowpath+0x120/0x1ac
   [<ffffffc00090d9d8>] mutex_lock+0x4c/0x68
   [<ffffffc000708e7c>] dm_bufio_shrink_count+0x38/0x78
   [<ffffffc00030b268>] shrink_slab.part.54.constprop.65+0x100/0x464
   [<ffffffc00030dbd8>] shrink_zone+0xa8/0x198
   [<ffffffc00030e578>] balance_pgdat+0x328/0x508
   [<ffffffc00030eb7c>] kswapd+0x424/0x51c
   [<ffffffc00023f06c>] kthread+0x10c/0x114
   [<ffffffc000203dd0>] ret_from_fork+0x10/0x40

I'm not an expert, but I believe that blocking swapd isn't a super
great idea and that if we unblock it (like my patch will) then that
can help alleviate memory pressure.


-Doug

^ permalink raw reply

* Re: dm: Avoid sleeping while holding the dm_bufio lock
From: Mike Snitzer @ 2016-11-17 20:48 UTC (permalink / raw)
  To: Doug Anderson
  Cc: Shaohua Li, Dmitry Torokhov, linux-kernel@vger.kernel.org,
	linux-raid, dm-devel, Alasdair Kergon, David Rientjes, Sonny Rao,
	Guenter Roeck
In-Reply-To: <CAD=FV=VD9fyFFjGLi2ptyZTibNL70Wh7Fk43WCFgGX4cC6R1qA@mail.gmail.com>

On Thu, Nov 17 2016 at  3:44pm -0500,
Doug Anderson <dianders@chromium.org> wrote:

> Hi,
> 
> On Thu, Nov 17, 2016 at 12:28 PM, Mike Snitzer <snitzer@redhat.com> wrote:
> > On Thu, Nov 17 2016 at  2:24pm -0500,
> > Douglas Anderson <dianders@chromium.org> wrote:
> >
> >> We've seen in-field reports showing _lots_ (18 in one case, 41 in
> >> another) of tasks all sitting there blocked on:
> >>
> >>   mutex_lock+0x4c/0x68
> >>   dm_bufio_shrink_count+0x38/0x78
> >>   shrink_slab.part.54.constprop.65+0x100/0x464
> >>   shrink_zone+0xa8/0x198
> >>
> >> In the two cases analyzed, we see one task that looks like this:
> >>
> >>   Workqueue: kverityd verity_prefetch_io
> >>
> >>   __switch_to+0x9c/0xa8
> >>   __schedule+0x440/0x6d8
> >>   schedule+0x94/0xb4
> >>   schedule_timeout+0x204/0x27c
> >>   schedule_timeout_uninterruptible+0x44/0x50
> >>   wait_iff_congested+0x9c/0x1f0
> >>   shrink_inactive_list+0x3a0/0x4cc
> >>   shrink_lruvec+0x418/0x5cc
> >>   shrink_zone+0x88/0x198
> >>   try_to_free_pages+0x51c/0x588
> >>   __alloc_pages_nodemask+0x648/0xa88
> >>   __get_free_pages+0x34/0x7c
> >>   alloc_buffer+0xa4/0x144
> >>   __bufio_new+0x84/0x278
> >>   dm_bufio_prefetch+0x9c/0x154
> >>   verity_prefetch_io+0xe8/0x10c
> >>   process_one_work+0x240/0x424
> >>   worker_thread+0x2fc/0x424
> >>   kthread+0x10c/0x114
> >>
> >> ...and that looks to be the one holding the mutex.
> >>
> >> The problem has been reproduced on fairly easily:
> >> 0. Be running Chrome OS w/ verity enabled on the root filesystem
> >> 1. Pick test patch: http://crosreview.com/412360
> >> 2. Install launchBalloons.sh and balloon.arm from
> >>      http://crbug.com/468342
> >>    ...that's just a memory stress test app.
> >> 3. On a 4GB rk3399 machine, run
> >>      nice ./launchBalloons.sh 4 900 100000
> >>    ...that tries to eat 4 * 900 MB of memory and keep accessing.
> >> 4. Login to the Chrome web browser and restore many tabs
> >>
> >> With that, I've seen printouts like:
> >>   DOUG: long bufio 90758 ms
> >> ...and stack trace always show's we're in dm_bufio_prefetch().
> >>
> >> The problem is that we try to allocate memory with GFP_NOIO while
> >> we're holding the dm_bufio lock.  Instead we should be using
> >> GFP_NOWAIT.  Using GFP_NOIO can cause us to sleep while holding the
> >> lock and that causes the above problems.
> >>
> >> The current behavior explained by David Rientjes:
> >>
> >>   It will still try reclaim initially because __GFP_WAIT (or
> >>   __GFP_KSWAPD_RECLAIM) is set by GFP_NOIO.  This is the cause of
> >>   contention on dm_bufio_lock() that the thread holds.  You want to
> >>   pass GFP_NOWAIT instead of GFP_NOIO to alloc_buffer() when holding a
> >>   mutex that can be contended by a concurrent slab shrinker (if
> >>   count_objects didn't use a trylock, this pattern would trivially
> >>   deadlock).
> >>
> >> Suggested-by: David Rientjes <rientjes@google.com>
> >> Signed-off-by: Douglas Anderson <dianders@chromium.org>
> >> ---
> >> Note that this change was developed and tested against the Chrome OS
> >> 4.4 kernel tree, not mainline.  Due to slight differences in verity
> >> between mainline and Chrome OS it became too difficult to reproduce my
> >> testing setup on mainline.  This patch still seems correct and
> >> relevant to upstream, so I'm posting it.  If this is not acceptible to
> >> you then please ignore this patch.
> >>
> >> Also note that when I tested the Chrome OS 3.14 kernel tree I couldn't
> >> reproduce the long delays described in the patch.  Presumably
> >> something changed in either the kernel config or the memory management
> >> code between the two kernel versions that made this crop up.  In a
> >> similar vein, it is possible that problems described in this patch are
> >> no longer reproducible upstream.  However, the arguments made in this
> >> patch (that we don't want to block while holding the mutex) still
> >> apply so I think the patch may still have merit.
> >>
> >>  drivers/md/dm-bufio.c | 6 ++++--
> >>  1 file changed, 4 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
> >> index b3ba142e59a4..3c767399cc59 100644
> >> --- a/drivers/md/dm-bufio.c
> >> +++ b/drivers/md/dm-bufio.c
> >> @@ -827,7 +827,8 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
> >>        * dm-bufio is resistant to allocation failures (it just keeps
> >>        * one buffer reserved in cases all the allocations fail).
> >>        * So set flags to not try too hard:
> >> -      *      GFP_NOIO: don't recurse into the I/O layer
> >> +      *      GFP_NOWAIT: don't wait; if we need to sleep we'll release our
> >> +      *                  mutex and wait ourselves.
> >>        *      __GFP_NORETRY: don't retry and rather return failure
> >>        *      __GFP_NOMEMALLOC: don't use emergency reserves
> >>        *      __GFP_NOWARN: don't print a warning in case of failure
> >> @@ -837,7 +838,8 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
> >>        */
> >>       while (1) {
> >>               if (dm_bufio_cache_size_latch != 1) {
> >> -                     b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
> >> +                     b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY |
> >> +                                      __GFP_NOMEMALLOC | __GFP_NOWARN);
> >>                       if (b)
> >>                               return b;
> >>               }
> >> --
> >> 2.8.0.rc3.226.g39d4020
> >>
> >
> > I have one report of a very low-memory system hitting issues with bufio
> > (in the context of DM-thinp, due to bufio shrinker) but nothing
> > implicating alloc_buffer().
> >
> > In any case, I'm fine with your patch given that we'll just retry.  BUT
> > spinning in __alloc_buffer_wait_no_callback() doesn't really change the
> > fact that you're starved for memory.  It just makes this less visible
> > right?  Meaning that you won't see hung task timeouts?  Or were you
> > seeing these tasks manifest this back-pressure through other means?
> 
> It actually significantly increases responsiveness of the system while
> in this state, so it makes a real difference.  I believe it actually
> changes behavior because it (at least) unblocks kswapd.  In the bug
> report I analyzed, I saw:
> 
>    kswapd0         D ffffffc000204fd8     0    72      2 0x00000000
>    Call trace:
>    [<ffffffc000204fd8>] __switch_to+0x9c/0xa8
>    [<ffffffc00090b794>] __schedule+0x440/0x6d8
>    [<ffffffc00090bac0>] schedule+0x94/0xb4
>    [<ffffffc00090be44>] schedule_preempt_disabled+0x28/0x44
>    [<ffffffc00090d900>] __mutex_lock_slowpath+0x120/0x1ac
>    [<ffffffc00090d9d8>] mutex_lock+0x4c/0x68
>    [<ffffffc000708e7c>] dm_bufio_shrink_count+0x38/0x78
>    [<ffffffc00030b268>] shrink_slab.part.54.constprop.65+0x100/0x464
>    [<ffffffc00030dbd8>] shrink_zone+0xa8/0x198
>    [<ffffffc00030e578>] balance_pgdat+0x328/0x508
>    [<ffffffc00030eb7c>] kswapd+0x424/0x51c
>    [<ffffffc00023f06c>] kthread+0x10c/0x114
>    [<ffffffc000203dd0>] ret_from_fork+0x10/0x40
> 
> I'm not an expert, but I believe that blocking swapd isn't a super
> great idea and that if we unblock it (like my patch will) then that
> can help alleviate memory pressure.

OK, thanks for clarifying.  I'll get it staged for 4.10 this week.

^ permalink raw reply

* [PATCH] RFC: dm: avoid the mutex lock in dm_bufio_shrink_count()
From: Douglas Anderson @ 2016-11-17 21:16 UTC (permalink / raw)
  To: Alasdair Kergon, Mike Snitzer
  Cc: David Rientjes, Dmitry Torokhov, Sonny Rao, linux,
	Douglas Anderson, dm-devel, shli, linux-raid, linux-kernel

As reported in my recent patch ("dm: Avoid sleeping while holding the
dm_bufio lock"), we've seen in-field reports of lots of tasks sitting
blocked on:

  mutex_lock+0x4c/0x68
  dm_bufio_shrink_count+0x38/0x78
  shrink_slab.part.54.constprop.65+0x100/0x464
  shrink_zone+0xa8/0x198

Analysis of dm_bufio_shrink_count() shows that it's not much work to
avoid grabbing the mutex there.  Presumably if we can avoid grabbing
this mutex then other tasks (including those handling swap) may
sometimes return faster.

Note that it's likely that this won't be a huge savings since we'll
just need to grab the mutex if dm_bufio_shrink_scan() is called, but
it still seems like it would be a sane tradeoff.

This does slightly change the behavior of dm_bufio_shrink_count().  If
anyone was relying on dm_bufio_shrink_count() to return the total
count _after_ some in-progress operation finished then they'll no
longer get that behavior.  It seems unlikely anyone would be relying
on this behavior, though, because:
- Someone would have to be certain that the operation was already in
  progress _before_ dm_bufio_shrink_count() was called.  If the
  operation wasn't already in progress then we'd get the lock first.
- There aren't exactly lots of long-running operations where the
  dm_bufio lock is held the whole time.  Most functions using the
  dm_bufio lock grab and drop it almost constantly.  That means that
  getting the dm_bufio doesn't mean that any in-progress dm_bufio
  transactions are all done.
Maybe the above argument would be obvious to someone wise in the ways
of dm_bufio but it's a useful argument to make for those like me who
are trying to make a small fix without full comprehension of all of
dm_bufio's finer details.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
---
It's a bit unclear if this is actually useful and I don't have any
test cases showing the benefit, but posting it in case someone else
has good test cases or thinks it's a clear win.

Same caveat that this development was done on Chrome OS Kernel 4.4

 drivers/md/dm-bufio.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index b3ba142e59a4..885ba5482d9f 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -89,6 +89,7 @@ struct dm_bufio_client {
 
 	struct list_head lru[LIST_SIZE];
 	unsigned long n_buffers[LIST_SIZE];
+	unsigned long n_all_buffers;
 
 	struct block_device *bdev;
 	unsigned block_size;
@@ -485,6 +486,7 @@ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
 	struct dm_bufio_client *c = b->c;
 
 	c->n_buffers[dirty]++;
+	c->n_all_buffers++;
 	b->block = block;
 	b->list_mode = dirty;
 	list_add(&b->lru_list, &c->lru[dirty]);
@@ -502,6 +504,7 @@ static void __unlink_buffer(struct dm_buffer *b)
 	BUG_ON(!c->n_buffers[b->list_mode]);
 
 	c->n_buffers[b->list_mode]--;
+	c->n_all_buffers--;
 	__remove(b->c, b);
 	list_del(&b->lru_list);
 }
@@ -515,6 +518,7 @@ static void __relink_lru(struct dm_buffer *b, int dirty)
 
 	BUG_ON(!c->n_buffers[b->list_mode]);
 
+	/* NOTE: don't update n_all_buffers: -1 + 1 = 0 */
 	c->n_buffers[b->list_mode]--;
 	c->n_buffers[dirty]++;
 	b->list_mode = dirty;
@@ -1588,17 +1592,10 @@ static unsigned long
 dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct dm_bufio_client *c;
-	unsigned long count;
 
 	c = container_of(shrink, struct dm_bufio_client, shrinker);
-	if (sc->gfp_mask & __GFP_FS)
-		dm_bufio_lock(c);
-	else if (!dm_bufio_trylock(c))
-		return 0;
 
-	count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
-	dm_bufio_unlock(c);
-	return count;
+	return c->n_all_buffers;
 }
 
 /*
-- 
2.8.0.rc3.226.g39d4020


^ permalink raw reply related

* Re: [PATCH] dm: Avoid sleeping while holding the dm_bufio lock
From: Guenter Roeck @ 2016-11-17 21:56 UTC (permalink / raw)
  To: Douglas Anderson
  Cc: Alasdair Kergon, Mike Snitzer, David Rientjes, Dmitry Torokhov,
	Sonny Rao, dm-devel, shli, linux-raid, linux-kernel
In-Reply-To: <1479410660-31408-1-git-send-email-dianders@chromium.org>

On Thu, Nov 17, 2016 at 11:24:20AM -0800, Douglas Anderson wrote:
> We've seen in-field reports showing _lots_ (18 in one case, 41 in
> another) of tasks all sitting there blocked on:
> 
>   mutex_lock+0x4c/0x68
>   dm_bufio_shrink_count+0x38/0x78
>   shrink_slab.part.54.constprop.65+0x100/0x464
>   shrink_zone+0xa8/0x198
> 
> In the two cases analyzed, we see one task that looks like this:
> 
>   Workqueue: kverityd verity_prefetch_io
> 
>   __switch_to+0x9c/0xa8
>   __schedule+0x440/0x6d8
>   schedule+0x94/0xb4
>   schedule_timeout+0x204/0x27c
>   schedule_timeout_uninterruptible+0x44/0x50
>   wait_iff_congested+0x9c/0x1f0
>   shrink_inactive_list+0x3a0/0x4cc
>   shrink_lruvec+0x418/0x5cc
>   shrink_zone+0x88/0x198
>   try_to_free_pages+0x51c/0x588
>   __alloc_pages_nodemask+0x648/0xa88
>   __get_free_pages+0x34/0x7c
>   alloc_buffer+0xa4/0x144
>   __bufio_new+0x84/0x278
>   dm_bufio_prefetch+0x9c/0x154
>   verity_prefetch_io+0xe8/0x10c
>   process_one_work+0x240/0x424
>   worker_thread+0x2fc/0x424
>   kthread+0x10c/0x114
> 
> ...and that looks to be the one holding the mutex.
> 
> The problem has been reproduced on fairly easily:
> 0. Be running Chrome OS w/ verity enabled on the root filesystem
> 1. Pick test patch: http://crosreview.com/412360
> 2. Install launchBalloons.sh and balloon.arm from
>      http://crbug.com/468342
>    ...that's just a memory stress test app.
> 3. On a 4GB rk3399 machine, run
>      nice ./launchBalloons.sh 4 900 100000
>    ...that tries to eat 4 * 900 MB of memory and keep accessing.
> 4. Login to the Chrome web browser and restore many tabs
> 
> With that, I've seen printouts like:
>   DOUG: long bufio 90758 ms
> ...and stack trace always show's we're in dm_bufio_prefetch().
> 
> The problem is that we try to allocate memory with GFP_NOIO while
> we're holding the dm_bufio lock.  Instead we should be using
> GFP_NOWAIT.  Using GFP_NOIO can cause us to sleep while holding the
> lock and that causes the above problems.
> 
> The current behavior explained by David Rientjes:
> 
>   It will still try reclaim initially because __GFP_WAIT (or
>   __GFP_KSWAPD_RECLAIM) is set by GFP_NOIO.  This is the cause of
>   contention on dm_bufio_lock() that the thread holds.  You want to
>   pass GFP_NOWAIT instead of GFP_NOIO to alloc_buffer() when holding a
>   mutex that can be contended by a concurrent slab shrinker (if
>   count_objects didn't use a trylock, this pattern would trivially
>   deadlock).
> 
> Suggested-by: David Rientjes <rientjes@google.com>
> Signed-off-by: Douglas Anderson <dianders@chromium.org>

Reviewed-by: Guenter Roeck <linux@roeck-us.net>

> ---
> Note that this change was developed and tested against the Chrome OS
> 4.4 kernel tree, not mainline.  Due to slight differences in verity
> between mainline and Chrome OS it became too difficult to reproduce my
> testing setup on mainline.  This patch still seems correct and
> relevant to upstream, so I'm posting it.  If this is not acceptible to
> you then please ignore this patch.
> 
> Also note that when I tested the Chrome OS 3.14 kernel tree I couldn't
> reproduce the long delays described in the patch.  Presumably
> something changed in either the kernel config or the memory management
> code between the two kernel versions that made this crop up.  In a
> similar vein, it is possible that problems described in this patch are
> no longer reproducible upstream.  However, the arguments made in this
> patch (that we don't want to block while holding the mutex) still
> apply so I think the patch may still have merit.
> 
>  drivers/md/dm-bufio.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
> index b3ba142e59a4..3c767399cc59 100644
> --- a/drivers/md/dm-bufio.c
> +++ b/drivers/md/dm-bufio.c
> @@ -827,7 +827,8 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
>  	 * dm-bufio is resistant to allocation failures (it just keeps
>  	 * one buffer reserved in cases all the allocations fail).
>  	 * So set flags to not try too hard:
> -	 *	GFP_NOIO: don't recurse into the I/O layer
> +	 *	GFP_NOWAIT: don't wait; if we need to sleep we'll release our
> +	 *		    mutex and wait ourselves.
>  	 *	__GFP_NORETRY: don't retry and rather return failure
>  	 *	__GFP_NOMEMALLOC: don't use emergency reserves
>  	 *	__GFP_NOWARN: don't print a warning in case of failure
> @@ -837,7 +838,8 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
>  	 */
>  	while (1) {
>  		if (dm_bufio_cache_size_latch != 1) {
> -			b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
> +			b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY |
> +					 __GFP_NOMEMALLOC | __GFP_NOWARN);
>  			if (b)
>  				return b;
>  		}
> -- 
> 2.8.0.rc3.226.g39d4020
> 

^ permalink raw reply

* Re:
From: Wols Lists @ 2016-11-17 22:12 UTC (permalink / raw)
  To: Dennis Dataopslag, linux-raid
In-Reply-To: <CAGELipKk-J3yD7aoc0=w-JgMHaW6OGWhtmhjehGVV4kQ0a3g_Q@mail.gmail.com>

On 17/11/16 20:33, Dennis Dataopslag wrote:
> CHeers for the reaction and sorry for my late response, I've been out
> for business.
> 
> Trying to rebuild this RAID is definately worth it for me. The
> learning experience alone already makes it worth.
> 
> I did read the wiki page and tried several steps that are on there but
> it didn't seem to get me out of trouble.
> 
> I used this information from the drive, obviously didn't search for
> any "hidden" settings:
> " Magic : a92b4efc
>         Version : 1.2
>     Feature Map : 0x0
>      Array UUID : 36fdeb4b:c5360009:0958ad1e:17da451b
>            Name : TRD106:0  (local to host TRD106)
>   Creation Time : Fri Oct 10 12:27:27 2014
>      Raid Level : raid5
>    Raid Devices : 4
> 
>  Avail Dev Size : 1948250112 (929.00 GiB 997.50 GB)
>      Array Size : 5844750336 (2786.99 GiB 2992.51 GB)
>     Data Offset : 2048 sectors
>    Super Offset : 8 sectors
>           State : clean
>     Device UUID : b49e2752:d37dac6c:8764c52a:372277bd
> 
>     Update Time : Sat Nov  5 14:40:33 2016
>        Checksum : d47a9ad4 - correct
>          Events : 14934
> 
>          Layout : left-symmetric
>      Chunk Size : 64K
> 
>    Device Role : Active device 0
>    Array State : AAAA ('A' == active, '.' == missing)"
> 
> Anybody that can give me a little extra push?
> 
Others will be able to help better than me, but you might want to look
for the thread "RAID10 with 2 drives auto-assembled as RAID1".

This will give you some information about how to run hexdump and find
where your filesystems are on the array.

There's plenty of other threads with this sort of information, but this
will give you a starting point. If Phil Turmel sees this, he'll chime in
with better detail.

Cheers,
Wol


^ permalink raw reply

* Re: [PATCH] RFC: dm: avoid the mutex lock in dm_bufio_shrink_count()
From: David Rientjes @ 2016-11-17 22:20 UTC (permalink / raw)
  To: Douglas Anderson
  Cc: Alasdair Kergon, Mike Snitzer, Dmitry Torokhov, Sonny Rao, linux,
	dm-devel, shli, linux-raid, linux-kernel
In-Reply-To: <1479417398-2593-1-git-send-email-dianders@chromium.org>

On Thu, 17 Nov 2016, Douglas Anderson wrote:

> diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
> index b3ba142e59a4..885ba5482d9f 100644
> --- a/drivers/md/dm-bufio.c
> +++ b/drivers/md/dm-bufio.c
> @@ -89,6 +89,7 @@ struct dm_bufio_client {
>  
>  	struct list_head lru[LIST_SIZE];
>  	unsigned long n_buffers[LIST_SIZE];
> +	unsigned long n_all_buffers;
>  
>  	struct block_device *bdev;
>  	unsigned block_size;
> @@ -485,6 +486,7 @@ static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
>  	struct dm_bufio_client *c = b->c;
>  
>  	c->n_buffers[dirty]++;
> +	c->n_all_buffers++;
>  	b->block = block;
>  	b->list_mode = dirty;
>  	list_add(&b->lru_list, &c->lru[dirty]);
> @@ -502,6 +504,7 @@ static void __unlink_buffer(struct dm_buffer *b)
>  	BUG_ON(!c->n_buffers[b->list_mode]);
>  
>  	c->n_buffers[b->list_mode]--;
> +	c->n_all_buffers--;
>  	__remove(b->c, b);
>  	list_del(&b->lru_list);
>  }
> @@ -515,6 +518,7 @@ static void __relink_lru(struct dm_buffer *b, int dirty)
>  
>  	BUG_ON(!c->n_buffers[b->list_mode]);
>  
> +	/* NOTE: don't update n_all_buffers: -1 + 1 = 0 */
>  	c->n_buffers[b->list_mode]--;
>  	c->n_buffers[dirty]++;
>  	b->list_mode = dirty;
> @@ -1588,17 +1592,10 @@ static unsigned long
>  dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
>  {
>  	struct dm_bufio_client *c;
> -	unsigned long count;
>  
>  	c = container_of(shrink, struct dm_bufio_client, shrinker);
> -	if (sc->gfp_mask & __GFP_FS)
> -		dm_bufio_lock(c);
> -	else if (!dm_bufio_trylock(c))
> -		return 0;
>  
> -	count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
> -	dm_bufio_unlock(c);
> -	return count;
> +	return c->n_all_buffers;
>  }
>  
>  /*

Would be better to just avoid taking the mutex at all and returning 
c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] with a comment that 
the estimate might be wrong, but the actual count may vary between 
->count_objects() and ->scan_objects() anyway, so we don't actually care?

^ permalink raw reply

* Re: Newly-created arrays don't auto-assemble - related to hostname change?
From: NeilBrown @ 2016-11-17 22:43 UTC (permalink / raw)
  To: Andy Smith, linux-raid
In-Reply-To: <20161117150954.GH21587@bitfolk.com>

[-- Attachment #1: Type: text/plain, Size: 2678 bytes --]

On Fri, Nov 18 2016, Andy Smith wrote:

> Hi Neil,
>
> On Thu, Nov 17, 2016 at 05:09:28PM +1100, NeilBrown wrote:
>> On Thu, Nov 17 2016, Andy Smith wrote:
>> > After install the name of the server was changed from "tbd" to
>> > "jfd". Another array was then created (md5), added to
>> > /etc/mdadm/mdadm.conf and the initramfs was rebuilt
>> > (update-initramfs -u).
>> >
>> > md5 does not auto-assemble. It can be started fine after boot with:
>> >
>> >     # mdadm --assemble /dev/md5
>> >
>> > or:
>> >
>> >     # mdadm --incremental /dev/sdc
>> >     # mdadm --incremental /dev/sdd
>> 
>> This is almost exactly what udev does when the devices are discovered,
>> so if it works here, it should work when udev does it.
>
> Indeed. So confusing. :(
>
>> My only guess is that maybe the "DEVICE /dev/sd*" line in the mdadm.conf
>> is causing confusion.  udev might be using a different name, though that
>> would be odd.
>> 
>> Can you try removing that line and see if it makes a difference?
>
> I've now tried that and it hasn't made a difference.
>
> I don't know anything about udev but I guess that assembly is
> handled by /lib/udev/rules.d/64-md-raid-assembly.rules whose only
> relevant ACTION lines are:
>
> # remember you can limit what gets auto/incrementally assembled by
> # mdadm.conf(5)'s 'AUTO' and selectively whitelist using 'ARRAY'
> ACTION=="add|change", IMPORT{program}="/sbin/mdadm --incremental --export $tempnode --offroot ${DEVLINKS}"
> ACTION=="add|change", ENV{MD_STARTED}=="*unsafe*", ENV{MD_FOREIGN}=="no", ENV{SYSTEMD_WANTS}+="mdadm-last-resort@$env{MD_DEVICE}.timer"
>
> …but I can't work out why they wouldn't be working here.
>
> Time for a Debian bug report?

Maybe, though as they are using *exactly* the upstream mdadm-udev files
it probably isn't something they've broken.
Something you could try, after boot and while the arrays are still not
assembled, is

  echo change > /sys/block/sdc/uevent
  echo change > /sys/block/sdd/uevent

That should cause udev to assemble the array.
If you had "udevadm monitor" running at the same time, you would even
see the events.

Alternately you could use "udevadm trigger" instead of the "echo"
commands. That effectively sends 'change' to all devices.

If that doesn't work, the looking over the udev logs, and possibly
turning on extra udev logging, might lead to an answer.

If it does work, then we need to work out why it doesn't work earlier in
boot.
/etc/init.d/udev runs "udevadm trigger --action=add" which should pick
up anything that the initrd missed.  Maybe adding some tracing around
that would help.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]

^ permalink raw reply

* Re: Just a thought - linux raid wiki - raid 5 grows getting stuck at 0%
From: NeilBrown @ 2016-11-17 22:54 UTC (permalink / raw)
  To: Wols Lists; +Cc: linux-raid
In-Reply-To: <582D74F0.6080107@youngman.org.uk>

[-- Attachment #1: Type: text/plain, Size: 1502 bytes --]

On Thu, Nov 17 2016, Wols Lists wrote:

> On 17/11/16 06:18, NeilBrown wrote:
>>> > But I'm sure you can see from this that I'm updating this bit of the
>>> > wiki. Is there anything else I ought to know about bitmaps, so I can
>>> > document it? :-)
>
>> If only I could run "diff" between my brain and the wiki....
>
> :-)
>> 
>> Thanks for doing this!
>> 
>> NeilBrown
>
> I'll update the wiki in the next day or so.
>
> I'm a bit worried I'll spread myself too thin, but I've been following
> the threads on the recent patch sets, and I've also picked up on the new
> Sphinx kernel documentation stuff.
>
> So my aim now is, on top of generally updating the wiki, to link it in
> to the kernel doc, and update a lot of that doc using re-structured text
> in the md source files. That's not going to be a problem, I hope?
>
> I'm going to dig and find out more, but if anybody can point me to some
> kerneldoc code in the md source, and the corresponding links to the doc
> on the kernel.org website, that'll be wonderful. It'll just give me a
> leg up on the hardest part - actually getting started.

There is no kerneldoc documentation in md - just a few ad-hoc large
comments.
Even Documentation/md.txt isn't in very good shape.

I would be nice to have more documentation, but documentation can so
quickly become stale....

Thanks,
NeilBrown


>
> And if there isn't any at present, it'll just tell me it's needed even
> more :-)
>
> Cheers,
> Wol

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 800 bytes --]

^ permalink raw reply

* RE: Newly-created arrays don't auto-assemble - related to hostname change?
From: Peter Sangas @ 2016-11-17 23:22 UTC (permalink / raw)
  To: 'Andy Smith', linux-raid
In-Reply-To: <20161117035230.GG21587@bitfolk.com>

Andy, Your question as prompted me to think about the following:   I'm using Ubuntu 16 and have a running system with RAID1.   If I change the hostname of the system do I need to make any changes to /etc/mdadm/mdadm.conf file and if so how do I do that?

I see the host name is listed at the end of /etc/mdadm/mdadm.conf (name=hostname:0) for example.  

Thank you,
Pete


-----Original Message-----
From: Andy Smith [mailto:andy@strugglers.net] 
Sent: Wednesday, November 16, 2016 7:53 PM
To: linux-raid@vger.kernel.org
Subject: Newly-created arrays don't auto-assemble - related to hostname change?

Hi,

I feel I am missing something very simple here, as I usually don't have this issue, but here goes…

I've got a Debian jessie host on which I created four arrays during install (md{0,1,2,3}), using the Debian installer and partman. These auto-assemble fine.

After install the name of the server was changed from "tbd" to "jfd". Another array was then created (md5), added to /etc/mdadm/mdadm.conf and the initramfs was rebuilt (update-initramfs -u).

md5 does not auto-assemble. It can be started fine after boot with:

    # mdadm --assemble /dev/md5

or:

    # mdadm --incremental /dev/sdc
    # mdadm --incremental /dev/sdd

/etc/mdadm/mdadm.conf:

    DEVICE /dev/sd*
    CREATE owner=root group=disk mode=0660 auto=yes
    HOMEHOST <ignore>
    MAILADDR root
    ARRAY /dev/md/0  metadata=1.2 UUID=400bac1d:e2c5d6ef:fea3b8c8:bcb70f8f
    ARRAY /dev/md/1  metadata=1.2 UUID=e29c8b89:705f0116:d888f77e:2b6e32f5
    ARRAY /dev/md/2  metadata=1.2 UUID=039b3427:4be5157a:6e2d53bd:fe898803
    ARRAY /dev/md/3  metadata=1.2 UUID=30f745ce:7ed41b53:4df72181:7406ea1d
    ARRAY /dev/md/5  metadata=1.2 UUID=957030cf:c09f023d:ceaebb27:e546f095

I've unpacked the initramfs and looked at the mdadm.conf in there and it is identical.

Initially HOMEHOST was set to <system> (the default), but I noticed when looking at --detail that md5 has:

           Name : jfd:5  (local to host jfd)

whereas the others have:

           Name : tbd:0

…so I changed it to <ignore> to see if that would help. It didn't.

So, I'd really appreciate any hints as to what I've missed here!

Here follows --detail and --examine of md5 and its members, then the contents of /proc/mdstat after I have manually assembled md5.

$ sudo mdadm --detail /dev/md5
/dev/md5:
        Version : 1.2
  Creation Time : Thu Nov 17 02:35:15 2016
     Raid Level : raid10
     Array Size : 1875243008 (1788.37 GiB 1920.25 GB)
  Used Dev Size : 1875243008 (1788.37 GiB 1920.25 GB)
   Raid Devices : 2
  Total Devices : 2
    Persistence : Superblock is persistent

  Intent Bitmap : Internal

    Update Time : Thu Nov 17 02:35:15 2016
          State : clean
 Active Devices : 2
Working Devices : 2
 Failed Devices : 0
  Spare Devices : 0

         Layout : far=2
     Chunk Size : 512K

           Name : jfd:5  (local to host jfd)
           UUID : 957030cf:c09f023d:ceaebb27:e546f095
         Events : 0

    Number   Major   Minor   RaidDevice State
       0       8       48        0      active sync   /dev/sdd
       1       8       32        1      active sync   /dev/sdc

$ sudo mdadm --examine /dev/sd{c,d}
/dev/sdc:
          Magic : a92b4efc
        Version : 1.2
    Feature Map : 0x1
     Array UUID : 957030cf:c09f023d:ceaebb27:e546f095
           Name : jfd:5  (local to host jfd)
  Creation Time : Thu Nov 17 02:35:15 2016
     Raid Level : raid10
   Raid Devices : 2

 Avail Dev Size : 3750486704 (1788.37 GiB 1920.25 GB)
     Array Size : 1875243008 (1788.37 GiB 1920.25 GB)
  Used Dev Size : 3750486016 (1788.37 GiB 1920.25 GB)
    Data Offset : 262144 sectors
   Super Offset : 8 sectors
   Unused Space : before=262056 sectors, after=688 sectors
          State : clean
    Device UUID : 4ac82c29:2d109465:7fff9b22:8c411c1e

Internal Bitmap : 8 sectors from superblock
    Update Time : Thu Nov 17 02:35:15 2016
  Bad Block Log : 512 entries available at offset 72 sectors
       Checksum : 96d669f1 - correct
         Events : 0

         Layout : far=2
     Chunk Size : 512K

   Device Role : Active device 1
   Array State : AA ('A' == active, '.' == missing, 'R' == replacing)
/dev/sdd:
          Magic : a92b4efc
        Version : 1.2
    Feature Map : 0x1
     Array UUID : 957030cf:c09f023d:ceaebb27:e546f095
           Name : jfd:5  (local to host jfd)
  Creation Time : Thu Nov 17 02:35:15 2016
     Raid Level : raid10
   Raid Devices : 2
 Avail Dev Size : 3750486704 (1788.37 GiB 1920.25 GB)
     Array Size : 1875243008 (1788.37 GiB 1920.25 GB)
  Used Dev Size : 3750486016 (1788.37 GiB 1920.25 GB)
    Data Offset : 262144 sectors
   Super Offset : 8 sectors
   Unused Space : before=262056 sectors, after=688 sectors
          State : clean
    Device UUID : 3a067652:6e88afae:82722342:0036bae0

Internal Bitmap : 8 sectors from superblock
    Update Time : Thu Nov 17 02:35:15 2016
  Bad Block Log : 512 entries available at offset 72 sectors
       Checksum : eb608799 - correct
         Events : 0

         Layout : far=2
     Chunk Size : 512K

   Device Role : Active device 0
   Array State : AA ('A' == active, '.' == missing, 'R' == replacing)

$ cat /proc/mdstat
Personalities : [raid1] [raid10]
md5 : active (auto-read-only) raid10 sdd[0] sdc[1]
      1875243008 blocks super 1.2 512K chunks 2 far-copies [2/2] [UU]
      bitmap: 0/14 pages [0KB], 65536KB chunk

md3 : active raid10 sda5[0] sdb5[1]
      12199936 blocks super 1.2 512K chunks 2 far-copies [2/2] [UU]
      
md2 : active (auto-read-only) raid10 sda3[0] sdb3[1]
      975872 blocks super 1.2 512K chunks 2 far-copies [2/2] [UU]
      
md1 : active raid10 sda2[0] sdb2[1]
      1951744 blocks super 1.2 512K chunks 2 far-copies [2/2] [UU]
      
md0 : active raid1 sda1[0] sdb1[1]
      498368 blocks super 1.2 [2/2] [UU]
      
unused devices: <none>

Cheers,
Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply

* [PATCH v7 00/10] raid5-cache: enabling cache features
From: Song Liu @ 2016-11-17 23:24 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuang521,
	liuzhengyuan, Song Liu

These are the 7th version of patches to enable write cache part of
raid5-cache. The journal part was released with kernel 4.4.

The caching part uses same disk format of raid456 journal, and provides
acceleration to writes. Write operations are committed (bio_endio) once
the data is secured in journal. Reconstruct and RMW are postponed to
writing-out phase, which is usually not on the critical path.

The changes are organized in 10 patches (details below).

Patch for chunk_aligned_read in earlier RFC is not included yet
(http://marc.info/?l=linux-raid&m=146432700719277). But we may still need
some optimizations later, especially for SSD raid devices.

Changes between v7 and v6 (http://marc.info/?l=linux-raid&m=147881079931937):
 1. Incorprate feedbacks;
 2. Modify representation of write-back state machine to use STRIPE_R5C_CACHING
    instead of STRIPE_R5C_WRITE_OUT. This reduces state trasitions in
    write-through mode;
 3. For prexor, allocate extra orig_page instead of page;
 4. Rename and refactor of some functions and variables;
 5. Remove path 11 (handle alloc_page failure). I will propose a simpler retry
    patch later.

Song Liu (10):
  md/r5cache: Check array size in r5l_init_log
  md/r5cache: move some code to raid5.h
  md/r5cache: State machine for raid5-cache write back mode
  md/r5cache: caching phase of r5cache
  md/r5cache: write-out phase and reclaim support
  md/r5cache: sysfs entry journal_mode
  md/r5cache: refactoring journal recovery code
  md/r5cache: r5cache recovery: part 1
  md/r5cache: r5cache recovery: part 2
  md/r5cache: handle SYNC and FUA

 drivers/md/raid5-cache.c | 1798 ++++++++++++++++++++++++++++++++++++++++------
 drivers/md/raid5.c       |  301 +++++---
 drivers/md/raid5.h       |  166 ++++-
 3 files changed, 1935 insertions(+), 330 deletions(-)

--
2.9.3

^ permalink raw reply

* [PATCH v7 01/10] md/r5cache: Check array size in r5l_init_log
From: Song Liu @ 2016-11-17 23:24 UTC (permalink / raw)
  To: linux-raid
  Cc: neilb, shli, kernel-team, dan.j.williams, hch, liuzhengyuang521,
	liuzhengyuan, Song Liu
In-Reply-To: <20161117232445.1798305-1-songliubraving@fb.com>

Currently, r5l_write_stripe checks meta size for each stripe write,
which is not necessary.

With this patch, r5l_init_log checks maximal meta size of the array,
which is (r5l_meta_block + raid_disks x r5l_payload_data_parity).
If this is too big to fit in one page, r5l_init_log aborts.

With current meta data, r5l_log support raid_disks up to 203.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index f73672b..33fc850 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -441,7 +441,6 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 {
 	int write_disks = 0;
 	int data_pages, parity_pages;
-	int meta_size;
 	int reserve;
 	int i;
 	int ret = 0;
@@ -473,15 +472,6 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
 	parity_pages = 1 + !!(sh->qd_idx >= 0);
 	data_pages = write_disks - parity_pages;
 
-	meta_size =
-		((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
-		 * data_pages) +
-		sizeof(struct r5l_payload_data_parity) +
-		sizeof(__le32) * parity_pages;
-	/* Doesn't work with very big raid array */
-	if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
-		return -EINVAL;
-
 	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
 	/*
 	 * The stripe must enter state machine again to finish the write, so
@@ -1197,6 +1187,22 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 
 	if (PAGE_SIZE != 4096)
 		return -EINVAL;
+
+	/*
+	 * The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
+	 * raid_disks r5l_payload_data_parity.
+	 *
+	 * Write journal and cache does not work for very big array
+	 * (raid_disks > 203)
+	 */
+	if (sizeof(struct r5l_meta_block) +
+	    ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) *
+	     conf->raid_disks) > PAGE_SIZE) {
+		pr_err("md/raid:%s: write journal/cache doesn't work for array with %d disks\n",
+		       mdname(conf->mddev), conf->raid_disks);
+		return -EINVAL;
+	}
+
 	log = kzalloc(sizeof(*log), GFP_KERNEL);
 	if (!log)
 		return -ENOMEM;
-- 
2.9.3


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox