Linux RAID subsystem development
 help / color / mirror / Atom feed
* [PATCH v5 4/7] md: add sysfs entries for PPL
From: Artur Paszkiewicz @ 2017-03-09  9:00 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20170309090003.13298-1-artur.paszkiewicz@intel.com>

Add 'consistency_policy' attribute for array. It indicates how the array
maintains consistency in case of unexpected shutdown.

Add 'ppl_sector' and 'ppl_size' for rdev, which describe the location
and size of the PPL space on the device. They can't be changed for
active members if the array is started and PPL is enabled, so in the
setter functions only basic checks are performed. More checks are done
in ppl_validate_rdev() when starting the log.

These attributes are writable to allow enabling PPL for external
metadata arrays and (later) to enable/disable PPL for a running array.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 Documentation/admin-guide/md.rst |  32 ++++++++++-
 drivers/md/md.c                  | 115 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 144 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst
index 1e61bf50595c..84de718f24a4 100644
--- a/Documentation/admin-guide/md.rst
+++ b/Documentation/admin-guide/md.rst
@@ -276,14 +276,14 @@ All md devices contain:
      array creation it will default to 0, though starting the array as
      ``clean`` will set it much larger.
 
-   new_dev
+  new_dev
      This file can be written but not read.  The value written should
      be a block device number as major:minor.  e.g. 8:0
      This will cause that device to be attached to the array, if it is
      available.  It will then appear at md/dev-XXX (depending on the
      name of the device) and further configuration is then possible.
 
-   safe_mode_delay
+  safe_mode_delay
      When an md array has seen no write requests for a certain period
      of time, it will be marked as ``clean``.  When another write
      request arrives, the array is marked as ``dirty`` before the write
@@ -292,7 +292,7 @@ All md devices contain:
      period as a number of seconds.  The default is 200msec (0.200).
      Writing a value of 0 disables safemode.
 
-   array_state
+  array_state
      This file contains a single word which describes the current
      state of the array.  In many cases, the state can be set by
      writing the word for the desired state, however some states
@@ -401,7 +401,30 @@ All md devices contain:
      once the array becomes non-degraded, and this fact has been
      recorded in the metadata.
 
+  consistency_policy
+     This indicates how the array maintains consistency in case of unexpected
+     shutdown. It can be:
 
+     none
+       Array has no redundancy information, e.g. raid0, linear.
+
+     resync
+       Full resync is performed and all redundancy is regenerated when the
+       array is started after unclean shutdown.
+
+     bitmap
+       Resync assisted by a write-intent bitmap.
+
+     journal
+       For raid4/5/6, journal device is used to log transactions and replay
+       after unclean shutdown.
+
+     ppl
+       For raid5 only, Partial Parity Log is used to close the write hole and
+       eliminate resync.
+
+     The accepted values when writing to this file are ``ppl`` and ``resync``,
+     used to enable and disable PPL.
 
 
 As component devices are added to an md array, they appear in the ``md``
@@ -563,6 +586,9 @@ Each directory contains:
 	adds bad blocks without acknowledging them. This is largely
 	for testing.
 
+      ppl_sector, ppl_size
+        Location and size (in sectors) of the space used for Partial Parity Log
+        on this device.
 
 
 An active md device will also contain an entry for each active device
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 173550455c42..1df48f365b3c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3149,6 +3149,78 @@ static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
 
+static ssize_t
+ppl_sector_show(struct md_rdev *rdev, char *page)
+{
+	return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
+}
+
+static ssize_t
+ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+	unsigned long long sector;
+
+	if (kstrtoull(buf, 10, &sector) < 0)
+		return -EINVAL;
+	if (sector != (sector_t)sector)
+		return -EINVAL;
+
+	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
+	    rdev->raid_disk >= 0)
+		return -EBUSY;
+
+	if (rdev->mddev->persistent) {
+		if (rdev->mddev->major_version == 0)
+			return -EINVAL;
+		if ((sector > rdev->sb_start &&
+		     sector - rdev->sb_start > S16_MAX) ||
+		    (sector < rdev->sb_start &&
+		     rdev->sb_start - sector > -S16_MIN))
+			return -EINVAL;
+		rdev->ppl.offset = sector - rdev->sb_start;
+	} else if (!rdev->mddev->external) {
+		return -EBUSY;
+	}
+	rdev->ppl.sector = sector;
+	return len;
+}
+
+static struct rdev_sysfs_entry rdev_ppl_sector =
+__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
+
+static ssize_t
+ppl_size_show(struct md_rdev *rdev, char *page)
+{
+	return sprintf(page, "%u\n", rdev->ppl.size);
+}
+
+static ssize_t
+ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+	unsigned int size;
+
+	if (kstrtouint(buf, 10, &size) < 0)
+		return -EINVAL;
+
+	if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
+	    rdev->raid_disk >= 0)
+		return -EBUSY;
+
+	if (rdev->mddev->persistent) {
+		if (rdev->mddev->major_version == 0)
+			return -EINVAL;
+		if (size > U16_MAX)
+			return -EINVAL;
+	} else if (!rdev->mddev->external) {
+		return -EBUSY;
+	}
+	rdev->ppl.size = size;
+	return len;
+}
+
+static struct rdev_sysfs_entry rdev_ppl_size =
+__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
+
 static struct attribute *rdev_default_attrs[] = {
 	&rdev_state.attr,
 	&rdev_errors.attr,
@@ -3159,6 +3231,8 @@ static struct attribute *rdev_default_attrs[] = {
 	&rdev_recovery_start.attr,
 	&rdev_bad_blocks.attr,
 	&rdev_unack_bad_blocks.attr,
+	&rdev_ppl_sector.attr,
+	&rdev_ppl_size.attr,
 	NULL,
 };
 static ssize_t
@@ -4895,6 +4969,46 @@ static struct md_sysfs_entry md_array_size =
 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
        array_size_store);
 
+static ssize_t
+consistency_policy_show(struct mddev *mddev, char *page)
+{
+	int ret;
+
+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+		ret = sprintf(page, "journal\n");
+	} else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+		ret = sprintf(page, "ppl\n");
+	} else if (mddev->bitmap) {
+		ret = sprintf(page, "bitmap\n");
+	} else if (mddev->pers) {
+		if (mddev->pers->sync_request)
+			ret = sprintf(page, "resync\n");
+		else
+			ret = sprintf(page, "none\n");
+	} else {
+		ret = sprintf(page, "unknown\n");
+	}
+
+	return ret;
+}
+
+static ssize_t
+consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
+{
+	if (mddev->pers) {
+		return -EBUSY;
+	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
+		set_bit(MD_HAS_PPL, &mddev->flags);
+		return len;
+	} else {
+		return -EINVAL;
+	}
+}
+
+static struct md_sysfs_entry md_consistency_policy =
+__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
+       consistency_policy_store);
+
 static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
 	&md_layout.attr,
@@ -4910,6 +5024,7 @@ static struct attribute *md_default_attrs[] = {
 	&md_reshape_direction.attr,
 	&md_array_size.attr,
 	&max_corr_read_errors.attr,
+	&md_consistency_policy.attr,
 	NULL,
 };
 
-- 
2.11.0


^ permalink raw reply related

* [PATCH v5 5/7] raid5-ppl: load and recover the log
From: Artur Paszkiewicz @ 2017-03-09  9:00 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20170309090003.13298-1-artur.paszkiewicz@intel.com>

Load the log from each disk when starting the array and recover if the
array is dirty.

The initial empty PPL is written by mdadm. When loading the log we
verify the header checksum and signature. For external metadata arrays
the signature is verified in userspace, so here we read it from the
header, verifying only if it matches on all disks, and use it later when
writing PPL.

In addition to the header checksum, each header entry also contains a
checksum of its partial parity data. If the header is valid, recovery is
performed for each entry until an invalid entry is found. If the array
is not degraded and recovery using PPL fully succeeds, there is no need
to resync the array because data and parity will be consistent, so in
this case resync will be disabled.

Due to compatibility with IMSM implementations on other systems, we
can't assume that the recovery data block size is always 4K. Writes
generated by MD raid5 don't have this issue, but when recovering PPL
written in other environments it is possible to have entries with
512-byte sector granularity. The recovery code takes this into account
and also the logical sector size of the underlying drives.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/raid5-ppl.c | 497 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c     |   5 +-
 2 files changed, 501 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 92783586743d..548d1028a3ce 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -103,6 +103,10 @@ struct ppl_conf {
 	mempool_t *io_pool;
 	struct bio_set *bs;
 	mempool_t *meta_pool;
+
+	/* used only for recovery */
+	int recovered_entries;
+	int mismatch_count;
 };
 
 struct ppl_log {
@@ -514,6 +518,482 @@ void ppl_stripe_write_finished(struct stripe_head *sh)
 		ppl_io_unit_finished(io);
 }
 
+static void ppl_xor(int size, struct page *page1, struct page *page2,
+		    struct page *page_result)
+{
+	struct async_submit_ctl submit;
+	struct dma_async_tx_descriptor *tx;
+	struct page *xor_srcs[] = { page1, page2 };
+
+	init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
+			  NULL, NULL, NULL, NULL);
+	tx = async_xor(page_result, xor_srcs, 0, 2, size, &submit);
+
+	async_tx_quiesce(&tx);
+}
+
+/*
+ * PPL recovery strategy: xor partial parity and data from all modified data
+ * disks within a stripe and write the result as the new stripe parity. If all
+ * stripe data disks are modified (full stripe write), no partial parity is
+ * available, so just xor the data disks.
+ *
+ * Recovery of a PPL entry shall occur only if all modified data disks are
+ * available and read from all of them succeeds.
+ *
+ * A PPL entry applies to a stripe, partial parity size for an entry is at most
+ * the size of the chunk. Examples of possible cases for a single entry:
+ *
+ * case 0: single data disk write:
+ *   data0    data1    data2     ppl        parity
+ * +--------+--------+--------+           +--------------------+
+ * | ------ | ------ | ------ | +----+    | (no change)        |
+ * | ------ | -data- | ------ | | pp | -> | data1 ^ pp         |
+ * | ------ | -data- | ------ | | pp | -> | data1 ^ pp         |
+ * | ------ | ------ | ------ | +----+    | (no change)        |
+ * +--------+--------+--------+           +--------------------+
+ * pp_size = data_size
+ *
+ * case 1: more than one data disk write:
+ *   data0    data1    data2     ppl        parity
+ * +--------+--------+--------+           +--------------------+
+ * | ------ | ------ | ------ | +----+    | (no change)        |
+ * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
+ * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
+ * | ------ | ------ | ------ | +----+    | (no change)        |
+ * +--------+--------+--------+           +--------------------+
+ * pp_size = data_size / modified_data_disks
+ *
+ * case 2: write to all data disks (also full stripe write):
+ *   data0    data1    data2                parity
+ * +--------+--------+--------+           +--------------------+
+ * | ------ | ------ | ------ |           | (no change)        |
+ * | -data- | -data- | -data- | --------> | xor all data       |
+ * | ------ | ------ | ------ | --------> | (no change)        |
+ * | ------ | ------ | ------ |           | (no change)        |
+ * +--------+--------+--------+           +--------------------+
+ * pp_size = 0
+ *
+ * The following cases are possible only in other implementations. The recovery
+ * code can handle them, but they are not generated at runtime because they can
+ * be reduced to cases 0, 1 and 2:
+ *
+ * case 3:
+ *   data0    data1    data2     ppl        parity
+ * +--------+--------+--------+ +----+    +--------------------+
+ * | ------ | -data- | -data- | | pp |    | data1 ^ data2 ^ pp |
+ * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp |
+ * | -data- | -data- | -data- | | -- | -> | xor all data       |
+ * | -data- | -data- | ------ | | pp |    | data0 ^ data1 ^ pp |
+ * +--------+--------+--------+ +----+    +--------------------+
+ * pp_size = chunk_size
+ *
+ * case 4:
+ *   data0    data1    data2     ppl        parity
+ * +--------+--------+--------+ +----+    +--------------------+
+ * | ------ | -data- | ------ | | pp |    | data1 ^ pp         |
+ * | ------ | ------ | ------ | | -- | -> | (no change)        |
+ * | ------ | ------ | ------ | | -- | -> | (no change)        |
+ * | -data- | ------ | ------ | | pp |    | data0 ^ pp         |
+ * +--------+--------+--------+ +----+    +--------------------+
+ * pp_size = chunk_size
+ */
+static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
+			     sector_t ppl_sector)
+{
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct mddev *mddev = ppl_conf->mddev;
+	struct r5conf *conf = mddev->private;
+	int block_size = ppl_conf->block_size;
+	struct page *page1;
+	struct page *page2;
+	sector_t r_sector_first;
+	sector_t r_sector_last;
+	int strip_sectors;
+	int data_disks;
+	int i;
+	int ret = 0;
+	char b[BDEVNAME_SIZE];
+	unsigned int pp_size = le32_to_cpu(e->pp_size);
+	unsigned int data_size = le32_to_cpu(e->data_size);
+
+	page1 = alloc_page(GFP_KERNEL);
+	page2 = alloc_page(GFP_KERNEL);
+
+	if (!page1 || !page2) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9);
+
+	if ((pp_size >> 9) < conf->chunk_sectors) {
+		if (pp_size > 0) {
+			data_disks = data_size / pp_size;
+			strip_sectors = pp_size >> 9;
+		} else {
+			data_disks = conf->raid_disks - conf->max_degraded;
+			strip_sectors = (data_size >> 9) / data_disks;
+		}
+		r_sector_last = r_sector_first +
+				(data_disks - 1) * conf->chunk_sectors +
+				strip_sectors;
+	} else {
+		data_disks = conf->raid_disks - conf->max_degraded;
+		strip_sectors = conf->chunk_sectors;
+		r_sector_last = r_sector_first + (data_size >> 9);
+	}
+
+	pr_debug("%s: array sector first: %llu last: %llu\n", __func__,
+		 (unsigned long long)r_sector_first,
+		 (unsigned long long)r_sector_last);
+
+	/* if start and end is 4k aligned, use a 4k block */
+	if (block_size == 512 &&
+	    (r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
+	    (r_sector_last & (STRIPE_SECTORS - 1)) == 0)
+		block_size = STRIPE_SIZE;
+
+	/* iterate through blocks in strip */
+	for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
+		bool update_parity = false;
+		sector_t parity_sector;
+		struct md_rdev *parity_rdev;
+		struct stripe_head sh;
+		int disk;
+		int indent = 0;
+
+		pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i);
+		indent += 2;
+
+		memset(page_address(page1), 0, PAGE_SIZE);
+
+		/* iterate through data member disks */
+		for (disk = 0; disk < data_disks; disk++) {
+			int dd_idx;
+			struct md_rdev *rdev;
+			sector_t sector;
+			sector_t r_sector = r_sector_first + i +
+					    (disk * conf->chunk_sectors);
+
+			pr_debug("%s:%*s data member disk %d start\n",
+				 __func__, indent, "", disk);
+			indent += 2;
+
+			if (r_sector >= r_sector_last) {
+				pr_debug("%s:%*s array sector %llu doesn't need parity update\n",
+					 __func__, indent, "",
+					 (unsigned long long)r_sector);
+				indent -= 2;
+				continue;
+			}
+
+			update_parity = true;
+
+			/* map raid sector to member disk */
+			sector = raid5_compute_sector(conf, r_sector, 0,
+						      &dd_idx, NULL);
+			pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n",
+				 __func__, indent, "",
+				 (unsigned long long)r_sector, dd_idx,
+				 (unsigned long long)sector);
+
+			rdev = conf->disks[dd_idx].rdev;
+			if (!rdev) {
+				pr_debug("%s:%*s data member disk %d missing\n",
+					 __func__, indent, "", dd_idx);
+				update_parity = false;
+				break;
+			}
+
+			pr_debug("%s:%*s reading data member disk %s sector %llu\n",
+				 __func__, indent, "", bdevname(rdev->bdev, b),
+				 (unsigned long long)sector);
+			if (!sync_page_io(rdev, sector, block_size, page2,
+					REQ_OP_READ, 0, false)) {
+				md_error(mddev, rdev);
+				pr_debug("%s:%*s read failed!\n", __func__,
+					 indent, "");
+				ret = -EIO;
+				goto out;
+			}
+
+			ppl_xor(block_size, page1, page2, page1);
+
+			indent -= 2;
+		}
+
+		if (!update_parity)
+			continue;
+
+		if (pp_size > 0) {
+			pr_debug("%s:%*s reading pp disk sector %llu\n",
+				 __func__, indent, "",
+				 (unsigned long long)(ppl_sector + i));
+			if (!sync_page_io(log->rdev,
+					ppl_sector - log->rdev->data_offset + i,
+					block_size, page2, REQ_OP_READ, 0,
+					false)) {
+				pr_debug("%s:%*s read failed!\n", __func__,
+					 indent, "");
+				md_error(mddev, log->rdev);
+				ret = -EIO;
+				goto out;
+			}
+
+			ppl_xor(block_size, page1, page2, page1);
+		}
+
+		/* map raid sector to parity disk */
+		parity_sector = raid5_compute_sector(conf, r_sector_first + i,
+				0, &disk, &sh);
+		BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
+		parity_rdev = conf->disks[sh.pd_idx].rdev;
+
+		BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
+		pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
+			 __func__, indent, "",
+			 (unsigned long long)parity_sector,
+			 bdevname(parity_rdev->bdev, b));
+		if (!sync_page_io(parity_rdev, parity_sector, block_size,
+				page1, REQ_OP_WRITE, 0, false)) {
+			pr_debug("%s:%*s parity write error!\n", __func__,
+				 indent, "");
+			md_error(mddev, parity_rdev);
+			ret = -EIO;
+			goto out;
+		}
+	}
+out:
+	if (page1)
+		__free_page(page1);
+	if (page2)
+		__free_page(page2);
+	return ret;
+}
+
+static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr)
+{
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct md_rdev *rdev = log->rdev;
+	struct mddev *mddev = rdev->mddev;
+	sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9);
+	struct page *page;
+	int i;
+	int ret = 0;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	/* iterate through all PPL entries saved */
+	for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) {
+		struct ppl_header_entry *e = &pplhdr->entries[i];
+		u32 pp_size = le32_to_cpu(e->pp_size);
+		sector_t sector = ppl_sector;
+		int ppl_entry_sectors = pp_size >> 9;
+		u32 crc, crc_stored;
+
+		pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n",
+			 __func__, rdev->raid_disk, i,
+			 (unsigned long long)ppl_sector, pp_size);
+
+		crc = ~0;
+		crc_stored = le32_to_cpu(e->checksum);
+
+		/* read parial parity for this entry and calculate its checksum */
+		while (pp_size) {
+			int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size;
+
+			if (!sync_page_io(rdev, sector - rdev->data_offset,
+					s, page, REQ_OP_READ, 0, false)) {
+				md_error(mddev, rdev);
+				ret = -EIO;
+				goto out;
+			}
+
+			crc = crc32c_le(crc, page_address(page), s);
+
+			pp_size -= s;
+			sector += s >> 9;
+		}
+
+		crc = ~crc;
+
+		if (crc != crc_stored) {
+			/*
+			 * Don't recover this entry if the checksum does not
+			 * match, but keep going and try to recover other
+			 * entries.
+			 */
+			pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
+				 __func__, crc_stored, crc);
+			ppl_conf->mismatch_count++;
+		} else {
+			ret = ppl_recover_entry(log, e, ppl_sector);
+			if (ret)
+				goto out;
+			ppl_conf->recovered_entries++;
+		}
+
+		ppl_sector += ppl_entry_sectors;
+	}
+
+	/* flush the disk cache after recovery if necessary */
+	if (test_bit(QUEUE_FLAG_WC, &bdev_get_queue(rdev->bdev)->queue_flags)) {
+		struct bio *bio = bio_alloc_bioset(GFP_KERNEL, 0, ppl_conf->bs);
+
+		bio->bi_bdev = rdev->bdev;
+		bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+		ret = submit_bio_wait(bio);
+		bio_put(bio);
+	}
+out:
+	__free_page(page);
+	return ret;
+}
+
+static int ppl_write_empty_header(struct ppl_log *log)
+{
+	struct page *page;
+	struct ppl_header *pplhdr;
+	struct md_rdev *rdev = log->rdev;
+	int ret = 0;
+
+	pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__,
+		 rdev->raid_disk, (unsigned long long)rdev->ppl.sector);
+
+	page = alloc_page(GFP_NOIO | __GFP_ZERO);
+	if (!page)
+		return -ENOMEM;
+
+	pplhdr = page_address(page);
+	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+	pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
+	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
+
+	if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
+			  PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_FUA, 0,
+			  false)) {
+		md_error(rdev->mddev, rdev);
+		ret = -EIO;
+	}
+
+	__free_page(page);
+	return ret;
+}
+
+static int ppl_load_distributed(struct ppl_log *log)
+{
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct md_rdev *rdev = log->rdev;
+	struct mddev *mddev = rdev->mddev;
+	struct page *page;
+	struct ppl_header *pplhdr;
+	u32 crc, crc_stored;
+	u32 signature;
+	int ret = 0;
+
+	pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk);
+
+	/* read PPL header */
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
+			  PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
+		md_error(mddev, rdev);
+		ret = -EIO;
+		goto out;
+	}
+	pplhdr = page_address(page);
+
+	/* check header validity */
+	crc_stored = le32_to_cpu(pplhdr->checksum);
+	pplhdr->checksum = 0;
+	crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
+
+	if (crc_stored != crc) {
+		pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n",
+			 __func__, crc_stored, crc);
+		ppl_conf->mismatch_count++;
+		goto out;
+	}
+
+	signature = le32_to_cpu(pplhdr->signature);
+
+	if (mddev->external) {
+		/*
+		 * For external metadata the header signature is set and
+		 * validated in userspace.
+		 */
+		ppl_conf->signature = signature;
+	} else if (ppl_conf->signature != signature) {
+		pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n",
+			 __func__, signature, ppl_conf->signature);
+		ppl_conf->mismatch_count++;
+		goto out;
+	}
+
+	/* attempt to recover from log if we are starting a dirty array */
+	if (!mddev->pers && mddev->recovery_cp != MaxSector)
+		ret = ppl_recover(log, pplhdr);
+out:
+	/* write empty header if we are starting the array */
+	if (!ret && !mddev->pers)
+		ret = ppl_write_empty_header(log);
+
+	__free_page(page);
+
+	pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
+		 __func__, ret, ppl_conf->mismatch_count,
+		 ppl_conf->recovered_entries);
+	return ret;
+}
+
+static int ppl_load(struct ppl_conf *ppl_conf)
+{
+	int ret = 0;
+	u32 signature = 0;
+	bool signature_set = false;
+	int i;
+
+	for (i = 0; i < ppl_conf->count; i++) {
+		struct ppl_log *log = &ppl_conf->child_logs[i];
+
+		/* skip missing drive */
+		if (!log->rdev)
+			continue;
+
+		ret = ppl_load_distributed(log);
+		if (ret)
+			break;
+
+		/*
+		 * For external metadata we can't check if the signature is
+		 * correct on a single drive, but we can check if it is the same
+		 * on all drives.
+		 */
+		if (ppl_conf->mddev->external) {
+			if (!signature_set) {
+				signature = ppl_conf->signature;
+				signature_set = true;
+			} else if (signature != ppl_conf->signature) {
+				pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n",
+					mdname(ppl_conf->mddev));
+				ret = -EINVAL;
+				break;
+			}
+		}
+	}
+
+	pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
+		 __func__, ret, ppl_conf->mismatch_count,
+		 ppl_conf->recovered_entries);
+	return ret;
+}
+
 static void __ppl_exit_log(struct ppl_conf *ppl_conf)
 {
 	clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
@@ -694,6 +1174,23 @@ int ppl_init_log(struct r5conf *conf)
 		pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
 			mdname(mddev));
 
+	/* load and possibly recover the logs from the member disks */
+	ret = ppl_load(ppl_conf);
+
+	if (ret) {
+		goto err;
+	} else if (!mddev->pers &&
+		   mddev->recovery_cp == 0 && !mddev->degraded &&
+		   ppl_conf->recovered_entries > 0 &&
+		   ppl_conf->mismatch_count == 0) {
+		/*
+		 * If we are starting a dirty array and the recovery succeeds
+		 * without any issues, set the array as clean.
+		 */
+		mddev->recovery_cp = MaxSector;
+		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+	}
+
 	conf->log_private = ppl_conf;
 
 	return 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4ca8e555ae5c..f032af4d0421 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7236,7 +7236,10 @@ static int raid5_run(struct mddev *mddev)
 
 	if (mddev->degraded > dirty_parity_disks &&
 	    mddev->recovery_cp != MaxSector) {
-		if (mddev->ok_start_degraded)
+		if (test_bit(MD_HAS_PPL, &mddev->flags))
+			pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
+				mdname(mddev));
+		else if (mddev->ok_start_degraded)
 			pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
 				mdname(mddev));
 		else {
-- 
2.11.0


^ permalink raw reply related

* [PATCH v5 6/7] raid5-ppl: support disk hot add/remove with PPL
From: Artur Paszkiewicz @ 2017-03-09  9:00 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20170309090003.13298-1-artur.paszkiewicz@intel.com>

Add a function to modify the log by removing an rdev when a drive fails
or adding when a spare/replacement is activated as a raid member.

Removing a disk just clears the child log rdev pointer. No new stripes
will be accepted for this child log in ppl_write_stripe() and running io
units will be processed without writing PPL to the device.

Adding a disk sets the child log rdev pointer and writes an empty PPL
header.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/raid5-log.h |  9 +++++++++
 drivers/md/raid5-ppl.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/raid5.c     | 12 +++++++++++-
 3 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index a67fb58513b9..4f5a0f4e0b1f 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -39,6 +39,7 @@ extern void ppl_exit_log(struct r5conf *conf);
 extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
 extern void ppl_write_stripe_run(struct r5conf *conf);
 extern void ppl_stripe_write_finished(struct stripe_head *sh);
+extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
 
 static inline bool raid5_has_ppl(struct r5conf *conf)
 {
@@ -102,4 +103,12 @@ static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev)
 	return 0;
 }
 
+static inline int log_modify(struct r5conf *conf, struct md_rdev *rdev, bool add)
+{
+	if (raid5_has_ppl(conf))
+		return ppl_modify_log(conf, rdev, add);
+
+	return 0;
+}
+
 #endif
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 548d1028a3ce..b5fcf55edee9 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -400,6 +400,13 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
 	struct stripe_head *sh;
 	int i;
 
+	bio->bi_private = io;
+
+	if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
+		ppl_log_endio(bio);
+		return;
+	}
+
 	for (i = 0; i < io->entries_count; i++) {
 		struct ppl_header_entry *e = &pplhdr->entries[i];
 
@@ -415,7 +422,6 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
 	pplhdr->entries_count = cpu_to_le32(io->entries_count);
 	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
 
-	bio->bi_private = io;
 	bio->bi_end_io = ppl_log_endio;
 	bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
 	bio->bi_bdev = log->rdev->bdev;
@@ -1198,3 +1204,40 @@ int ppl_init_log(struct r5conf *conf)
 	__ppl_exit_log(ppl_conf);
 	return ret;
 }
+
+int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add)
+{
+	struct ppl_conf *ppl_conf = conf->log_private;
+	struct ppl_log *log;
+	int ret = 0;
+	char b[BDEVNAME_SIZE];
+
+	if (!rdev)
+		return -EINVAL;
+
+	pr_debug("%s: disk: %d operation: %s dev: %s\n",
+		 __func__, rdev->raid_disk, add ? "add" : "remove",
+		 bdevname(rdev->bdev, b));
+
+	if (rdev->raid_disk < 0)
+		return 0;
+
+	if (rdev->raid_disk >= ppl_conf->count)
+		return -ENODEV;
+
+	log = &ppl_conf->child_logs[rdev->raid_disk];
+
+	mutex_lock(&log->io_mutex);
+	if (add) {
+		ret = ppl_validate_rdev(rdev);
+		if (!ret) {
+			log->rdev = rdev;
+			ret = ppl_write_empty_header(log);
+		}
+	} else {
+		log->rdev = NULL;
+	}
+	mutex_unlock(&log->io_mutex);
+
+	return ret;
+}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f032af4d0421..8b8f172d6807 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7527,6 +7527,11 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			*rdevp = rdev;
 		}
 	}
+	if (!err) {
+		err = log_modify(conf, rdev, false);
+		if (err)
+			goto abort;
+	}
 	if (p->replacement) {
 		/* We must have just cleared 'rdev' */
 		p->rdev = p->replacement;
@@ -7536,6 +7541,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			   */
 		p->replacement = NULL;
 		clear_bit(WantReplacement, &rdev->flags);
+
+		if (!err)
+			err = log_modify(conf, p->rdev, true);
 	} else
 		/* We might have just removed the Replacement as faulty-
 		 * clear the bit just in case
@@ -7592,10 +7600,12 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		if (p->rdev == NULL) {
 			clear_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = disk;
-			err = 0;
 			if (rdev->saved_raid_disk != disk)
 				conf->fullsync = 1;
 			rcu_assign_pointer(p->rdev, rdev);
+
+			err = log_modify(conf, rdev, true);
+
 			goto out;
 		}
 	}
-- 
2.11.0


^ permalink raw reply related

* [PATCH v5 7/7] raid5-ppl: runtime PPL enabling or disabling
From: Artur Paszkiewicz @ 2017-03-09  9:00 UTC (permalink / raw)
  To: shli; +Cc: linux-raid, Artur Paszkiewicz
In-Reply-To: <20170309090003.13298-1-artur.paszkiewicz@intel.com>

Allow writing to 'consistency_policy' attribute when the array is
active. Add a new function 'change_consistency_policy' to the
md_personality operations structure to handle the change in the
personality code. Values "ppl" and "resync" are accepted and
turn PPL on and off respectively.

When enabling PPL its location and size should first be set using
'ppl_sector' and 'ppl_size' attributes and a valid PPL header should be
written at this location on each member device.

Enabling or disabling PPL is performed under a suspended array.  The
raid5_reset_stripe_cache function frees the stripe cache and allocates
it again in order to allocate or free the ppl_pages for the stripes in
the stripe cache.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
 drivers/md/md.c        | 12 +++++++++---
 drivers/md/md.h        |  2 ++
 drivers/md/raid5-ppl.c |  4 ++++
 drivers/md/raid5.c     | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1df48f365b3c..8ef2100e2788 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4995,14 +4995,20 @@ consistency_policy_show(struct mddev *mddev, char *page)
 static ssize_t
 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
 {
+	int err = 0;
+
 	if (mddev->pers) {
-		return -EBUSY;
+		if (mddev->pers->change_consistency_policy)
+			err = mddev->pers->change_consistency_policy(mddev, buf);
+		else
+			err = -EBUSY;
 	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
 		set_bit(MD_HAS_PPL, &mddev->flags);
-		return len;
 	} else {
-		return -EINVAL;
+		err = -EINVAL;
 	}
+
+	return err ? err : len;
 }
 
 static struct md_sysfs_entry md_consistency_policy =
diff --git a/drivers/md/md.h b/drivers/md/md.h
index a7b2f16452c4..e0940064c3ec 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -545,6 +545,8 @@ struct md_personality
 	/* congested implements bdi.congested_fn().
 	 * Will not be called while array is 'suspended' */
 	int (*congested)(struct mddev *mddev, int bits);
+	/* Changes the consistency policy of an active array. */
+	int (*change_consistency_policy)(struct mddev *mddev, const char *buf);
 };
 
 struct md_sysfs_entry {
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index b5fcf55edee9..ca1ef644351a 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -1195,6 +1195,10 @@ int ppl_init_log(struct r5conf *conf)
 		 */
 		mddev->recovery_cp = MaxSector;
 		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+	} else if (mddev->pers && ppl_conf->mismatch_count > 0) {
+		/* no mismatch allowed when enabling PPL for a running array */
+		ret = -EINVAL;
+		goto err;
 	}
 
 	conf->log_private = ppl_conf;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8b8f172d6807..7a6a6184e2a7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -8213,6 +8213,58 @@ static void *raid6_takeover(struct mddev *mddev)
 	return setup_conf(mddev);
 }
 
+static void raid5_reset_stripe_cache(struct mddev *mddev)
+{
+	struct r5conf *conf = mddev->private;
+
+	mutex_lock(&conf->cache_size_mutex);
+	while (conf->max_nr_stripes &&
+	       drop_one_stripe(conf))
+		;
+	while (conf->min_nr_stripes > conf->max_nr_stripes &&
+	       grow_one_stripe(conf, GFP_KERNEL))
+		;
+	mutex_unlock(&conf->cache_size_mutex);
+}
+
+static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
+{
+	struct r5conf *conf;
+	int err;
+
+	err = mddev_lock(mddev);
+	if (err)
+		return err;
+	conf = mddev->private;
+	if (!conf) {
+		mddev_unlock(mddev);
+		return -ENODEV;
+	}
+
+	if (strncmp(buf, "ppl", 3) == 0 && !raid5_has_ppl(conf)) {
+		mddev_suspend(mddev);
+		set_bit(MD_HAS_PPL, &mddev->flags);
+		err = log_init(conf, NULL);
+		if (!err)
+			raid5_reset_stripe_cache(mddev);
+		mddev_resume(mddev);
+	} else if (strncmp(buf, "resync", 6) == 0 && raid5_has_ppl(conf)) {
+		mddev_suspend(mddev);
+		log_exit(conf);
+		raid5_reset_stripe_cache(mddev);
+		mddev_resume(mddev);
+	} else {
+		err = -EINVAL;
+	}
+
+	if (!err)
+		md_update_sb(mddev, 1);
+
+	mddev_unlock(mddev);
+
+	return err;
+}
+
 static struct md_personality raid6_personality =
 {
 	.name		= "raid6",
@@ -8258,6 +8310,7 @@ static struct md_personality raid5_personality =
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid5_takeover,
 	.congested	= raid5_congested,
+	.change_consistency_policy = raid5_change_consistency_policy,
 };
 
 static struct md_personality raid4_personality =
-- 
2.11.0


^ permalink raw reply related

* Re: Auto replace disk
From: Gandalf Corvotempesta @ 2017-03-09  9:07 UTC (permalink / raw)
  To: Brad Campbell; +Cc: Wols Lists, linux-raid
In-Reply-To: <87713f17-e078-5d90-07b4-fc1dbb963496@fnarfbargle.com>

2017-03-09 2:31 GMT+01:00 Brad Campbell <lists2009@fnarfbargle.com>:
> In general a good number of "help me my RAID is dead" requests that hit this
> list are due to not performing routine array or drive scrubs. So one drive
> dies, and one of the others has a previously unknown bad sector. When you
> put the new drive in, during the rebuild the bad sector is hit and the whole
> array comes tumbling down.
>
> Doing a proactive replacement reduces the possibility of this occurring.
> Having said that, if your disk is dead then there's no other option anyway.
> Regular array scrubs go a long way to mitigating this risk, but it does
> happen frequently enough that you need to be warned against it.

OK I misunderstood
You are right, scrubs are needed (and at least on debian, are
scheduled automatically) and I'm scrubbing my d
arrays monthly (don't know if it's enough) in addition to weekly smart long test

I was referring to just removing a disks
I thought that removing a disk was a bad idea and that MD wasn't able
to support that properly bringing the whole raid down

Also, I always use 3way mirrors or raid6.
I never use a single redundancy raid.

^ permalink raw reply

* Re: interesting case of a hung 'recovery'
From: Jack Wang @ 2017-03-09  9:13 UTC (permalink / raw)
  To: Eyal Lebedinsky; +Cc: linux-raid@vger.kernel.org
In-Reply-To: <61a297da-de29-e471-7270-166b33ce18b9@eyal.emu.id.au>

2017-03-09 8:39 GMT+01:00 Eyal Lebedinsky <eyal@eyal.emu.id.au>:
> Bump.
>
> On 18/02/17 23:14, Eyal Lebedinsky wrote:
>>
>> I should start by saying that this is an old fedora 19 system
>>
>> Executive summary: after '--add'ing a new member a 'recovery' starts but
>> 'sync_max' is not reset.
>>
>> $ uname -a
>> Linux e7.eyal.emu.id.au 3.14.27-100.fc19.x86_64 #1 SMP Wed Dec 17 19:36:34
>> UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
>
>
> $ sudo mdadm --version
> mdadm - v4.0 - 2017-01-09
>
>> so the issue may have been fixed since.
>>
>> I had a disk fail in a raid6. After some 'pending' sectors were logged I
>> decided to do a 'check'
>> around that location (set sync_min/max and echo 'check'). Sure enough it
>> elicited disk errors,
>> but the disk did not recover and it was kicked out of the array. Moreover
>> it became unresponsive.
>> It needed a power cycle so I shutdown and rebooted the machine.
>>
>> Not one to give up easily I tried the check again, with the same result.
>> It was time to '--remove' this array member. I then '--add'ed a new disk
>> which started a recovery.
>>
>> A few hours later I noticed that it slowed down. A lot. It actually did
>> not progress at all for
>> a few hours (I was away from the machine).
>>
>> As I was staring at the screen (for a long while) I realised that it
>> stopped at 55.5%, and this
>> number is exactly where the original 'check' failed (I still do not
>> understand why with my bad
>> memory I remembered this number).
>>
>> I checked 'sync_completed' and it was proper.
>> I then examined 'sync_max' and it was wrong - it had the location where
>> the very early 'check'
>> failed earlier in the day. It was the same sector where it is now paused
>> at - looks related.
>>
>> I decided to take a (small) risk and do
>>     # echo 'max' >/sys/block/md127/md/sync_max
>> at which point the recovery moved on. It should be finished in about 5
>> hours.
>>
>> I do not think that it is correct for 'sync_max' to not be set to 'max'
>> when a new member is
>> added - it surely requires a full recovery.
>>
>> I really hope (and expect) that this was actually fixed, but this note may
>> help others facing
>> same predicament.
>>
>> cheers
>>
>
> --
> Eyal Lebedinsky (eyal@eyal.emu.id.au)

You'd better offer attach much detailed information, then people can help.

eg:
https://raid.wiki.kernel.org/index.php/Asking_for_help

For the problem you reported, better offer also kernel dmesg, output
of blocking tasks via "echo w >  /proc/sysrq-trigger" maybe also
"echo t > /proc/sysrq-trigger"

Cheers,
Jack

^ permalink raw reply

* RE: [PATCH 22/29] drivers, scsi: convert iscsi_task.refcount from atomic_t to refcount_t
From: Reshetova, Elena @ 2017-03-09  9:26 UTC (permalink / raw)
  To: Johannes Thumshirn, Chris Leech
  Cc: peterz@infradead.org, linux-pci@vger.kernel.org,
	target-devel@vger.kernel.org,
	linux1394-devel@lists.sourceforge.net, devel@driverdev.osuosl.org,
	linux-s390@vger.kernel.org, linux-scsi@vger.kernel.org,
	linux-serial@vger.kernel.org, fcoe-devel@open-fcoe.org,
	xen-devel@lists.xenproject.org, open-iscsi@googlegroups.com,
	linux-media@vger.kernel.org, Kees Cook,
	linux-raid@vger.kernel.org, linux-bcache@vger.kernel.org
In-Reply-To: <5a1f7860-b650-9fe7-fafb-1f0c7cae00e7@suse.de>


> On 03/09/2017 08:18 AM, Reshetova, Elena wrote:
> >> On Mon, Mar 06, 2017 at 04:21:09PM +0200, Elena Reshetova wrote:
> >>> refcount_t type and corresponding API should be
> >>> used instead of atomic_t when the variable is used as
> >>> a reference counter. This allows to avoid accidental
> >>> refcounter overflows that might lead to use-after-free
> >>> situations.
> >>>
> >>> Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
> >>> Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
> >>> Signed-off-by: Kees Cook <keescook@chromium.org>
> >>> Signed-off-by: David Windsor <dwindsor@gmail.com>
> >>
> >> This looks OK to me.
> >>
> >> Acked-by: Chris Leech <cleech@redhat.com>
> >
> > Thank you for review! Do you have a tree that can take this change?
> 
> Hi Elena,
> 
> iscsi like fcoe should go via the SCSI tree.

Thanks Johannes! Should I resend with "Acked-by" added in order for it to be picked up? 

Best Regards,
Elena.


> 
> Byte,
> 	Johannes
> 
> --
> Johannes Thumshirn                                          Storage
> jthumshirn@suse.de                                +49 911 74053 689
> SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
> GF: Felix Imendörffer, Jane Smithard, Graham Norton
> HRB 21284 (AG Nürnberg)
> Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply

* Re: [PATCH 22/29] drivers, scsi: convert iscsi_task.refcount from atomic_t to refcount_t
From: Johannes Thumshirn @ 2017-03-09  9:32 UTC (permalink / raw)
  To: Reshetova, Elena, Chris Leech
  Cc: gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r@public.gmane.org,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	xen-devel-GuqFBffKawtpuQazS67q72D2FQJk+8+b@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux1394-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org,
	linux-bcache-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-raid-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-media-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	devel-tBiZLqfeLfOHmIFyCCdPziST3g8Odh+X@public.gmane.org,
	linux-pci-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-s390-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	fcoe-devel-s9riP+hp16TNLxjTenLetw@public.gmane.org,
	linux-scsi-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	open-iscsi-/JYPxA39Uh5TLH3MbocFFw@public.gmane.org,
	devel-gWbeCf7V1WCQmaza687I9g
In-Reply-To: <2236FBA76BA1254E88B949DDB74E612B41C56ABF-kPTMFJFq+rFP9JyJpTNKArfspsVTdybXVpNB7YpNyf8@public.gmane.org>

On 03/09/2017 10:26 AM, Reshetova, Elena wrote:
> 
>> On 03/09/2017 08:18 AM, Reshetova, Elena wrote:
>>>> On Mon, Mar 06, 2017 at 04:21:09PM +0200, Elena Reshetova wrote:
>>>>> refcount_t type and corresponding API should be
>>>>> used instead of atomic_t when the variable is used as
>>>>> a reference counter. This allows to avoid accidental
>>>>> refcounter overflows that might lead to use-after-free
>>>>> situations.
>>>>>
>>>>> Signed-off-by: Elena Reshetova <elena.reshetova-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
>>>>> Signed-off-by: Hans Liljestrand <ishkamiel-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>>>>> Signed-off-by: Kees Cook <keescook-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
>>>>> Signed-off-by: David Windsor <dwindsor-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
>>>>
>>>> This looks OK to me.
>>>>
>>>> Acked-by: Chris Leech <cleech-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
>>>
>>> Thank you for review! Do you have a tree that can take this change?
>>
>> Hi Elena,
>>
>> iscsi like fcoe should go via the SCSI tree.
> 
> Thanks Johannes! Should I resend with "Acked-by" added in order for it to be picked up? 

Yes I think this would be a good way to go.

Byte,
	Johannes
-- 
Johannes Thumshirn                                          Storage
jthumshirn-l3A5Bk7waGM@public.gmane.org                                +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

-- 
You received this message because you are subscribed to the Google Groups "open-iscsi" group.
To unsubscribe from this group and stop receiving emails from it, send an email to open-iscsi+unsubscribe-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
To post to this group, send email to open-iscsi-/JYPxA39Uh5TLH3MbocFF+G/Ez6ZCGd0@public.gmane.org
Visit this group at https://groups.google.com/group/open-iscsi.
For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply

* Re: interesting case of a hung 'recovery'
From: Eyal Lebedinsky @ 2017-03-09 11:00 UTC (permalink / raw)
  To: linux-raid@vger.kernel.org
In-Reply-To: <CA+res+SYTaCd77Nvdvn-gSe2qcJu5nXMdnV32nL0MKs9NOzqYg@mail.gmail.com>

On 09/03/17 20:13, Jack Wang wrote:
> 2017-03-09 8:39 GMT+01:00 Eyal Lebedinsky <eyal@eyal.emu.id.au>:
>> Bump.
>>
>> On 18/02/17 23:14, Eyal Lebedinsky wrote:
>>>
>>> I should start by saying that this is an old fedora 19 system
>>>
>>> Executive summary: after '--add'ing a new member a 'recovery' starts but
>>> 'sync_max' is not reset.
>>>
>>> $ uname -a
>>> Linux e7.eyal.emu.id.au 3.14.27-100.fc19.x86_64 #1 SMP Wed Dec 17 19:36:34
>>> UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
>>
>>
>> $ sudo mdadm --version
>> mdadm - v4.0 - 2017-01-09
>>
>>> so the issue may have been fixed since.
>>>
>>> I had a disk fail in a raid6. After some 'pending' sectors were logged I
>>> decided to do a 'check'
>>> around that location (set sync_min/max and echo 'check'). Sure enough it
>>> elicited disk errors,
>>> but the disk did not recover and it was kicked out of the array. Moreover
>>> it became unresponsive.
>>> It needed a power cycle so I shutdown and rebooted the machine.
>>>
>>> Not one to give up easily I tried the check again, with the same result.
>>> It was time to '--remove' this array member. I then '--add'ed a new disk
>>> which started a recovery.
>>>
>>> A few hours later I noticed that it slowed down. A lot. It actually did
>>> not progress at all for
>>> a few hours (I was away from the machine).
>>>
>>> As I was staring at the screen (for a long while) I realised that it
>>> stopped at 55.5%, and this
>>> number is exactly where the original 'check' failed (I still do not
>>> understand why with my bad
>>> memory I remembered this number).
>>>
>>> I checked 'sync_completed' and it was proper.
>>> I then examined 'sync_max' and it was wrong - it had the location where
>>> the very early 'check'
>>> failed earlier in the day. It was the same sector where it is now paused
>>> at - looks related.
>>>
>>> I decided to take a (small) risk and do
>>>     # echo 'max' >/sys/block/md127/md/sync_max
>>> at which point the recovery moved on. It should be finished in about 5
>>> hours.
>>>
>>> I do not think that it is correct for 'sync_max' to not be set to 'max'
>>> when a new member is
>>> added - it surely requires a full recovery.
>>>
>>> I really hope (and expect) that this was actually fixed, but this note may
>>> help others facing
>>> same predicament.
>>>
>>> cheers
>>>
>>
>> --
>> Eyal Lebedinsky (eyal@eyal.emu.id.au)
>
> You'd better offer attach much detailed information, then people can help.

I know, but this is a live system and I cannot experiment with it.

There were no unusual messages (see below) at all. The 'recovery' simply stopped. This is
the usual behaviour when one sets sync_min/max and starts a (for example) 'check'.
As I described above, I found that after the --add 'sync_max' was NOT set to 'max' and
not to the full RAID size but was left at the value where the earlier 'check' failed.
It failed when the disk completely disappeared.

I guess the original 'check' was left pending, and when a replacement disk was added
to the RAID it was automatically recovered (as it should) but the original 'check'
somehow left something behind that should have been reset for the recovery.

> eg:
> https://raid.wiki.kernel.org/index.php/Asking_for_help
>
> For the problem you reported, better offer also kernel dmesg, output
> of blocking tasks via "echo w >  /proc/sysrq-trigger" maybe also
> "echo t > /proc/sysrq-trigger"

There was no blocked task, just that the recovery stopped progressing when it
reached sync_max, as it should. The problem is that sync_max had the wrong value,
the full (newly added) disk was supposed to be synced.

> Cheers,

In case it helps, here is all the info I have of the event:

### I have a script that checks the RAID in a small region. I had an earlier error in
### on the disk and to check it my script effectively did:
	echo 4336657408 >sys/block/md127/md/sync_min
	echo 4339803136 >sys/block/md127/md/sync_max
	echo check      >sys/block/md127/md/sync_action
### messages:
Feb 18 13:46:31 e7 kernel: [  976.688691] md: data-check of RAID array md127
Feb 18 13:46:31 e7 kernel: [  976.693254] md: minimum _guaranteed_  speed: 150000 KB/sec/disk.
Feb 18 13:46:31 e7 kernel: [  976.699479] md: using maximum available idle IO bandwidth (but not more than 200000 KB/sec) for data-check.
Feb 18 13:46:31 e7 kernel: [  976.709420] md: using 128k window, over a total of 3906885120k.
Feb 18 13:46:31 e7 kernel: [  976.715457] md: resuming data-check of md127 from checkpoint.

### the failure of the 'check'
... many i/o errors then sdf completely disappeared ... errors at sectors 4337414{000,040,168}
Feb 18 13:47:08 e7 kernel: [ 1014.334781] md: super_written gets error=-5, uptodate=0
Feb 18 13:47:08 e7 kernel: [ 1014.340024] md/raid:md127: Disk failure on sdf1, disabling device.
Feb 18 13:47:08 e7 kernel: [ 1014.340024] md/raid:md127: Operation continuing on 6 devices.
Feb 18 13:47:08 e7 kernel: [ 1014.417307] md: md127: data-check interrupted.

### a new disk (sdj) is plugged in and partitioned.
$ sudo mdadm /dev/md127 --add /dev/sdj1
$ cat /proc/mdstat
Personalities : [raid6] [raid5] [raid4]
md127 : active raid6 sdj1[11] sdf1[7](F) sdi1[8] sde1[9] sdh1[12] sdc1[0] sdg1[13] sdd1[10]
       19534425600 blocks super 1.2 level 6, 512k chunk, algorithm 2 [7/6] [UUU_UUU]
       [>....................]  recovery =  0.7% (29805572/3906885120) finish=509.2min speed=126880K/sec
       bitmap: 7/30 pages [28KB], 65536KB chunk

### messages:
Feb 18 14:23:10 e7 kernel: [ 3177.183250] md: bind<sdj1>
Feb 18 14:23:10 e7 kernel: [ 3177.255529] md: recovery of RAID array md127
Feb 18 14:23:10 e7 kernel: [ 3177.259894] md: minimum _guaranteed_  speed: 150000 KB/sec/disk.
Feb 18 14:23:10 e7 kernel: [ 3177.265994] md: using maximum available idle IO bandwidth (but not more than 200000 KB/sec) for recovery.
Feb 18 14:23:10 e7 kernel: [ 3177.275736] md: using 128k window, over a total of 3906885120k.

### the point when the recovery paused:
2017-02-18 20:02:48        [===========>.........]  recovery = 55.4% (2166229192/3906885120) finish=372.8min speed=77803K/sec
2017-02-18 20:02:58        [===========>.........]  recovery = 55.4% (2167083344/3906885120) finish=366.2min speed=79159K/sec
2017-02-18 20:03:08        [===========>.........]  recovery = 55.4% (2167819876/3906885120) finish=374.8min speed=77316K/sec
2017-02-18 20:03:18        [===========>.........]  recovery = 55.5% (2168520428/3906885120) finish=375.4min speed=77157K/sec
2017-02-18 20:03:28        [===========>.........]  recovery = 55.5% (2168590848/3906885120) finish=489.4min speed=59194K/sec
2017-02-18 20:03:38        [===========>.........]  recovery = 55.5% (2168590848/3906885120) finish=608.7min speed=47588K/sec
2017-02-18 20:03:48        [===========>.........]  recovery = 55.5% (2168590848/3906885120) finish=728.1min speed=39786K/sec
2017-02-18 20:03:58        [===========>.........]  recovery = 55.5% (2168590848/3906885120) finish=847.5min speed=34182K/sec
...
2017-02-18 22:36:44 0       [===========>.........]  recovery = 55.5% (2168590848/3906885120) finish=110261.8min speed=262K/sec
2017-02-18 22:36:54 0       [===========>.........]  recovery = 55.5% (2168590848/3906885120) finish=110381.2min speed=262K/sec
2017-02-18 22:37:04 0       [===========>.........]  recovery = 55.5% (2168590848/3906885120) finish=110500.6min speed=262K/sec
2017-02-18 22:37:14 0       [===========>.........]  recovery = 55.5% (2168590848/3906885120) finish=110619.9min speed=261K/sec

# echo 'max' >/sys/block/md127/md/sync_max
### recovery now moves on:
2017-02-18 22:37:24 0       [===========>.........]  recovery = 55.5% (2168938500/3906885120) finish=117500.2min speed=246K/sec
2017-02-18 22:37:34 0       [===========>.........]  recovery = 55.5% (2169997568/3906885120) finish=105201.7min speed=275K/sec
2017-02-18 22:37:44 0       [===========>.........]  recovery = 55.5% (2171066120/3906885120) finish=90962.0min speed=318K/sec
2017-02-18 22:37:54 0       [===========>.........]  recovery = 55.5% (2172125192/3906885120) finish=269.9min speed=107101K/sec
2017-02-18 22:38:04 0       [===========>.........]  recovery = 55.6% (2173114372/3906885120) finish=272.1min speed=106165K/sec
2017-02-18 22:38:14 0       [===========>.........]  recovery = 55.6% (2174004224/3906885120) finish=287.3min speed=100492K/sec

### and it completed over six hours later:
Feb 19 04:49:16 e7 kernel: [55167.633100] md: md127: recovery done.

cheers

-- 
Eyal Lebedinsky (eyal@eyal.emu.id.au)

^ permalink raw reply

* Re: [PATCH 08/29] drivers, md: convert mddev.active from atomic_t to refcount_t
From: Shaohua Li @ 2017-03-09 17:11 UTC (permalink / raw)
  To: gregkh@linuxfoundation.org; +Cc: Reshetova, Elena, linux-raid@vger.kernel.org
In-Reply-To: <20170308101914.GB15198@kroah.com>

On Wed, Mar 08, 2017 at 11:19:14AM +0100, gregkh@linuxfoundation.org wrote:
> On Wed, Mar 08, 2017 at 09:42:09AM +0000, Reshetova, Elena wrote:
> > > On Mon, Mar 06, 2017 at 04:20:55PM +0200, Elena Reshetova wrote:
> > > > refcount_t type and corresponding API should be
> > > > used instead of atomic_t when the variable is used as
> > > > a reference counter. This allows to avoid accidental
> > > > refcounter overflows that might lead to use-after-free
> > > > situations.
> > > 
> > > Looks good. Let me know how do you want to route the patch to upstream.
> > 
> > Greg, you previously mentioned that driver's conversions can go via your tree. Does this still apply?
> > Or should I be asking maintainers to merge these patches via their trees? 
> 
> You should ask them to take them through their trees, if they have them.
> I'll be glad to scoop up all of the remaining ones that get missed, or
> for subsystems that do not have trees.

ok, applied, thanks!

^ permalink raw reply

* Re: [PATCH 10/29] drivers, md: convert stripe_head.count from atomic_t to refcount_t
From: Shaohua Li @ 2017-03-09 17:18 UTC (permalink / raw)
  To: Reshetova, Elena
  Cc: gregkh@linuxfoundation.org, linux-raid@vger.kernel.org,
	Hans Liljestrand, Kees Cook, David Windsor
In-Reply-To: <2236FBA76BA1254E88B949DDB74E612B41C56050@IRSMSX102.ger.corp.intel.com>

On Wed, Mar 08, 2017 at 09:39:30AM +0000, Reshetova, Elena wrote:
> > On Mon, Mar 06, 2017 at 04:20:57PM +0200, Elena Reshetova wrote:
> > > refcount_t type and corresponding API should be
> > > used instead of atomic_t when the variable is used as
> > > a reference counter. This allows to avoid accidental
> > > refcounter overflows that might lead to use-after-free
> > > situations.
> > >
> > > Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
> > > Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
> > > Signed-off-by: Kees Cook <keescook@chromium.org>
> > > Signed-off-by: David Windsor <dwindsor@gmail.com>
> > > ---
> > >  drivers/md/raid5-cache.c |  8 +++---
> > >  drivers/md/raid5.c       | 66 ++++++++++++++++++++++++------------------------
> > >  drivers/md/raid5.h       |  3 ++-
> > >  3 files changed, 39 insertions(+), 38 deletions(-)
> > >
> > > diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> > > index 3f307be..6c05e12 100644
> > > --- a/drivers/md/raid5-cache.c
> > > +++ b/drivers/md/raid5-cache.c
> > 
> > snip
> > >  	       sh->check_state, sh->reconstruct_state);
> > >
> > >  	analyse_stripe(sh, &s);
> > > @@ -4924,7 +4924,7 @@ static void activate_bit_delay(struct r5conf *conf,
> > >  		struct stripe_head *sh = list_entry(head.next, struct
> > stripe_head, lru);
> > >  		int hash;
> > >  		list_del_init(&sh->lru);
> > > -		atomic_inc(&sh->count);
> > > +		refcount_inc(&sh->count);
> > >  		hash = sh->hash_lock_index;
> > >  		__release_stripe(conf, sh,
> > &temp_inactive_list[hash]);
> > >  	}
> > > @@ -5240,7 +5240,7 @@ static struct stripe_head *__get_priority_stripe(struct
> > r5conf *conf, int group)
> > >  		sh->group = NULL;
> > >  	}
> > >  	list_del_init(&sh->lru);
> > > -	BUG_ON(atomic_inc_return(&sh->count) != 1);
> > > +	BUG_ON(refcount_inc_not_zero(&sh->count));
> > 
> > This changes the behavior. refcount_inc_not_zero doesn't inc if original value is 0
> 
> Hm.. So, you want to inc here in any case and BUG if the end result differs from 1. 
> So essentially you want to only increment here from zero to one under normal conditions... This is a challenge for refcount_t and against the design.
> Is it ok just to maybe do this here:
> 
> -	BUG_ON(atomic_inc_return(&sh->count) != 1);
> +	BUG_ON(refcount_read(&sh->count) != 0);
> +	refcount_set((&sh->count, 1);

this looks ok


> Do we have an issue with locking in this case? Or maybe it is then better to leave this one to be atomic_t without protection since it isn't a real refcounter as it turns out. 

There is no lock issue, the count should be 0 in the list. It's a refcounter actually, so good to do the convert.

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH V3] md: move bitmap_destroy before __md_stop
From: Shaohua Li @ 2017-03-09 18:24 UTC (permalink / raw)
  To: Guoqing Jiang; +Cc: linux-raid, neilb, shli
In-Reply-To: <1488940292-8555-1-git-send-email-gqjiang@suse.com>

On Wed, Mar 08, 2017 at 10:31:32AM +0800, Guoqing Jiang wrote:
> Since we have switched to sync way to handle METADATA_UPDATED
> msg for md-cluster, then process_metadata_update is depended
> on mddev->thread->wqueue.
> 
> With the new change, clustered raid could possible hang if
> array received a METADATA_UPDATED msg after array unregistered
> mddev->thread, so we need to stop clustered raid (bitmap_destroy
> 	 -> bitmap_free -> md_cluster_stop) earlier than unregister
> thread (mddev_detach -> md_unregister_thread).
> 
> And this change should be safe for non-clustered raid since
> all writes are stopped before the destroy. Also in md_run,
> we activate the personality (pers->run()) before activating
> the bitmap (bitmap_create()). So it is pleasingly symmetric
> to stop the bitmap (bitmap_destroy()) before stopping the
> personality (__md_stop() calls pers->free()).
> 
> But we don't want to break the codes for waiting behind IO as
> Shaohua mentioned, so move those codes from mddev_detach to
> bitmap_destroy. Since we already check bitmap at the beginning
> of bitmap_destroy, just wait for behind_writes to be zero if
> it existed.
> 
> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> ---
> This version move waiting behind IO codes into bitmap_destroy
> so we can safely call bitmap_destroy before __md_stop now.
> 
>  drivers/md/bitmap.c |  9 +++++++++
>  drivers/md/md.c     | 13 ++-----------
>  2 files changed, 11 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
> index b6fa55a3cff8..89a35bc092dd 100644
> --- a/drivers/md/bitmap.c
> +++ b/drivers/md/bitmap.c
> @@ -1771,6 +1771,15 @@ void bitmap_destroy(struct mddev *mddev)
>  	if (!bitmap) /* there was no bitmap */
>  		return;
>  
> +	/* wait for behind writes to complete */
> +	if (atomic_read(&bitmap->behind_writes) > 0) {
> +		printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n",
> +		       mdname(mddev));
> +		/* need to kick something here to make sure I/O goes? */
> +		wait_event(bitmap->behind_wait,
> +			   atomic_read(&bitmap->behind_writes) == 0);
> +	}
> +
>  	mutex_lock(&mddev->bitmap_info.mutex);
>  	spin_lock(&mddev->lock);
>  	mddev->bitmap = NULL; /* disconnect from the md device */
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 79a99a1c9ce7..b63ab4f33892 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -5534,15 +5534,6 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
>  
>  static void mddev_detach(struct mddev *mddev)
>  {
> -	struct bitmap *bitmap = mddev->bitmap;
> -	/* wait for behind writes to complete */
> -	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
> -		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
> -			 mdname(mddev));
> -		/* need to kick something here to make sure I/O goes? */
> -		wait_event(bitmap->behind_wait,
> -			   atomic_read(&bitmap->behind_writes) == 0);
> -	}

I think it's ok to add this part into bitmap_destroy, as we need to call
bitmap_destroy before mddev_detach. Look at the usage of mddev_detach, at in
one place (level_store()), we wait for the IO without bitmap_destroy. I think
we should keep this part code in mddev_detach. Maybe create a small function,
let both mddev_detach and bitmap_destroy call it.

>  	if (mddev->pers && mddev->pers->quiesce) {
>  		mddev->pers->quiesce(mddev, 1);
>  		mddev->pers->quiesce(mddev, 0);
> @@ -5574,8 +5565,8 @@ void md_stop(struct mddev *mddev)
>  	/* stop the array and free an attached data structures.
>  	 * This is called from dm-raid
>  	 */
> -	__md_stop(mddev);
>  	bitmap_destroy(mddev);
> +	__md_stop(mddev);

since we now always do bitmap_destroy and follow __md_stop, maybe move
bitmap_destroy to very begining of __md_stop

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH v3] md/r5cache: improve recovery with read ahead page pool
From: Shaohua Li @ 2017-03-09 18:48 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, shli, neilb, kernel-team, dan.j.williams, hch
In-Reply-To: <20170308004917.3902687-1-songliubraving@fb.com>

On Tue, Mar 07, 2017 at 04:49:17PM -0800, Song Liu wrote:
> In r5cache recovery, the journal device is scanned page by page.
> Currently, we use sync_page_io() to read journal device. This is
> not efficient when we have to recovery many stripes from the journal.
> 
> To improve the speed of recovery, this patch introduces a read ahead
> page pool (ra_pool) to recovery_ctx. With ra_pool, multiple consecutive
> pages are read in one IO. Then the recovery code read the journal from
> ra_pool.
> 
> With ra_pool, r5l_recovery_ctx has become much bigger. Therefore,
> r5l_recovery_log() is refactored so r5l_recovery_ctx is not using
> stack space.

applied, thanks!

> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  drivers/md/raid5-cache.c | 223 +++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 177 insertions(+), 46 deletions(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 3f307be..0d744d5 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -30,6 +30,7 @@
>   * underneath hardware sector size. only works with PAGE_SIZE == 4096
>   */
>  #define BLOCK_SECTORS (8)
> +#define BLOCK_SECTOR_SHIFT (3)
>  
>  /*
>   * log->max_free_space is min(1/4 disk size, 10G reclaimable space).
> @@ -1552,6 +1553,8 @@ bool r5l_log_disk_error(struct r5conf *conf)
>  	return ret;
>  }
>  
> +#define R5L_RECOVERY_PAGE_POOL_SIZE 256
> +
>  struct r5l_recovery_ctx {
>  	struct page *meta_page;		/* current meta */
>  	sector_t meta_total_blocks;	/* total size of current meta and data */
> @@ -1560,18 +1563,131 @@ struct r5l_recovery_ctx {
>  	int data_parity_stripes;	/* number of data_parity stripes */
>  	int data_only_stripes;		/* number of data_only stripes */
>  	struct list_head cached_list;
> +
> +	/*
> +	 * read ahead page pool (ra_pool)
> +	 * in recovery, log is read sequentially. It is not efficient to
> +	 * read every page with sync_page_io(). The read ahead page pool
> +	 * reads multiple pages with one IO, so further log read can
> +	 * just copy data from the pool.
> +	 */
> +	struct page *ra_pool[R5L_RECOVERY_PAGE_POOL_SIZE];
> +	sector_t pool_offset;	/* offset of first page in the pool */
> +	int total_pages;	/* total allocated pages */
> +	int valid_pages;	/* pages with valid data */
> +	struct bio *ra_bio;	/* bio to do the read ahead */
>  };
>  
> +static int r5l_recovery_allocate_ra_pool(struct r5l_log *log,
> +					    struct r5l_recovery_ctx *ctx)
> +{
> +	struct page *page;
> +
> +	ctx->ra_bio = bio_alloc_bioset(GFP_KERNEL, BIO_MAX_PAGES, log->bs);
> +	if (!ctx->ra_bio)
> +		return -ENOMEM;
> +
> +	ctx->valid_pages = 0;
> +	ctx->total_pages = 0;
> +	while (ctx->total_pages < R5L_RECOVERY_PAGE_POOL_SIZE) {
> +		page = alloc_page(GFP_KERNEL);
> +
> +		if (!page)
> +			break;
> +		ctx->ra_pool[ctx->total_pages] = page;
> +		ctx->total_pages += 1;
> +	}
> +
> +	if (ctx->total_pages == 0) {
> +		bio_put(ctx->ra_bio);
> +		return -ENOMEM;
> +	}
> +
> +	ctx->pool_offset = 0;
> +	return 0;
> +}
> +
> +static void r5l_recovery_free_ra_pool(struct r5l_log *log,
> +					struct r5l_recovery_ctx *ctx)
> +{
> +	int i;
> +
> +	for (i = 0; i < ctx->total_pages; ++i)
> +		put_page(ctx->ra_pool[i]);
> +	bio_put(ctx->ra_bio);
> +}
> +
> +/*
> + * fetch ctx->valid_pages pages from offset
> + * In normal cases, ctx->valid_pages == ctx->total_pages after the call.
> + * However, if the offset is close to the end of the journal device,
> + * ctx->valid_pages could be smaller than ctx->total_pages
> + */
> +static int r5l_recovery_fetch_ra_pool(struct r5l_log *log,
> +				      struct r5l_recovery_ctx *ctx,
> +				      sector_t offset)
> +{
> +	bio_reset(ctx->ra_bio);
> +	ctx->ra_bio->bi_bdev = log->rdev->bdev;
> +	bio_set_op_attrs(ctx->ra_bio, REQ_OP_READ, 0);
> +	ctx->ra_bio->bi_iter.bi_sector = log->rdev->data_offset + offset;
> +
> +	ctx->valid_pages = 0;
> +	ctx->pool_offset = offset;
> +
> +	while (ctx->valid_pages < ctx->total_pages) {
> +		bio_add_page(ctx->ra_bio,
> +			     ctx->ra_pool[ctx->valid_pages], PAGE_SIZE, 0);
> +		ctx->valid_pages += 1;
> +
> +		offset = r5l_ring_add(log, offset, BLOCK_SECTORS);
> +
> +		if (offset == 0)  /* reached end of the device */
> +			break;
> +	}
> +
> +	return submit_bio_wait(ctx->ra_bio);
> +}
> +
> +/*
> + * try read a page from the read ahead page pool, if the page is not in the
> + * pool, call r5l_recovery_fetch_ra_pool
> + */
> +static int r5l_recovery_read_page(struct r5l_log *log,
> +				  struct r5l_recovery_ctx *ctx,
> +				  struct page *page,
> +				  sector_t offset)
> +{
> +	int ret;
> +
> +	if (offset < ctx->pool_offset ||
> +	    offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS) {
> +		ret = r5l_recovery_fetch_ra_pool(log, ctx, offset);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	BUG_ON(offset < ctx->pool_offset ||
> +	       offset >= ctx->pool_offset + ctx->valid_pages * BLOCK_SECTORS);
> +
> +	memcpy(page_address(page),
> +	       page_address(ctx->ra_pool[(offset - ctx->pool_offset) >>
> +					 BLOCK_SECTOR_SHIFT]),
> +	       PAGE_SIZE);
> +	return 0;
> +}
> +
>  static int r5l_recovery_read_meta_block(struct r5l_log *log,
>  					struct r5l_recovery_ctx *ctx)
>  {
>  	struct page *page = ctx->meta_page;
>  	struct r5l_meta_block *mb;
>  	u32 crc, stored_crc;
> +	int ret;
>  
> -	if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, REQ_OP_READ, 0,
> -			  false))
> -		return -EIO;
> +	ret = r5l_recovery_read_page(log, ctx, page, ctx->pos);
> +	if (ret != 0)
> +		return ret;
>  
>  	mb = page_address(page);
>  	stored_crc = le32_to_cpu(mb->checksum);
> @@ -1653,8 +1769,7 @@ static void r5l_recovery_load_data(struct r5l_log *log,
>  	raid5_compute_sector(conf,
>  			     le64_to_cpu(payload->location), 0,
>  			     &dd_idx, sh);
> -	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
> -		     sh->dev[dd_idx].page, REQ_OP_READ, 0, false);
> +	r5l_recovery_read_page(log, ctx, sh->dev[dd_idx].page, log_offset);
>  	sh->dev[dd_idx].log_checksum =
>  		le32_to_cpu(payload->checksum[0]);
>  	ctx->meta_total_blocks += BLOCK_SECTORS;
> @@ -1673,17 +1788,15 @@ static void r5l_recovery_load_parity(struct r5l_log *log,
>  	struct r5conf *conf = mddev->private;
>  
>  	ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
> -	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
> -		     sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
> +	r5l_recovery_read_page(log, ctx, sh->dev[sh->pd_idx].page, log_offset);
>  	sh->dev[sh->pd_idx].log_checksum =
>  		le32_to_cpu(payload->checksum[0]);
>  	set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
>  
>  	if (sh->qd_idx >= 0) {
> -		sync_page_io(log->rdev,
> -			     r5l_ring_add(log, log_offset, BLOCK_SECTORS),
> -			     PAGE_SIZE, sh->dev[sh->qd_idx].page,
> -			     REQ_OP_READ, 0, false);
> +		r5l_recovery_read_page(
> +			log, ctx, sh->dev[sh->qd_idx].page,
> +			r5l_ring_add(log, log_offset, BLOCK_SECTORS));
>  		sh->dev[sh->qd_idx].log_checksum =
>  			le32_to_cpu(payload->checksum[1]);
>  		set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
> @@ -1814,14 +1927,15 @@ r5c_recovery_replay_stripes(struct list_head *cached_stripe_list,
>  
>  /* if matches return 0; otherwise return -EINVAL */
>  static int
> -r5l_recovery_verify_data_checksum(struct r5l_log *log, struct page *page,
> +r5l_recovery_verify_data_checksum(struct r5l_log *log,
> +				  struct r5l_recovery_ctx *ctx,
> +				  struct page *page,
>  				  sector_t log_offset, __le32 log_checksum)
>  {
>  	void *addr;
>  	u32 checksum;
>  
> -	sync_page_io(log->rdev, log_offset, PAGE_SIZE,
> -		     page, REQ_OP_READ, 0, false);
> +	r5l_recovery_read_page(log, ctx, page, log_offset);
>  	addr = kmap_atomic(page);
>  	checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
>  	kunmap_atomic(addr);
> @@ -1853,17 +1967,17 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
>  
>  		if (payload->header.type == R5LOG_PAYLOAD_DATA) {
>  			if (r5l_recovery_verify_data_checksum(
> -				    log, page, log_offset,
> +				    log, ctx, page, log_offset,
>  				    payload->checksum[0]) < 0)
>  				goto mismatch;
>  		} else if (payload->header.type == R5LOG_PAYLOAD_PARITY) {
>  			if (r5l_recovery_verify_data_checksum(
> -				    log, page, log_offset,
> +				    log, ctx, page, log_offset,
>  				    payload->checksum[0]) < 0)
>  				goto mismatch;
>  			if (conf->max_degraded == 2 && /* q for RAID 6 */
>  			    r5l_recovery_verify_data_checksum(
> -				    log, page,
> +				    log, ctx, page,
>  				    r5l_ring_add(log, log_offset,
>  						 BLOCK_SECTORS),
>  				    payload->checksum[1]) < 0)
> @@ -2241,55 +2355,72 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
>  static int r5l_recovery_log(struct r5l_log *log)
>  {
>  	struct mddev *mddev = log->rdev->mddev;
> -	struct r5l_recovery_ctx ctx;
> +	struct r5l_recovery_ctx *ctx;
>  	int ret;
>  	sector_t pos;
>  
> -	ctx.pos = log->last_checkpoint;
> -	ctx.seq = log->last_cp_seq;
> -	ctx.meta_page = alloc_page(GFP_KERNEL);
> -	ctx.data_only_stripes = 0;
> -	ctx.data_parity_stripes = 0;
> -	INIT_LIST_HEAD(&ctx.cached_list);
> -
> -	if (!ctx.meta_page)
> +	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
> +	if (!ctx)
>  		return -ENOMEM;
>  
> -	ret = r5c_recovery_flush_log(log, &ctx);
> -	__free_page(ctx.meta_page);
> +	ctx->pos = log->last_checkpoint;
> +	ctx->seq = log->last_cp_seq;
> +	ctx->data_only_stripes = 0;
> +	ctx->data_parity_stripes = 0;
> +	INIT_LIST_HEAD(&ctx->cached_list);
> +	ctx->meta_page = alloc_page(GFP_KERNEL);
>  
> -	if (ret)
> -		return ret;
> +	if (!ctx->meta_page) {
> +		ret =  -ENOMEM;
> +		goto meta_page;
> +	}
> +
> +	if (r5l_recovery_allocate_ra_pool(log, ctx) != 0) {
> +		ret = -ENOMEM;
> +		goto ra_pool;
> +	}
>  
> -	pos = ctx.pos;
> -	ctx.seq += 10000;
> +	ret = r5c_recovery_flush_log(log, ctx);
>  
> +	if (ret)
> +		goto error;
>  
> -	if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
> +	pos = ctx->pos;
> +	ctx->seq += 10000;
> +
> +	if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
>  		pr_debug("md/raid:%s: starting from clean shutdown\n",
>  			 mdname(mddev));
>  	else
>  		pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
> -			 mdname(mddev), ctx.data_only_stripes,
> -			 ctx.data_parity_stripes);
> -
> -	if (ctx.data_only_stripes == 0) {
> -		log->next_checkpoint = ctx.pos;
> -		r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
> -		ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
> -	} else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
> +			 mdname(mddev), ctx->data_only_stripes,
> +			 ctx->data_parity_stripes);
> +
> +	if (ctx->data_only_stripes == 0) {
> +		log->next_checkpoint = ctx->pos;
> +		r5l_log_write_empty_meta_block(log, ctx->pos, ctx->seq++);
> +		ctx->pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
> +	} else if (r5c_recovery_rewrite_data_only_stripes(log, ctx)) {
>  		pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
>  		       mdname(mddev));
> -		return -EIO;
> +		ret =  -EIO;
> +		goto error;
>  	}
>  
> -	log->log_start = ctx.pos;
> -	log->seq = ctx.seq;
> +	log->log_start = ctx->pos;
> +	log->seq = ctx->seq;
>  	log->last_checkpoint = pos;
>  	r5l_write_super(log, pos);
>  
> -	r5c_recovery_flush_data_only_stripes(log, &ctx);
> -	return 0;
> +	r5c_recovery_flush_data_only_stripes(log, ctx);
> +	ret = 0;
> +error:
> +	r5l_recovery_free_ra_pool(log, ctx);
> +ra_pool:
> +	__free_page(ctx->meta_page);
> +meta_page:
> +	kfree(ctx);
> +	return ret;
>  }
>  
>  static void r5l_write_super(struct r5l_log *log, sector_t cp)
> -- 
> 2.9.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* mdadm add spare fails with "add new device failed ... Invalid argument" for RAID5
From: Tim Small @ 2017-03-09 20:17 UTC (permalink / raw)
  To: Linux-RAID

Hello,

I attempted to add in a new partition as a spare to a RAID5 array.  This
had previously worked for a different md (a smaller one, using another
set of partitions on the same set of drives, but otherwise created at
the same time and very similar), but it fails on the larger md array as
below:

# ~tim/compile/mdadm/mdadm --manage /dev/md2 --add /dev/sdf2
mdadm: add new device failed for /dev/sdf2 as 4: Invalid argument

The kernel says:

md: sdf2 does not have a valid v1.2 superblock, not importing!

I've tried with various mdadm versions (v3.1.4 to current git), and also
with Ubuntu  4.4.x and 4.8.x kernels.

I've attached mdadm --examine -v output:

Existing member examined with mdadm from git@github.com:neilbrown/mdadm.git

/dev/sdd2:
          Magic : a92b4efc
        Version : 1.2
    Feature Map : 0x9
     Array UUID : ad7ef7fa:e78344ea:a8778f06:abf07bf5
           Name : magic:2  (local to host magic)
  Creation Time : Wed Jul 15 13:43:06 2015
     Raid Level : raid5
   Raid Devices : 3

 Avail Dev Size : 3885793456 (1852.89 GiB 1989.53 GB)
     Array Size : 3885793280 (3705.78 GiB 3979.05 GB)
  Used Dev Size : 3885793280 (1852.89 GiB 1989.53 GB)
    Data Offset : 262144 sectors
   Super Offset : 8 sectors
   Unused Space : before=262056 sectors, after=176 sectors
          State : clean
    Device UUID : 55004cc7:b2e691de:c612612a:675ea2f3

Internal Bitmap : 8 sectors from superblock
    Update Time : Thu Mar  9 15:14:29 2017
  Bad Block Log : 512 entries available at offset 72 sectors - bad
blocks present.
       Checksum : 35698943 - correct
         Events : 568847

         Layout : left-symmetric
     Chunk Size : 512K

   Device Role : Active device 1
   Array State : AAA ('A' == active, '.' == missing, 'R' == replacing)



 diff vs. failed-to-add member:


--- /tmp/existing-member-examined-with-git-mdadm-d2	2017-03-09
15:18:32.993620717 +0000
+++ /tmp/mdadm-3-1-4-examined-with-git-mdadm-f2	2017-03-09
15:18:37.905679120 +0000
@@ -1,4 +1,4 @@
-/dev/sdd2:
+/dev/sdf2:
           Magic : a92b4efc
         Version : 1.2
     Feature Map : 0x9
@@ -8,23 +8,23 @@
      Raid Level : raid5
    Raid Devices : 3

- Avail Dev Size : 3885793456 (1852.89 GiB 1989.53 GB)
+ Avail Dev Size : 3886053552 (1853.01 GiB 1989.66 GB)
      Array Size : 3885793280 (3705.78 GiB 3979.05 GB)
   Used Dev Size : 3885793280 (1852.89 GiB 1989.53 GB)
-    Data Offset : 262144 sectors
+    Data Offset : 2048 sectors
    Super Offset : 8 sectors
-   Unused Space : before=262056 sectors, after=176 sectors
+   Unused Space : before=1960 sectors, after=260272 sectors
           State : active
-    Device UUID : 55004cc7:b2e691de:c612612a:675ea2f3
+    Device UUID : 876152cb:5f1646b5:39c32e17:9b55d3dd

 Internal Bitmap : 8 sectors from superblock
-    Update Time : Thu Mar  9 15:18:32 2017
+    Update Time : Thu Mar  9 15:15:02 2017
   Bad Block Log : 512 entries available at offset 72 sectors - bad
blocks present.
-       Checksum : 35698a37 - correct
-         Events : 568848
+       Checksum : e719c0b6 - correct
+         Events : 0

          Layout : left-symmetric
      Chunk Size : 512K

-   Device Role : Active device 1
+   Device Role : spare
    Array State : AAA ('A' == active, '.' == missing, 'R' == replacing)


and vs. same newly created spare, but when the add was attempted with
the a current git mdadm:


--- /tmp/existing-member-examined-with-git-mdadm-d2	2017-03-09
15:18:32.993620717 +0000
+++ /tmp/git-f2	2017-03-09 15:14:22.226655671 +0000
@@ -1,4 +1,4 @@
-/dev/sdd2:
+/dev/sdf2:
           Magic : a92b4efc
         Version : 1.2
     Feature Map : 0x9
@@ -13,18 +13,18 @@
   Used Dev Size : 3885793280 (1852.89 GiB 1989.53 GB)
     Data Offset : 262144 sectors
    Super Offset : 8 sectors
-   Unused Space : before=262056 sectors, after=176 sectors
-          State : active
-    Device UUID : 55004cc7:b2e691de:c612612a:675ea2f3
+   Unused Space : before=262064 sectors, after=176 sectors
+          State : clean
+    Device UUID : 3b1be0c7:82e8b8df:2d84b34b:9212fa71

 Internal Bitmap : 8 sectors from superblock
-    Update Time : Thu Mar  9 15:18:32 2017
-  Bad Block Log : 512 entries available at offset 72 sectors - bad
blocks present.
-       Checksum : 35698a37 - correct
-         Events : 568848
+    Update Time : Thu Mar  9 15:13:58 2017
+  Bad Block Log : 512 entries available at offset 16 sectors - bad
blocks present.
+       Checksum : d6c5ca00 - correct
+         Events : 0

          Layout : left-symmetric
      Chunk Size : 512K

-   Device Role : Active device 1
+   Device Role : spare
    Array State : AAA ('A' == active, '.' == missing, 'R' == replacing)


Any hints or tips welcome...

Tim.

^ permalink raw reply

* Re: [PATCH 2/2] md/r5cache: generate R5LOG_PAYLOAD_FLUSH
From: Shaohua Li @ 2017-03-09 21:53 UTC (permalink / raw)
  To: Song Liu; +Cc: linux-raid, shli, neilb, kernel-team, dan.j.williams, hch
In-Reply-To: <20170308014422.4019149-2-songliubraving@fb.com>

On Tue, Mar 07, 2017 at 05:44:22PM -0800, Song Liu wrote:
> In r5c_finish_stripe_write_out(), R5LOG_PAYLOAD_FLUSH is append to
> log->current_io.
> 
> Appending R5LOG_PAYLOAD_FLUSH in quiesce needs extra writes to
> journal. To simplify the logic, we just skip R5LOG_PAYLOAD_FLUSH in
> quiesce.
> 
> Even R5LOG_PAYLOAD_FLUSH supports multiple stripes per payload.
> However, current implementation is one stripe per R5LOG_PAYLOAD_FLUSH,
> which is simpler.

much simpiler than I expected :)
 
> Signed-off-by: Song Liu <songliubraving@fb.com>
> ---
>  drivers/md/raid5-cache.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 59 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index e69f922..fd0bfea 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -590,7 +590,21 @@ static void r5l_log_endio(struct bio *bio)
>  	mempool_free(io->meta_page, log->meta_pool);
>  
>  	spin_lock_irqsave(&log->io_list_lock, flags);
> -	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
> +
> +	if (list_empty(&io->stripe_list))
> +		/*
> +		 * this io_unit only has R5LOG_PAYLOAD_FLUSH, set
> +		 * to IO_UNIT_STRIPE_END
> +		 */
> +		__r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
> +	else
> +		/*
> +		 * io_unit with R5LOG_PAYLOAD_FLUSH and also DATA/PARITY
> +		 * set to IO_UNIT_IO_END and wait for all stripes get
> +		 * handled.
> +		 */
> +		__r5l_set_io_unit_state(io, IO_UNIT_IO_END);

This part along with r5l_do_reclaim change looks strange. Could we call
__r5l_stripe_write_finished here? It makes more sense as we also free io unit
and do other things. The r5l_do_reclaim part looks quite hackish.

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH v5 3/7] raid5-ppl: Partial Parity Log write logging implementation
From: Shaohua Li @ 2017-03-09 23:24 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: shli, linux-raid
In-Reply-To: <20170309090003.13298-4-artur.paszkiewicz@intel.com>

On Thu, Mar 09, 2017 at 09:59:59AM +0100, Artur Paszkiewicz wrote:
> Implement the calculation of partial parity for a stripe and PPL write
> logging functionality. The description of PPL is added to the
> documentation. More details can be found in the comments in raid5-ppl.c.
> 
> Attach a page for holding the partial parity data to stripe_head.
> Allocate it only if mddev has the MD_HAS_PPL flag set.
> 
> Partial parity is the xor of not modified data chunks of a stripe and is
> calculated as follows:
> 
> - reconstruct-write case:
>   xor data from all not updated disks in a stripe
> 
> - read-modify-write case:
>   xor old data and parity from all updated disks in a stripe
> 
> Implement it using the async_tx API and integrate into raid_run_ops().
> It must be called when we still have access to old data, so do it when
> STRIPE_OP_BIODRAIN is set, but before ops_run_prexor5(). The result is
> stored into sh->ppl_page.
> 
> Partial parity is not meaningful for full stripe write and is not stored
> in the log or used for recovery, so don't attempt to calculate it when
> stripe has STRIPE_FULL_WRITE.
> 
> Put the PPL metadata structures to md_p.h because userspace tools
> (mdadm) will also need to read/write PPL.
> 
> Warn about using PPL with enabled disk volatile write-back cache for
> now. It can be removed once disk cache flushing before writing PPL is
> implemented.
> 
> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>

... snip ...

> +struct dma_async_tx_descriptor *
> +ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
> +		       struct dma_async_tx_descriptor *tx)
> +{
> +	int disks = sh->disks;
> +	struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
> +	int count = 0, pd_idx = sh->pd_idx, i;
> +	struct async_submit_ctl submit;
> +
> +	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
> +
> +	/*
> +	 * Partial parity is the XOR of stripe data chunks that are not changed
> +	 * during the write request. Depending on available data
> +	 * (read-modify-write vs. reconstruct-write case) we calculate it
> +	 * differently.
> +	 */
> +	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
> +		/* rmw: xor old data and parity from updated disks */
> +		for (i = disks; i--;) {
> +			struct r5dev *dev = &sh->dev[i];
> +			if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx)
> +				xor_srcs[count++] = dev->page;
> +		}
> +	} else if (sh->reconstruct_state == reconstruct_state_drain_run) {
> +		/* rcw: xor data from all not updated disks */
> +		for (i = disks; i--;) {
> +			struct r5dev *dev = &sh->dev[i];
> +			if (test_bit(R5_UPTODATE, &dev->flags))
> +				xor_srcs[count++] = dev->page;
> +		}
> +	} else {
> +		return tx;
> +	}
> +
> +	init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, tx, NULL, sh,
> +			  flex_array_get(percpu->scribble, 0)
> +			  + sizeof(struct page *) * (sh->disks + 2));

Since this should be done before biodrain, should this add ASYNC_TX_FENCE flag?

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH v5 5/7] raid5-ppl: load and recover the log
From: Shaohua Li @ 2017-03-09 23:30 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: shli, linux-raid
In-Reply-To: <20170309090003.13298-6-artur.paszkiewicz@intel.com>

On Thu, Mar 09, 2017 at 10:00:01AM +0100, Artur Paszkiewicz wrote:
> Load the log from each disk when starting the array and recover if the
> array is dirty.
> 
> The initial empty PPL is written by mdadm. When loading the log we
> verify the header checksum and signature. For external metadata arrays
> the signature is verified in userspace, so here we read it from the
> header, verifying only if it matches on all disks, and use it later when
> writing PPL.
> 
> In addition to the header checksum, each header entry also contains a
> checksum of its partial parity data. If the header is valid, recovery is
> performed for each entry until an invalid entry is found. If the array
> is not degraded and recovery using PPL fully succeeds, there is no need
> to resync the array because data and parity will be consistent, so in
> this case resync will be disabled.
> 
> Due to compatibility with IMSM implementations on other systems, we
> can't assume that the recovery data block size is always 4K. Writes
> generated by MD raid5 don't have this issue, but when recovering PPL
> written in other environments it is possible to have entries with
> 512-byte sector granularity. The recovery code takes this into account
> and also the logical sector size of the underlying drives.
> 
> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
> ---
>  drivers/md/raid5-ppl.c | 497 +++++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/md/raid5.c     |   5 +-
>  2 files changed, 501 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
> index 92783586743d..548d1028a3ce 100644
> --- a/drivers/md/raid5-ppl.c
> +++ b/drivers/md/raid5-ppl.c
> @@ -103,6 +103,10 @@ struct ppl_conf {
>  	mempool_t *io_pool;
>  	struct bio_set *bs;
>  	mempool_t *meta_pool;
> +
> +	/* used only for recovery */
> +	int recovered_entries;
> +	int mismatch_count;
>  };
>  
>  struct ppl_log {
> @@ -514,6 +518,482 @@ void ppl_stripe_write_finished(struct stripe_head *sh)
>  		ppl_io_unit_finished(io);
>  }
>  
> +static void ppl_xor(int size, struct page *page1, struct page *page2,
> +		    struct page *page_result)
> +{

I'd remove the page_result parameter, it should always be page1. And this will
make it clear why we need ASYNC_TX_XOR_DROP_DST.

> +	struct async_submit_ctl submit;
> +	struct dma_async_tx_descriptor *tx;
> +	struct page *xor_srcs[] = { page1, page2 };
> +
> +	init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
> +			  NULL, NULL, NULL, NULL);
> +	tx = async_xor(page_result, xor_srcs, 0, 2, size, &submit);
> +
> +	async_tx_quiesce(&tx);
> +}

...
> +			ret = ppl_recover_entry(log, e, ppl_sector);
> +			if (ret)
> +				goto out;
> +			ppl_conf->recovered_entries++;
> +		}
> +
> +		ppl_sector += ppl_entry_sectors;
> +	}
> +
> +	/* flush the disk cache after recovery if necessary */
> +	if (test_bit(QUEUE_FLAG_WC, &bdev_get_queue(rdev->bdev)->queue_flags)) {

The block layer will handle this, so you don't need to check

> +		struct bio *bio = bio_alloc_bioset(GFP_KERNEL, 0, ppl_conf->bs);
> +
> +		bio->bi_bdev = rdev->bdev;
> +		bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
> +		ret = submit_bio_wait(bio);
> +		bio_put(bio);

please use blkdev_issue_flush() instead.

> +	}
> +out:
> +	__free_page(page);
> +	return ret;
> +}
> +
...
> +static int ppl_load(struct ppl_conf *ppl_conf)
> +{
> +	int ret = 0;
> +	u32 signature = 0;
> +	bool signature_set = false;
> +	int i;
> +
> +	for (i = 0; i < ppl_conf->count; i++) {
> +		struct ppl_log *log = &ppl_conf->child_logs[i];
> +
> +		/* skip missing drive */
> +		if (!log->rdev)
> +			continue;
> +
> +		ret = ppl_load_distributed(log);
> +		if (ret)
> +			break;

Not sure about the strategy here. But if one disk fails, why don't we continue
do the recovery from other disks? This way we can at least recovery more data.

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH v5 0/7] Partial Parity Log for MD RAID 5
From: Shaohua Li @ 2017-03-09 23:32 UTC (permalink / raw)
  To: Artur Paszkiewicz; +Cc: shli, linux-raid
In-Reply-To: <20170309090003.13298-1-artur.paszkiewicz@intel.com>

On Thu, Mar 09, 2017 at 09:59:56AM +0100, Artur Paszkiewicz wrote:
> This series of patches implements the Partial Parity Log for RAID5 arrays. The
> purpose of this feature is closing the RAID 5 Write Hole. It is a solution
> alternative to the existing raid5-cache, but the logging workflow and much of
> the implementation is based on it.
> 
> The main differences compared to raid5-cache is that PPL is a distributed log -
> it is stored on array member drives in the metadata area and does not require a
> dedicated journaling drive. Write performance is reduced by up to 30%-40% but
> it scales with the number of drives in the array and the journaling drive does
> not become a bottleneck or a single point of failure. PPL does not protect from
> losing in-flight data, only from silent data corruption. More details about how
> the log works can be found in patches 3 and 5.
> 
> This feature originated from Intel RSTe, which uses IMSM metadata. PPL for IMSM
> is going to be included in RSTe implementations starting with upcoming Xeon
> platforms and Intel will continue supporting and maintaining it. This patchset
> implements PPL for external metadata (specifically IMSM) as well as native MD
> v1.x metadata.
> 
> Changes in mdadm are also required to make this fully usable. Patches for mdadm
> will be sent later.

Looks good, I'll apply this series into next. There are some small issues I
replied separately. Please send a separate patch to fix them if they make
sense, I'll integrate them locally.

Thanks,
Shaohua

^ permalink raw reply

* [PATCH v2] md/r5cache: generate R5LOG_PAYLOAD_FLUSH
From: Song Liu @ 2017-03-10  0:24 UTC (permalink / raw)
  To: linux-raid; +Cc: shli, neilb, kernel-team, dan.j.williams, hch, Song Liu

In r5c_finish_stripe_write_out(), R5LOG_PAYLOAD_FLUSH is append to
log->current_io.

Appending R5LOG_PAYLOAD_FLUSH in quiesce needs extra writes to
journal. To simplify the logic, we just skip R5LOG_PAYLOAD_FLUSH in
quiesce.

Even R5LOG_PAYLOAD_FLUSH supports multiple stripes per payload.
However, current implementation is one stripe per R5LOG_PAYLOAD_FLUSH,
which is simpler.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 48 ++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index e69f922..f148c68 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -588,10 +588,11 @@ static void r5l_log_endio(struct bio *bio)
 
 	bio_put(bio);
 	mempool_free(io->meta_page, log->meta_pool);
+	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
 
 	spin_lock_irqsave(&log->io_list_lock, flags);
-	__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
-	if (log->need_cache_flush)
+
+	if (log->need_cache_flush && !list_empty(&io->stripe_list))
 		r5l_move_to_end_ios(log);
 	else
 		r5l_log_run_stripes(log);
@@ -619,9 +620,11 @@ static void r5l_log_endio(struct bio *bio)
 			bio_endio(bi);
 			atomic_dec(&io->pending_stripe);
 		}
-		if (atomic_read(&io->pending_stripe) == 0)
-			__r5l_stripe_write_finished(io);
 	}
+
+	/* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */
+	if (list_empty(&io->stripe_list))
+		__r5l_stripe_write_finished(io);
 }
 
 static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
@@ -843,6 +846,41 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
 	r5_reserve_log_entry(log, io);
 }
 
+static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect)
+{
+	struct mddev *mddev = log->rdev->mddev;
+	struct r5conf *conf = mddev->private;
+	struct r5l_io_unit *io;
+	struct r5l_payload_flush *payload;
+	int meta_size;
+
+	/*
+	 * payload_flush requires extra writes to the journal.
+	 * To avoid handling the extra IO in quiesce, just skip
+	 * flush_payload
+	 */
+	if (conf->quiesce)
+		return;
+
+	mutex_lock(&log->io_mutex);
+	meta_size = sizeof(struct r5l_payload_flush) + sizeof(__le64);
+
+	if (r5l_get_meta(log, meta_size)) {
+		mutex_unlock(&log->io_mutex);
+		return;
+	}
+
+	/* current implementation is one stripe per flush payload */
+	io = log->current_io;
+	payload = page_address(io->meta_page) + io->meta_offset;
+	payload->header.type = cpu_to_le16(R5LOG_PAYLOAD_FLUSH);
+	payload->header.flags = cpu_to_le16(0);
+	payload->size = cpu_to_le32(sizeof(__le64));
+	payload->flush_stripes[0] = cpu_to_le64(sect);
+	io->meta_offset += meta_size;
+	mutex_unlock(&log->io_mutex);
+}
+
 static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
 			   int data_pages, int parity_pages)
 {
@@ -2784,6 +2822,8 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
 		atomic_dec(&conf->r5c_flushing_full_stripes);
 		atomic_dec(&conf->r5c_cached_full_stripes);
 	}
+
+	r5l_append_flush_payload(log, sh->sector);
 }
 
 int
-- 
2.9.3


^ permalink raw reply related

* Re: [PATCH V3] md: move bitmap_destroy before __md_stop
From: NeilBrown @ 2017-03-10  0:51 UTC (permalink / raw)
  To: Shaohua Li, Guoqing Jiang; +Cc: linux-raid, shli
In-Reply-To: <20170309182443.c3th4di2v3kct7eu@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 4288 bytes --]

On Thu, Mar 09 2017, Shaohua Li wrote:

> On Wed, Mar 08, 2017 at 10:31:32AM +0800, Guoqing Jiang wrote:
>> Since we have switched to sync way to handle METADATA_UPDATED
>> msg for md-cluster, then process_metadata_update is depended
>> on mddev->thread->wqueue.
>> 
>> With the new change, clustered raid could possible hang if
>> array received a METADATA_UPDATED msg after array unregistered
>> mddev->thread, so we need to stop clustered raid (bitmap_destroy
>> 	 -> bitmap_free -> md_cluster_stop) earlier than unregister
>> thread (mddev_detach -> md_unregister_thread).
>> 
>> And this change should be safe for non-clustered raid since
>> all writes are stopped before the destroy. Also in md_run,
>> we activate the personality (pers->run()) before activating
>> the bitmap (bitmap_create()). So it is pleasingly symmetric
>> to stop the bitmap (bitmap_destroy()) before stopping the
>> personality (__md_stop() calls pers->free()).
>> 
>> But we don't want to break the codes for waiting behind IO as
>> Shaohua mentioned, so move those codes from mddev_detach to
>> bitmap_destroy. Since we already check bitmap at the beginning
>> of bitmap_destroy, just wait for behind_writes to be zero if
>> it existed.
>> 
>> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
>> ---
>> This version move waiting behind IO codes into bitmap_destroy
>> so we can safely call bitmap_destroy before __md_stop now.
>> 
>>  drivers/md/bitmap.c |  9 +++++++++
>>  drivers/md/md.c     | 13 ++-----------
>>  2 files changed, 11 insertions(+), 11 deletions(-)
>> 
>> diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
>> index b6fa55a3cff8..89a35bc092dd 100644
>> --- a/drivers/md/bitmap.c
>> +++ b/drivers/md/bitmap.c
>> @@ -1771,6 +1771,15 @@ void bitmap_destroy(struct mddev *mddev)
>>  	if (!bitmap) /* there was no bitmap */
>>  		return;
>>  
>> +	/* wait for behind writes to complete */
>> +	if (atomic_read(&bitmap->behind_writes) > 0) {
>> +		printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n",
>> +		       mdname(mddev));
>> +		/* need to kick something here to make sure I/O goes? */
>> +		wait_event(bitmap->behind_wait,
>> +			   atomic_read(&bitmap->behind_writes) == 0);
>> +	}
>> +
>>  	mutex_lock(&mddev->bitmap_info.mutex);
>>  	spin_lock(&mddev->lock);
>>  	mddev->bitmap = NULL; /* disconnect from the md device */
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 79a99a1c9ce7..b63ab4f33892 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -5534,15 +5534,6 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
>>  
>>  static void mddev_detach(struct mddev *mddev)
>>  {
>> -	struct bitmap *bitmap = mddev->bitmap;
>> -	/* wait for behind writes to complete */
>> -	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
>> -		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
>> -			 mdname(mddev));
>> -		/* need to kick something here to make sure I/O goes? */
>> -		wait_event(bitmap->behind_wait,
>> -			   atomic_read(&bitmap->behind_writes) == 0);
>> -	}
>
> I think it's ok to add this part into bitmap_destroy, as we need to call
> bitmap_destroy before mddev_detach. Look at the usage of mddev_detach, at in
> one place (level_store()), we wait for the IO without bitmap_destroy. I think
> we should keep this part code in mddev_detach. Maybe create a small function,
> let both mddev_detach and bitmap_destroy call it.

I don't think level_store() needs to explicitly wait for behind io.
It calls mddev_suspend(), which calls the ->quiesce function in the
personality, which is responsible for waiting for all pending IO,
including behind.  raid1.c does this correctly.


>
>>  	if (mddev->pers && mddev->pers->quiesce) {
>>  		mddev->pers->quiesce(mddev, 1);
>>  		mddev->pers->quiesce(mddev, 0);
>> @@ -5574,8 +5565,8 @@ void md_stop(struct mddev *mddev)
>>  	/* stop the array and free an attached data structures.
>>  	 * This is called from dm-raid
>>  	 */
>> -	__md_stop(mddev);
>>  	bitmap_destroy(mddev);
>> +	__md_stop(mddev);
>
> since we now always do bitmap_destroy and follow __md_stop, maybe move
> bitmap_destroy to very begining of __md_stop

That sounds like a good idea.

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [PATCH V3] md: move bitmap_destroy before __md_stop
From: Shaohua Li @ 2017-03-10  1:06 UTC (permalink / raw)
  To: NeilBrown; +Cc: Guoqing Jiang, linux-raid, shli
In-Reply-To: <87k27ygdj6.fsf@notabene.neil.brown.name>

On Fri, Mar 10, 2017 at 11:51:09AM +1100, Neil Brown wrote:
> On Thu, Mar 09 2017, Shaohua Li wrote:
> 
> > On Wed, Mar 08, 2017 at 10:31:32AM +0800, Guoqing Jiang wrote:
> >> Since we have switched to sync way to handle METADATA_UPDATED
> >> msg for md-cluster, then process_metadata_update is depended
> >> on mddev->thread->wqueue.
> >> 
> >> With the new change, clustered raid could possible hang if
> >> array received a METADATA_UPDATED msg after array unregistered
> >> mddev->thread, so we need to stop clustered raid (bitmap_destroy
> >> 	 -> bitmap_free -> md_cluster_stop) earlier than unregister
> >> thread (mddev_detach -> md_unregister_thread).
> >> 
> >> And this change should be safe for non-clustered raid since
> >> all writes are stopped before the destroy. Also in md_run,
> >> we activate the personality (pers->run()) before activating
> >> the bitmap (bitmap_create()). So it is pleasingly symmetric
> >> to stop the bitmap (bitmap_destroy()) before stopping the
> >> personality (__md_stop() calls pers->free()).
> >> 
> >> But we don't want to break the codes for waiting behind IO as
> >> Shaohua mentioned, so move those codes from mddev_detach to
> >> bitmap_destroy. Since we already check bitmap at the beginning
> >> of bitmap_destroy, just wait for behind_writes to be zero if
> >> it existed.
> >> 
> >> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
> >> ---
> >> This version move waiting behind IO codes into bitmap_destroy
> >> so we can safely call bitmap_destroy before __md_stop now.
> >> 
> >>  drivers/md/bitmap.c |  9 +++++++++
> >>  drivers/md/md.c     | 13 ++-----------
> >>  2 files changed, 11 insertions(+), 11 deletions(-)
> >> 
> >> diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
> >> index b6fa55a3cff8..89a35bc092dd 100644
> >> --- a/drivers/md/bitmap.c
> >> +++ b/drivers/md/bitmap.c
> >> @@ -1771,6 +1771,15 @@ void bitmap_destroy(struct mddev *mddev)
> >>  	if (!bitmap) /* there was no bitmap */
> >>  		return;
> >>  
> >> +	/* wait for behind writes to complete */
> >> +	if (atomic_read(&bitmap->behind_writes) > 0) {
> >> +		printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n",
> >> +		       mdname(mddev));
> >> +		/* need to kick something here to make sure I/O goes? */
> >> +		wait_event(bitmap->behind_wait,
> >> +			   atomic_read(&bitmap->behind_writes) == 0);
> >> +	}
> >> +
> >>  	mutex_lock(&mddev->bitmap_info.mutex);
> >>  	spin_lock(&mddev->lock);
> >>  	mddev->bitmap = NULL; /* disconnect from the md device */
> >> diff --git a/drivers/md/md.c b/drivers/md/md.c
> >> index 79a99a1c9ce7..b63ab4f33892 100644
> >> --- a/drivers/md/md.c
> >> +++ b/drivers/md/md.c
> >> @@ -5534,15 +5534,6 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
> >>  
> >>  static void mddev_detach(struct mddev *mddev)
> >>  {
> >> -	struct bitmap *bitmap = mddev->bitmap;
> >> -	/* wait for behind writes to complete */
> >> -	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
> >> -		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
> >> -			 mdname(mddev));
> >> -		/* need to kick something here to make sure I/O goes? */
> >> -		wait_event(bitmap->behind_wait,
> >> -			   atomic_read(&bitmap->behind_writes) == 0);
> >> -	}
> >
> > I think it's ok to add this part into bitmap_destroy, as we need to call
> > bitmap_destroy before mddev_detach. Look at the usage of mddev_detach, at in
> > one place (level_store()), we wait for the IO without bitmap_destroy. I think
> > we should keep this part code in mddev_detach. Maybe create a small function,
> > let both mddev_detach and bitmap_destroy call it.
> 
> I don't think level_store() needs to explicitly wait for behind io.
> It calls mddev_suspend(), which calls the ->quiesce function in the
> personality, which is responsible for waiting for all pending IO,
> including behind.  raid1.c does this correctly.

Can you elaborate Where >quiesce waits for behind IO in raid1? It's not
obvious. It really should though.

Thanks,
Shaohua

^ permalink raw reply

* Re: [PATCH V3] md: move bitmap_destroy before __md_stop
From: NeilBrown @ 2017-03-10  2:12 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Guoqing Jiang, linux-raid, shli
In-Reply-To: <20170310010604.lpyhnmnlcy5w3xmi@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 1452 bytes --]

On Thu, Mar 09 2017, Shaohua Li wrote:

> On Fri, Mar 10, 2017 at 11:51:09AM +1100, Neil Brown wrote:
>> On Thu, Mar 09 2017, Shaohua Li wrote:
....
>> > I think it's ok to add this part into bitmap_destroy, as we need to call
>> > bitmap_destroy before mddev_detach. Look at the usage of mddev_detach, at in
>> > one place (level_store()), we wait for the IO without bitmap_destroy. I think
>> > we should keep this part code in mddev_detach. Maybe create a small function,
>> > let both mddev_detach and bitmap_destroy call it.
>> 
>> I don't think level_store() needs to explicitly wait for behind io.
>> It calls mddev_suspend(), which calls the ->quiesce function in the
>> personality, which is responsible for waiting for all pending IO,
>> including behind.  raid1.c does this correctly.
>
> Can you elaborate Where >quiesce waits for behind IO in raid1? It's not
> obvious. It really should though.

Ahh - you are correct.
allow_barrier() is called by call_bio_endio(), which can be called
before behind-writes complete.

My bi_phys_segments series actually fixed that, by counting every write
request in nr_pending so that nr_pending wouldn't drop to zero until all
the writes had completed.

We should probably come up with a simple fix for -stable.

Unfortunately it doesn't look at all easy.
I'll revisit my bi_phys_segments series next week and try to sort out
the best way to fix this issue.

NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [PATCH V3] md: move bitmap_destroy before __md_stop
From: Guoqing Jiang @ 2017-03-10  2:51 UTC (permalink / raw)
  To: Shaohua Li; +Cc: linux-raid, neilb, shli
In-Reply-To: <20170309182443.c3th4di2v3kct7eu@kernel.org>



On 03/10/2017 02:24 AM, Shaohua Li wrote:
> On Wed, Mar 08, 2017 at 10:31:32AM +0800, Guoqing Jiang wrote:
>> Since we have switched to sync way to handle METADATA_UPDATED
>> msg for md-cluster, then process_metadata_update is depended
>> on mddev->thread->wqueue.
>>
>> With the new change, clustered raid could possible hang if
>> array received a METADATA_UPDATED msg after array unregistered
>> mddev->thread, so we need to stop clustered raid (bitmap_destroy
>> 	 -> bitmap_free -> md_cluster_stop) earlier than unregister
>> thread (mddev_detach -> md_unregister_thread).
>>
>> And this change should be safe for non-clustered raid since
>> all writes are stopped before the destroy. Also in md_run,
>> we activate the personality (pers->run()) before activating
>> the bitmap (bitmap_create()). So it is pleasingly symmetric
>> to stop the bitmap (bitmap_destroy()) before stopping the
>> personality (__md_stop() calls pers->free()).
>>
>> But we don't want to break the codes for waiting behind IO as
>> Shaohua mentioned, so move those codes from mddev_detach to
>> bitmap_destroy. Since we already check bitmap at the beginning
>> of bitmap_destroy, just wait for behind_writes to be zero if
>> it existed.
>>
>> Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
>> ---
>> This version move waiting behind IO codes into bitmap_destroy
>> so we can safely call bitmap_destroy before __md_stop now.
>>
>>   drivers/md/bitmap.c |  9 +++++++++
>>   drivers/md/md.c     | 13 ++-----------
>>   2 files changed, 11 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
>> index b6fa55a3cff8..89a35bc092dd 100644
>> --- a/drivers/md/bitmap.c
>> +++ b/drivers/md/bitmap.c
>> @@ -1771,6 +1771,15 @@ void bitmap_destroy(struct mddev *mddev)
>>   	if (!bitmap) /* there was no bitmap */
>>   		return;
>>   
>> +	/* wait for behind writes to complete */
>> +	if (atomic_read(&bitmap->behind_writes) > 0) {
>> +		printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n",
>> +		       mdname(mddev));
>> +		/* need to kick something here to make sure I/O goes? */
>> +		wait_event(bitmap->behind_wait,
>> +			   atomic_read(&bitmap->behind_writes) == 0);
>> +	}
>> +
>>   	mutex_lock(&mddev->bitmap_info.mutex);
>>   	spin_lock(&mddev->lock);
>>   	mddev->bitmap = NULL; /* disconnect from the md device */
>> diff --git a/drivers/md/md.c b/drivers/md/md.c
>> index 79a99a1c9ce7..b63ab4f33892 100644
>> --- a/drivers/md/md.c
>> +++ b/drivers/md/md.c
>> @@ -5534,15 +5534,6 @@ EXPORT_SYMBOL_GPL(md_stop_writes);
>>   
>>   static void mddev_detach(struct mddev *mddev)
>>   {
>> -	struct bitmap *bitmap = mddev->bitmap;
>> -	/* wait for behind writes to complete */
>> -	if (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
>> -		pr_debug("md:%s: behind writes in progress - waiting to stop.\n",
>> -			 mdname(mddev));
>> -		/* need to kick something here to make sure I/O goes? */
>> -		wait_event(bitmap->behind_wait,
>> -			   atomic_read(&bitmap->behind_writes) == 0);
>> -	}
> I think it's ok to add this part into bitmap_destroy, as we need to call
> bitmap_destroy before mddev_detach. Look at the usage of mddev_detach, at in
> one place (level_store()), we wait for the IO without bitmap_destroy. I think
> we should keep this part code in mddev_detach. Maybe create a small function,
> let both mddev_detach and bitmap_destroy call it.

Thanks, I will add bitmap_wait_behind_writes for it, then call the func
in both mddev_detach and bitmap_destroy.

>
>>   	if (mddev->pers && mddev->pers->quiesce) {
>>   		mddev->pers->quiesce(mddev, 1);
>>   		mddev->pers->quiesce(mddev, 0);
>> @@ -5574,8 +5565,8 @@ void md_stop(struct mddev *mddev)
>>   	/* stop the array and free an attached data structures.
>>   	 * This is called from dm-raid
>>   	 */
>> -	__md_stop(mddev);
>>   	bitmap_destroy(mddev);
>> +	__md_stop(mddev);
> since we now always do bitmap_destroy and follow __md_stop, maybe move
> bitmap_destroy to very begining of __md_stop

Ok, will do it in next version.

Thanks,
Guoqing

^ permalink raw reply

* [PATCH] md: fix super_offset endianness in super_1_rdev_size_change
From: Jason Yan @ 2017-03-10  3:27 UTC (permalink / raw)
  To: shli, linux-raid; +Cc: miaoxie, zhaohongjiang, Jason Yan

The sb->super_offset should be big-endian, but the rdev->sb_start is in
host byte order, so fix this by adding cpu_to_le64.

Signed-off-by: Jason Yan <yanaijie@huawei.com>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 548d1b8..b6516f8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1887,7 +1887,7 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
 	}
 	sb = page_address(rdev->sb_page);
 	sb->data_size = cpu_to_le64(num_sectors);
-	sb->super_offset = rdev->sb_start;
+	sb->super_offset = cpu_to_le64(rdev->sb_start);
 	sb->sb_csum = calc_sb_1_csum(sb);
 	do {
 		md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
-- 
2.5.0


^ permalink raw reply related

* [PATCH] md: fix incorrect use of lexx_to_cpu in does_sb_need_changing
From: Jason Yan @ 2017-03-10  3:49 UTC (permalink / raw)
  To: shli, linux-raid; +Cc: miaoxie, zhaohongjiang, Jason Yan

The sb->layout is of type __le32, so we shoud use le32_to_cpu.

Signed-off-by: Jason Yan <yanaijie@huawei.com>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index e0d4d13..ea86a89 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2287,7 +2287,7 @@ static bool does_sb_need_changing(struct mddev *mddev)
 	/* Check if any mddev parameters have changed */
 	if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
 	    (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
-	    (mddev->layout != le64_to_cpu(sb->layout)) ||
+	    (mddev->layout != le32_to_cpu(sb->layout)) ||
 	    (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
 	    (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
 		return true;
-- 
2.5.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox