From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
To: shli@kernel.org
Cc: linux-raid@vger.kernel.org
Subject: [PATCH v2 10/12] raid5-ppl: recovery from dirty shutdown using PPL
Date: Mon, 5 Dec 2016 16:31:11 +0100 [thread overview]
Message-ID: <20161205153113.7268-11-artur.paszkiewicz@intel.com> (raw)
In-Reply-To: <20161205153113.7268-1-artur.paszkiewicz@intel.com>
The recovery algorithm recalculates parity for every dirty stripe by
xor-ing the partial parity and data from each updated data member disk.
To verify PPL correctness a CRC is used for the PPL header. Each header
entry also contains a CRC for its partial parity data. If the header is
valid, recovery is performed for each entry until an invalid entry is
found. If the array is not degraded and recovery using PPL fully
succeeds, there is no need to resync the array because data and parity
will be consistent, so in this case resync will be disabled.
Due to compatibility with IMSM implementations on other systems, we
can't assume that the block size is always 4K. Writes generated by MD
raid5 don't have this issue, but in other environments it is possible to
have writes with the size of even a single 512-byte sector. The recovery
code takes this into account and also the logical sector size of the
underlying drives.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/raid5-ppl.c | 347 +++++++++++++++++++++++++++++++++++++++++++++++++
drivers/md/raid5.c | 5 +-
2 files changed, 351 insertions(+), 1 deletion(-)
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 9e46497..17e9803 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -16,6 +16,7 @@
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/crc32c.h>
+#include <linux/async_tx.h>
#include <linux/module.h>
#include <linux/raid/md_p.h>
#include "md.h"
@@ -403,6 +404,346 @@ static void __ppl_stripe_write_finished(struct r5l_io_unit *io)
spin_unlock_irqrestore(&log->io_list_lock, flags);
}
+static void ppl_xor(int size, struct page *page1, struct page *page2,
+ struct page *page_result)
+{
+ struct async_submit_ctl submit;
+ struct dma_async_tx_descriptor *tx;
+ struct page *xor_srcs[] = { page1, page2 };
+
+ init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
+ NULL, NULL, NULL, NULL);
+ tx = async_xor(page_result, xor_srcs, 0, 2, size, &submit);
+
+ async_tx_quiesce(&tx);
+}
+
+static int ppl_recover_entry(struct r5l_log *log, struct ppl_header_entry *e,
+ sector_t ppl_sector)
+{
+ struct mddev *mddev = log->rdev->mddev;
+ struct r5conf *conf = mddev->private;
+
+ int block_size = queue_logical_block_size(mddev->queue);
+ struct page *pages;
+ struct page *page1;
+ struct page *page2;
+ sector_t r_sector_first = e->data_sector * (block_size >> 9);
+ sector_t r_sector_last = r_sector_first + (e->data_size >> 9) - 1;
+ int strip_sectors = conf->chunk_sectors;
+ int i;
+ int ret = 0;
+
+ if (e->pp_size > 0 && (e->pp_size >> 9) < strip_sectors) {
+ if (e->data_size > e->pp_size)
+ r_sector_last = r_sector_first +
+ (e->data_size / e->pp_size) * strip_sectors - 1;
+ strip_sectors = e->pp_size >> 9;
+ }
+
+ pages = alloc_pages(GFP_KERNEL, 1);
+ if (!pages)
+ return -ENOMEM;
+ page1 = pages;
+ page2 = pages + 1;
+
+ dbg("array sector first %llu, last %llu\n",
+ (unsigned long long)r_sector_first,
+ (unsigned long long)r_sector_last);
+
+ /* if start and end is 4k aligned, use a 4k block */
+ if (block_size == 512 &&
+ r_sector_first % (PAGE_SIZE >> 9) == 0 &&
+ (r_sector_last + 1) % (PAGE_SIZE >> 9) == 0)
+ block_size = PAGE_SIZE;
+
+ /* iterate through blocks in strip */
+ for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
+ bool update_parity = false;
+ sector_t parity_sector;
+ struct md_rdev *parity_rdev;
+ struct stripe_head sh;
+ int disk;
+
+ dbg(" iter %d start\n", i);
+ memset(page_address(page1), 0, PAGE_SIZE);
+
+ /* iterate through data member disks */
+ for (disk = 0; disk < (conf->raid_disks - conf->max_degraded);
+ disk++) {
+ int dd_idx;
+ struct md_rdev *rdev;
+ sector_t sector;
+ sector_t r_sector = r_sector_first + i +
+ (disk * conf->chunk_sectors);
+
+ dbg(" data member disk %d start\n", disk);
+ if (r_sector > r_sector_last) {
+ dbg(" array sector %llu doesn't need parity update\n",
+ (unsigned long long)r_sector);
+ continue;
+ }
+
+ update_parity = true;
+
+ /* map raid sector to member disk */
+ sector = raid5_compute_sector(conf, r_sector, 0, &dd_idx, NULL);
+ dbg(" processing array sector %llu => data mem disk %d, sector %llu\n",
+ (unsigned long long)r_sector, dd_idx,
+ (unsigned long long)sector);
+
+ rdev = conf->disks[dd_idx].rdev;
+ if (!rdev) {
+ dbg(" data member disk %d missing\n", dd_idx);
+ update_parity = false;
+ break;
+ }
+
+ dbg(" reading data member disk %s sector %llu\n",
+ rdev->bdev->bd_disk->disk_name,
+ (unsigned long long)sector);
+ if (!sync_page_io(rdev, sector, block_size, page2,
+ REQ_OP_READ, 0, false)) {
+ md_error(mddev, rdev);
+ dbg(" read failed!\n");
+ ret = -EIO;
+ goto out;
+ }
+
+ ppl_xor(block_size, page1, page2, page1);
+ }
+
+ if (!update_parity)
+ continue;
+
+ if (e->pp_size > 0) {
+ dbg(" reading pp disk sector %llu\n",
+ (unsigned long long)(ppl_sector + i));
+ if (!sync_page_io(log->rdev,
+ ppl_sector - log->rdev->data_offset + i,
+ block_size, page2, REQ_OP_READ, 0,
+ false)) {
+ dbg(" read failed!\n");
+ md_error(mddev, log->rdev);
+ ret = -EIO;
+ goto out;
+ }
+
+ ppl_xor(block_size, page1, page2, page1);
+ }
+
+ /* map raid sector to parity disk */
+ parity_sector = raid5_compute_sector(conf, r_sector_first + i,
+ 0, &disk, &sh);
+ BUG_ON(sh.pd_idx != e->parity_disk);
+ parity_rdev = conf->disks[sh.pd_idx].rdev;
+
+ BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
+ dbg(" write parity at sector %llu, parity disk %s\n",
+ (unsigned long long)parity_sector,
+ parity_rdev->bdev->bd_disk->disk_name);
+ if (!sync_page_io(parity_rdev, parity_sector, block_size,
+ page1, REQ_OP_WRITE, 0, false)) {
+ dbg(" parity write error!\n");
+ md_error(mddev, parity_rdev);
+ ret = -EIO;
+ goto out;
+ }
+ }
+
+out:
+ __free_pages(pages, 1);
+ return ret;
+}
+
+static int ppl_recover(struct r5l_log *log, struct ppl_header *pplhdr)
+{
+ struct mddev *mddev = log->rdev->mddev;
+ sector_t ppl_sector = log->rdev->ppl.sector + (PPL_HEADER_SIZE >> 9);
+ struct page *page;
+ int i;
+ int ret = 0;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ /* iterate through all PPL entries saved */
+ for (i = 0; i < pplhdr->entries_count; i++) {
+ struct ppl_header_entry *e = &pplhdr->entries[i];
+ u32 size = le32_to_cpu(e->pp_size);
+ sector_t sector = ppl_sector;
+ int ppl_entry_sectors = size >> 9;
+ u32 crc, crc_stored;
+
+ dbg("disk: %d, entry: %d, ppl_sector: %llu ppl_size: %u\n",
+ log->rdev->raid_disk, i, (unsigned long long)ppl_sector,
+ size);
+
+ crc = ~0;
+ crc_stored = le32_to_cpu(e->checksum);
+
+ while (size) {
+ int s = size > PAGE_SIZE ? PAGE_SIZE : size;
+
+ if (!sync_page_io(log->rdev,
+ sector - log->rdev->data_offset,
+ s, page, REQ_OP_READ, 0, false)) {
+ md_error(mddev, log->rdev);
+ ret = -EIO;
+ goto out;
+ }
+
+ crc = crc32c_le(crc, page_address(page), s);
+
+ size -= s;
+ sector += s >> 9;
+ }
+
+ crc = ~crc;
+
+ if (crc != crc_stored) {
+ dbg("ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
+ crc_stored, crc);
+ ret++;
+ } else {
+ int ret2;
+ e->data_sector = le64_to_cpu(e->data_sector);
+ e->pp_size = le32_to_cpu(e->pp_size);
+ e->data_size = le32_to_cpu(e->data_size);
+
+ ret2 = ppl_recover_entry(log, e, ppl_sector);
+ if (ret2) {
+ ret = ret2;
+ goto out;
+ }
+ }
+
+ ppl_sector += ppl_entry_sectors;
+ }
+out:
+ __free_page(page);
+ return ret;
+}
+
+static int ppl_write_empty_header(struct r5l_log *log)
+{
+ struct page *page;
+ struct ppl_header *pplhdr;
+ int ret = 0;
+
+ dbg("disk: %d ppl_sector: %llu\n",
+ log->rdev->raid_disk, (unsigned long long)log->rdev->ppl.sector);
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+
+ pplhdr = page_address(page);
+ memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+ pplhdr->signature = cpu_to_le32(log->uuid_checksum);
+ pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
+
+ if (!sync_page_io(log->rdev, log->rdev->ppl.sector -
+ log->rdev->data_offset, PPL_HEADER_SIZE, page,
+ REQ_OP_WRITE, 0, false)) {
+ md_error(log->rdev->mddev, log->rdev);
+ ret = -EIO;
+ }
+
+ __free_page(page);
+ return ret;
+}
+
+static int ppl_load_distributed(struct r5l_log *log)
+{
+ struct mddev *mddev = log->rdev->mddev;
+ struct page *page;
+ struct ppl_header *pplhdr;
+ u32 crc, crc_stored;
+ int ret = 0;
+
+ dbg("disk: %d\n", log->rdev->raid_disk);
+
+ /* read PPL header */
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ if (!sync_page_io(log->rdev,
+ log->rdev->ppl.sector - log->rdev->data_offset,
+ PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
+ md_error(mddev, log->rdev);
+ ret = -EIO;
+ goto out;
+ }
+ pplhdr = page_address(page);
+
+ /* check header validity */
+ crc_stored = le32_to_cpu(pplhdr->checksum);
+ pplhdr->checksum = 0;
+ crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
+
+ if (crc_stored != crc) {
+ dbg("ppl header crc does not match: stored: 0x%x calculated: 0x%x\n",
+ crc_stored, crc);
+ ret = 1;
+ goto out;
+ }
+
+ pplhdr->signature = le32_to_cpu(pplhdr->signature);
+ pplhdr->generation = le64_to_cpu(pplhdr->generation);
+ pplhdr->entries_count = le32_to_cpu(pplhdr->entries_count);
+
+ if (pplhdr->signature != log->uuid_checksum) {
+ dbg("ppl header signature does not match: stored: 0x%x configured: 0x%x\n",
+ pplhdr->signature, log->uuid_checksum);
+ ret = 1;
+ goto out;
+ }
+
+ if (mddev->recovery_cp != MaxSector)
+ ret = ppl_recover(log, pplhdr);
+out:
+ __free_page(page);
+
+ if (ret >= 0) {
+ int ret2 = ppl_write_empty_header(log);
+ if (ret2)
+ ret = ret2;
+ }
+
+ dbg("return: %d\n", ret);
+ return ret;
+}
+
+static int ppl_load(struct r5l_log *log)
+{
+ struct ppl_conf *ppl_conf = log->private;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < ppl_conf->count; i++) {
+ struct r5l_log *log_child = ppl_conf->child_logs[i];
+ int ret2;
+
+ /* Missing drive */
+ if (!log_child)
+ continue;
+
+ ret2 = ppl_load_distributed(log_child);
+ if (ret2 < 0) {
+ ret = ret2;
+ break;
+ }
+
+ ret += ret2;
+ }
+
+ dbg("return: %d\n", ret);
+ return ret;
+}
+
#define IMSM_MPB_SIG "Intel Raid ISM Cfg Sig. "
#define IMSM_MPB_ORIG_FAMILY_NUM_OFFSET 64
@@ -634,6 +975,12 @@ static int __ppl_init_log(struct r5l_log *log, struct r5conf *conf)
ppl_conf->child_logs[i] = log_child;
}
+ ret = ppl_load(log);
+ if (!ret && mddev->recovery_cp == 0 && !mddev->degraded)
+ mddev->recovery_cp = MaxSector;
+ else if (ret < 0)
+ goto err;
+
rcu_assign_pointer(conf->log, log);
set_bit(MD_HAS_PPL, &mddev->flags);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2169de5..ed340c3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7160,7 +7160,10 @@ static int raid5_run(struct mddev *mddev)
if (mddev->degraded > dirty_parity_disks &&
mddev->recovery_cp != MaxSector) {
- if (mddev->ok_start_degraded)
+ if (rwh_policy)
+ pr_warn("md/raid:%s: starting dirty degraded array with journal.\n",
+ mdname(mddev));
+ else if (mddev->ok_start_degraded)
pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
mdname(mddev));
else {
--
2.10.1
next prev parent reply other threads:[~2016-12-05 15:31 UTC|newest]
Thread overview: 38+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-12-05 15:31 [PATCH v2 00/12] Partial Parity Log for MD RAID 5 Artur Paszkiewicz
2016-12-05 15:31 ` [PATCH v2 01/12] raid5-cache: move declarations to separate header Artur Paszkiewicz
2016-12-05 15:31 ` [PATCH v2 02/12] raid5-cache: add policy logic Artur Paszkiewicz
2016-12-05 15:31 ` [PATCH v2 03/12] raid5-cache: add a new policy Artur Paszkiewicz
2016-12-07 0:46 ` NeilBrown
2016-12-07 14:36 ` Artur Paszkiewicz
2016-12-07 23:24 ` NeilBrown
2016-12-08 10:28 ` Artur Paszkiewicz
2016-12-08 21:22 ` NeilBrown
2016-12-05 15:31 ` [PATCH v2 04/12] md: superblock changes for PPL Artur Paszkiewicz
2016-12-05 15:31 ` [PATCH v2 05/12] raid5-ppl: Partial Parity Log implementation Artur Paszkiewicz
2016-12-06 1:06 ` kbuild test robot
2016-12-07 1:17 ` NeilBrown
2016-12-07 14:37 ` Artur Paszkiewicz
2016-12-05 15:31 ` [PATCH v2 06/12] raid5-ppl: calculate partial parity Artur Paszkiewicz
2016-12-05 15:31 ` [PATCH v2 07/12] md: mddev_find_container helper function Artur Paszkiewicz
2016-12-07 1:23 ` NeilBrown
2016-12-05 15:31 ` [PATCH v2 08/12] md: expose rdev->sb_start as sysfs attribute Artur Paszkiewicz
2016-12-07 1:25 ` NeilBrown
2016-12-05 15:31 ` [PATCH v2 09/12] raid5-ppl: read PPL signature from IMSM metadata Artur Paszkiewicz
2016-12-07 1:25 ` NeilBrown
2016-12-07 14:38 ` Artur Paszkiewicz
2016-12-07 23:27 ` NeilBrown
2016-12-08 10:36 ` Artur Paszkiewicz
2016-12-05 15:31 ` Artur Paszkiewicz [this message]
2016-12-05 15:31 ` [PATCH v2 11/12] raid5-ppl: support disk add/remove with distributed PPL Artur Paszkiewicz
2016-12-07 1:29 ` NeilBrown
2016-12-05 15:31 ` [PATCH v2 12/12] raid5-ppl: runtime PPL enabling or disabling Artur Paszkiewicz
2016-12-07 0:32 ` [PATCH v2 00/12] Partial Parity Log for MD RAID 5 NeilBrown
2016-12-07 14:36 ` Artur Paszkiewicz
2016-12-07 17:09 ` Shaohua Li
2016-12-13 15:25 ` Jes Sorensen
2016-12-14 19:47 ` Shaohua Li
2016-12-15 11:44 ` Artur Paszkiewicz
2016-12-16 23:24 ` Shaohua Li
2017-01-03 15:42 ` Jes Sorensen
2017-01-04 8:01 ` Artur Paszkiewicz
2017-01-04 13:29 ` Jes Sorensen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20161205153113.7268-11-artur.paszkiewicz@intel.com \
--to=artur.paszkiewicz@intel.com \
--cc=linux-raid@vger.kernel.org \
--cc=shli@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).