From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
To: linux-raid@vger.kernel.org
Cc: shli@fb.com, neilb@suse.com, jes.sorensen@gmail.com,
Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Subject: [PATCH v3 4/9] raid5: calculate partial parity for a stripe
Date: Mon, 30 Jan 2017 19:59:48 +0100 [thread overview]
Message-ID: <20170130185953.30428-5-artur.paszkiewicz@intel.com> (raw)
In-Reply-To: <20170130185953.30428-1-artur.paszkiewicz@intel.com>
Attach a page for holding the partial parity data to stripe_head.
Allocate it only if mddev has the MD_HAS_PPL flag set.
Partial parity is the xor of not modified data chunks of a stripe and is
calculated as follows:
- reconstruct-write case:
xor data from all not updated disks in a stripe
- read-modify-write case:
xor old data and parity from all updated disks in a stripe
Implement it using the async_tx API and integrate into raid_run_ops().
It must be called when we still have access to old data, so do it when
STRIPE_OP_BIODRAIN is set, but before ops_run_prexor5(). The result is
stored into sh->ppl_page.
Partial parity is not meaningful for full stripe write and is not stored
in the log or used for recovery, so don't attempt to calculate it when
stripe has STRIPE_FULL_WRITE.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/raid5.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
drivers/md/raid5.h | 2 ++
2 files changed, 100 insertions(+)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d1cba941951e..e1e238da32ba 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -466,6 +466,11 @@ static void shrink_buffers(struct stripe_head *sh)
sh->dev[i].page = NULL;
put_page(p);
}
+
+ if (sh->ppl_page) {
+ put_page(sh->ppl_page);
+ sh->ppl_page = NULL;
+ }
}
static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -482,6 +487,13 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
sh->dev[i].page = page;
sh->dev[i].orig_page = page;
}
+
+ if (test_bit(MD_HAS_PPL, &sh->raid_conf->mddev->flags)) {
+ sh->ppl_page = alloc_page(gfp);
+ if (!sh->ppl_page)
+ return 1;
+ }
+
return 0;
}
@@ -1977,6 +1989,55 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
&sh->ops.zero_sum_result, percpu->spare_page, &submit);
}
+static struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+ struct dma_async_tx_descriptor *tx)
+{
+ int disks = sh->disks;
+ struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
+ int count = 0, pd_idx = sh->pd_idx, i;
+ struct async_submit_ctl submit;
+
+ pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+
+ /*
+ * Partial parity is the XOR of stripe data chunks that are not changed
+ * during the write request. Depending on available data
+ * (read-modify-write vs. reconstruct-write case) we calculate it
+ * differently.
+ */
+ if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+ /* rmw: xor old data and parity from updated disks */
+ for (i = disks; i--;) {
+ struct r5dev *dev = &sh->dev[i];
+ if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx)
+ xor_srcs[count++] = dev->page;
+ }
+ } else if (sh->reconstruct_state == reconstruct_state_drain_run) {
+ /* rcw: xor data from all not updated disks */
+ for (i = disks; i--;) {
+ struct r5dev *dev = &sh->dev[i];
+ if (test_bit(R5_UPTODATE, &dev->flags))
+ xor_srcs[count++] = dev->page;
+ }
+ } else {
+ return tx;
+ }
+
+ init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, tx, NULL, sh,
+ flex_array_get(percpu->scribble, 0)
+ + sizeof(struct page *) * (sh->disks + 2));
+
+ if (count == 1)
+ tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
+ &submit);
+ else
+ tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
+ &submit);
+
+ return tx;
+}
+
static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
{
int overlap_clear = 0, i, disks = sh->disks;
@@ -2007,6 +2068,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
async_tx_ack(tx);
}
+ if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+ tx = ops_run_partial_parity(sh, percpu, tx);
+
if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
if (level < 6)
tx = ops_run_prexor5(sh, percpu, tx);
@@ -3058,6 +3122,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
s->locked++;
}
+ if (level == 5 && test_bit(MD_HAS_PPL, &conf->mddev->flags) &&
+ test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
+ !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
+ test_bit(R5_Insync, &sh->dev[pd_idx].flags))
+ set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
+
pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
__func__, (unsigned long long)sh->sector,
s->locked, s->ops_request);
@@ -3105,6 +3175,34 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
goto overlap;
+ if (forwrite && test_bit(MD_HAS_PPL, &conf->mddev->flags)) {
+ /*
+ * With PPL only writes to consecutive data chunks within a
+ * stripe are allowed. Not really an overlap, but
+ * wait_for_overlap can be used to handle this.
+ */
+ sector_t sector;
+ sector_t first = 0;
+ sector_t last = 0;
+ int count = 0;
+ int i;
+
+ for (i = 0; i < sh->disks; i++) {
+ if (i != sh->pd_idx &&
+ (i == dd_idx || sh->dev[i].towrite)) {
+ sector = sh->dev[i].sector;
+ if (count == 0 || sector < first)
+ first = sector;
+ if (sector > last)
+ last = sector;
+ count++;
+ }
+ }
+
+ if (first + conf->chunk_sectors * (count - 1) != last)
+ goto overlap;
+ }
+
if (!forwrite || previous)
clear_bit(STRIPE_BATCH_READY, &sh->state);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 0f64a58873de..88f1e52d9daf 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -228,6 +228,7 @@ struct stripe_head {
struct list_head log_list;
sector_t log_start; /* first meta block on the journal */
struct list_head r5c; /* for r5c_cache->stripe_in_journal */
+ struct page *ppl_page; /* partial parity of this stripe */
/**
* struct stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target
@@ -400,6 +401,7 @@ enum {
STRIPE_OP_BIODRAIN,
STRIPE_OP_RECONSTRUCT,
STRIPE_OP_CHECK,
+ STRIPE_OP_PARTIAL_PARITY,
};
/*
--
2.11.0
next prev parent reply other threads:[~2017-01-30 18:59 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-01-30 18:59 [PATCH v3 0/9] Partial Parity Log for MD RAID 5 Artur Paszkiewicz
2017-01-30 18:59 ` [PATCH v3 1/9] raid5-cache: move declarations to separate header Artur Paszkiewicz
2017-01-30 18:59 ` [PATCH v3 2/9] raid5-cache: add policy logic Artur Paszkiewicz
2017-01-30 18:59 ` [PATCH v3 3/9] md: superblock changes for PPL Artur Paszkiewicz
2017-02-07 21:20 ` Shaohua Li
2017-02-08 11:58 ` Artur Paszkiewicz
2017-01-30 18:59 ` Artur Paszkiewicz [this message]
2017-02-07 21:25 ` [PATCH v3 4/9] raid5: calculate partial parity for a stripe Shaohua Li
2017-02-08 11:58 ` Artur Paszkiewicz
2017-01-30 18:59 ` [PATCH v3 5/9] raid5-ppl: Partial Parity Log write logging implementation Artur Paszkiewicz
2017-02-07 21:42 ` Shaohua Li
2017-02-08 11:58 ` Artur Paszkiewicz
2017-02-08 5:34 ` Shaohua Li
2017-02-09 15:35 ` Artur Paszkiewicz
2017-02-09 17:09 ` Shaohua Li
2017-02-09 16:06 ` Wols Lists
2017-02-09 17:13 ` Shaohua Li
2017-01-30 18:59 ` [PATCH v3 6/9] md: add sysfs entries for PPL Artur Paszkiewicz
2017-02-07 21:49 ` Shaohua Li
2017-02-08 11:58 ` Artur Paszkiewicz
2017-02-08 18:14 ` Shaohua Li
2017-01-30 18:59 ` [PATCH v3 7/9] raid5-ppl: load and recover the log Artur Paszkiewicz
2017-01-30 18:59 ` [PATCH v3 8/9] raid5-ppl: support disk hot add/remove with PPL Artur Paszkiewicz
2017-01-30 18:59 ` [PATCH v3 9/9] raid5-ppl: runtime PPL enabling or disabling Artur Paszkiewicz
2017-03-27 5:08 ` NeilBrown
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20170130185953.30428-5-artur.paszkiewicz@intel.com \
--to=artur.paszkiewicz@intel.com \
--cc=jes.sorensen@gmail.com \
--cc=linux-raid@vger.kernel.org \
--cc=neilb@suse.com \
--cc=shli@fb.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).