* [PATCH 1/3] ore: (trivial) reformat some code
2014-05-22 13:03 [PACHSET 0/3] ore: raid6 Boaz Harrosh
@ 2014-05-22 13:07 ` Boaz Harrosh
2014-05-22 13:08 ` [PATCH 2/3] ore: Remove redundant dev_order(), more cleanups Boaz Harrosh
2014-05-22 13:11 ` [PATCH 3/3] ore: Support for raid 6 Boaz Harrosh
2 siblings, 0 replies; 5+ messages in thread
From: Boaz Harrosh @ 2014-05-22 13:07 UTC (permalink / raw)
To: open-osd, NFS list, Daniel Gryniewicz, Elizabeth Ellenbogen Ziph,
Pathak, Santosh, linux-fsdevel
Cc: Tigran Mkrtchyan, Benny Halevy, Sachin bhamare
rearrange some source lines. Nothing changed.
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
fs/exofs/ore.c | 19 ++++++++-----------
fs/exofs/ore_raid.c | 4 +---
2 files changed, 9 insertions(+), 14 deletions(-)
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index dae8846..92157b6 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -675,8 +675,8 @@ static int _prepare_for_striping(struct ore_io_state *ios)
si->cur_pg = si->unit_off / PAGE_SIZE;
while (length) {
- unsigned comp = dev - first_dev;
- struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
+ struct ore_per_dev_state *per_dev =
+ &ios->per_dev[dev - first_dev];
unsigned cur_len, page_off = 0;
if (!per_dev->length) {
@@ -708,11 +708,9 @@ static int _prepare_for_striping(struct ore_io_state *ios)
if (unlikely(ret))
goto out;
- dev += mirrors_p1;
- dev = (dev % devs_in_group) + first_dev;
-
length -= cur_len;
+ dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
si->cur_comp = (si->cur_comp + 1) % group_width;
if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
if (!length && ios->sp2d) {
@@ -721,11 +719,6 @@ static int _prepare_for_striping(struct ore_io_state *ios)
*/
dev = si->par_dev;
}
- if (ios->sp2d)
- /* In writes cur_len just means if it's the
- * last one. See _ore_add_parity_unit.
- */
- cur_len = length;
per_dev = &ios->per_dev[dev - first_dev];
if (!per_dev->length) {
/* Only/always the parity unit of the first
@@ -736,7 +729,11 @@ static int _prepare_for_striping(struct ore_io_state *ios)
per_dev->offset = si->obj_offset - si->unit_off;
}
- ret = _ore_add_parity_unit(ios, si, per_dev, cur_len);
+ /* In writes cur_len just means if it's the
+ * last one. See _ore_add_parity_unit.
+ */
+ ret = _ore_add_parity_unit(ios, si, per_dev,
+ ios->sp2d ? length : cur_len);
if (unlikely(ret))
goto out;
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 4e2c032..af417d3 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -226,9 +226,7 @@ static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
init_async_submit(&_1ps->submit,
ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK,
- NULL,
- NULL, NULL,
- (addr_conv_t *)_1ps->scribble);
+ NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble);
/* TODO: raid6 */
_1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages,
--
1.9.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 2/3] ore: Remove redundant dev_order(), more cleanups
2014-05-22 13:03 [PACHSET 0/3] ore: raid6 Boaz Harrosh
2014-05-22 13:07 ` [PATCH 1/3] ore: (trivial) reformat some code Boaz Harrosh
@ 2014-05-22 13:08 ` Boaz Harrosh
2014-05-22 13:11 ` [PATCH 3/3] ore: Support for raid 6 Boaz Harrosh
2 siblings, 0 replies; 5+ messages in thread
From: Boaz Harrosh @ 2014-05-22 13:08 UTC (permalink / raw)
To: open-osd, NFS list, Daniel Gryniewicz, Elizabeth Ellenbogen Ziph,
Pathak, Santosh, linux-fsdevel
Cc: Tigran Mkrtchyan, Benny Halevy, Sachin bhamare
Two cleanups:
* si->cur_comp, si->cur_pg where always calculated after
the call to ore_calc_stripe_info() with the help of
_dev_order(...). But these are already calculated by
ore_calc_stripe_info() and can be just set there.
(This is left over from the time that si->cur_comp, si->cur_pg
were only used by raid code, but now the main loop manages
them anyway even though they are ultimately not used in
none raid code)
* si->cur_comp - For the very last stripe case, was set inside
_ore_add_parity_unit(). This is not clear and will be wrong
for coming raid6 so move this to only caller. Now si->cur_comp
is only manipulated within _prepare_for_striping(), always next
to the manipulation of cur_dev.
Which is much easier to understand and follow.
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
fs/exofs/ore.c | 10 ++++++----
fs/exofs/ore_raid.c | 13 ++++---------
fs/exofs/ore_raid.h | 18 ------------------
3 files changed, 10 insertions(+), 31 deletions(-)
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 92157b6..0e2a835 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -545,17 +545,19 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
/* "H - (N * U)" is just "H % U" so it's bound to u32 */
u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+ u32 first_dev = C - C % group_width;
div_u64_rem(file_offset, stripe_unit, &si->unit_off);
si->obj_offset = si->unit_off + (N * stripe_unit) +
(M * group_depth * stripe_unit);
+ si->cur_comp = C - first_dev;
+ si->cur_pg = si->unit_off / PAGE_SIZE;
if (parity) {
u32 LCMdP = lcm(group_width, parity) / parity;
/* R = N % LCMdP; */
u32 RxP = (N % LCMdP) * parity;
- u32 first_dev = C - C % group_width;
si->par_dev = (group_width + group_width - parity - RxP) %
group_width + first_dev;
@@ -670,9 +672,7 @@ static int _prepare_for_striping(struct ore_io_state *ios)
BUG_ON(length > si->length);
- dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
- si->cur_comp = dev_order;
- si->cur_pg = si->unit_off / PAGE_SIZE;
+ dev_order = si->cur_comp;
while (length) {
struct ore_per_dev_state *per_dev =
@@ -718,6 +718,8 @@ static int _prepare_for_striping(struct ore_io_state *ios)
* stripe. then operate on parity dev.
*/
dev = si->par_dev;
+ /* If last stripe operate on parity comp */
+ si->cur_comp = group_width - ios->layout->parity;
}
per_dev = &ios->per_dev[dev - first_dev];
if (!per_dev->length) {
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index af417d3..d58a952 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -402,9 +402,8 @@ static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset)
ore_calc_stripe_info(ios->layout, *offset, 0, &si);
- p = si.unit_off / PAGE_SIZE;
- c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
- ios->layout->mirrors_p1, si.par_dev, si.dev);
+ p = si.cur_pg;
+ c = si.cur_comp;
page = ios->sp2d->_1p_stripes[p].pages[c];
pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE);
@@ -532,9 +531,8 @@ static int _read_4_write_last_stripe(struct ore_io_state *ios)
goto read_it;
ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
- p = read_si.unit_off / PAGE_SIZE;
- c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
- ios->layout->mirrors_p1, read_si.par_dev, read_si.dev);
+ p = read_si.cur_pg;
+ c = read_si.cur_comp;
if (min_p == sp2d->pages_in_unit) {
/* Didn't do it yet */
@@ -638,9 +636,6 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
si->cur_pg = _sp2d_min_pg(sp2d);
num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
- if (!cur_len) /* If last stripe operate on parity comp */
- si->cur_comp = sp2d->data_devs;
-
if (!per_dev->length) {
per_dev->offset += si->cur_pg * PAGE_SIZE;
/* If first stripe, Read in all read4write pages
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
index 2ffd2c3..d365bda 100644
--- a/fs/exofs/ore_raid.h
+++ b/fs/exofs/ore_raid.h
@@ -31,24 +31,6 @@
#define ORE_DBGMSG2(M...) do {} while (0)
/* #define ORE_DBGMSG2 ORE_DBGMSG */
-/* Calculate the component order in a stripe. eg the logical data unit
- * address within the stripe of @dev given the @par_dev of this stripe.
- */
-static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,
- unsigned par_dev, unsigned dev)
-{
- unsigned first_dev = dev - dev % devs_in_group;
-
- dev -= first_dev;
- par_dev -= first_dev;
-
- if (devs_in_group == par_dev) /* The raid 0 case */
- return dev / mirrors_p1;
- /* raid4/5/6 case */
- return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /
- mirrors_p1;
-}
-
/* ios_raid.c stuff needed by ios.c */
int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
void _ore_free_raid_stuff(struct ore_io_state *ios);
--
1.9.0
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 3/3] ore: Support for raid 6
2014-05-22 13:03 [PACHSET 0/3] ore: raid6 Boaz Harrosh
2014-05-22 13:07 ` [PATCH 1/3] ore: (trivial) reformat some code Boaz Harrosh
2014-05-22 13:08 ` [PATCH 2/3] ore: Remove redundant dev_order(), more cleanups Boaz Harrosh
@ 2014-05-22 13:11 ` Boaz Harrosh
2 siblings, 0 replies; 5+ messages in thread
From: Boaz Harrosh @ 2014-05-22 13:11 UTC (permalink / raw)
To: open-osd, NFS list, Daniel Gryniewicz, Elizabeth Ellenbogen Ziph,
Pathak, Santosh, linux-fsdevel
Cc: Tigran Mkrtchyan, Benny Halevy, Sachin bhamare
This simple patch adds support for raid6 to the ORE.
Most operations and calculations where already for the general
case. Only things left:
* call async_gen_syndrome() in the case of raid6
(NOTE that the raid6 math is the one supported by the Linux Kernel
see: crypto/async_tx/async_pq.c)
* call _ore_add_parity_unit() twice with only last call generating
the redundancy pages.
* Fix couple BUGS in old code
a. In reads when parity==2 it can happen that per_dev->length=0
but per_dev->offset was set and adjusted by _ore_add_sg_seg().
Don't let it be overwritten.
b. The all 'cur_comp > starting_dev' thing to determine if:
"per_dev->offset is in the current stripe number or the
next one."
Was a complete raid5/4 accident. When parity==2 this is not
at all true usually. All we need to do is increment si->ob_offset
once we pass by the first parity device.
(This also greatly simplifies the code, amen)
c. Calculation of si->dev rotation can overflow when parity==2.
* Then last enable raid6 in ore_verify_layout()
I want to deeply thank Daniel Gryniewicz who found first all the
bugs in the old raid code, and inspired these patches:
Inspired-by Daniel Gryniewicz <dang@linuxbox.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
fs/exofs/Kconfig.ore | 2 ++
fs/exofs/ore.c | 75 ++++++++++++++++++++++++++++++++++++----------------
fs/exofs/ore_raid.c | 37 +++++++++++++++++---------
fs/exofs/ore_raid.h | 3 ++-
4 files changed, 80 insertions(+), 37 deletions(-)
diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore
index 1ca7fb7..2daf232 100644
--- a/fs/exofs/Kconfig.ore
+++ b/fs/exofs/Kconfig.ore
@@ -9,4 +9,6 @@ config ORE
tristate
depends on EXOFS_FS || PNFS_OBJLAYOUT
select ASYNC_XOR
+ select RAID6_PQ
+ select ASYNC_PQ
default SCSI_OSD_ULD
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 0e2a835..cfc0205 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -58,9 +58,12 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
layout->parity = 1;
break;
case PNFS_OSD_RAID_PQ:
+ layout->parity = 2;
+ break;
case PNFS_OSD_RAID_4:
default:
- ORE_ERR("Only RAID_0/5 for now\n");
+ ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n",
+ layout->raid_algorithm);
return -EINVAL;
}
if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
@@ -112,6 +115,8 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
layout->max_io_length /= stripe_length;
layout->max_io_length *= stripe_length;
}
+ ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length);
+
return 0;
}
EXPORT_SYMBOL(ore_verify_layout);
@@ -561,7 +566,8 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
si->par_dev = (group_width + group_width - parity - RxP) %
group_width + first_dev;
- si->dev = (group_width + C - RxP) % group_width + first_dev;
+ si->dev = (group_width + group_width + C - RxP) %
+ group_width + first_dev;
si->bytes_in_stripe = U;
si->first_stripe_start = M * S + G * T + N * U;
} else {
@@ -651,6 +657,43 @@ out: /* we fail the complete unit on an error eg don't advance
return ret;
}
+static int _add_parity_units(struct ore_io_state *ios,
+ struct ore_striping_info *si,
+ unsigned dev, unsigned first_dev,
+ unsigned mirrors_p1, unsigned devs_in_group,
+ unsigned cur_len)
+{
+ unsigned do_parity;
+ int ret = 0;
+
+ for (do_parity = ios->layout->parity; do_parity; --do_parity) {
+ struct ore_per_dev_state *per_dev;
+
+ per_dev = &ios->per_dev[dev - first_dev];
+ if (!per_dev->length && !per_dev->offset) {
+ /* Only/always the parity unit of the first
+ * stripe will be empty. So this is a chance to
+ * initialize the per_dev info.
+ */
+ per_dev->dev = dev;
+ per_dev->offset = si->obj_offset - si->unit_off;
+ }
+
+ ret = _ore_add_parity_unit(ios, si, per_dev, cur_len,
+ do_parity == 1);
+ if (unlikely(ret))
+ break;
+
+ if (do_parity != 1) {
+ dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
+ si->cur_comp = (si->cur_comp + 1) %
+ ios->layout->group_width;
+ }
+ }
+
+ return ret;
+}
+
static int _prepare_for_striping(struct ore_io_state *ios)
{
struct ore_striping_info *si = &ios->si;
@@ -660,7 +703,6 @@ static int _prepare_for_striping(struct ore_io_state *ios)
unsigned devs_in_group = group_width * mirrors_p1;
unsigned dev = si->dev;
unsigned first_dev = dev - (dev % devs_in_group);
- unsigned dev_order;
unsigned cur_pg = ios->pages_consumed;
u64 length = ios->length;
int ret = 0;
@@ -672,14 +714,13 @@ static int _prepare_for_striping(struct ore_io_state *ios)
BUG_ON(length > si->length);
- dev_order = si->cur_comp;
-
while (length) {
struct ore_per_dev_state *per_dev =
&ios->per_dev[dev - first_dev];
unsigned cur_len, page_off = 0;
- if (!per_dev->length) {
+ if (!per_dev->length && !per_dev->offset) {
+ /* First time initialize the per_dev info. */
per_dev->dev = dev;
if (dev == si->dev) {
WARN_ON(dev == si->par_dev);
@@ -688,13 +729,7 @@ static int _prepare_for_striping(struct ore_io_state *ios)
page_off = si->unit_off & ~PAGE_MASK;
BUG_ON(page_off && (page_off != ios->pgbase));
} else {
- if (si->cur_comp > dev_order)
- per_dev->offset =
- si->obj_offset - si->unit_off;
- else /* si->cur_comp < dev_order */
- per_dev->offset =
- si->obj_offset + stripe_unit -
- si->unit_off;
+ per_dev->offset = si->obj_offset - si->unit_off;
cur_len = stripe_unit;
}
} else {
@@ -721,20 +756,12 @@ static int _prepare_for_striping(struct ore_io_state *ios)
/* If last stripe operate on parity comp */
si->cur_comp = group_width - ios->layout->parity;
}
- per_dev = &ios->per_dev[dev - first_dev];
- if (!per_dev->length) {
- /* Only/always the parity unit of the first
- * stripe will be empty. So this is a chance to
- * initialize the per_dev info.
- */
- per_dev->dev = dev;
- per_dev->offset = si->obj_offset - si->unit_off;
- }
/* In writes cur_len just means if it's the
* last one. See _ore_add_parity_unit.
*/
- ret = _ore_add_parity_unit(ios, si, per_dev,
+ ret = _add_parity_units(ios, si, dev, first_dev,
+ mirrors_p1, devs_in_group,
ios->sp2d ? length : cur_len);
if (unlikely(ret))
goto out;
@@ -746,6 +773,8 @@ static int _prepare_for_striping(struct ore_io_state *ios)
/* Next stripe, start fresh */
si->cur_comp = 0;
si->cur_pg = 0;
+ si->obj_offset += cur_len;
+ si->unit_off = 0;
}
}
out:
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index d58a952..7f20f25 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -218,20 +218,28 @@ static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
{
unsigned p;
+ unsigned tx_flags = ASYNC_TX_ACK;
+
+ if (sp2d->parity == 1)
+ tx_flags |= ASYNC_TX_XOR_ZERO_DST;
+
for (p = 0; p < sp2d->pages_in_unit; p++) {
struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
if (!_1ps->write_count)
continue;
- init_async_submit(&_1ps->submit,
- ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK,
+ init_async_submit(&_1ps->submit, tx_flags,
NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble);
- /* TODO: raid6 */
- _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages,
- 0, sp2d->data_devs, PAGE_SIZE,
- &_1ps->submit);
+ if (sp2d->parity == 1)
+ _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs],
+ _1ps->pages, 0, sp2d->data_devs,
+ PAGE_SIZE, &_1ps->submit);
+ else /* parity == 2 */
+ _1ps->tx = async_gen_syndrome(_1ps->pages, 0,
+ sp2d->data_devs + sp2d->parity,
+ PAGE_SIZE, &_1ps->submit);
}
for (p = 0; p < sp2d->pages_in_unit; p++) {
@@ -616,7 +624,7 @@ static int _read_4_write_execute(struct ore_io_state *ios)
int _ore_add_parity_unit(struct ore_io_state *ios,
struct ore_striping_info *si,
struct ore_per_dev_state *per_dev,
- unsigned cur_len)
+ unsigned cur_len, bool do_xor)
{
if (ios->reading) {
if (per_dev->cur_sg >= ios->sgs_per_dev) {
@@ -641,9 +649,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
/* If first stripe, Read in all read4write pages
* (if needed) before we calculate the first parity.
*/
- _read_4_write_first_stripe(ios);
+ if (do_xor)
+ _read_4_write_first_stripe(ios);
}
- if (!cur_len) /* If last stripe r4w pages of last stripe */
+ if (!cur_len && do_xor)
+ /* If last stripe r4w pages of last stripe */
_read_4_write_last_stripe(ios);
_read_4_write_execute(ios);
@@ -655,7 +665,7 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
++(ios->cur_par_page);
}
- BUG_ON(si->cur_comp != sp2d->data_devs);
+ BUG_ON(si->cur_comp < sp2d->data_devs);
BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
@@ -663,9 +673,10 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
if (unlikely(ret))
return ret;
- /* TODO: raid6 if (last_parity_dev) */
- _gen_xor_unit(sp2d);
- _sp2d_reset(sp2d, ios->r4w, ios->private);
+ if (do_xor) {
+ _gen_xor_unit(sp2d);
+ _sp2d_reset(sp2d, ios->r4w, ios->private);
+ }
}
return 0;
}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
index d365bda..cf6375d 100644
--- a/fs/exofs/ore_raid.h
+++ b/fs/exofs/ore_raid.h
@@ -38,7 +38,8 @@ void _ore_free_raid_stuff(struct ore_io_state *ios);
void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
bool not_last);
int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
- struct ore_per_dev_state *per_dev, unsigned cur_len);
+ struct ore_per_dev_state *per_dev, unsigned cur_len,
+ bool do_xor);
void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
struct ore_striping_info *si, struct page *page);
static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
--
1.9.0
^ permalink raw reply related [flat|nested] 5+ messages in thread