* [PATCH v2 01/12] raid5-cache: move declarations to separate header
2016-12-05 15:31 [PATCH v2 00/12] Partial Parity Log for MD RAID 5 Artur Paszkiewicz
@ 2016-12-05 15:31 ` Artur Paszkiewicz
2016-12-05 15:31 ` [PATCH v2 02/12] raid5-cache: add policy logic Artur Paszkiewicz
` (11 subsequent siblings)
12 siblings, 0 replies; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-05 15:31 UTC (permalink / raw)
To: shli; +Cc: linux-raid
Next patches will be reusing raid5-cache structures and functions, so
put them in their own header.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/raid5-cache.c | 127 +-----------------------------------------
drivers/md/raid5-cache.h | 140 +++++++++++++++++++++++++++++++++++++++++++++++
drivers/md/raid5.c | 1 +
drivers/md/raid5.h | 9 ---
4 files changed, 142 insertions(+), 135 deletions(-)
create mode 100644 drivers/md/raid5-cache.h
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index e786d4e..6a3f8e7 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -23,6 +23,7 @@
#include "md.h"
#include "raid5.h"
#include "bitmap.h"
+#include "raid5-cache.h"
/*
* metadata/data stored in disk with 4k size unit (a block) regardless
@@ -52,16 +53,6 @@
*/
#define R5L_POOL_SIZE 4
-/*
- * r5c journal modes of the array: write-back or write-through.
- * write-through mode has identical behavior as existing log only
- * implementation.
- */
-enum r5c_journal_mode {
- R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
- R5C_JOURNAL_MODE_WRITE_BACK = 1,
-};
-
static char *r5c_journal_mode_str[] = {"write-through",
"write-back"};
/*
@@ -95,122 +86,6 @@ static char *r5c_journal_mode_str[] = {"write-through",
* - return IO for pending writes
*/
-struct r5l_log {
- struct md_rdev *rdev;
-
- u32 uuid_checksum;
-
- sector_t device_size; /* log device size, round to
- * BLOCK_SECTORS */
- sector_t max_free_space; /* reclaim run if free space is at
- * this size */
-
- sector_t last_checkpoint; /* log tail. where recovery scan
- * starts from */
- u64 last_cp_seq; /* log tail sequence */
-
- sector_t log_start; /* log head. where new data appends */
- u64 seq; /* log head sequence */
-
- sector_t next_checkpoint;
-
- struct mutex io_mutex;
- struct r5l_io_unit *current_io; /* current io_unit accepting new data */
-
- spinlock_t io_list_lock;
- struct list_head running_ios; /* io_units which are still running,
- * and have not yet been completely
- * written to the log */
- struct list_head io_end_ios; /* io_units which have been completely
- * written to the log but not yet written
- * to the RAID */
- struct list_head flushing_ios; /* io_units which are waiting for log
- * cache flush */
- struct list_head finished_ios; /* io_units which settle down in log disk */
- struct bio flush_bio;
-
- struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */
-
- struct kmem_cache *io_kc;
- mempool_t *io_pool;
- struct bio_set *bs;
- mempool_t *meta_pool;
-
- struct md_thread *reclaim_thread;
- unsigned long reclaim_target; /* number of space that need to be
- * reclaimed. if it's 0, reclaim spaces
- * used by io_units which are in
- * IO_UNIT_STRIPE_END state (eg, reclaim
- * dones't wait for specific io_unit
- * switching to IO_UNIT_STRIPE_END
- * state) */
- wait_queue_head_t iounit_wait;
-
- struct list_head no_space_stripes; /* pending stripes, log has no space */
- spinlock_t no_space_stripes_lock;
-
- bool need_cache_flush;
-
- /* for r5c_cache */
- enum r5c_journal_mode r5c_journal_mode;
-
- /* all stripes in r5cache, in the order of seq at sh->log_start */
- struct list_head stripe_in_journal_list;
-
- spinlock_t stripe_in_journal_lock;
- atomic_t stripe_in_journal_count;
-
- /* to submit async io_units, to fulfill ordering of flush */
- struct work_struct deferred_io_work;
-};
-
-/*
- * an IO range starts from a meta data block and end at the next meta data
- * block. The io unit's the meta data block tracks data/parity followed it. io
- * unit is written to log disk with normal write, as we always flush log disk
- * first and then start move data to raid disks, there is no requirement to
- * write io unit with FLUSH/FUA
- */
-struct r5l_io_unit {
- struct r5l_log *log;
-
- struct page *meta_page; /* store meta block */
- int meta_offset; /* current offset in meta_page */
-
- struct bio *current_bio;/* current_bio accepting new data */
-
- atomic_t pending_stripe;/* how many stripes not flushed to raid */
- u64 seq; /* seq number of the metablock */
- sector_t log_start; /* where the io_unit starts */
- sector_t log_end; /* where the io_unit ends */
- struct list_head log_sibling; /* log->running_ios */
- struct list_head stripe_list; /* stripes added to the io_unit */
-
- int state;
- bool need_split_bio;
- struct bio *split_bio;
-
- unsigned int has_flush:1; /* include flush request */
- unsigned int has_fua:1; /* include fua request */
- unsigned int has_null_flush:1; /* include empty flush request */
- /*
- * io isn't sent yet, flush/fua request can only be submitted till it's
- * the first IO in running_ios list
- */
- unsigned int io_deferred:1;
-
- struct bio_list flush_barriers; /* size == 0 flush bios */
-};
-
-/* r5l_io_unit state */
-enum r5l_io_unit_state {
- IO_UNIT_RUNNING = 0, /* accepting new IO */
- IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
- * don't accepting new bio */
- IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
- IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
-};
-
bool r5c_is_writeback(struct r5l_log *log)
{
return (log != NULL &&
diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
new file mode 100644
index 0000000..1908d45
--- /dev/null
+++ b/drivers/md/raid5-cache.h
@@ -0,0 +1,140 @@
+#ifndef _RAID5_CACHE_H
+#define _RAID5_CACHE_H
+
+/*
+ * r5c journal modes of the array: write-back or write-through.
+ * write-through mode has identical behavior as existing log only
+ * implementation.
+ */
+enum r5c_journal_mode {
+ R5C_JOURNAL_MODE_WRITE_THROUGH = 0,
+ R5C_JOURNAL_MODE_WRITE_BACK = 1,
+};
+
+struct r5l_log {
+ struct md_rdev *rdev;
+
+ u32 uuid_checksum;
+
+ sector_t device_size; /* log device size, round to
+ * BLOCK_SECTORS */
+ sector_t max_free_space; /* reclaim run if free space is at
+ * this size */
+
+ sector_t last_checkpoint; /* log tail. where recovery scan
+ * starts from */
+ u64 last_cp_seq; /* log tail sequence */
+
+ sector_t log_start; /* log head. where new data appends */
+ u64 seq; /* log head sequence */
+
+ sector_t next_checkpoint;
+
+ struct mutex io_mutex;
+ struct r5l_io_unit *current_io; /* current io_unit accepting new data */
+
+ spinlock_t io_list_lock;
+ struct list_head running_ios; /* io_units which are still running,
+ * and have not yet been completely
+ * written to the log */
+ struct list_head io_end_ios; /* io_units which have been completely
+ * written to the log but not yet written
+ * to the RAID */
+ struct list_head flushing_ios; /* io_units which are waiting for log
+ * cache flush */
+ struct list_head finished_ios; /* io_units which settle down in log disk */
+ struct bio flush_bio;
+
+ struct list_head no_mem_stripes; /* pending stripes, -ENOMEM */
+
+ struct kmem_cache *io_kc;
+ mempool_t *io_pool;
+ struct bio_set *bs;
+ mempool_t *meta_pool;
+
+ struct md_thread *reclaim_thread;
+ unsigned long reclaim_target; /* number of space that need to be
+ * reclaimed. if it's 0, reclaim spaces
+ * used by io_units which are in
+ * IO_UNIT_STRIPE_END state (eg, reclaim
+ * dones't wait for specific io_unit
+ * switching to IO_UNIT_STRIPE_END
+ * state) */
+ wait_queue_head_t iounit_wait;
+
+ struct list_head no_space_stripes; /* pending stripes, log has no space */
+ spinlock_t no_space_stripes_lock;
+
+ bool need_cache_flush;
+
+ /* for r5c_cache */
+ enum r5c_journal_mode r5c_journal_mode;
+
+ /* all stripes in r5cache, in the order of seq at sh->log_start */
+ struct list_head stripe_in_journal_list;
+
+ spinlock_t stripe_in_journal_lock;
+ atomic_t stripe_in_journal_count;
+
+ /* to submit async io_units, to fulfill ordering of flush */
+ struct work_struct deferred_io_work;
+};
+
+/*
+ * an IO range starts from a meta data block and end at the next meta data
+ * block. The io unit's the meta data block tracks data/parity followed it. io
+ * unit is written to log disk with normal write, as we always flush log disk
+ * first and then start move data to raid disks, there is no requirement to
+ * write io unit with FLUSH/FUA
+ */
+struct r5l_io_unit {
+ struct r5l_log *log;
+
+ struct page *meta_page; /* store meta block */
+ int meta_offset; /* current offset in meta_page */
+
+ struct bio *current_bio;/* current_bio accepting new data */
+
+ atomic_t pending_stripe;/* how many stripes not flushed to raid */
+ u64 seq; /* seq number of the metablock */
+ sector_t log_start; /* where the io_unit starts */
+ sector_t log_end; /* where the io_unit ends */
+ struct list_head log_sibling; /* log->running_ios */
+ struct list_head stripe_list; /* stripes added to the io_unit */
+
+ int state;
+ bool need_split_bio;
+ struct bio *split_bio;
+
+ unsigned int has_flush:1; /* include flush request */
+ unsigned int has_fua:1; /* include fua request */
+ unsigned int has_null_flush:1; /* include empty flush request */
+ /*
+ * io isn't sent yet, flush/fua request can only be submitted till it's
+ * the first IO in running_ios list
+ */
+ unsigned int io_deferred:1;
+
+ struct bio_list flush_barriers; /* size == 0 flush bios */
+};
+
+/* r5l_io_unit state */
+enum r5l_io_unit_state {
+ IO_UNIT_RUNNING = 0, /* accepting new IO */
+ IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
+ * don't accepting new bio */
+ IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
+ IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
+};
+
+extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
+extern void r5l_exit_log(struct r5l_log *log);
+extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh);
+extern void r5l_write_stripe_run(struct r5l_log *log);
+extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
+extern void r5l_stripe_write_finished(struct stripe_head *sh);
+extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
+extern void r5l_quiesce(struct r5l_log *log, int state);
+extern bool r5l_log_disk_error(struct r5conf *conf);
+
+#endif /* _RAID5_CACHE_H */
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6bf3c26..fe8c1a7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -61,6 +61,7 @@
#include "raid5.h"
#include "raid0.h"
#include "bitmap.h"
+#include "raid5-cache.h"
#define cpu_to_group(cpu) cpu_to_node(cpu)
#define ANY_GROUP NUMA_NO_NODE
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ed8e136..315d6ea 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -753,15 +753,6 @@ extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
extern struct stripe_head *
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce);
-extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
-extern void r5l_exit_log(struct r5l_log *log);
-extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
-extern void r5l_write_stripe_run(struct r5l_log *log);
-extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
-extern void r5l_stripe_write_finished(struct stripe_head *sh);
-extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
-extern void r5l_quiesce(struct r5l_log *log, int state);
-extern bool r5l_log_disk_error(struct r5conf *conf);
extern bool r5c_is_writeback(struct r5l_log *log);
extern int
r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
--
2.10.1
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH v2 02/12] raid5-cache: add policy logic
2016-12-05 15:31 [PATCH v2 00/12] Partial Parity Log for MD RAID 5 Artur Paszkiewicz
2016-12-05 15:31 ` [PATCH v2 01/12] raid5-cache: move declarations to separate header Artur Paszkiewicz
@ 2016-12-05 15:31 ` Artur Paszkiewicz
2016-12-05 15:31 ` [PATCH v2 03/12] raid5-cache: add a new policy Artur Paszkiewicz
` (10 subsequent siblings)
12 siblings, 0 replies; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-05 15:31 UTC (permalink / raw)
To: shli; +Cc: linux-raid
Add a struct r5l_policy and wrapper functions for non-static functions
from raid5-cache. This allows adding different policies for raid5
logging without changing the mechanism - calls from the raid5
personality stay the same.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/raid5-cache.c | 112 +++++++++++++++++++++++++++++++++++------------
drivers/md/raid5-cache.h | 13 ++++++
2 files changed, 98 insertions(+), 27 deletions(-)
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 6a3f8e7..74a0eda 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -727,7 +727,7 @@ static inline void r5l_add_no_space_stripe(struct r5l_log *log,
* running in raid5d, where reclaim could wait for raid5d too (when it flushes
* data from log to raid disks), so we shouldn't wait for reclaim here
*/
-int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
+static int __r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
int write_disks = 0;
@@ -737,8 +737,6 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
int ret = 0;
bool wake_reclaim = false;
- if (!log)
- return -EAGAIN;
/* Don't support stripe batch */
if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
test_bit(STRIPE_SYNCING, &sh->state)) {
@@ -825,19 +823,28 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
return 0;
}
-void r5l_write_stripe_run(struct r5l_log *log)
+int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
+{
+ if (log && log->policy->write_stripe)
+ return log->policy->write_stripe(log, sh);
+ return -EAGAIN;
+}
+
+static void __r5l_write_stripe_run(struct r5l_log *log)
{
- if (!log)
- return;
mutex_lock(&log->io_mutex);
r5l_submit_current_io(log);
mutex_unlock(&log->io_mutex);
}
-int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
+void r5l_write_stripe_run(struct r5l_log *log)
+{
+ if (log && log->policy->write_stripe_run)
+ log->policy->write_stripe_run(log);
+}
+
+static int __r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
{
- if (!log)
- return -ENODEV;
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
/*
@@ -869,6 +876,13 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
return -EAGAIN;
}
+int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
+{
+ if (log && log->policy->handle_flush_request)
+ return log->policy->handle_flush_request(log, bio);
+ return -ENODEV;
+}
+
/* This will run after log space is reclaimed */
static void r5l_run_no_space_stripes(struct r5l_log *log)
{
@@ -989,8 +1003,9 @@ void r5l_stripe_write_finished(struct stripe_head *sh)
io = sh->log_io;
sh->log_io = NULL;
- if (io && atomic_dec_and_test(&io->pending_stripe))
- __r5l_stripe_write_finished(io);
+ if (io && atomic_dec_and_test(&io->pending_stripe) &&
+ io->log->policy->stripe_write_finished)
+ io->log->policy->stripe_write_finished(io);
}
static void r5l_log_flush_endio(struct bio *bio)
@@ -1024,11 +1039,11 @@ static void r5l_log_flush_endio(struct bio *bio)
* only write stripes of an io_unit to raid disks till the io_unit is the first
* one whose data/parity is in log.
*/
-void r5l_flush_stripe_to_raid(struct r5l_log *log)
+static void __r5l_flush_stripe_to_raid(struct r5l_log *log)
{
bool do_flush;
- if (!log || !log->need_cache_flush)
+ if (!log->need_cache_flush)
return;
spin_lock_irq(&log->io_list_lock);
@@ -1050,6 +1065,12 @@ void r5l_flush_stripe_to_raid(struct r5l_log *log)
submit_bio(&log->flush_bio);
}
+void r5l_flush_stripe_to_raid(struct r5l_log *log)
+{
+ if (log && log->policy->flush_stripe_to_raid)
+ log->policy->flush_stripe_to_raid(log);
+}
+
static void r5l_write_super(struct r5l_log *log, sector_t cp);
static void r5l_write_super_and_discard_space(struct r5l_log *log,
sector_t end)
@@ -1304,10 +1325,10 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
md_wakeup_thread(log->reclaim_thread);
}
-void r5l_quiesce(struct r5l_log *log, int state)
+static void __r5l_quiesce(struct r5l_log *log, int state)
{
struct mddev *mddev;
- if (!log || state == 2)
+ if (state == 2)
return;
if (state == 0)
kthread_unpark(log->reclaim_thread->tsk);
@@ -1321,6 +1342,12 @@ void r5l_quiesce(struct r5l_log *log, int state)
}
}
+void r5l_quiesce(struct r5l_log *log, int state)
+{
+ if (log && log->policy->quiesce)
+ log->policy->quiesce(log, state);
+}
+
bool r5l_log_disk_error(struct r5conf *conf)
{
struct r5l_log *log;
@@ -2406,11 +2433,9 @@ static int r5l_load_log(struct r5l_log *log)
return ret;
}
-int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
+static int __r5l_init_log(struct r5l_log *log, struct r5conf *conf)
{
- struct request_queue *q = bdev_get_queue(rdev->bdev);
- struct r5l_log *log;
-
+ struct request_queue *q = bdev_get_queue(log->rdev->bdev);
if (PAGE_SIZE != 4096)
return -EINVAL;
@@ -2429,15 +2454,11 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
return -EINVAL;
}
- log = kzalloc(sizeof(*log), GFP_KERNEL);
- if (!log)
- return -ENOMEM;
- log->rdev = rdev;
log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
- log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
- sizeof(rdev->mddev->uuid));
+ log->uuid_checksum = crc32c_le(~0, log->rdev->mddev->uuid,
+ sizeof(log->rdev->mddev->uuid));
mutex_init(&log->io_mutex);
@@ -2502,16 +2523,53 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
io_pool:
kmem_cache_destroy(log->io_kc);
io_kc:
- kfree(log);
return -EINVAL;
}
-void r5l_exit_log(struct r5l_log *log)
+static void __r5l_exit_log(struct r5l_log *log)
{
md_unregister_thread(&log->reclaim_thread);
mempool_destroy(log->meta_pool);
bioset_free(log->bs);
mempool_destroy(log->io_pool);
kmem_cache_destroy(log->io_kc);
+}
+
+void r5l_exit_log(struct r5l_log *log)
+{
+ if (!log)
+ return;
+
+ if (log->policy->exit_log)
+ log->policy->exit_log(log);
+
kfree(log);
}
+
+struct r5l_policy r5l_journal = {
+ .init_log = __r5l_init_log,
+ .exit_log = __r5l_exit_log,
+ .write_stripe = __r5l_write_stripe,
+ .write_stripe_run = __r5l_write_stripe_run,
+ .flush_stripe_to_raid = __r5l_flush_stripe_to_raid,
+ .stripe_write_finished = __r5l_stripe_write_finished,
+ .handle_flush_request = __r5l_handle_flush_request,
+ .quiesce = __r5l_quiesce,
+};
+
+int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
+{
+ int ret;
+ struct r5l_log *log = kzalloc(sizeof(*log), GFP_KERNEL);
+ if (!log)
+ return -ENOMEM;
+
+ log->rdev = rdev;
+ log->policy = &r5l_journal;
+
+ ret = log->policy->init_log(log, conf);
+ if (ret)
+ kfree(log);
+
+ return ret;
+}
diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
index 1908d45..52cfef4 100644
--- a/drivers/md/raid5-cache.h
+++ b/drivers/md/raid5-cache.h
@@ -78,6 +78,8 @@ struct r5l_log {
/* to submit async io_units, to fulfill ordering of flush */
struct work_struct deferred_io_work;
+
+ struct r5l_policy *policy;
};
/*
@@ -127,6 +129,17 @@ enum r5l_io_unit_state {
IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
};
+struct r5l_policy {
+ int (*init_log)(struct r5l_log *log, struct r5conf *conf);
+ void (*exit_log)(struct r5l_log *log);
+ int (*write_stripe)(struct r5l_log *log, struct stripe_head *sh);
+ void (*write_stripe_run)(struct r5l_log *log);
+ void (*flush_stripe_to_raid)(struct r5l_log *log);
+ void (*stripe_write_finished)(struct r5l_io_unit *io);
+ int (*handle_flush_request)(struct r5l_log *log, struct bio *bio);
+ void (*quiesce)(struct r5l_log *log, int state);
+};
+
extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
extern void r5l_exit_log(struct r5l_log *log);
extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh);
--
2.10.1
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH v2 03/12] raid5-cache: add a new policy
2016-12-05 15:31 [PATCH v2 00/12] Partial Parity Log for MD RAID 5 Artur Paszkiewicz
2016-12-05 15:31 ` [PATCH v2 01/12] raid5-cache: move declarations to separate header Artur Paszkiewicz
2016-12-05 15:31 ` [PATCH v2 02/12] raid5-cache: add policy logic Artur Paszkiewicz
@ 2016-12-05 15:31 ` Artur Paszkiewicz
2016-12-07 0:46 ` NeilBrown
2016-12-05 15:31 ` [PATCH v2 04/12] md: superblock changes for PPL Artur Paszkiewicz
` (9 subsequent siblings)
12 siblings, 1 reply; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-05 15:31 UTC (permalink / raw)
To: shli; +Cc: linux-raid
Add a source file for the new policy implementation and allow selecting
the policy based on the policy_type parameter in r5l_init_log().
Introduce a new flag for rdev state flags to allow enabling the new
policy from userspace.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/Makefile | 2 +-
drivers/md/md.c | 5 +++++
drivers/md/md.h | 3 +++
drivers/md/raid5-cache.c | 17 +++++++++++++++--
drivers/md/raid5-cache.h | 9 ++++++++-
drivers/md/raid5-ppl.c | 20 ++++++++++++++++++++
drivers/md/raid5.c | 42 ++++++++++++++++++++++++++++++++++++------
7 files changed, 88 insertions(+), 10 deletions(-)
create mode 100644 drivers/md/raid5-ppl.c
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3cbda1a..4d48714 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -18,7 +18,7 @@ dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
dm-verity-y += dm-verity-target.o
md-mod-y += md.o bitmap.o
-raid456-y += raid5.o raid5-cache.o
+raid456-y += raid5.o raid5-cache.o raid5-ppl.o
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c7894fb..4876687 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2575,6 +2575,8 @@ state_show(struct md_rdev *rdev, char *page)
len += sprintf(page+len, "journal%s", sep);
if (test_bit(WriteMostly, &flags))
len += sprintf(page+len, "write_mostly%s", sep);
+ if (test_bit(JournalPpl, &flags))
+ len += sprintf(page+len, "journal_ppl%s", sep);
if (test_bit(Blocked, &flags) ||
(rdev->badblocks.unacked_exist
&& !test_bit(Faulty, &flags)))
@@ -2753,6 +2755,9 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
} else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
clear_bit(ExternalBbl, &rdev->flags);
err = 0;
+ } else if (cmd_match(buf, "journal_ppl")) {
+ set_bit(JournalPpl, &rdev->flags);
+ err = 0;
}
if (!err)
sysfs_notify_dirent_safe(rdev->sysfs_state);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 5c08f84..2fc75ac 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -172,6 +172,9 @@ enum flag_bits {
* Usually, this device should be faster
* than other devices in the array
*/
+ JournalPpl, /* This device is used for raid5
+ * Partial Parity Log.
+ */
ClusterRemove,
RemoveSynchronized, /* synchronize_rcu() was called after
* this device was known to be faulty,
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 74a0eda..fa82b9a 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2556,16 +2556,29 @@ struct r5l_policy r5l_journal = {
.handle_flush_request = __r5l_handle_flush_request,
.quiesce = __r5l_quiesce,
};
+extern struct r5l_policy r5l_ppl;
-int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
+int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev, int policy_type)
{
int ret;
struct r5l_log *log = kzalloc(sizeof(*log), GFP_KERNEL);
if (!log)
return -ENOMEM;
+ switch (policy_type) {
+ case RWH_POLICY_JOURNAL:
+ log->policy = &r5l_journal;
+ break;
+ case RWH_POLICY_PPL:
+ log->policy = &r5l_ppl;
+ break;
+ default:
+ kfree(log);
+ return -EINVAL;
+ }
+
log->rdev = rdev;
- log->policy = &r5l_journal;
+ log->rwh_policy = policy_type;
ret = log->policy->init_log(log, conf);
if (ret)
diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
index 52cfef4..4ba11d3 100644
--- a/drivers/md/raid5-cache.h
+++ b/drivers/md/raid5-cache.h
@@ -80,6 +80,13 @@ struct r5l_log {
struct work_struct deferred_io_work;
struct r5l_policy *policy;
+ enum {
+ RWH_POLICY_OFF,
+ RWH_POLICY_JOURNAL,
+ RWH_POLICY_PPL,
+ } rwh_policy;
+
+ void *private;
};
/*
@@ -140,7 +147,7 @@ struct r5l_policy {
void (*quiesce)(struct r5l_log *log, int state);
};
-extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
+extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev, int policy_type);
extern void r5l_exit_log(struct r5l_log *log);
extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh);
extern void r5l_write_stripe_run(struct r5l_log *log);
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
new file mode 100644
index 0000000..263fad7
--- /dev/null
+++ b/drivers/md/raid5-ppl.c
@@ -0,0 +1,20 @@
+/*
+ * Partial Parity Log for closing the RAID5 write hole
+ * Copyright (c) 2016, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include "raid5.h"
+#include "raid5-cache.h"
+
+struct r5l_policy r5l_ppl;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index fe8c1a7..9f07769 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6833,8 +6833,10 @@ static int raid5_run(struct mddev *mddev)
struct r5conf *conf;
int working_disks = 0;
int dirty_parity_disks = 0;
+ int ppl_disks = 0;
struct md_rdev *rdev;
struct md_rdev *journal_dev = NULL;
+ int rwh_policy = RWH_POLICY_OFF;
sector_t reshape_offset = 0;
int i;
long long min_offset_diff = 0;
@@ -6847,6 +6849,10 @@ static int raid5_run(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
long long diff;
+ if (test_bit(JournalPpl, &rdev->flags) &&
+ test_bit(In_sync, &rdev->flags))
+ ppl_disks++;
+
if (test_bit(Journal, &rdev->flags)) {
journal_dev = rdev;
continue;
@@ -7037,6 +7043,22 @@ static int raid5_run(struct mddev *mddev)
goto abort;
}
+ if (ppl_disks) {
+ if (ppl_disks != working_disks) {
+ pr_err("md/raid:%s: distributed PPL must be enabled on all member devices - aborting\n",
+ mdname(mddev));
+ goto abort;
+ }
+ rwh_policy = RWH_POLICY_PPL;
+ }
+
+ if (journal_dev) {
+ if (ppl_disks)
+ pr_warn("md/raid:%s: using journal device and PPL not allowed - ignoring PPL\n",
+ mdname(mddev));
+ rwh_policy = RWH_POLICY_JOURNAL;
+ }
+
/* device size must be a multiple of chunk size */
mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
mddev->resync_max_sectors = mddev->dev_sectors;
@@ -7171,12 +7193,17 @@ static int raid5_run(struct mddev *mddev)
blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
}
- if (journal_dev) {
- char b[BDEVNAME_SIZE];
+ if (rwh_policy) {
+ if (journal_dev) {
+ char b[BDEVNAME_SIZE];
- pr_debug("md/raid:%s: using device %s as journal\n",
- mdname(mddev), bdevname(journal_dev->bdev, b));
- if (r5l_init_log(conf, journal_dev))
+ pr_debug("md/raid:%s: using device %s as journal\n",
+ mdname(mddev), bdevname(journal_dev->bdev, b));
+ } else if (rwh_policy == RWH_POLICY_PPL) {
+ pr_debug("md/raid:%s: enabling distributed PPL journal\n",
+ mdname(mddev));
+ }
+ if (r5l_init_log(conf, journal_dev, rwh_policy))
goto abort;
}
@@ -7372,6 +7399,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (test_bit(Journal, &rdev->flags)) {
char b[BDEVNAME_SIZE];
+ int ret;
if (conf->log)
return -EBUSY;
@@ -7380,7 +7408,9 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
* The array is in readonly mode if journal is missing, so no
* write requests running. We should be safe
*/
- r5l_init_log(conf, rdev);
+ ret = r5l_init_log(conf, rdev, RWH_POLICY_JOURNAL);
+ if (ret)
+ return ret;
pr_debug("md/raid:%s: using device %s as journal\n",
mdname(mddev), bdevname(rdev->bdev, b));
return 0;
--
2.10.1
^ permalink raw reply related [flat|nested] 38+ messages in thread
* Re: [PATCH v2 03/12] raid5-cache: add a new policy
2016-12-05 15:31 ` [PATCH v2 03/12] raid5-cache: add a new policy Artur Paszkiewicz
@ 2016-12-07 0:46 ` NeilBrown
2016-12-07 14:36 ` Artur Paszkiewicz
0 siblings, 1 reply; 38+ messages in thread
From: NeilBrown @ 2016-12-07 0:46 UTC (permalink / raw)
To: Artur Paszkiewicz, shli; +Cc: linux-raid
[-- Attachment #1: Type: text/plain, Size: 1279 bytes --]
On Tue, Dec 06 2016, Artur Paszkiewicz wrote:
> Add a source file for the new policy implementation and allow selecting
> the policy based on the policy_type parameter in r5l_init_log().
>
> Introduce a new flag for rdev state flags to allow enabling the new
> policy from userspace.
This seems odd. Why is this a per-device flag?
It makes sense for "journal" to be a per-device flag, because only one
device is the journal device and it is obviously different from the
others.
But with the ppl, all devices serve as journal devices. So we would
need to set journal_ppl on all devices? What happens if you set it on
some, but not others? I see you get an error.
I think some sort of array-wide setting would make more sense, would it
not?
And what is an RWH??? A Really Weird Handle ??
I guess it is probably a Raid5 Write Hole ? At the very least there
should be a comment explaining this when you define the enum. (remember,
you are trying to make it easier for reviewers).
It might almost make sense for bitmap/metadata to be used here.
It can currently be "external" "internal" "clustered".
Allow also "journalled" or "partial-partiy-log" ???
Maybe not ... but I'd definitely prefer a global setting, and one that
didn't use an obscure abbreviation.
NeilBrown
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 03/12] raid5-cache: add a new policy
2016-12-07 0:46 ` NeilBrown
@ 2016-12-07 14:36 ` Artur Paszkiewicz
2016-12-07 23:24 ` NeilBrown
0 siblings, 1 reply; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-07 14:36 UTC (permalink / raw)
To: NeilBrown, shli; +Cc: linux-raid
On 12/07/2016 01:46 AM, NeilBrown wrote:
> On Tue, Dec 06 2016, Artur Paszkiewicz wrote:
>
>> Add a source file for the new policy implementation and allow selecting
>> the policy based on the policy_type parameter in r5l_init_log().
>>
>> Introduce a new flag for rdev state flags to allow enabling the new
>> policy from userspace.
>
> This seems odd. Why is this a per-device flag?
> It makes sense for "journal" to be a per-device flag, because only one
> device is the journal device and it is obviously different from the
> others.
>
> But with the ppl, all devices serve as journal devices. So we would
> need to set journal_ppl on all devices? What happens if you set it on
> some, but not others? I see you get an error.
>
> I think some sort of array-wide setting would make more sense, would it
> not?
Yes, it would. The problem exists only for external metadata, because
for native there is a feature flag in the superblock and a corresponding
flag in mddev->flags. Patch 12 adds a sysfs attribute to control the
policy at runtime but it would have to be moved out of raid5 personality
into the main md code. I didn't like the idea of adding something
specific to raid5 to generic code and visible in sysfs for unrelated
raid levels.
> And what is an RWH??? A Really Weird Handle ??
>
> I guess it is probably a Raid5 Write Hole ? At the very least there
> should be a comment explaining this when you define the enum. (remember,
> you are trying to make it easier for reviewers).
That's right, RWH stands for RAID Write Hole. I think I introduced it in
the cover letter, but I'll explain it also in the code.
> It might almost make sense for bitmap/metadata to be used here.
> It can currently be "external" "internal" "clustered".
> Allow also "journalled" or "partial-partiy-log" ???
>
> Maybe not ... but I'd definitely prefer a global setting, and one that
> didn't use an obscure abbreviation.
So do you think the sysfs attribute from patch 12 ("rwh_policy") could
be made global? This would simplify things but it doesn't seem right.
And about the abbreviation, should it be called "write_hole_policy" or
"raid5_write_hole_policy"? Maybe using bitmap/metadata is not a bad
idea...
Artur
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 03/12] raid5-cache: add a new policy
2016-12-07 14:36 ` Artur Paszkiewicz
@ 2016-12-07 23:24 ` NeilBrown
2016-12-08 10:28 ` Artur Paszkiewicz
0 siblings, 1 reply; 38+ messages in thread
From: NeilBrown @ 2016-12-07 23:24 UTC (permalink / raw)
To: Artur Paszkiewicz, shli; +Cc: linux-raid
[-- Attachment #1: Type: text/plain, Size: 3087 bytes --]
On Thu, Dec 08 2016, Artur Paszkiewicz wrote:
> On 12/07/2016 01:46 AM, NeilBrown wrote:
>> On Tue, Dec 06 2016, Artur Paszkiewicz wrote:
>>
>>> Add a source file for the new policy implementation and allow selecting
>>> the policy based on the policy_type parameter in r5l_init_log().
>>>
>>> Introduce a new flag for rdev state flags to allow enabling the new
>>> policy from userspace.
>>
>> This seems odd. Why is this a per-device flag?
>> It makes sense for "journal" to be a per-device flag, because only one
>> device is the journal device and it is obviously different from the
>> others.
>>
>> But with the ppl, all devices serve as journal devices. So we would
>> need to set journal_ppl on all devices? What happens if you set it on
>> some, but not others? I see you get an error.
>>
>> I think some sort of array-wide setting would make more sense, would it
>> not?
>
> Yes, it would. The problem exists only for external metadata, because
> for native there is a feature flag in the superblock and a corresponding
> flag in mddev->flags. Patch 12 adds a sysfs attribute to control the
> policy at runtime but it would have to be moved out of raid5 personality
> into the main md code. I didn't like the idea of adding something
> specific to raid5 to generic code and visible in sysfs for unrelated
> raid levels.
>
>> And what is an RWH??? A Really Weird Handle ??
>>
>> I guess it is probably a Raid5 Write Hole ? At the very least there
>> should be a comment explaining this when you define the enum. (remember,
>> you are trying to make it easier for reviewers).
>
> That's right, RWH stands for RAID Write Hole. I think I introduced it in
> the cover letter, but I'll explain it also in the code.
>
>> It might almost make sense for bitmap/metadata to be used here.
>> It can currently be "external" "internal" "clustered".
>> Allow also "journalled" or "partial-partiy-log" ???
>>
>> Maybe not ... but I'd definitely prefer a global setting, and one that
>> didn't use an obscure abbreviation.
>
> So do you think the sysfs attribute from patch 12 ("rwh_policy") could
> be made global? This would simplify things but it doesn't seem right.
> And about the abbreviation, should it be called "write_hole_policy" or
> "raid5_write_hole_policy"? Maybe using bitmap/metadata is not a bad
> idea...
How about we call it "resync_policy" which describes how to cope with
unexpected shutdown. Options include:
full regenerate all redundancy info after a crash
bitmap only regenerate redundancy info indicated by bitmap
(both these suseptible to write-hole on raid456)
journal raid456 only, though could theoretically be extended
to raid1, raid10 : log transactions and replay after crash
ppl raid456 only: log partial-parity before writes.
With external metadata, this must be set explicitly. With internal
metadata, it is set best on flags etc.
Thoughts? I'm not really happy with "full", but I cannot think of a
better name.
Thanks,
NeilBrown
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 03/12] raid5-cache: add a new policy
2016-12-07 23:24 ` NeilBrown
@ 2016-12-08 10:28 ` Artur Paszkiewicz
2016-12-08 21:22 ` NeilBrown
0 siblings, 1 reply; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-08 10:28 UTC (permalink / raw)
To: NeilBrown, shli; +Cc: linux-raid
On 12/08/2016 12:24 AM, NeilBrown wrote:
> On Thu, Dec 08 2016, Artur Paszkiewicz wrote:
>
>> On 12/07/2016 01:46 AM, NeilBrown wrote:
>>> On Tue, Dec 06 2016, Artur Paszkiewicz wrote:
>>>
>>>> Add a source file for the new policy implementation and allow selecting
>>>> the policy based on the policy_type parameter in r5l_init_log().
>>>>
>>>> Introduce a new flag for rdev state flags to allow enabling the new
>>>> policy from userspace.
>>>
>>> This seems odd. Why is this a per-device flag?
>>> It makes sense for "journal" to be a per-device flag, because only one
>>> device is the journal device and it is obviously different from the
>>> others.
>>>
>>> But with the ppl, all devices serve as journal devices. So we would
>>> need to set journal_ppl on all devices? What happens if you set it on
>>> some, but not others? I see you get an error.
>>>
>>> I think some sort of array-wide setting would make more sense, would it
>>> not?
>>
>> Yes, it would. The problem exists only for external metadata, because
>> for native there is a feature flag in the superblock and a corresponding
>> flag in mddev->flags. Patch 12 adds a sysfs attribute to control the
>> policy at runtime but it would have to be moved out of raid5 personality
>> into the main md code. I didn't like the idea of adding something
>> specific to raid5 to generic code and visible in sysfs for unrelated
>> raid levels.
>>
>>> And what is an RWH??? A Really Weird Handle ??
>>>
>>> I guess it is probably a Raid5 Write Hole ? At the very least there
>>> should be a comment explaining this when you define the enum. (remember,
>>> you are trying to make it easier for reviewers).
>>
>> That's right, RWH stands for RAID Write Hole. I think I introduced it in
>> the cover letter, but I'll explain it also in the code.
>>
>>> It might almost make sense for bitmap/metadata to be used here.
>>> It can currently be "external" "internal" "clustered".
>>> Allow also "journalled" or "partial-partiy-log" ???
>>>
>>> Maybe not ... but I'd definitely prefer a global setting, and one that
>>> didn't use an obscure abbreviation.
>>
>> So do you think the sysfs attribute from patch 12 ("rwh_policy") could
>> be made global? This would simplify things but it doesn't seem right.
>> And about the abbreviation, should it be called "write_hole_policy" or
>> "raid5_write_hole_policy"? Maybe using bitmap/metadata is not a bad
>> idea...
>
> How about we call it "resync_policy" which describes how to cope with
> unexpected shutdown. Options include:
>
> full regenerate all redundancy info after a crash
> bitmap only regenerate redundancy info indicated by bitmap
> (both these suseptible to write-hole on raid456)
> journal raid456 only, though could theoretically be extended
> to raid1, raid10 : log transactions and replay after crash
> ppl raid456 only: log partial-parity before writes.
>
> With external metadata, this must be set explicitly. With internal
> metadata, it is set best on flags etc.
>
> Thoughts? I'm not really happy with "full", but I cannot think of a
> better name.
I'm OK with this approach. A corresponding option will also be needed in
mdadm. But won't this name be a little misleading because this option is
not just about unexpected shutdown (where resync applies), but also
degraded array unexpected shutdown. So maybe something like
"consistency_policy" and "resync" option instead of "full"?
Thanks,
Artur
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 03/12] raid5-cache: add a new policy
2016-12-08 10:28 ` Artur Paszkiewicz
@ 2016-12-08 21:22 ` NeilBrown
0 siblings, 0 replies; 38+ messages in thread
From: NeilBrown @ 2016-12-08 21:22 UTC (permalink / raw)
To: Artur Paszkiewicz, shli; +Cc: linux-raid
[-- Attachment #1: Type: text/plain, Size: 1209 bytes --]
On Thu, Dec 08 2016, Artur Paszkiewicz wrote:
>>
>> How about we call it "resync_policy" which describes how to cope with
>> unexpected shutdown. Options include:
>>
>> full regenerate all redundancy info after a crash
>> bitmap only regenerate redundancy info indicated by bitmap
>> (both these suseptible to write-hole on raid456)
>> journal raid456 only, though could theoretically be extended
>> to raid1, raid10 : log transactions and replay after crash
>> ppl raid456 only: log partial-parity before writes.
>>
>> With external metadata, this must be set explicitly. With internal
>> metadata, it is set best on flags etc.
>>
>> Thoughts? I'm not really happy with "full", but I cannot think of a
>> better name.
>
> I'm OK with this approach. A corresponding option will also be needed in
> mdadm. But won't this name be a little misleading because this option is
> not just about unexpected shutdown (where resync applies), but also
> degraded array unexpected shutdown. So maybe something like
> "consistency_policy" and "resync" option instead of "full"?
"consistency_policy" and "resync" sounds good to me. Thanks.
NeilBrown
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]
^ permalink raw reply [flat|nested] 38+ messages in thread
* [PATCH v2 04/12] md: superblock changes for PPL
2016-12-05 15:31 [PATCH v2 00/12] Partial Parity Log for MD RAID 5 Artur Paszkiewicz
` (2 preceding siblings ...)
2016-12-05 15:31 ` [PATCH v2 03/12] raid5-cache: add a new policy Artur Paszkiewicz
@ 2016-12-05 15:31 ` Artur Paszkiewicz
2016-12-05 15:31 ` [PATCH v2 05/12] raid5-ppl: Partial Parity Log implementation Artur Paszkiewicz
` (8 subsequent siblings)
12 siblings, 0 replies; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-05 15:31 UTC (permalink / raw)
To: shli; +Cc: linux-raid
Include information about PPL location and size into mdp_superblock_1
and copy it to/from rdev. Because PPL is mutually exclusive with bitmap,
put it in place of 'bitmap_offset'. Add a new flag MD_FEATURE_PPL for
'feature_map', analogically to MD_FEATURE_BITMAP_OFFSET. Add MD_HAS_PPL
to mddev->flags to indicate that PPL is enabled on an array.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/md.c | 14 ++++++++++++++
drivers/md/md.h | 8 ++++++++
drivers/md/raid5.c | 7 +++++++
include/uapi/linux/raid/md_p.h | 18 ++++++++++++++----
4 files changed, 43 insertions(+), 4 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4876687..7028d54 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1524,6 +1524,11 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
} else if (sb->bblog_offset != 0)
rdev->badblocks.shift = 0;
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+ rdev->ppl.offset = le16_to_cpu(sb->ppl.offset);
+ rdev->ppl.size = le16_to_cpu(sb->ppl.size);
+ }
+
if (!refdev) {
ret = 1;
} else {
@@ -1636,6 +1641,9 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
set_bit(MD_HAS_JOURNAL, &mddev->flags);
+
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL)
+ set_bit(MD_HAS_PPL, &mddev->flags);
} else if (mddev->pers == NULL) {
/* Insist of good event counter while assembling, except for
* spares (which don't need an event count) */
@@ -1849,6 +1857,12 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
+ if (test_bit(MD_HAS_PPL, &mddev->flags)) {
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
+ sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
+ sb->ppl.size = cpu_to_le16(rdev->ppl.size);
+ }
+
rdev_for_each(rdev2, mddev) {
i = rdev2->desc_nr;
if (test_bit(Faulty, &rdev2->flags))
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2fc75ac..d1e56f8 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -122,6 +122,13 @@ struct md_rdev {
* sysfs entry */
struct badblocks badblocks;
+
+ struct {
+ unsigned int offset; /* Offset from superblock to start of PPL.
+ * Not used by external metadata. */
+ unsigned int size; /* Size in sectors of the PPL space */
+ sector_t sector; /* First sector of the PPL space */
+ } ppl;
};
enum flag_bits {
Faulty, /* device is known to have a fault */
@@ -235,6 +242,7 @@ enum mddev_flags {
* never cause the array to become failed.
*/
MD_NEED_REWRITE, /* metadata write needs to be repeated */
+ MD_HAS_PPL, /* The raid array has PPL feature set */
};
#define MD_UPDATE_SB_FLAGS (BIT(MD_CHANGE_DEVS) | \
BIT(MD_CHANGE_CLEAN) | \
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9f07769..a7e993a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6849,6 +6849,10 @@ static int raid5_run(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
long long diff;
+ if (test_bit(MD_HAS_PPL, &mddev->flags) &&
+ test_bit(In_sync, &rdev->flags))
+ set_bit(JournalPpl, &rdev->flags);
+
if (test_bit(JournalPpl, &rdev->flags) &&
test_bit(In_sync, &rdev->flags))
ppl_disks++;
@@ -6871,6 +6875,9 @@ static int raid5_run(struct mddev *mddev)
min_offset_diff = diff;
}
+ if (ppl_disks)
+ set_bit(MD_HAS_PPL, &mddev->flags);
+
if (mddev->reshape_position != MaxSector) {
/* Check that we can continue the reshape.
* Difficulties arise if the stripe we would write to
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index 9930f3e..455caa8 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -242,10 +242,18 @@ struct mdp_superblock_1 {
__le32 chunksize; /* in 512byte sectors */
__le32 raid_disks;
- __le32 bitmap_offset; /* sectors after start of superblock that bitmap starts
- * NOTE: signed, so bitmap can be before superblock
- * only meaningful of feature_map[0] is set.
- */
+ union {
+ __le32 bitmap_offset; /* sectors after start of superblock that bitmap starts
+ * NOTE: signed, so bitmap can be before superblock
+ * only meaningful of feature_map[0] is set.
+ */
+
+ /* only meaningful when feature_map[MD_FEATURE_PPL] is set */
+ struct {
+ __le16 offset; /* sectors after start of superblock that ppl starts */
+ __le16 size; /* PPL size (including header) in sectors */
+ } ppl;
+ };
/* These are only valid with feature bit '4' */
__le32 new_level; /* new level we are reshaping to */
@@ -318,6 +326,7 @@ struct mdp_superblock_1 {
*/
#define MD_FEATURE_CLUSTERED 256 /* clustered MD */
#define MD_FEATURE_JOURNAL 512 /* support write cache */
+#define MD_FEATURE_PPL 1024 /* support PPL */
#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
|MD_FEATURE_RECOVERY_OFFSET \
|MD_FEATURE_RESHAPE_ACTIVE \
@@ -328,6 +337,7 @@ struct mdp_superblock_1 {
|MD_FEATURE_RECOVERY_BITMAP \
|MD_FEATURE_CLUSTERED \
|MD_FEATURE_JOURNAL \
+ |MD_FEATURE_PPL \
)
struct r5l_payload_header {
--
2.10.1
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH v2 05/12] raid5-ppl: Partial Parity Log implementation
2016-12-05 15:31 [PATCH v2 00/12] Partial Parity Log for MD RAID 5 Artur Paszkiewicz
` (3 preceding siblings ...)
2016-12-05 15:31 ` [PATCH v2 04/12] md: superblock changes for PPL Artur Paszkiewicz
@ 2016-12-05 15:31 ` Artur Paszkiewicz
2016-12-06 1:06 ` kbuild test robot
2016-12-07 1:17 ` NeilBrown
2016-12-05 15:31 ` [PATCH v2 06/12] raid5-ppl: calculate partial parity Artur Paszkiewicz
` (7 subsequent siblings)
12 siblings, 2 replies; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-05 15:31 UTC (permalink / raw)
To: shli; +Cc: linux-raid
This implements the write logging functionality, using the policy logic
introduced in previous patches.
PPL is a distributed log - data is stored on all RAID member drives in
the metadata area. PPL is written to the parity disk of a particular
stripe. Distributed log is implemented by using one r5l_log instance per
each array member. They are grouped in child_logs array in struct
ppl_conf, which is assigned to a common parent log. This parent log
serves as a proxy and is used in raid5 personality code - it is assigned
as _the_ log in r5conf->log. The child logs are where all the real work
is done.
The PPL consists of a 4KB header (struct ppl_header), and at least 128KB
for partial parity data. It is stored right after the array data (for
IMSM) or in the bitmap area (super 1.1 and 1.2) and can be overwritten
even at each array write request.
Attach a page for holding the partial parity data to each stripe_head.
Allocate it only if mddev has the MD_HAS_PPL flag set.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/raid5-cache.c | 12 +-
drivers/md/raid5-cache.h | 6 +
drivers/md/raid5-ppl.c | 594 ++++++++++++++++++++++++++++++++++++++++++++++-
drivers/md/raid5.c | 15 +-
drivers/md/raid5.h | 1 +
5 files changed, 620 insertions(+), 8 deletions(-)
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index fa82b9a..be534d8 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -119,8 +119,8 @@ static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
return log->device_size > used_size + size;
}
-static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
- enum r5l_io_unit_state state)
+void __r5l_set_io_unit_state(struct r5l_io_unit *io,
+ enum r5l_io_unit_state state)
{
if (WARN_ON(io->state >= state))
return;
@@ -340,7 +340,7 @@ static void r5c_finish_cache_stripe(struct stripe_head *sh)
}
}
-static void r5l_io_run_stripes(struct r5l_io_unit *io)
+void r5l_io_run_stripes(struct r5l_io_unit *io)
{
struct stripe_head *sh, *next;
@@ -935,7 +935,7 @@ static sector_t r5l_reclaimable_space(struct r5l_log *log)
r5c_calculate_new_cp(conf));
}
-static void r5l_run_no_mem_stripe(struct r5l_log *log)
+void r5l_run_no_mem_stripe(struct r5l_log *log)
{
struct stripe_head *sh;
@@ -1039,7 +1039,7 @@ static void r5l_log_flush_endio(struct bio *bio)
* only write stripes of an io_unit to raid disks till the io_unit is the first
* one whose data/parity is in log.
*/
-static void __r5l_flush_stripe_to_raid(struct r5l_log *log)
+void __r5l_flush_stripe_to_raid(struct r5l_log *log)
{
bool do_flush;
@@ -1359,7 +1359,7 @@ bool r5l_log_disk_error(struct r5conf *conf)
if (!log)
ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
else
- ret = test_bit(Faulty, &log->rdev->flags);
+ ret = log->rdev && test_bit(Faulty, &log->rdev->flags);
rcu_read_unlock();
return ret;
}
diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
index 4ba11d3..0446100 100644
--- a/drivers/md/raid5-cache.h
+++ b/drivers/md/raid5-cache.h
@@ -157,4 +157,10 @@ extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
extern void r5l_quiesce(struct r5l_log *log, int state);
extern bool r5l_log_disk_error(struct r5conf *conf);
+extern void __r5l_set_io_unit_state(struct r5l_io_unit *io,
+ enum r5l_io_unit_state state);
+extern void r5l_io_run_stripes(struct r5l_io_unit *io);
+extern void r5l_run_no_mem_stripe(struct r5l_log *log);
+extern void __r5l_flush_stripe_to_raid(struct r5l_log *log);
+
#endif /* _RAID5_CACHE_H */
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 263fad7..2d4c90f 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -14,7 +14,599 @@
#include <linux/kernel.h>
#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/module.h>
+#include <linux/raid/md_p.h>
+#include "md.h"
#include "raid5.h"
#include "raid5-cache.h"
-struct r5l_policy r5l_ppl;
+static bool ppl_debug;
+module_param(ppl_debug, bool, 0644);
+MODULE_PARM_DESC(ppl_debug, "Debug mode for md raid5 PPL");
+
+#define dbg(format, args...) \
+do { \
+ if (ppl_debug) \
+ printk(KERN_DEBUG"[%d] %s() "format, \
+ current->pid, __func__, ##args); \
+} while (0)
+
+struct ppl_conf {
+ int count;
+ struct r5l_log **child_logs;
+};
+
+struct ppl_header_entry {
+ __le64 data_sector; /* Raid sector of the new data */
+ __le32 pp_size; /* Length of partial parity */
+ __le32 data_size; /* Length of data */
+ __u8 parity_disk; /* Member disk containing parity */
+ __le32 checksum; /* Checksum of this entry */
+} __packed;
+
+#define PPL_HEADER_SIZE PAGE_SIZE
+#define PPL_HDR_RESERVED 512
+#define PPL_HDR_ENTRY_SPACE \
+ (PPL_HEADER_SIZE - PPL_HDR_RESERVED - 3 * sizeof(u32) - sizeof(u64))
+#define PPL_HDR_MAX_ENTRIES \
+ (PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
+#define PPL_ENTRY_SPACE_IMSM (128 * 1024)
+
+struct ppl_header {
+ __u8 reserved[PPL_HDR_RESERVED];/* Reserved space */
+ __le32 signature; /* Signature (family number of volume) */
+ __le64 generation; /* Generation number of PP Header */
+ __le32 entries_count; /* Number of entries in entry array */
+ __le32 checksum; /* Checksum of PP Header */
+ struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES];
+} __packed;
+
+static void ppl_log_endio(struct bio *bio)
+{
+ struct r5l_io_unit *io = bio->bi_private;
+ struct r5l_log *log = io->log;
+ unsigned long flags;
+
+ dbg("io %p seq: %llu\n", io, io->seq);
+
+ if (bio->bi_error)
+ md_error(log->rdev->mddev, log->rdev);
+
+ bio_put(bio);
+ mempool_free(io->meta_page, log->meta_pool);
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ __r5l_set_io_unit_state(io, IO_UNIT_IO_END);
+ if (log->need_cache_flush) {
+ list_move_tail(&io->log_sibling, &log->io_end_ios);
+ } else {
+ list_move_tail(&io->log_sibling, &log->finished_ios);
+ r5l_io_run_stripes(io);
+ }
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+
+ if (log->need_cache_flush)
+ md_wakeup_thread(log->rdev->mddev->thread);
+}
+
+static struct r5l_io_unit *ppl_new_iounit(struct r5l_log *log,
+ struct stripe_head *sh)
+{
+ struct r5l_io_unit *io;
+ struct ppl_header *pplhdr;
+ struct r5conf *conf = log->rdev->mddev->private;
+ struct r5l_log *parent_log = conf->log;
+
+ io = mempool_alloc(log->io_pool, GFP_ATOMIC);
+ if (!io)
+ return NULL;
+
+ memset(io, 0, sizeof(*io));
+ io->log = log;
+ INIT_LIST_HEAD(&io->log_sibling);
+ INIT_LIST_HEAD(&io->stripe_list);
+ io->state = IO_UNIT_RUNNING;
+
+ io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
+ pplhdr = page_address(io->meta_page);
+ clear_page(pplhdr);
+ memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+ pplhdr->signature = cpu_to_le32(log->uuid_checksum);
+
+ io->current_bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES, log->bs);
+ bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, 0);
+
+ io->current_bio->bi_bdev = log->rdev->bdev;
+ io->current_bio->bi_iter.bi_sector = log->rdev->ppl.sector;
+ io->current_bio->bi_end_io = ppl_log_endio;
+ io->current_bio->bi_private = io;
+ bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
+
+ spin_lock(&parent_log->io_list_lock);
+ io->seq = parent_log->seq++;
+ spin_unlock(&parent_log->io_list_lock);
+ pplhdr->generation = cpu_to_le64(io->seq);
+
+ return io;
+}
+
+static int ppl_log_stripe(struct r5l_log *log, struct stripe_head *sh)
+{
+ struct r5l_io_unit *io;
+ struct ppl_header *pplhdr;
+ struct ppl_header_entry *pplhdr_entry = NULL;
+ int i;
+ sector_t data_sector;
+ unsigned long flags;
+ int data_disks = 0;
+ unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
+ struct r5conf *conf = log->rdev->mddev->private;
+
+ dbg("<%llu>\n", (unsigned long long)sh->sector);
+
+ io = log->current_io;
+ if (!io) {
+ io = ppl_new_iounit(log, sh);
+ if (!io)
+ return -ENOMEM;
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ list_add_tail(&io->log_sibling, &log->running_ios);
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+ } else {
+ pplhdr = page_address(io->meta_page);
+ if (io->meta_offset >= entry_space ||
+ pplhdr->entries_count == PPL_HDR_MAX_ENTRIES) {
+ /*
+ * this io_unit is full - set meta_offset to -1 to
+ * indicate that other units are waiting for this one
+ */
+ io->meta_offset = -1;
+
+ dbg("add blocked io_unit by %p seq: %llu\n",
+ io, io->seq);
+
+ io = ppl_new_iounit(log, sh);
+ if (!io) {
+ log->current_io->meta_offset = entry_space;
+ return -ENOMEM;
+ }
+ /*
+ * reuse need_split_bio to mark that this io_unit is
+ * blocked by an other
+ */
+ io->need_split_bio = true;
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ list_add_tail(&io->log_sibling, &log->running_ios);
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+ }
+ }
+
+ log->current_io = io;
+ io->meta_offset += PAGE_SIZE;
+
+ for (i = 0; i < sh->disks; i++) {
+ struct r5dev *dev = &sh->dev[i];
+ if (i != sh->pd_idx && test_bit(R5_LOCKED, &dev->flags)) {
+ if (!data_disks)
+ data_sector = dev->sector;
+ data_disks++;
+ }
+ }
+ BUG_ON(!data_disks);
+
+ dbg("io: %p seq: %llu data_sector: %llu data_disks: %d\n",
+ io, io->seq, (unsigned long long)data_sector, data_disks);
+ pplhdr = page_address(io->meta_page);
+
+ if (pplhdr->entries_count > 0) {
+ /* check if we can merge with the previous entry */
+ struct ppl_header_entry *prev;
+ prev = &pplhdr->entries[pplhdr->entries_count-1];
+
+ if ((prev->data_sector + (prev->pp_size >> 9) == data_sector) &&
+ (prev->data_size == prev->pp_size * data_disks) &&
+ (data_sector >> ilog2(sh->raid_conf->chunk_sectors) ==
+ prev->data_sector >> ilog2(sh->raid_conf->chunk_sectors)))
+ pplhdr_entry = prev;
+ }
+
+ if (pplhdr_entry) {
+ pplhdr_entry->data_size += PAGE_SIZE * data_disks;
+ pplhdr_entry->pp_size += PAGE_SIZE;
+ } else {
+ pplhdr_entry = &pplhdr->entries[pplhdr->entries_count++];
+ pplhdr_entry->data_sector = data_sector;
+ pplhdr_entry->data_size = PAGE_SIZE * data_disks;
+ pplhdr_entry->pp_size = PAGE_SIZE;
+ pplhdr_entry->parity_disk = sh->pd_idx;
+ }
+
+ /* don't write any PP if full stripe write */
+ if (pplhdr_entry->pp_size >> 9 == conf->chunk_sectors &&
+ pplhdr_entry->data_size == pplhdr_entry->pp_size *
+ (conf->raid_disks - conf->max_degraded)) {
+ io->meta_offset -= pplhdr_entry->pp_size;
+ pplhdr_entry->pp_size = 0;
+ }
+
+ list_add_tail(&sh->log_list, &io->stripe_list);
+ atomic_inc(&io->pending_stripe);
+ sh->log_io = io;
+
+ return 0;
+}
+
+static int ppl_write_stripe(struct r5l_log *log, struct stripe_head *sh)
+{
+ struct r5l_io_unit *io = sh->log_io;
+
+ if (io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
+ test_bit(STRIPE_SYNCING, &sh->state) || !log || !log->rdev ||
+ test_bit(Faulty, &log->rdev->flags)) {
+ clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
+ return -EAGAIN;
+ }
+
+ set_bit(STRIPE_LOG_TRAPPED, &sh->state);
+ clear_bit(STRIPE_DELAYED, &sh->state);
+ atomic_inc(&sh->count);
+
+ mutex_lock(&log->io_mutex);
+ if (ppl_log_stripe(log, sh)) {
+ spin_lock_irq(&log->io_list_lock);
+ list_add_tail(&sh->log_list, &log->no_mem_stripes);
+ spin_unlock_irq(&log->io_list_lock);
+ }
+ mutex_unlock(&log->io_mutex);
+
+ return 0;
+}
+
+static void ppl_submit_iounit(struct r5l_io_unit *io)
+{
+ struct mddev *mddev = io->log->rdev->mddev;
+ struct r5conf *conf = mddev->private;
+ int block_size = queue_logical_block_size(mddev->queue);
+ struct ppl_header *pplhdr = page_address(io->meta_page);
+ struct bio *bio = io->current_bio;
+ struct stripe_head *sh;
+ int i;
+ struct bio_list bios = BIO_EMPTY_LIST;
+
+ dbg("io %p seq: %llu\n", io, io->seq);
+
+ sh = list_first_entry(&io->stripe_list, struct stripe_head, log_list);
+ bio_list_add(&bios, io->current_bio);
+
+ for (i = 0; i < pplhdr->entries_count; i++) {
+ struct ppl_header_entry *e = &pplhdr->entries[i];
+ u32 pp_size = e->pp_size;
+ u32 crc = ~0;
+
+ if (pp_size == 0) {
+ pp_size = conf->chunk_sectors << 9;
+ while (pp_size) {
+ pp_size -= PAGE_SIZE;
+ sh = list_next_entry(sh, log_list);
+ }
+ }
+
+ while (pp_size) {
+ struct page *pp_page;
+
+ if (test_bit(STRIPE_FULL_WRITE, &sh->state))
+ pp_page = ZERO_PAGE(0);
+ else
+ pp_page = sh->ppl_page;
+
+ crc = crc32c_le(crc, page_address(pp_page), PAGE_SIZE);
+
+ if (!bio_add_page(bio, pp_page, PAGE_SIZE, 0)) {
+ struct bio *bio2;
+ bio2 = bio_alloc_bioset(GFP_NOIO,
+ BIO_MAX_PAGES,
+ io->log->bs);
+ bio_set_op_attrs(bio2, REQ_OP_WRITE, 0);
+ bio2->bi_bdev = bio->bi_bdev;
+ bio2->bi_iter.bi_sector =
+ bio->bi_iter.bi_sector +
+ bio_sectors(bio);
+ bio_add_page(bio2, pp_page, PAGE_SIZE, 0);
+ bio_chain(bio2, io->current_bio);
+ bio_list_add(&bios, bio2);
+ bio = bio2;
+ }
+
+ pp_size -= PAGE_SIZE;
+ sh = list_next_entry(sh, log_list);
+ }
+
+ dbg("entry: %d, data sector: %llu, PPL size: %u, data size %u\n",
+ i, e->data_sector, e->pp_size, e->data_size);
+
+ e->data_sector = cpu_to_le64(e->data_sector >>
+ ilog2(block_size >> 9));
+ e->pp_size = cpu_to_le32(e->pp_size);
+ e->data_size = cpu_to_le32(e->data_size);
+ e->checksum = cpu_to_le32(~crc);
+ }
+ pplhdr->entries_count = cpu_to_le32(pplhdr->entries_count);
+ pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
+
+ while ((bio = bio_list_pop(&bios))) {
+ dbg("submit_bio() size: %u sector: %llu dev: %s\n",
+ bio->bi_iter.bi_size,
+ (unsigned long long)bio->bi_iter.bi_sector,
+ bio->bi_bdev->bd_disk->disk_name);
+ submit_bio(bio);
+ }
+}
+
+static void ppl_submit_current_io(struct r5l_log *log)
+{
+ struct r5l_io_unit *io, *io_submit = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ list_for_each_entry(io, &log->running_ios, log_sibling) {
+ if (io->state >= IO_UNIT_IO_START)
+ break;
+
+ if (io->state == IO_UNIT_RUNNING && !io->need_split_bio) {
+ __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
+
+ if (io == log->current_io) {
+ BUG_ON(io->meta_offset < 0);
+ log->current_io = NULL;
+ }
+
+ io_submit = io;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+
+ if (io_submit)
+ ppl_submit_iounit(io_submit);
+}
+
+static void ppl_write_stripe_run(struct r5l_log *log)
+{
+ mutex_lock(&log->io_mutex);
+ ppl_submit_current_io(log);
+ mutex_unlock(&log->io_mutex);
+}
+
+static void __ppl_stripe_write_finished(struct r5l_io_unit *io)
+{
+ struct r5l_log *log = io->log;
+ unsigned long flags;
+
+ dbg("io %p seq: %llu\n", io, io->seq);
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+
+ if (io->meta_offset < 0) {
+ struct r5l_io_unit *io_next = list_first_entry(&log->running_ios,
+ struct r5l_io_unit, log_sibling);
+ BUG_ON(!io_next->need_split_bio);
+ io_next->need_split_bio = false;
+ }
+
+ list_del(&io->log_sibling);
+ mempool_free(io, log->io_pool);
+ r5l_run_no_mem_stripe(log);
+
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+}
+
+static void ppl_exit_log_child(struct r5l_log *log)
+{
+ clear_bit(JournalPpl, &log->rdev->flags);
+ kfree(log);
+}
+
+static void __ppl_exit_log(struct r5l_log *log)
+{
+ struct ppl_conf *ppl_conf = log->private;
+
+ if (ppl_conf->child_logs) {
+ struct r5l_log *log_child;
+ int i;
+
+ for (i = 0; i < ppl_conf->count; i++) {
+ log_child = ppl_conf->child_logs[i];
+ if (!log_child)
+ continue;
+
+ clear_bit(MD_HAS_PPL, &log_child->rdev->mddev->flags);
+ ppl_exit_log_child(log_child);
+ }
+ kfree(ppl_conf->child_logs);
+ }
+ kfree(ppl_conf);
+
+ mempool_destroy(log->meta_pool);
+ if (log->bs)
+ bioset_free(log->bs);
+ mempool_destroy(log->io_pool);
+ kmem_cache_destroy(log->io_kc);
+}
+
+static int ppl_init_log_child(struct r5l_log *log_parent,
+ struct md_rdev *rdev, struct r5l_log **log_child)
+{
+ struct r5l_log *log;
+ struct request_queue *q;
+
+ log = kzalloc(sizeof(struct r5l_log), GFP_KERNEL);
+ if (!log)
+ return -ENOMEM;
+
+ *log_child = log;
+ log->rdev = rdev;
+
+ mutex_init(&log->io_mutex);
+ spin_lock_init(&log->io_list_lock);
+ INIT_LIST_HEAD(&log->running_ios);
+ INIT_LIST_HEAD(&log->io_end_ios);
+ INIT_LIST_HEAD(&log->flushing_ios);
+ INIT_LIST_HEAD(&log->finished_ios);
+ INIT_LIST_HEAD(&log->no_mem_stripes);
+ bio_init(&log->flush_bio);
+
+ log->io_kc = log_parent->io_kc;
+ log->io_pool = log_parent->io_pool;
+ log->bs = log_parent->bs;
+ log->meta_pool = log_parent->meta_pool;
+ log->uuid_checksum = log_parent->uuid_checksum;
+
+ if (rdev->mddev->external) {
+ log->rdev->ppl.sector = log->rdev->data_offset +
+ log->rdev->sectors;
+ log->rdev->ppl.size = (PPL_HEADER_SIZE +
+ PPL_ENTRY_SPACE_IMSM) >> 9;
+ } else {
+ log->rdev->ppl.sector = log->rdev->sb_start +
+ log->rdev->ppl.offset;
+ }
+ log->policy = log_parent->policy;
+ q = bdev_get_queue(log->rdev->bdev);
+ log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
+
+ set_bit(JournalPpl, &rdev->flags);
+
+ return 0;
+}
+
+static int __ppl_init_log(struct r5l_log *log, struct r5conf *conf)
+{
+ struct ppl_conf *ppl_conf;
+ struct mddev *mddev = conf->mddev;
+ int ret;
+ int i;
+
+ if (PAGE_SIZE != 4096)
+ return -EINVAL;
+
+ ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
+ if (!ppl_conf)
+ return -ENOMEM;
+ log->private = ppl_conf;
+
+ if (!mddev->external)
+ log->uuid_checksum = crc32c_le(~0, mddev->uuid,
+ sizeof(mddev->uuid));
+
+ if (mddev->bitmap) {
+ pr_err("PPL is not compatible with bitmap\n");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ spin_lock_init(&log->io_list_lock);
+
+ log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
+ if (!log->io_kc) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ log->io_pool = mempool_create_slab_pool(conf->raid_disks, log->io_kc);
+ if (!log->io_pool) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ log->bs = bioset_create(conf->raid_disks, 0);
+ if (!log->bs) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ log->meta_pool = mempool_create_page_pool(conf->raid_disks, 0);
+ if (!log->meta_pool) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ log->need_cache_flush = true;
+
+ ppl_conf->count = conf->raid_disks;
+ ppl_conf->child_logs = kzalloc(sizeof(struct r5l_log *) * ppl_conf->count,
+ GFP_KERNEL);
+ if (!ppl_conf->child_logs) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ for (i = 0; i < ppl_conf->count; i++) {
+ struct r5l_log *log_child;
+ struct md_rdev *rdev = conf->disks[i].rdev;
+
+ if (!rdev)
+ continue;
+
+ ret = ppl_init_log_child(log, rdev, &log_child);
+ if (ret)
+ goto err;
+
+ ppl_conf->child_logs[i] = log_child;
+ }
+
+ rcu_assign_pointer(conf->log, log);
+ set_bit(MD_HAS_PPL, &mddev->flags);
+
+ return 0;
+err:
+ __ppl_exit_log(log);
+ return ret;
+}
+
+static int __ppl_write_stripe(struct r5l_log *log, struct stripe_head *sh)
+{
+ struct ppl_conf *ppl_conf = log->private;
+ struct r5l_log *log_child = ppl_conf->child_logs[sh->pd_idx];
+
+ return ppl_write_stripe(log_child, sh);
+}
+
+static void __ppl_write_stripe_run(struct r5l_log *log)
+{
+ struct ppl_conf *ppl_conf = log->private;
+ struct r5l_log *log_child;
+ int i;
+
+ for (i = 0; i < ppl_conf->count; i++) {
+ log_child = ppl_conf->child_logs[i];
+ if (log_child)
+ ppl_write_stripe_run(log_child);
+ }
+}
+
+static void __ppl_flush_stripe_to_raid(struct r5l_log *log)
+{
+ struct ppl_conf *ppl_conf = log->private;
+ struct r5l_log *log_child;
+ int i;
+
+ for (i = 0; i < ppl_conf->count; i++) {
+ log_child = ppl_conf->child_logs[i];
+ if (log_child)
+ __r5l_flush_stripe_to_raid(log_child);
+ }
+}
+
+struct r5l_policy r5l_ppl = {
+ .init_log = __ppl_init_log,
+ .exit_log = __ppl_exit_log,
+ .write_stripe = __ppl_write_stripe,
+ .write_stripe_run = __ppl_write_stripe_run,
+ .flush_stripe_to_raid = __ppl_flush_stripe_to_raid,
+ .stripe_write_finished = __ppl_stripe_write_finished,
+ .handle_flush_request = NULL,
+ .quiesce = NULL,
+};
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a7e993a..77a503d3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -464,6 +464,11 @@ static void shrink_buffers(struct stripe_head *sh)
sh->dev[i].page = NULL;
put_page(p);
}
+
+ if (sh->ppl_page) {
+ put_page(sh->ppl_page);
+ sh->ppl_page = NULL;
+ }
}
static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -480,6 +485,13 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
sh->dev[i].page = page;
sh->dev[i].orig_page = page;
}
+
+ if (test_bit(MD_HAS_PPL, &sh->raid_conf->mddev->flags)) {
+ sh->ppl_page = alloc_page(gfp);
+ if (!sh->ppl_page)
+ return 1;
+ }
+
return 0;
}
@@ -875,7 +887,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
might_sleep();
- if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
+ if (!test_bit(STRIPE_R5C_CACHING, &sh->state) ||
+ test_bit(MD_HAS_PPL, &conf->mddev->flags)) {
/* writing out phase */
if (s->waiting_extra_page)
return;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 315d6ea..4a0d7b3 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -228,6 +228,7 @@ struct stripe_head {
struct list_head log_list;
sector_t log_start; /* first meta block on the journal */
struct list_head r5c; /* for r5c_cache->stripe_in_journal */
+ struct page *ppl_page;
/**
* struct stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target
--
2.10.1
^ permalink raw reply related [flat|nested] 38+ messages in thread
* Re: [PATCH v2 05/12] raid5-ppl: Partial Parity Log implementation
2016-12-05 15:31 ` [PATCH v2 05/12] raid5-ppl: Partial Parity Log implementation Artur Paszkiewicz
@ 2016-12-06 1:06 ` kbuild test robot
2016-12-07 1:17 ` NeilBrown
1 sibling, 0 replies; 38+ messages in thread
From: kbuild test robot @ 2016-12-06 1:06 UTC (permalink / raw)
To: Artur Paszkiewicz; +Cc: kbuild-all, shli, linux-raid
[-- Attachment #1: Type: text/plain, Size: 1574 bytes --]
Hi Artur,
[auto build test ERROR on next-20161205]
[cannot apply to v4.9-rc8 v4.9-rc7 v4.9-rc6 v4.9-rc8]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]
url: https://github.com/0day-ci/linux/commits/Artur-Paszkiewicz/Partial-Parity-Log-for-MD-RAID-5/20161206-081540
config: i386-randconfig-r0-201649 (attached as .config)
compiler: gcc-5 (Debian 5.4.1-2) 5.4.1 20160904
reproduce:
# save the attached .config to linux build tree
make ARCH=i386
All errors (new ones prefixed by >>):
drivers/md/raid5-ppl.c: In function 'ppl_init_log_child':
>> drivers/md/raid5-ppl.c:459:2: error: too few arguments to function 'bio_init'
bio_init(&log->flush_bio);
^
In file included from include/linux/blkdev.h:19:0,
from drivers/md/raid5-ppl.c:16:
include/linux/bio.h:426:13: note: declared here
extern void bio_init(struct bio *bio, struct bio_vec *table,
^
vim +/bio_init +459 drivers/md/raid5-ppl.c
453 spin_lock_init(&log->io_list_lock);
454 INIT_LIST_HEAD(&log->running_ios);
455 INIT_LIST_HEAD(&log->io_end_ios);
456 INIT_LIST_HEAD(&log->flushing_ios);
457 INIT_LIST_HEAD(&log->finished_ios);
458 INIT_LIST_HEAD(&log->no_mem_stripes);
> 459 bio_init(&log->flush_bio);
460
461 log->io_kc = log_parent->io_kc;
462 log->io_pool = log_parent->io_pool;
---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 31114 bytes --]
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 05/12] raid5-ppl: Partial Parity Log implementation
2016-12-05 15:31 ` [PATCH v2 05/12] raid5-ppl: Partial Parity Log implementation Artur Paszkiewicz
2016-12-06 1:06 ` kbuild test robot
@ 2016-12-07 1:17 ` NeilBrown
2016-12-07 14:37 ` Artur Paszkiewicz
1 sibling, 1 reply; 38+ messages in thread
From: NeilBrown @ 2016-12-07 1:17 UTC (permalink / raw)
To: Artur Paszkiewicz, shli; +Cc: linux-raid
[-- Attachment #1: Type: text/plain, Size: 6455 bytes --]
On Tue, Dec 06 2016, Artur Paszkiewicz wrote:
> This implements the write logging functionality, using the policy logic
> introduced in previous patches.
>
> PPL is a distributed log - data is stored on all RAID member drives in
> the metadata area. PPL is written to the parity disk of a particular
> stripe. Distributed log is implemented by using one r5l_log instance per
> each array member. They are grouped in child_logs array in struct
> ppl_conf, which is assigned to a common parent log. This parent log
> serves as a proxy and is used in raid5 personality code - it is assigned
> as _the_ log in r5conf->log. The child logs are where all the real work
> is done.
>
> The PPL consists of a 4KB header (struct ppl_header), and at least 128KB
> for partial parity data. It is stored right after the array data (for
> IMSM) or in the bitmap area (super 1.1 and 1.2) and can be overwritten
> even at each array write request.
>
> Attach a page for holding the partial parity data to each stripe_head.
> Allocate it only if mddev has the MD_HAS_PPL flag set.
>
> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
> ---
> drivers/md/raid5-cache.c | 12 +-
> drivers/md/raid5-cache.h | 6 +
> drivers/md/raid5-ppl.c | 594 ++++++++++++++++++++++++++++++++++++++++++++++-
> drivers/md/raid5.c | 15 +-
> drivers/md/raid5.h | 1 +
> 5 files changed, 620 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index fa82b9a..be534d8 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -119,8 +119,8 @@ static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
> return log->device_size > used_size + size;
> }
>
> -static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
> - enum r5l_io_unit_state state)
> +void __r5l_set_io_unit_state(struct r5l_io_unit *io,
> + enum r5l_io_unit_state state)
> {
> if (WARN_ON(io->state >= state))
> return;
> @@ -340,7 +340,7 @@ static void r5c_finish_cache_stripe(struct stripe_head *sh)
> }
> }
>
> -static void r5l_io_run_stripes(struct r5l_io_unit *io)
> +void r5l_io_run_stripes(struct r5l_io_unit *io)
> {
> struct stripe_head *sh, *next;
>
> @@ -935,7 +935,7 @@ static sector_t r5l_reclaimable_space(struct r5l_log *log)
> r5c_calculate_new_cp(conf));
> }
>
> -static void r5l_run_no_mem_stripe(struct r5l_log *log)
> +void r5l_run_no_mem_stripe(struct r5l_log *log)
> {
> struct stripe_head *sh;
>
> @@ -1039,7 +1039,7 @@ static void r5l_log_flush_endio(struct bio *bio)
> * only write stripes of an io_unit to raid disks till the io_unit is the first
> * one whose data/parity is in log.
> */
> -static void __r5l_flush_stripe_to_raid(struct r5l_log *log)
> +void __r5l_flush_stripe_to_raid(struct r5l_log *log)
> {
> bool do_flush;
>
> @@ -1359,7 +1359,7 @@ bool r5l_log_disk_error(struct r5conf *conf)
> if (!log)
> ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
> else
> - ret = test_bit(Faulty, &log->rdev->flags);
> + ret = log->rdev && test_bit(Faulty, &log->rdev->flags);
> rcu_read_unlock();
> return ret;
> }
> diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
> index 4ba11d3..0446100 100644
> --- a/drivers/md/raid5-cache.h
> +++ b/drivers/md/raid5-cache.h
> @@ -157,4 +157,10 @@ extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
> extern void r5l_quiesce(struct r5l_log *log, int state);
> extern bool r5l_log_disk_error(struct r5conf *conf);
>
> +extern void __r5l_set_io_unit_state(struct r5l_io_unit *io,
> + enum r5l_io_unit_state state);
> +extern void r5l_io_run_stripes(struct r5l_io_unit *io);
> +extern void r5l_run_no_mem_stripe(struct r5l_log *log);
> +extern void __r5l_flush_stripe_to_raid(struct r5l_log *log);
> +
> #endif /* _RAID5_CACHE_H */
> diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
> index 263fad7..2d4c90f 100644
> --- a/drivers/md/raid5-ppl.c
> +++ b/drivers/md/raid5-ppl.c
> @@ -14,7 +14,599 @@
>
> #include <linux/kernel.h>
> #include <linux/blkdev.h>
> +#include <linux/slab.h>
> +#include <linux/crc32c.h>
> +#include <linux/module.h>
> +#include <linux/raid/md_p.h>
> +#include "md.h"
> #include "raid5.h"
> #include "raid5-cache.h"
>
> -struct r5l_policy r5l_ppl;
> +static bool ppl_debug;
> +module_param(ppl_debug, bool, 0644);
> +MODULE_PARM_DESC(ppl_debug, "Debug mode for md raid5 PPL");
> +
> +#define dbg(format, args...) \
> +do { \
> + if (ppl_debug) \
> + printk(KERN_DEBUG"[%d] %s() "format, \
> + current->pid, __func__, ##args); \
> +} while (0)
Please don't do this. Just use pr_debug(), and use
/sys/kernel/debug/dynamic_debug/control
to turn them on and off.
> +
> +struct ppl_conf {
> + int count;
> + struct r5l_log **child_logs;
> +};
> +
> +struct ppl_header_entry {
> + __le64 data_sector; /* Raid sector of the new data */
> + __le32 pp_size; /* Length of partial parity */
> + __le32 data_size; /* Length of data */
> + __u8 parity_disk; /* Member disk containing parity */
> + __le32 checksum; /* Checksum of this entry */
> +} __packed;
Really? "checksum" is 32bits but not aligned?
I *think* you should be using get_unaligned_le32() to access this
and put_unaligned_le32() to set it.
> +
> +#define PPL_HEADER_SIZE PAGE_SIZE
> +#define PPL_HDR_RESERVED 512
> +#define PPL_HDR_ENTRY_SPACE \
> + (PPL_HEADER_SIZE - PPL_HDR_RESERVED - 3 * sizeof(u32) - sizeof(u64))
> +#define PPL_HDR_MAX_ENTRIES \
> + (PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
> +#define PPL_ENTRY_SPACE_IMSM (128 * 1024)
> +
> +struct ppl_header {
> + __u8 reserved[PPL_HDR_RESERVED];/* Reserved space */
> + __le32 signature; /* Signature (family number of volume) */
> + __le64 generation; /* Generation number of PP Header */
This probably needs to use the 'unaligned' macros too.
> + __le32 entries_count; /* Number of entries in entry array */
> + __le32 checksum; /* Checksum of PP Header */
> + struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES];
> +} __packed;
ppl_header_entry doesn't seem to be a multiple of 4 bytes long.
This means all the fields in it could be unaligned...
Maybe we should make this code refuse to compile except on x86 ???
NeilBrown
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 05/12] raid5-ppl: Partial Parity Log implementation
2016-12-07 1:17 ` NeilBrown
@ 2016-12-07 14:37 ` Artur Paszkiewicz
0 siblings, 0 replies; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-07 14:37 UTC (permalink / raw)
To: NeilBrown, shli; +Cc: linux-raid
On 12/07/2016 02:17 AM, NeilBrown wrote:
> On Tue, Dec 06 2016, Artur Paszkiewicz wrote:
>
>> This implements the write logging functionality, using the policy logic
>> introduced in previous patches.
>>
>> PPL is a distributed log - data is stored on all RAID member drives in
>> the metadata area. PPL is written to the parity disk of a particular
>> stripe. Distributed log is implemented by using one r5l_log instance per
>> each array member. They are grouped in child_logs array in struct
>> ppl_conf, which is assigned to a common parent log. This parent log
>> serves as a proxy and is used in raid5 personality code - it is assigned
>> as _the_ log in r5conf->log. The child logs are where all the real work
>> is done.
>>
>> The PPL consists of a 4KB header (struct ppl_header), and at least 128KB
>> for partial parity data. It is stored right after the array data (for
>> IMSM) or in the bitmap area (super 1.1 and 1.2) and can be overwritten
>> even at each array write request.
>>
>> Attach a page for holding the partial parity data to each stripe_head.
>> Allocate it only if mddev has the MD_HAS_PPL flag set.
>>
>> Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
>> ---
>> drivers/md/raid5-cache.c | 12 +-
>> drivers/md/raid5-cache.h | 6 +
>> drivers/md/raid5-ppl.c | 594 ++++++++++++++++++++++++++++++++++++++++++++++-
>> drivers/md/raid5.c | 15 +-
>> drivers/md/raid5.h | 1 +
>> 5 files changed, 620 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
>> index fa82b9a..be534d8 100644
>> --- a/drivers/md/raid5-cache.c
>> +++ b/drivers/md/raid5-cache.c
>> @@ -119,8 +119,8 @@ static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
>> return log->device_size > used_size + size;
>> }
>>
>> -static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
>> - enum r5l_io_unit_state state)
>> +void __r5l_set_io_unit_state(struct r5l_io_unit *io,
>> + enum r5l_io_unit_state state)
>> {
>> if (WARN_ON(io->state >= state))
>> return;
>> @@ -340,7 +340,7 @@ static void r5c_finish_cache_stripe(struct stripe_head *sh)
>> }
>> }
>>
>> -static void r5l_io_run_stripes(struct r5l_io_unit *io)
>> +void r5l_io_run_stripes(struct r5l_io_unit *io)
>> {
>> struct stripe_head *sh, *next;
>>
>> @@ -935,7 +935,7 @@ static sector_t r5l_reclaimable_space(struct r5l_log *log)
>> r5c_calculate_new_cp(conf));
>> }
>>
>> -static void r5l_run_no_mem_stripe(struct r5l_log *log)
>> +void r5l_run_no_mem_stripe(struct r5l_log *log)
>> {
>> struct stripe_head *sh;
>>
>> @@ -1039,7 +1039,7 @@ static void r5l_log_flush_endio(struct bio *bio)
>> * only write stripes of an io_unit to raid disks till the io_unit is the first
>> * one whose data/parity is in log.
>> */
>> -static void __r5l_flush_stripe_to_raid(struct r5l_log *log)
>> +void __r5l_flush_stripe_to_raid(struct r5l_log *log)
>> {
>> bool do_flush;
>>
>> @@ -1359,7 +1359,7 @@ bool r5l_log_disk_error(struct r5conf *conf)
>> if (!log)
>> ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
>> else
>> - ret = test_bit(Faulty, &log->rdev->flags);
>> + ret = log->rdev && test_bit(Faulty, &log->rdev->flags);
>> rcu_read_unlock();
>> return ret;
>> }
>> diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
>> index 4ba11d3..0446100 100644
>> --- a/drivers/md/raid5-cache.h
>> +++ b/drivers/md/raid5-cache.h
>> @@ -157,4 +157,10 @@ extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
>> extern void r5l_quiesce(struct r5l_log *log, int state);
>> extern bool r5l_log_disk_error(struct r5conf *conf);
>>
>> +extern void __r5l_set_io_unit_state(struct r5l_io_unit *io,
>> + enum r5l_io_unit_state state);
>> +extern void r5l_io_run_stripes(struct r5l_io_unit *io);
>> +extern void r5l_run_no_mem_stripe(struct r5l_log *log);
>> +extern void __r5l_flush_stripe_to_raid(struct r5l_log *log);
>> +
>> #endif /* _RAID5_CACHE_H */
>> diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
>> index 263fad7..2d4c90f 100644
>> --- a/drivers/md/raid5-ppl.c
>> +++ b/drivers/md/raid5-ppl.c
>> @@ -14,7 +14,599 @@
>>
>> #include <linux/kernel.h>
>> #include <linux/blkdev.h>
>> +#include <linux/slab.h>
>> +#include <linux/crc32c.h>
>> +#include <linux/module.h>
>> +#include <linux/raid/md_p.h>
>> +#include "md.h"
>> #include "raid5.h"
>> #include "raid5-cache.h"
>>
>> -struct r5l_policy r5l_ppl;
>> +static bool ppl_debug;
>> +module_param(ppl_debug, bool, 0644);
>> +MODULE_PARM_DESC(ppl_debug, "Debug mode for md raid5 PPL");
>> +
>> +#define dbg(format, args...) \
>> +do { \
>> + if (ppl_debug) \
>> + printk(KERN_DEBUG"[%d] %s() "format, \
>> + current->pid, __func__, ##args); \
>> +} while (0)
>
> Please don't do this. Just use pr_debug(), and use
> /sys/kernel/debug/dynamic_debug/control
> to turn them on and off.
>
>> +
>> +struct ppl_conf {
>> + int count;
>> + struct r5l_log **child_logs;
>> +};
>> +
>> +struct ppl_header_entry {
>> + __le64 data_sector; /* Raid sector of the new data */
>> + __le32 pp_size; /* Length of partial parity */
>> + __le32 data_size; /* Length of data */
>> + __u8 parity_disk; /* Member disk containing parity */
>> + __le32 checksum; /* Checksum of this entry */
>> +} __packed;
>
> Really? "checksum" is 32bits but not aligned?
> I *think* you should be using get_unaligned_le32() to access this
> and put_unaligned_le32() to set it.
>
>> +
>> +#define PPL_HEADER_SIZE PAGE_SIZE
>> +#define PPL_HDR_RESERVED 512
>> +#define PPL_HDR_ENTRY_SPACE \
>> + (PPL_HEADER_SIZE - PPL_HDR_RESERVED - 3 * sizeof(u32) - sizeof(u64))
>> +#define PPL_HDR_MAX_ENTRIES \
>> + (PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
>> +#define PPL_ENTRY_SPACE_IMSM (128 * 1024)
>> +
>> +struct ppl_header {
>> + __u8 reserved[PPL_HDR_RESERVED];/* Reserved space */
>> + __le32 signature; /* Signature (family number of volume) */
>> + __le64 generation; /* Generation number of PP Header */
>
> This probably needs to use the 'unaligned' macros too.
>
>> + __le32 entries_count; /* Number of entries in entry array */
>> + __le32 checksum; /* Checksum of PP Header */
>> + struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES];
>> +} __packed;
>
> ppl_header_entry doesn't seem to be a multiple of 4 bytes long.
> This means all the fields in it could be unaligned...
>
> Maybe we should make this code refuse to compile except on x86 ???
Oh... you are right, this is bad. I hope I can still persuade some
people to fix those structures in the platform firmware, etc. Like
changing "parity_disk" to 4 bytes or inserting a padding there.
Artur
^ permalink raw reply [flat|nested] 38+ messages in thread
* [PATCH v2 06/12] raid5-ppl: calculate partial parity
2016-12-05 15:31 [PATCH v2 00/12] Partial Parity Log for MD RAID 5 Artur Paszkiewicz
` (4 preceding siblings ...)
2016-12-05 15:31 ` [PATCH v2 05/12] raid5-ppl: Partial Parity Log implementation Artur Paszkiewicz
@ 2016-12-05 15:31 ` Artur Paszkiewicz
2016-12-05 15:31 ` [PATCH v2 07/12] md: mddev_find_container helper function Artur Paszkiewicz
` (6 subsequent siblings)
12 siblings, 0 replies; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-05 15:31 UTC (permalink / raw)
To: shli; +Cc: linux-raid
Partial parity is calculated as follows:
- reconstruct-write case:
xor data from all not updated disks in a stripe
- read-modify-write case:
xor old data and parity from all updated disks in a stripe
Implement it using the async_tx API and integrate into raid_run_ops().
It must be called when we still have access to old data, so do it when
STRIPE_OP_BIODRAIN is set, but before ops_run_prexor5(). The result is
stored into sh->ppl_page.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/raid5.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 75 insertions(+)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 77a503d3..2169de5 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1941,6 +1941,49 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
&sh->ops.zero_sum_result, percpu->spare_page, &submit);
}
+static struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+ struct dma_async_tx_descriptor *tx)
+{
+ int disks = sh->disks;
+ struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
+ int count = 0, pd_idx = sh->pd_idx, i;
+ struct async_submit_ctl submit;
+
+ if (test_bit(STRIPE_FULL_WRITE, &sh->state))
+ return tx;
+
+ if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+ for (i = disks; i--;) {
+ struct r5dev *dev = &sh->dev[i];
+ if (test_bit(R5_Wantdrain, &dev->flags)
+ || i == pd_idx)
+ xor_srcs[count++] = dev->page;
+ }
+ } else if (sh->reconstruct_state == reconstruct_state_drain_run) {
+ for (i = disks; i--;) {
+ struct r5dev *dev = &sh->dev[i];
+ if (test_bit(R5_UPTODATE, &dev->flags))
+ xor_srcs[count++] = dev->page;
+ }
+ } else {
+ return tx;
+ }
+
+ init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, tx,
+ NULL, sh, flex_array_get(percpu->scribble, 0)
+ + sizeof(struct page *) * (sh->disks + 2));
+
+ if (count == 1)
+ tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
+ &submit);
+ else
+ tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
+ &submit);
+
+ return tx;
+}
+
static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
{
int overlap_clear = 0, i, disks = sh->disks;
@@ -1971,6 +2014,11 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
async_tx_ack(tx);
}
+ if (test_bit(STRIPE_OP_BIODRAIN, &ops_request) &&
+ test_bit(MD_HAS_PPL, &conf->mddev->flags) &&
+ conf->disks[sh->pd_idx].rdev)
+ tx = ops_run_partial_parity(sh, percpu, tx);
+
if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
if (level < 6)
tx = ops_run_prexor5(sh, percpu, tx);
@@ -3042,6 +3090,33 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
goto overlap;
+ /*
+ * PPL does not allow writes to different disks in the same stripe when
+ * they are not next to each other. Not really an overlap, but
+ * wait_for_overlap can be used to handle this.
+ */
+ if (forwrite && test_bit(MD_HAS_PPL, &conf->mddev->flags)) {
+ int i;
+ int di = 0;
+ int first = -1, last = -1;
+ int count = 0;
+ for (i = 0; i < sh->disks; i++) {
+ if (i == sh->pd_idx)
+ continue;
+
+ if (sh->dev[i].towrite || i == dd_idx) {
+ count++;
+ if (first < 0)
+ first = di;
+ last = di;
+ }
+ di++;
+ }
+
+ if (last > first && (last - first > count - 1))
+ goto overlap;
+ }
+
if (!forwrite || previous)
clear_bit(STRIPE_BATCH_READY, &sh->state);
--
2.10.1
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH v2 07/12] md: mddev_find_container helper function
2016-12-05 15:31 [PATCH v2 00/12] Partial Parity Log for MD RAID 5 Artur Paszkiewicz
` (5 preceding siblings ...)
2016-12-05 15:31 ` [PATCH v2 06/12] raid5-ppl: calculate partial parity Artur Paszkiewicz
@ 2016-12-05 15:31 ` Artur Paszkiewicz
2016-12-07 1:23 ` NeilBrown
2016-12-05 15:31 ` [PATCH v2 08/12] md: expose rdev->sb_start as sysfs attribute Artur Paszkiewicz
` (5 subsequent siblings)
12 siblings, 1 reply; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-05 15:31 UTC (permalink / raw)
To: shli; +Cc: linux-raid
This allows finding the parent container of a subarray for external
metadata arrays.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/md.c | 37 +++++++++++++++++++++++++++++++++++++
drivers/md/md.h | 3 +++
2 files changed, 40 insertions(+)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7028d54..5a14f8e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -595,6 +595,43 @@ static struct mddev *mddev_find(dev_t unit)
goto retry;
}
+struct mddev *mddev_find_container(struct mddev *subarray)
+{
+ struct mddev *mddev, *ret = NULL;
+ char *container_name;
+ int len = 0;
+ int i;
+
+ if (!subarray->external)
+ return NULL;
+
+ container_name = subarray->metadata_type + 1;
+ i = 0;
+ while (container_name[i]) {
+ if (container_name[i] == '/') {
+ len = i;
+ break;
+ }
+ i++;
+ }
+ if (len == 0)
+ return NULL;
+
+ spin_lock(&all_mddevs_lock);
+ list_for_each_entry(mddev, &all_mddevs, all_mddevs) {
+ char *name = mdname(mddev);
+ if (strlen(name) == len &&
+ strncmp(name, container_name, len) == 0) {
+ ret = mddev;
+ break;
+ }
+ }
+ spin_unlock(&all_mddevs_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mddev_find_container);
+
static struct attribute_group md_redundancy_group;
void mddev_unlock(struct mddev *mddev)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d1e56f8..ca0b68f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -711,4 +711,7 @@ static inline int mddev_is_clustered(struct mddev *mddev)
{
return mddev->cluster_info && mddev->bitmap_info.nodes > 1;
}
+
+extern struct mddev *mddev_find_container(struct mddev *subarray);
+
#endif /* _MD_MD_H */
--
2.10.1
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH v2 08/12] md: expose rdev->sb_start as sysfs attribute
2016-12-05 15:31 [PATCH v2 00/12] Partial Parity Log for MD RAID 5 Artur Paszkiewicz
` (6 preceding siblings ...)
2016-12-05 15:31 ` [PATCH v2 07/12] md: mddev_find_container helper function Artur Paszkiewicz
@ 2016-12-05 15:31 ` Artur Paszkiewicz
2016-12-07 1:25 ` NeilBrown
2016-12-05 15:31 ` [PATCH v2 09/12] raid5-ppl: read PPL signature from IMSM metadata Artur Paszkiewicz
` (4 subsequent siblings)
12 siblings, 1 reply; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-05 15:31 UTC (permalink / raw)
To: shli; +Cc: linux-raid
This is meant for external metadata arrays, to pass the location of the
superblock to the kernel.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/md.c | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5a14f8e..7049833 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3204,6 +3204,27 @@ static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
static struct rdev_sysfs_entry rdev_unack_bad_blocks =
__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
+static ssize_t
+sb_start_show(struct md_rdev *rdev, char *page)
+{
+ return sprintf(page, "%llu\n", (unsigned long long)rdev->sb_start);
+}
+
+static ssize_t
+sb_start_store(struct md_rdev *rdev, const char *buf, size_t len)
+{
+ unsigned long long sb_start;
+ if (kstrtoull(buf, 10, &sb_start) < 0)
+ return -EINVAL;
+ if (!rdev->mddev->external)
+ return -EBUSY;
+ rdev->sb_start = sb_start;
+ return len;
+}
+
+static struct rdev_sysfs_entry rdev_sb_start =
+__ATTR(sb_start, S_IRUGO|S_IWUSR, sb_start_show, sb_start_store);
+
static struct attribute *rdev_default_attrs[] = {
&rdev_state.attr,
&rdev_errors.attr,
@@ -3214,6 +3235,7 @@ static struct attribute *rdev_default_attrs[] = {
&rdev_recovery_start.attr,
&rdev_bad_blocks.attr,
&rdev_unack_bad_blocks.attr,
+ &rdev_sb_start.attr,
NULL,
};
static ssize_t
--
2.10.1
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH v2 09/12] raid5-ppl: read PPL signature from IMSM metadata
2016-12-05 15:31 [PATCH v2 00/12] Partial Parity Log for MD RAID 5 Artur Paszkiewicz
` (7 preceding siblings ...)
2016-12-05 15:31 ` [PATCH v2 08/12] md: expose rdev->sb_start as sysfs attribute Artur Paszkiewicz
@ 2016-12-05 15:31 ` Artur Paszkiewicz
2016-12-07 1:25 ` NeilBrown
2016-12-05 15:31 ` [PATCH v2 10/12] raid5-ppl: recovery from dirty shutdown using PPL Artur Paszkiewicz
` (3 subsequent siblings)
12 siblings, 1 reply; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-05 15:31 UTC (permalink / raw)
To: shli; +Cc: linux-raid
The PPL signature is used to determine if the stored PPL is valid for a
given array. With IMSM, the PPL signature should match the
orig_family_num field of the superblock. To avoid passing this value
from userspace, it can be read from the IMSM MPB when initializing the
log.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/raid5-ppl.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 78 insertions(+), 1 deletion(-)
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 2d4c90f..9e46497 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -403,6 +403,75 @@ static void __ppl_stripe_write_finished(struct r5l_io_unit *io)
spin_unlock_irqrestore(&log->io_list_lock, flags);
}
+#define IMSM_MPB_SIG "Intel Raid ISM Cfg Sig. "
+#define IMSM_MPB_ORIG_FAMILY_NUM_OFFSET 64
+
+static int ppl_find_signature_imsm(struct mddev *mddev, u32 *signature)
+{
+ struct md_rdev *rdev;
+ char *buf;
+ int ret = 0;
+ u32 orig_family_num = 0;
+ struct page *page;
+ struct mddev *container;
+
+ container = mddev_find_container(mddev);
+ if (!container || strncmp(container->metadata_type, "imsm", 4)) {
+ pr_err("Container metadata type is not imsm\n");
+ return -EINVAL;
+ }
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ buf = page_address(page);
+
+ rdev_for_each(rdev, container) {
+ u32 tmp;
+ struct md_rdev *rdev2;
+ bool found = false;
+
+ /* only use rdevs that are both in container and mddev */
+ rdev_for_each(rdev2, mddev)
+ if (rdev2->bdev == rdev->bdev) {
+ found = true;
+ break;
+ }
+
+ if (!found)
+ continue;
+
+ if (!sync_page_io(rdev, 0,
+ queue_logical_block_size(rdev->bdev->bd_queue),
+ page, REQ_OP_READ, 0, true)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ if (strncmp(buf, IMSM_MPB_SIG, strlen(IMSM_MPB_SIG)) != 0) {
+ dbg("imsm mpb signature does not match\n");
+ ret = 1;
+ goto out;
+ }
+
+ tmp = le32_to_cpu(*(u32 *)(buf + IMSM_MPB_ORIG_FAMILY_NUM_OFFSET));
+
+ if (orig_family_num && orig_family_num != tmp) {
+ dbg("orig_family_num is not the same on all disks\n");
+ ret = 1;
+ goto out;
+ }
+
+ orig_family_num = tmp;
+ }
+
+ *signature = orig_family_num;
+out:
+ __free_page(page);
+ return ret;
+}
+
static void ppl_exit_log_child(struct r5l_log *log)
{
clear_bit(JournalPpl, &log->rdev->flags);
@@ -497,9 +566,17 @@ static int __ppl_init_log(struct r5l_log *log, struct r5conf *conf)
return -ENOMEM;
log->private = ppl_conf;
- if (!mddev->external)
+ if (mddev->external) {
+ ret = ppl_find_signature_imsm(mddev, &log->uuid_checksum);
+ if (ret) {
+ pr_err("Failed to read imsm signature\n");
+ ret = -EINVAL;
+ goto err;
+ }
+ } else {
log->uuid_checksum = crc32c_le(~0, mddev->uuid,
sizeof(mddev->uuid));
+ }
if (mddev->bitmap) {
pr_err("PPL is not compatible with bitmap\n");
--
2.10.1
^ permalink raw reply related [flat|nested] 38+ messages in thread
* Re: [PATCH v2 09/12] raid5-ppl: read PPL signature from IMSM metadata
2016-12-05 15:31 ` [PATCH v2 09/12] raid5-ppl: read PPL signature from IMSM metadata Artur Paszkiewicz
@ 2016-12-07 1:25 ` NeilBrown
2016-12-07 14:38 ` Artur Paszkiewicz
0 siblings, 1 reply; 38+ messages in thread
From: NeilBrown @ 2016-12-07 1:25 UTC (permalink / raw)
To: Artur Paszkiewicz, shli; +Cc: linux-raid
[-- Attachment #1: Type: text/plain, Size: 464 bytes --]
On Tue, Dec 06 2016, Artur Paszkiewicz wrote:
> The PPL signature is used to determine if the stored PPL is valid for a
> given array. With IMSM, the PPL signature should match the
> orig_family_num field of the superblock. To avoid passing this value
> from userspace, it can be read from the IMSM MPB when initializing the
> log.
It is up to mdadm to determine if the PPL is valid. It would only tell
the kernel that a PPL exists if it is valid...
NeilBrown
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 09/12] raid5-ppl: read PPL signature from IMSM metadata
2016-12-07 1:25 ` NeilBrown
@ 2016-12-07 14:38 ` Artur Paszkiewicz
2016-12-07 23:27 ` NeilBrown
0 siblings, 1 reply; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-07 14:38 UTC (permalink / raw)
To: NeilBrown, shli; +Cc: linux-raid
On 12/07/2016 02:25 AM, NeilBrown wrote:
> On Tue, Dec 06 2016, Artur Paszkiewicz wrote:
>
>> The PPL signature is used to determine if the stored PPL is valid for a
>> given array. With IMSM, the PPL signature should match the
>> orig_family_num field of the superblock. To avoid passing this value
>> from userspace, it can be read from the IMSM MPB when initializing the
>> log.
>
> It is up to mdadm to determine if the PPL is valid. It would only tell
> the kernel that a PPL exists if it is valid...
The kernel also has to know this value because it writes it to the PPL
header. So yet another sysfs attribute just for this value? How about
adding a directory similar to "bitmap" to hold all the PPL related
settings?
Artur
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 09/12] raid5-ppl: read PPL signature from IMSM metadata
2016-12-07 14:38 ` Artur Paszkiewicz
@ 2016-12-07 23:27 ` NeilBrown
2016-12-08 10:36 ` Artur Paszkiewicz
0 siblings, 1 reply; 38+ messages in thread
From: NeilBrown @ 2016-12-07 23:27 UTC (permalink / raw)
To: Artur Paszkiewicz, shli; +Cc: linux-raid
[-- Attachment #1: Type: text/plain, Size: 1128 bytes --]
On Thu, Dec 08 2016, Artur Paszkiewicz wrote:
> On 12/07/2016 02:25 AM, NeilBrown wrote:
>> On Tue, Dec 06 2016, Artur Paszkiewicz wrote:
>>
>>> The PPL signature is used to determine if the stored PPL is valid for a
>>> given array. With IMSM, the PPL signature should match the
>>> orig_family_num field of the superblock. To avoid passing this value
>>> from userspace, it can be read from the IMSM MPB when initializing the
>>> log.
>>
>> It is up to mdadm to determine if the PPL is valid. It would only tell
>> the kernel that a PPL exists if it is valid...
>
> The kernel also has to know this value because it writes it to the PPL
> header. So yet another sysfs attribute just for this value? How about
> adding a directory similar to "bitmap" to hold all the PPL related
> settings?
There is only one PPL header (per device) - correct?
md can read that header, change the few fields that it knows about, and
write it back out again. Does it ever need to change the PPL signature?
When a PPL is created, mdadm can write the initial header.
Am I missing something?
Thanks,
NeilBrown
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 09/12] raid5-ppl: read PPL signature from IMSM metadata
2016-12-07 23:27 ` NeilBrown
@ 2016-12-08 10:36 ` Artur Paszkiewicz
0 siblings, 0 replies; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-08 10:36 UTC (permalink / raw)
To: NeilBrown, shli; +Cc: linux-raid
On 12/08/2016 12:27 AM, NeilBrown wrote:
> On Thu, Dec 08 2016, Artur Paszkiewicz wrote:
>
>> On 12/07/2016 02:25 AM, NeilBrown wrote:
>>> On Tue, Dec 06 2016, Artur Paszkiewicz wrote:
>>>
>>>> The PPL signature is used to determine if the stored PPL is valid for a
>>>> given array. With IMSM, the PPL signature should match the
>>>> orig_family_num field of the superblock. To avoid passing this value
>>>> from userspace, it can be read from the IMSM MPB when initializing the
>>>> log.
>>>
>>> It is up to mdadm to determine if the PPL is valid. It would only tell
>>> the kernel that a PPL exists if it is valid...
>>
>> The kernel also has to know this value because it writes it to the PPL
>> header. So yet another sysfs attribute just for this value? How about
>> adding a directory similar to "bitmap" to hold all the PPL related
>> settings?
>
> There is only one PPL header (per device) - correct?
> md can read that header, change the few fields that it knows about, and
> write it back out again. Does it ever need to change the PPL signature?
>
> When a PPL is created, mdadm can write the initial header.
>
> Am I missing something?
Well, you are right. If mdadm takes care of validating PPL and writes
the initial header, md can use the signature read from the header. Much
simpler that way :) Thanks for the suggestion.
Artur
^ permalink raw reply [flat|nested] 38+ messages in thread
* [PATCH v2 10/12] raid5-ppl: recovery from dirty shutdown using PPL
2016-12-05 15:31 [PATCH v2 00/12] Partial Parity Log for MD RAID 5 Artur Paszkiewicz
` (8 preceding siblings ...)
2016-12-05 15:31 ` [PATCH v2 09/12] raid5-ppl: read PPL signature from IMSM metadata Artur Paszkiewicz
@ 2016-12-05 15:31 ` Artur Paszkiewicz
2016-12-05 15:31 ` [PATCH v2 11/12] raid5-ppl: support disk add/remove with distributed PPL Artur Paszkiewicz
` (2 subsequent siblings)
12 siblings, 0 replies; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-05 15:31 UTC (permalink / raw)
To: shli; +Cc: linux-raid
The recovery algorithm recalculates parity for every dirty stripe by
xor-ing the partial parity and data from each updated data member disk.
To verify PPL correctness a CRC is used for the PPL header. Each header
entry also contains a CRC for its partial parity data. If the header is
valid, recovery is performed for each entry until an invalid entry is
found. If the array is not degraded and recovery using PPL fully
succeeds, there is no need to resync the array because data and parity
will be consistent, so in this case resync will be disabled.
Due to compatibility with IMSM implementations on other systems, we
can't assume that the block size is always 4K. Writes generated by MD
raid5 don't have this issue, but in other environments it is possible to
have writes with the size of even a single 512-byte sector. The recovery
code takes this into account and also the logical sector size of the
underlying drives.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/raid5-ppl.c | 347 +++++++++++++++++++++++++++++++++++++++++++++++++
drivers/md/raid5.c | 5 +-
2 files changed, 351 insertions(+), 1 deletion(-)
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 9e46497..17e9803 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -16,6 +16,7 @@
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/crc32c.h>
+#include <linux/async_tx.h>
#include <linux/module.h>
#include <linux/raid/md_p.h>
#include "md.h"
@@ -403,6 +404,346 @@ static void __ppl_stripe_write_finished(struct r5l_io_unit *io)
spin_unlock_irqrestore(&log->io_list_lock, flags);
}
+static void ppl_xor(int size, struct page *page1, struct page *page2,
+ struct page *page_result)
+{
+ struct async_submit_ctl submit;
+ struct dma_async_tx_descriptor *tx;
+ struct page *xor_srcs[] = { page1, page2 };
+
+ init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
+ NULL, NULL, NULL, NULL);
+ tx = async_xor(page_result, xor_srcs, 0, 2, size, &submit);
+
+ async_tx_quiesce(&tx);
+}
+
+static int ppl_recover_entry(struct r5l_log *log, struct ppl_header_entry *e,
+ sector_t ppl_sector)
+{
+ struct mddev *mddev = log->rdev->mddev;
+ struct r5conf *conf = mddev->private;
+
+ int block_size = queue_logical_block_size(mddev->queue);
+ struct page *pages;
+ struct page *page1;
+ struct page *page2;
+ sector_t r_sector_first = e->data_sector * (block_size >> 9);
+ sector_t r_sector_last = r_sector_first + (e->data_size >> 9) - 1;
+ int strip_sectors = conf->chunk_sectors;
+ int i;
+ int ret = 0;
+
+ if (e->pp_size > 0 && (e->pp_size >> 9) < strip_sectors) {
+ if (e->data_size > e->pp_size)
+ r_sector_last = r_sector_first +
+ (e->data_size / e->pp_size) * strip_sectors - 1;
+ strip_sectors = e->pp_size >> 9;
+ }
+
+ pages = alloc_pages(GFP_KERNEL, 1);
+ if (!pages)
+ return -ENOMEM;
+ page1 = pages;
+ page2 = pages + 1;
+
+ dbg("array sector first %llu, last %llu\n",
+ (unsigned long long)r_sector_first,
+ (unsigned long long)r_sector_last);
+
+ /* if start and end is 4k aligned, use a 4k block */
+ if (block_size == 512 &&
+ r_sector_first % (PAGE_SIZE >> 9) == 0 &&
+ (r_sector_last + 1) % (PAGE_SIZE >> 9) == 0)
+ block_size = PAGE_SIZE;
+
+ /* iterate through blocks in strip */
+ for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
+ bool update_parity = false;
+ sector_t parity_sector;
+ struct md_rdev *parity_rdev;
+ struct stripe_head sh;
+ int disk;
+
+ dbg(" iter %d start\n", i);
+ memset(page_address(page1), 0, PAGE_SIZE);
+
+ /* iterate through data member disks */
+ for (disk = 0; disk < (conf->raid_disks - conf->max_degraded);
+ disk++) {
+ int dd_idx;
+ struct md_rdev *rdev;
+ sector_t sector;
+ sector_t r_sector = r_sector_first + i +
+ (disk * conf->chunk_sectors);
+
+ dbg(" data member disk %d start\n", disk);
+ if (r_sector > r_sector_last) {
+ dbg(" array sector %llu doesn't need parity update\n",
+ (unsigned long long)r_sector);
+ continue;
+ }
+
+ update_parity = true;
+
+ /* map raid sector to member disk */
+ sector = raid5_compute_sector(conf, r_sector, 0, &dd_idx, NULL);
+ dbg(" processing array sector %llu => data mem disk %d, sector %llu\n",
+ (unsigned long long)r_sector, dd_idx,
+ (unsigned long long)sector);
+
+ rdev = conf->disks[dd_idx].rdev;
+ if (!rdev) {
+ dbg(" data member disk %d missing\n", dd_idx);
+ update_parity = false;
+ break;
+ }
+
+ dbg(" reading data member disk %s sector %llu\n",
+ rdev->bdev->bd_disk->disk_name,
+ (unsigned long long)sector);
+ if (!sync_page_io(rdev, sector, block_size, page2,
+ REQ_OP_READ, 0, false)) {
+ md_error(mddev, rdev);
+ dbg(" read failed!\n");
+ ret = -EIO;
+ goto out;
+ }
+
+ ppl_xor(block_size, page1, page2, page1);
+ }
+
+ if (!update_parity)
+ continue;
+
+ if (e->pp_size > 0) {
+ dbg(" reading pp disk sector %llu\n",
+ (unsigned long long)(ppl_sector + i));
+ if (!sync_page_io(log->rdev,
+ ppl_sector - log->rdev->data_offset + i,
+ block_size, page2, REQ_OP_READ, 0,
+ false)) {
+ dbg(" read failed!\n");
+ md_error(mddev, log->rdev);
+ ret = -EIO;
+ goto out;
+ }
+
+ ppl_xor(block_size, page1, page2, page1);
+ }
+
+ /* map raid sector to parity disk */
+ parity_sector = raid5_compute_sector(conf, r_sector_first + i,
+ 0, &disk, &sh);
+ BUG_ON(sh.pd_idx != e->parity_disk);
+ parity_rdev = conf->disks[sh.pd_idx].rdev;
+
+ BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
+ dbg(" write parity at sector %llu, parity disk %s\n",
+ (unsigned long long)parity_sector,
+ parity_rdev->bdev->bd_disk->disk_name);
+ if (!sync_page_io(parity_rdev, parity_sector, block_size,
+ page1, REQ_OP_WRITE, 0, false)) {
+ dbg(" parity write error!\n");
+ md_error(mddev, parity_rdev);
+ ret = -EIO;
+ goto out;
+ }
+ }
+
+out:
+ __free_pages(pages, 1);
+ return ret;
+}
+
+static int ppl_recover(struct r5l_log *log, struct ppl_header *pplhdr)
+{
+ struct mddev *mddev = log->rdev->mddev;
+ sector_t ppl_sector = log->rdev->ppl.sector + (PPL_HEADER_SIZE >> 9);
+ struct page *page;
+ int i;
+ int ret = 0;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ /* iterate through all PPL entries saved */
+ for (i = 0; i < pplhdr->entries_count; i++) {
+ struct ppl_header_entry *e = &pplhdr->entries[i];
+ u32 size = le32_to_cpu(e->pp_size);
+ sector_t sector = ppl_sector;
+ int ppl_entry_sectors = size >> 9;
+ u32 crc, crc_stored;
+
+ dbg("disk: %d, entry: %d, ppl_sector: %llu ppl_size: %u\n",
+ log->rdev->raid_disk, i, (unsigned long long)ppl_sector,
+ size);
+
+ crc = ~0;
+ crc_stored = le32_to_cpu(e->checksum);
+
+ while (size) {
+ int s = size > PAGE_SIZE ? PAGE_SIZE : size;
+
+ if (!sync_page_io(log->rdev,
+ sector - log->rdev->data_offset,
+ s, page, REQ_OP_READ, 0, false)) {
+ md_error(mddev, log->rdev);
+ ret = -EIO;
+ goto out;
+ }
+
+ crc = crc32c_le(crc, page_address(page), s);
+
+ size -= s;
+ sector += s >> 9;
+ }
+
+ crc = ~crc;
+
+ if (crc != crc_stored) {
+ dbg("ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
+ crc_stored, crc);
+ ret++;
+ } else {
+ int ret2;
+ e->data_sector = le64_to_cpu(e->data_sector);
+ e->pp_size = le32_to_cpu(e->pp_size);
+ e->data_size = le32_to_cpu(e->data_size);
+
+ ret2 = ppl_recover_entry(log, e, ppl_sector);
+ if (ret2) {
+ ret = ret2;
+ goto out;
+ }
+ }
+
+ ppl_sector += ppl_entry_sectors;
+ }
+out:
+ __free_page(page);
+ return ret;
+}
+
+static int ppl_write_empty_header(struct r5l_log *log)
+{
+ struct page *page;
+ struct ppl_header *pplhdr;
+ int ret = 0;
+
+ dbg("disk: %d ppl_sector: %llu\n",
+ log->rdev->raid_disk, (unsigned long long)log->rdev->ppl.sector);
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ return -ENOMEM;
+
+ pplhdr = page_address(page);
+ memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+ pplhdr->signature = cpu_to_le32(log->uuid_checksum);
+ pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
+
+ if (!sync_page_io(log->rdev, log->rdev->ppl.sector -
+ log->rdev->data_offset, PPL_HEADER_SIZE, page,
+ REQ_OP_WRITE, 0, false)) {
+ md_error(log->rdev->mddev, log->rdev);
+ ret = -EIO;
+ }
+
+ __free_page(page);
+ return ret;
+}
+
+static int ppl_load_distributed(struct r5l_log *log)
+{
+ struct mddev *mddev = log->rdev->mddev;
+ struct page *page;
+ struct ppl_header *pplhdr;
+ u32 crc, crc_stored;
+ int ret = 0;
+
+ dbg("disk: %d\n", log->rdev->raid_disk);
+
+ /* read PPL header */
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ if (!sync_page_io(log->rdev,
+ log->rdev->ppl.sector - log->rdev->data_offset,
+ PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
+ md_error(mddev, log->rdev);
+ ret = -EIO;
+ goto out;
+ }
+ pplhdr = page_address(page);
+
+ /* check header validity */
+ crc_stored = le32_to_cpu(pplhdr->checksum);
+ pplhdr->checksum = 0;
+ crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
+
+ if (crc_stored != crc) {
+ dbg("ppl header crc does not match: stored: 0x%x calculated: 0x%x\n",
+ crc_stored, crc);
+ ret = 1;
+ goto out;
+ }
+
+ pplhdr->signature = le32_to_cpu(pplhdr->signature);
+ pplhdr->generation = le64_to_cpu(pplhdr->generation);
+ pplhdr->entries_count = le32_to_cpu(pplhdr->entries_count);
+
+ if (pplhdr->signature != log->uuid_checksum) {
+ dbg("ppl header signature does not match: stored: 0x%x configured: 0x%x\n",
+ pplhdr->signature, log->uuid_checksum);
+ ret = 1;
+ goto out;
+ }
+
+ if (mddev->recovery_cp != MaxSector)
+ ret = ppl_recover(log, pplhdr);
+out:
+ __free_page(page);
+
+ if (ret >= 0) {
+ int ret2 = ppl_write_empty_header(log);
+ if (ret2)
+ ret = ret2;
+ }
+
+ dbg("return: %d\n", ret);
+ return ret;
+}
+
+static int ppl_load(struct r5l_log *log)
+{
+ struct ppl_conf *ppl_conf = log->private;
+ int ret = 0;
+ int i;
+
+ for (i = 0; i < ppl_conf->count; i++) {
+ struct r5l_log *log_child = ppl_conf->child_logs[i];
+ int ret2;
+
+ /* Missing drive */
+ if (!log_child)
+ continue;
+
+ ret2 = ppl_load_distributed(log_child);
+ if (ret2 < 0) {
+ ret = ret2;
+ break;
+ }
+
+ ret += ret2;
+ }
+
+ dbg("return: %d\n", ret);
+ return ret;
+}
+
#define IMSM_MPB_SIG "Intel Raid ISM Cfg Sig. "
#define IMSM_MPB_ORIG_FAMILY_NUM_OFFSET 64
@@ -634,6 +975,12 @@ static int __ppl_init_log(struct r5l_log *log, struct r5conf *conf)
ppl_conf->child_logs[i] = log_child;
}
+ ret = ppl_load(log);
+ if (!ret && mddev->recovery_cp == 0 && !mddev->degraded)
+ mddev->recovery_cp = MaxSector;
+ else if (ret < 0)
+ goto err;
+
rcu_assign_pointer(conf->log, log);
set_bit(MD_HAS_PPL, &mddev->flags);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2169de5..ed340c3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7160,7 +7160,10 @@ static int raid5_run(struct mddev *mddev)
if (mddev->degraded > dirty_parity_disks &&
mddev->recovery_cp != MaxSector) {
- if (mddev->ok_start_degraded)
+ if (rwh_policy)
+ pr_warn("md/raid:%s: starting dirty degraded array with journal.\n",
+ mdname(mddev));
+ else if (mddev->ok_start_degraded)
pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
mdname(mddev));
else {
--
2.10.1
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH v2 11/12] raid5-ppl: support disk add/remove with distributed PPL
2016-12-05 15:31 [PATCH v2 00/12] Partial Parity Log for MD RAID 5 Artur Paszkiewicz
` (9 preceding siblings ...)
2016-12-05 15:31 ` [PATCH v2 10/12] raid5-ppl: recovery from dirty shutdown using PPL Artur Paszkiewicz
@ 2016-12-05 15:31 ` Artur Paszkiewicz
2016-12-07 1:29 ` NeilBrown
2016-12-05 15:31 ` [PATCH v2 12/12] raid5-ppl: runtime PPL enabling or disabling Artur Paszkiewicz
2016-12-07 0:32 ` [PATCH v2 00/12] Partial Parity Log for MD RAID 5 NeilBrown
12 siblings, 1 reply; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-05 15:31 UTC (permalink / raw)
To: shli; +Cc: linux-raid
Add a function to modify the log by adding or removing an rdev when a
drive fails or is added as a spare.
Adding a drive to the log is as simple as initializing and adding a new
child log, removing a drive is more complicated because it requires
stopping the child log and freeing all of its resources. In order to do
that, we busy wait for any submitted log bios to complete and then
manually finish and free the io_units. No new log requests will happen
at this point. A new list is added to struct r5l_io_unit to have access
to stripes that have been written to the log but are not completely
processed yet.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/md.c | 3 +-
drivers/md/raid5-cache.c | 13 ++++++-
drivers/md/raid5-cache.h | 3 ++
drivers/md/raid5-ppl.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++
drivers/md/raid5.c | 20 +++++++++++
5 files changed, 126 insertions(+), 2 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7049833..279e303 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8334,7 +8334,8 @@ static int remove_and_add_spares(struct mddev *mddev,
!test_bit(Blocked, &rdev->flags) &&
((test_bit(RemoveSynchronized, &rdev->flags) ||
(!test_bit(In_sync, &rdev->flags) &&
- !test_bit(Journal, &rdev->flags))) &&
+ !test_bit(Journal, &rdev->flags) &&
+ !test_bit(JournalPpl, &rdev->flags))) &&
atomic_read(&rdev->nr_pending)==0)) {
if (mddev->pers->hot_remove_disk(
mddev, rdev) == 0) {
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index be534d8..b69a289 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -345,7 +345,7 @@ void r5l_io_run_stripes(struct r5l_io_unit *io)
struct stripe_head *sh, *next;
list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
- list_del_init(&sh->log_list);
+ list_move_tail(&sh->log_list, &io->stripe_finished_list);
r5c_finish_cache_stripe(sh);
@@ -553,6 +553,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
io->log = log;
INIT_LIST_HEAD(&io->log_sibling);
INIT_LIST_HEAD(&io->stripe_list);
+ INIT_LIST_HEAD(&io->stripe_finished_list);
bio_list_init(&io->flush_barriers);
io->state = IO_UNIT_RUNNING;
@@ -2546,6 +2547,16 @@ void r5l_exit_log(struct r5l_log *log)
kfree(log);
}
+/*
+ * operation: 0 - remove rdev from log, 1 - add rdev to log
+ */
+int r5l_modify_log(struct r5l_log *log, struct md_rdev *rdev, int operation)
+{
+ if (log && log->policy->modify_log)
+ return log->policy->modify_log(log, rdev, operation);
+ return 0;
+}
+
struct r5l_policy r5l_journal = {
.init_log = __r5l_init_log,
.exit_log = __r5l_exit_log,
diff --git a/drivers/md/raid5-cache.h b/drivers/md/raid5-cache.h
index 0446100..9d5fa0df 100644
--- a/drivers/md/raid5-cache.h
+++ b/drivers/md/raid5-cache.h
@@ -110,6 +110,7 @@ struct r5l_io_unit {
sector_t log_end; /* where the io_unit ends */
struct list_head log_sibling; /* log->running_ios */
struct list_head stripe_list; /* stripes added to the io_unit */
+ struct list_head stripe_finished_list; /* stripes written to log */
int state;
bool need_split_bio;
@@ -139,6 +140,7 @@ enum r5l_io_unit_state {
struct r5l_policy {
int (*init_log)(struct r5l_log *log, struct r5conf *conf);
void (*exit_log)(struct r5l_log *log);
+ int (*modify_log)(struct r5l_log *log, struct md_rdev *rdev, int op);
int (*write_stripe)(struct r5l_log *log, struct stripe_head *sh);
void (*write_stripe_run)(struct r5l_log *log);
void (*flush_stripe_to_raid)(struct r5l_log *log);
@@ -149,6 +151,7 @@ struct r5l_policy {
extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev, int policy_type);
extern void r5l_exit_log(struct r5l_log *log);
+extern int r5l_modify_log(struct r5l_log *log, struct md_rdev *rdev, int operation);
extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh);
extern void r5l_write_stripe_run(struct r5l_log *log);
extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 17e9803..1a9581c 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -108,6 +108,7 @@ static struct r5l_io_unit *ppl_new_iounit(struct r5l_log *log,
io->log = log;
INIT_LIST_HEAD(&io->log_sibling);
INIT_LIST_HEAD(&io->stripe_list);
+ INIT_LIST_HEAD(&io->stripe_finished_list);
io->state = IO_UNIT_RUNNING;
io->meta_page = mempool_alloc(log->meta_pool, GFP_NOIO);
@@ -990,6 +991,93 @@ static int __ppl_init_log(struct r5l_log *log, struct r5conf *conf)
return ret;
}
+static void ppl_log_stop(struct r5l_log *log)
+{
+ struct r5l_io_unit *io, *next;
+ unsigned long flags;
+ bool wait;
+
+ /* wait for in flight ios to complete */
+ do {
+ wait = false;
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ list_for_each_entry(io, &log->running_ios, log_sibling) {
+ if (io->state == IO_UNIT_IO_START) {
+ wait = true;
+ break;
+ }
+ }
+ if (!wait)
+ wait = !list_empty(&log->flushing_ios);
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+ } while (wait);
+
+ /* clean up iounits */
+ spin_lock_irqsave(&log->io_list_lock, flags);
+
+ list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
+ list_move_tail(&io->log_sibling, &log->finished_ios);
+ bio_put(io->current_bio);
+ mempool_free(io->meta_page, log->meta_pool);
+ }
+ list_splice_tail_init(&log->io_end_ios, &log->finished_ios);
+
+ list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
+ struct stripe_head *sh;
+ list_for_each_entry(sh, &io->stripe_list, log_list) {
+ clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
+ sh->log_io = NULL;
+ }
+ r5l_io_run_stripes(io);
+ list_for_each_entry(sh, &io->stripe_finished_list, log_list) {
+ sh->log_io = NULL;
+ }
+ list_del(&io->log_sibling);
+ mempool_free(io, log->io_pool);
+ }
+ r5l_run_no_mem_stripe(log);
+
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+}
+
+static int __ppl_modify_log(struct r5l_log *log, struct md_rdev *rdev, int op)
+{
+ struct r5l_log *log_child;
+ struct ppl_conf *ppl_conf = log->private;
+
+ if (!rdev)
+ return -EINVAL;
+
+ dbg("rdev->raid_disk: %d op: %d\n", rdev->raid_disk, op);
+
+ if (rdev->raid_disk < 0)
+ return 0;
+
+ if (rdev->raid_disk >= ppl_conf->count)
+ return -ENODEV;
+
+ if (op == 0) {
+ log_child = ppl_conf->child_logs[rdev->raid_disk];
+ if (!log_child)
+ return 0;
+ ppl_conf->child_logs[rdev->raid_disk] = NULL;
+ ppl_log_stop(log_child);
+ ppl_exit_log_child(log_child);
+ } else if (op == 1) {
+ int ret = ppl_init_log_child(log, rdev, &log_child);
+ if (ret)
+ return ret;
+ ret = ppl_write_empty_header(log_child);
+ if (ret)
+ return ret;
+ ppl_conf->child_logs[rdev->raid_disk] = log_child;
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int __ppl_write_stripe(struct r5l_log *log, struct stripe_head *sh)
{
struct ppl_conf *ppl_conf = log->private;
@@ -1027,6 +1115,7 @@ static void __ppl_flush_stripe_to_raid(struct r5l_log *log)
struct r5l_policy r5l_ppl = {
.init_log = __ppl_init_log,
.exit_log = __ppl_exit_log,
+ .modify_log = __ppl_modify_log,
.write_stripe = __ppl_write_stripe,
.write_stripe_run = __ppl_write_stripe_run,
.flush_stripe_to_raid = __ppl_flush_stripe_to_raid,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ed340c3..67c8dce 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7466,6 +7466,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
*rdevp = rdev;
}
}
+ if (test_bit(JournalPpl, &rdev->flags) && conf->log) {
+ int ret;
+ if (conf->log->rwh_policy != RWH_POLICY_PPL)
+ return -EINVAL;
+ ret = r5l_modify_log(conf->log, rdev, 0);
+ if (ret)
+ return ret;
+ if (p->replacement) {
+ ret = r5l_modify_log(conf->log, p->replacement, 1);
+ if (ret)
+ return ret;
+ }
+ }
if (p->replacement) {
/* We must have just cleared 'rdev' */
p->rdev = p->replacement;
@@ -7558,6 +7571,13 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
}
}
out:
+ if (conf->log && !test_bit(Replacement, &rdev->flags) &&
+ conf->log->rwh_policy == RWH_POLICY_PPL) {
+ int ret = r5l_modify_log(conf->log, rdev, 1);
+ if (ret)
+ return ret;
+ }
+
print_raid5_conf(conf);
return err;
}
--
2.10.1
^ permalink raw reply related [flat|nested] 38+ messages in thread
* [PATCH v2 12/12] raid5-ppl: runtime PPL enabling or disabling
2016-12-05 15:31 [PATCH v2 00/12] Partial Parity Log for MD RAID 5 Artur Paszkiewicz
` (10 preceding siblings ...)
2016-12-05 15:31 ` [PATCH v2 11/12] raid5-ppl: support disk add/remove with distributed PPL Artur Paszkiewicz
@ 2016-12-05 15:31 ` Artur Paszkiewicz
2016-12-07 0:32 ` [PATCH v2 00/12] Partial Parity Log for MD RAID 5 NeilBrown
12 siblings, 0 replies; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-05 15:31 UTC (permalink / raw)
To: shli; +Cc: linux-raid
Introduce a sysfs attribute to get or set the current RWH policy. The
raid5_reset_cache function is used to free the stripe cache and allocate
it again. This is needed to allocate or free the ppl_pages for the
stripes in the stripe cache.
When enabling the log at runtime it is necessary to overwrite the PPL
header to avoid recovering from stale PPL data if the log had been used
previously with this array.
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
---
drivers/md/raid5-ppl.c | 38 ++++++++++++++---
drivers/md/raid5.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 142 insertions(+), 5 deletions(-)
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 1a9581c..d0a25da 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -745,6 +745,27 @@ static int ppl_load(struct r5l_log *log)
return ret;
}
+static int ppl_invalidate(struct r5l_log *log)
+{
+ struct ppl_conf *ppl_conf = log->private;
+ int i;
+
+ for (i = 0; i < ppl_conf->count; i++) {
+ struct r5l_log *log_child = ppl_conf->child_logs[i];
+ int ret;
+
+ /* Missing drive */
+ if (!log_child)
+ continue;
+
+ ret = ppl_write_empty_header(log_child);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
#define IMSM_MPB_SIG "Intel Raid ISM Cfg Sig. "
#define IMSM_MPB_ORIG_FAMILY_NUM_OFFSET 64
@@ -976,11 +997,18 @@ static int __ppl_init_log(struct r5l_log *log, struct r5conf *conf)
ppl_conf->child_logs[i] = log_child;
}
- ret = ppl_load(log);
- if (!ret && mddev->recovery_cp == 0 && !mddev->degraded)
- mddev->recovery_cp = MaxSector;
- else if (ret < 0)
- goto err;
+ if (mddev->pers) {
+ dbg("Array running - invalidate PPL\n");
+ ret = ppl_invalidate(log);
+ if (ret)
+ goto err;
+ } else {
+ ret = ppl_load(log);
+ if (!ret && mddev->recovery_cp == 0 && !mddev->degraded)
+ mddev->recovery_cp = MaxSector;
+ else if (ret < 0)
+ goto err;
+ }
rcu_assign_pointer(conf->log, log);
set_bit(MD_HAS_PPL, &mddev->flags);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 67c8dce..d829a28 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6455,6 +6455,114 @@ raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
raid5_show_group_thread_cnt,
raid5_store_group_thread_cnt);
+static ssize_t
+raid5_show_rwh_policy(struct mddev *mddev, char *page)
+{
+ struct r5conf *conf;
+ int ret = 0;
+ spin_lock(&mddev->lock);
+ conf = mddev->private;
+ if (conf) {
+ const char *policy = NULL;
+ if (!conf->log)
+ policy = "off";
+ else if (conf->log->rwh_policy == RWH_POLICY_JOURNAL)
+ policy = "journal";
+ else if (conf->log->rwh_policy == RWH_POLICY_PPL)
+ policy = "ppl";
+ if (policy)
+ ret = sprintf(page, "%s\n", policy);
+ }
+ spin_unlock(&mddev->lock);
+ return ret;
+}
+
+static void raid5_reset_cache(struct mddev *mddev)
+{
+ struct r5conf *conf = mddev->private;
+
+ mutex_lock(&conf->cache_size_mutex);
+ while (conf->max_nr_stripes &&
+ drop_one_stripe(conf))
+ ;
+
+ while (conf->min_nr_stripes > conf->max_nr_stripes)
+ if (!grow_one_stripe(conf, GFP_KERNEL))
+ break;
+ mutex_unlock(&conf->cache_size_mutex);
+}
+
+static ssize_t
+raid5_store_rwh_policy(struct mddev *mddev, const char *page, size_t len)
+{
+ struct r5conf *conf;
+ int err;
+ int new_policy, current_policy;
+
+ if (len >= PAGE_SIZE)
+ return -EINVAL;
+
+ err = mddev_lock(mddev);
+ if (err)
+ return err;
+ conf = mddev->private;
+ if (!conf) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ if (conf->log)
+ current_policy = conf->log->rwh_policy;
+ else
+ current_policy = RWH_POLICY_OFF;
+
+ if (strncmp(page, "off", 3) == 0) {
+ new_policy = RWH_POLICY_OFF;
+ } else if (strncmp(page, "journal", 7) == 0) {
+ new_policy = RWH_POLICY_JOURNAL;
+ } else if (strncmp(page, "ppl", 3) == 0) {
+ new_policy = RWH_POLICY_PPL;
+ } else {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (new_policy == current_policy)
+ goto out;
+
+ if (current_policy == RWH_POLICY_PPL && new_policy == RWH_POLICY_OFF) {
+ struct r5l_log *log;
+ mddev_suspend(mddev);
+ log = conf->log;
+ conf->log = NULL;
+ synchronize_rcu();
+ r5l_exit_log(log);
+ raid5_reset_cache(mddev);
+ mddev_resume(mddev);
+ } else if (current_policy == RWH_POLICY_OFF &&
+ new_policy == RWH_POLICY_PPL) {
+ mddev_suspend(mddev);
+ err = r5l_init_log(conf, NULL, new_policy);
+ if (!err)
+ raid5_reset_cache(mddev);
+ mddev_resume(mddev);
+ } else {
+ err = -EINVAL;
+ goto out;
+ }
+
+ md_update_sb(mddev, 1);
+out:
+ mddev_unlock(mddev);
+
+ return err ?: len;
+}
+
+static struct md_sysfs_entry
+raid5_rwh_policy = __ATTR(rwh_policy, S_IRUGO | S_IWUSR,
+ raid5_show_rwh_policy,
+ raid5_store_rwh_policy);
+
static struct attribute *raid5_attrs[] = {
&raid5_stripecache_size.attr,
&raid5_stripecache_active.attr,
@@ -6463,6 +6571,7 @@ static struct attribute *raid5_attrs[] = {
&raid5_skip_copy.attr,
&raid5_rmw_level.attr,
&r5c_journal_mode.attr,
+ &raid5_rwh_policy.attr,
NULL,
};
static struct attribute_group raid5_attrs_group = {
--
2.10.1
^ permalink raw reply related [flat|nested] 38+ messages in thread
* Re: [PATCH v2 00/12] Partial Parity Log for MD RAID 5
2016-12-05 15:31 [PATCH v2 00/12] Partial Parity Log for MD RAID 5 Artur Paszkiewicz
` (11 preceding siblings ...)
2016-12-05 15:31 ` [PATCH v2 12/12] raid5-ppl: runtime PPL enabling or disabling Artur Paszkiewicz
@ 2016-12-07 0:32 ` NeilBrown
2016-12-07 14:36 ` Artur Paszkiewicz
12 siblings, 1 reply; 38+ messages in thread
From: NeilBrown @ 2016-12-07 0:32 UTC (permalink / raw)
To: Artur Paszkiewicz, shli; +Cc: linux-raid
[-- Attachment #1: Type: text/plain, Size: 1768 bytes --]
On Tue, Dec 06 2016, Artur Paszkiewicz wrote:
> This series of patches implements the Partial Parity Log for RAID5 arrays. The
> purpose of this feature is closing the RAID Write Hole. It is a solution
> alternative to the existing raid5-cache, but the implementation is based on it
> and reuses some of the code by introducing support for interchangeable
> policies. This allows decoupling policy from mechanism and not adding more
> boilerplate code in raid5.c.
>
> The issue addressed by PPL is, that on a dirty shutdown, parity for a
> particular stripe may be inconsistent with data on other member disks. In
> degraded state, there is no way to recalculate parity, because one of the disks
> is missing. PPL addresses this issue and allows recalculating the parity. It
> stores only enough data needed for recovering from RWH and is not a true
> journal, like the raid5-cache implementation. It does not protect from losing
> in-flight data.
>
> PPL is a distributed log - data is stored on all RAID member drives in the
> metadata area. It does not need a dedicated journaling drive. Performance is
> reduced by up to 30%-40% but it scales with the number of drives in the array
> and the journaling drive does not become a bottleneck.
I would expect to see as description of what a PPL actually is and how
it works here... but there is none.
The change-log for patch 06 has a tiny bit more information which is
just enough to be able to start trying to understand the code, but it
isn't much.
And none of this description gets into the code, or into the
Documentation/. This makes it hard to review and hard to maintain.
Remember: if you want people to review you code, it is in your interest
to make it easy. That means give lots of details.
NeilBrown
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 00/12] Partial Parity Log for MD RAID 5
2016-12-07 0:32 ` [PATCH v2 00/12] Partial Parity Log for MD RAID 5 NeilBrown
@ 2016-12-07 14:36 ` Artur Paszkiewicz
2016-12-07 17:09 ` Shaohua Li
0 siblings, 1 reply; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-07 14:36 UTC (permalink / raw)
To: NeilBrown, shli; +Cc: linux-raid
On 12/07/2016 01:32 AM, NeilBrown wrote:
>
> I would expect to see as description of what a PPL actually is and how
> it works here... but there is none.
>
> The change-log for patch 06 has a tiny bit more information which is
> just enough to be able to start trying to understand the code, but it
> isn't much.
> And none of this description gets into the code, or into the
> Documentation/. This makes it hard to review and hard to maintain.
>
> Remember: if you want people to review you code, it is in your interest
> to make it easy. That means give lots of details.
Hi Neil,
Thank you for taking the time to look at this and for your feedback. I
didn't try to make it hard to review... Sometimes it's easy to forget
how non-obvious things are after looking at them for too long :) I will
improve the descriptions and address the issues that you found in the
next version of the patches.
Thanks,
Artur
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 00/12] Partial Parity Log for MD RAID 5
2016-12-07 14:36 ` Artur Paszkiewicz
@ 2016-12-07 17:09 ` Shaohua Li
2016-12-13 15:25 ` Jes Sorensen
0 siblings, 1 reply; 38+ messages in thread
From: Shaohua Li @ 2016-12-07 17:09 UTC (permalink / raw)
To: Artur Paszkiewicz; +Cc: NeilBrown, linux-raid
On Wed, Dec 07, 2016 at 03:36:01PM +0100, Artur Paszkiewicz wrote:
> On 12/07/2016 01:32 AM, NeilBrown wrote:
> >
> > I would expect to see as description of what a PPL actually is and how
> > it works here... but there is none.
> >
> > The change-log for patch 06 has a tiny bit more information which is
> > just enough to be able to start trying to understand the code, but it
> > isn't much.
> > And none of this description gets into the code, or into the
> > Documentation/. This makes it hard to review and hard to maintain.
> >
> > Remember: if you want people to review you code, it is in your interest
> > to make it easy. That means give lots of details.
>
> Hi Neil,
>
> Thank you for taking the time to look at this and for your feedback. I
> didn't try to make it hard to review... Sometimes it's easy to forget
> how non-obvious things are after looking at them for too long :) I will
> improve the descriptions and address the issues that you found in the
> next version of the patches.
Havn't looked at the patches yet, being busy recently, sorry! When you repost
these, I'd like to know why we need another log for raid5 considering we
already had one to fix similar issue. What's the good/bad side of this new log?
There is such feature in Intel RSTe doesn't sound like a technical reason we
should support this.
Thanks,
Shaohua
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 00/12] Partial Parity Log for MD RAID 5
2016-12-07 17:09 ` Shaohua Li
@ 2016-12-13 15:25 ` Jes Sorensen
2016-12-14 19:47 ` Shaohua Li
0 siblings, 1 reply; 38+ messages in thread
From: Jes Sorensen @ 2016-12-13 15:25 UTC (permalink / raw)
To: Shaohua Li; +Cc: Artur Paszkiewicz, NeilBrown, linux-raid
Shaohua Li <shli@kernel.org> writes:
> On Wed, Dec 07, 2016 at 03:36:01PM +0100, Artur Paszkiewicz wrote:
>> On 12/07/2016 01:32 AM, NeilBrown wrote:
>> >
>> > I would expect to see as description of what a PPL actually is and how
>> > it works here... but there is none.
>> >
>> > The change-log for patch 06 has a tiny bit more information which is
>> > just enough to be able to start trying to understand the code, but it
>> > isn't much.
>> > And none of this description gets into the code, or into the
>> > Documentation/. This makes it hard to review and hard to maintain.
>> >
>> > Remember: if you want people to review you code, it is in your interest
>> > to make it easy. That means give lots of details.
>>
>> Hi Neil,
>>
>> Thank you for taking the time to look at this and for your feedback. I
>> didn't try to make it hard to review... Sometimes it's easy to forget
>> how non-obvious things are after looking at them for too long :) I will
>> improve the descriptions and address the issues that you found in the
>> next version of the patches.
>
> Havn't looked at the patches yet, being busy recently, sorry! When you repost
> these, I'd like to know why we need another log for raid5 considering we
> already had one to fix similar issue. What's the good/bad side of this new log?
> There is such feature in Intel RSTe doesn't sound like a technical reason we
> should support this.
Shaohua,
Any further thought on these patches? I am considering doing a release
of mdadm early in the new year. it would be nice to include these
patches if the feature is going in.
As for supporting it, if IMSM supports it and it is used in the field,
then it seems legitimate for Linux to support it too. Just like we
support so many other obscure pieces of hardware :)
Cheers,
Jes
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 00/12] Partial Parity Log for MD RAID 5
2016-12-13 15:25 ` Jes Sorensen
@ 2016-12-14 19:47 ` Shaohua Li
2016-12-15 11:44 ` Artur Paszkiewicz
0 siblings, 1 reply; 38+ messages in thread
From: Shaohua Li @ 2016-12-14 19:47 UTC (permalink / raw)
To: Jes Sorensen; +Cc: Artur Paszkiewicz, NeilBrown, linux-raid
On Tue, Dec 13, 2016 at 10:25:04AM -0500, Jes Sorensen wrote:
> Shaohua Li <shli@kernel.org> writes:
> > On Wed, Dec 07, 2016 at 03:36:01PM +0100, Artur Paszkiewicz wrote:
> >> On 12/07/2016 01:32 AM, NeilBrown wrote:
> >> >
> >> > I would expect to see as description of what a PPL actually is and how
> >> > it works here... but there is none.
> >> >
> >> > The change-log for patch 06 has a tiny bit more information which is
> >> > just enough to be able to start trying to understand the code, but it
> >> > isn't much.
> >> > And none of this description gets into the code, or into the
> >> > Documentation/. This makes it hard to review and hard to maintain.
> >> >
> >> > Remember: if you want people to review you code, it is in your interest
> >> > to make it easy. That means give lots of details.
> >>
> >> Hi Neil,
> >>
> >> Thank you for taking the time to look at this and for your feedback. I
> >> didn't try to make it hard to review... Sometimes it's easy to forget
> >> how non-obvious things are after looking at them for too long :) I will
> >> improve the descriptions and address the issues that you found in the
> >> next version of the patches.
> >
> > Havn't looked at the patches yet, being busy recently, sorry! When you repost
> > these, I'd like to know why we need another log for raid5 considering we
> > already had one to fix similar issue. What's the good/bad side of this new log?
> > There is such feature in Intel RSTe doesn't sound like a technical reason we
> > should support this.
>
> Shaohua,
>
> Any further thought on these patches? I am considering doing a release
> of mdadm early in the new year. it would be nice to include these
> patches if the feature is going in.
>
> As for supporting it, if IMSM supports it and it is used in the field,
> then it seems legitimate for Linux to support it too. Just like we
> support so many other obscure pieces of hardware :)
Sure, I don't object to support it. Just need to understand how it works. Had a
brief review. The ondisk format looks good. That probably is related to mdadm
mostly. The disk format has alignment issue as Neil noted, which would be
unfriendly for non-x86 arch. Will we stick to this disk format or change it?
We'd make a decision.
For the implementation, I don't understand how the ppl works much, there aren't
many details there. Two things I noted:
- The code skips the log for full stripe write. This isn't good. It would means
after a unclean shutdown/recovery, one disk has arbitrary data, not the old
data and new data. This breaks an assumption in filesystem, after a failed
write to a sector, the sector has either old or new data. Thinking about a
write to superblock. The data could be old or new superblock, but it's still a
superblock, not something random.
- From the patch 6 & 10, looks PPL only help recover unwritten disks. If one
disk of a stripe is dirty (eg it's written before unclean shutdown), and it's
lost in recovery, what will happen? Seems the data of lost disk will be read as
0? It will break the assumption above too. If I understand the code clearly
(maybe not, need clarification), this is a design flaw.
Thanks,
Shaohua
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 00/12] Partial Parity Log for MD RAID 5
2016-12-14 19:47 ` Shaohua Li
@ 2016-12-15 11:44 ` Artur Paszkiewicz
2016-12-16 23:24 ` Shaohua Li
0 siblings, 1 reply; 38+ messages in thread
From: Artur Paszkiewicz @ 2016-12-15 11:44 UTC (permalink / raw)
To: Shaohua Li, Jes Sorensen; +Cc: NeilBrown, linux-raid
On 12/14/2016 08:47 PM, Shaohua Li wrote:
> On Tue, Dec 13, 2016 at 10:25:04AM -0500, Jes Sorensen wrote:
>> Shaohua Li <shli@kernel.org> writes:
>>> On Wed, Dec 07, 2016 at 03:36:01PM +0100, Artur Paszkiewicz wrote:
>>>> On 12/07/2016 01:32 AM, NeilBrown wrote:
>>>>>
>>>>> I would expect to see as description of what a PPL actually is and how
>>>>> it works here... but there is none.
>>>>>
>>>>> The change-log for patch 06 has a tiny bit more information which is
>>>>> just enough to be able to start trying to understand the code, but it
>>>>> isn't much.
>>>>> And none of this description gets into the code, or into the
>>>>> Documentation/. This makes it hard to review and hard to maintain.
>>>>>
>>>>> Remember: if you want people to review you code, it is in your interest
>>>>> to make it easy. That means give lots of details.
>>>>
>>>> Hi Neil,
>>>>
>>>> Thank you for taking the time to look at this and for your feedback. I
>>>> didn't try to make it hard to review... Sometimes it's easy to forget
>>>> how non-obvious things are after looking at them for too long :) I will
>>>> improve the descriptions and address the issues that you found in the
>>>> next version of the patches.
>>>
>>> Havn't looked at the patches yet, being busy recently, sorry! When you repost
>>> these, I'd like to know why we need another log for raid5 considering we
>>> already had one to fix similar issue. What's the good/bad side of this new log?
>>> There is such feature in Intel RSTe doesn't sound like a technical reason we
>>> should support this.
>>
>> Shaohua,
>>
>> Any further thought on these patches? I am considering doing a release
>> of mdadm early in the new year. it would be nice to include these
>> patches if the feature is going in.
>>
>> As for supporting it, if IMSM supports it and it is used in the field,
>> then it seems legitimate for Linux to support it too. Just like we
>> support so many other obscure pieces of hardware :)
>
> Sure, I don't object to support it. Just need to understand how it works. Had a
> brief review. The ondisk format looks good. That probably is related to mdadm
> mostly. The disk format has alignment issue as Neil noted, which would be
> unfriendly for non-x86 arch. Will we stick to this disk format or change it?
> We'd make a decision.
This alignment issue will be fixed by extending the 'parity_disk' field
to 4 bytes. The 'checksum' field will then be properly aligned and the
size of the structure will be 24 bytes, also fixing the array alignment.
> For the implementation, I don't understand how the ppl works much, there aren't
> many details there. Two things I noted:
>
> - The code skips the log for full stripe write. This isn't good. It would means
> after a unclean shutdown/recovery, one disk has arbitrary data, not the old
> data and new data. This breaks an assumption in filesystem, after a failed
> write to a sector, the sector has either old or new data. Thinking about a
> write to superblock. The data could be old or new superblock, but it's still a
> superblock, not something random.
>
> - From the patch 6 & 10, looks PPL only help recover unwritten disks. If one
> disk of a stripe is dirty (eg it's written before unclean shutdown), and it's
> lost in recovery, what will happen? Seems the data of lost disk will be read as
> 0? It will break the assumption above too. If I understand the code clearly
> (maybe not, need clarification), this is a design flaw.
PPL is only used to update the parity for a stripe, data chunks are not
modified at all during PPL recovery. The assumption was that it would
protect only from silent data corruption, to eliminate the cases when
data that was not touched by a write request could change. So if a dirty
disk is lost, no recovery is performed for this stripe (parity is not
updated). For full stripe write we only recalculate the parity after a
dirty shutdown if all disks are available (like resync). So you are
right that it is still possible to have arbitrary data in the written
part of a stripe if that disk is lost. In such case the behavior is the
same as in plain raid5.
Thanks,
Artur
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 00/12] Partial Parity Log for MD RAID 5
2016-12-15 11:44 ` Artur Paszkiewicz
@ 2016-12-16 23:24 ` Shaohua Li
2017-01-03 15:42 ` Jes Sorensen
0 siblings, 1 reply; 38+ messages in thread
From: Shaohua Li @ 2016-12-16 23:24 UTC (permalink / raw)
To: Artur Paszkiewicz; +Cc: Jes Sorensen, NeilBrown, linux-raid
On Thu, Dec 15, 2016 at 12:44:57PM +0100, Artur Paszkiewicz wrote:
> On 12/14/2016 08:47 PM, Shaohua Li wrote:
> > On Tue, Dec 13, 2016 at 10:25:04AM -0500, Jes Sorensen wrote:
> >> Shaohua Li <shli@kernel.org> writes:
> >>> On Wed, Dec 07, 2016 at 03:36:01PM +0100, Artur Paszkiewicz wrote:
> >>>> On 12/07/2016 01:32 AM, NeilBrown wrote:
> >>>>>
> >>>>> I would expect to see as description of what a PPL actually is and how
> >>>>> it works here... but there is none.
> >>>>>
> >>>>> The change-log for patch 06 has a tiny bit more information which is
> >>>>> just enough to be able to start trying to understand the code, but it
> >>>>> isn't much.
> >>>>> And none of this description gets into the code, or into the
> >>>>> Documentation/. This makes it hard to review and hard to maintain.
> >>>>>
> >>>>> Remember: if you want people to review you code, it is in your interest
> >>>>> to make it easy. That means give lots of details.
> >>>>
> >>>> Hi Neil,
> >>>>
> >>>> Thank you for taking the time to look at this and for your feedback. I
> >>>> didn't try to make it hard to review... Sometimes it's easy to forget
> >>>> how non-obvious things are after looking at them for too long :) I will
> >>>> improve the descriptions and address the issues that you found in the
> >>>> next version of the patches.
> >>>
> >>> Havn't looked at the patches yet, being busy recently, sorry! When you repost
> >>> these, I'd like to know why we need another log for raid5 considering we
> >>> already had one to fix similar issue. What's the good/bad side of this new log?
> >>> There is such feature in Intel RSTe doesn't sound like a technical reason we
> >>> should support this.
> >>
> >> Shaohua,
> >>
> >> Any further thought on these patches? I am considering doing a release
> >> of mdadm early in the new year. it would be nice to include these
> >> patches if the feature is going in.
> >>
> >> As for supporting it, if IMSM supports it and it is used in the field,
> >> then it seems legitimate for Linux to support it too. Just like we
> >> support so many other obscure pieces of hardware :)
> >
> > Sure, I don't object to support it. Just need to understand how it works. Had a
> > brief review. The ondisk format looks good. That probably is related to mdadm
> > mostly. The disk format has alignment issue as Neil noted, which would be
> > unfriendly for non-x86 arch. Will we stick to this disk format or change it?
> > We'd make a decision.
>
> This alignment issue will be fixed by extending the 'parity_disk' field
> to 4 bytes. The 'checksum' field will then be properly aligned and the
> size of the structure will be 24 bytes, also fixing the array alignment.
>
> > For the implementation, I don't understand how the ppl works much, there aren't
> > many details there. Two things I noted:
> >
> > - The code skips the log for full stripe write. This isn't good. It would means
> > after a unclean shutdown/recovery, one disk has arbitrary data, not the old
> > data and new data. This breaks an assumption in filesystem, after a failed
> > write to a sector, the sector has either old or new data. Thinking about a
> > write to superblock. The data could be old or new superblock, but it's still a
> > superblock, not something random.
> >
> > - From the patch 6 & 10, looks PPL only help recover unwritten disks. If one
> > disk of a stripe is dirty (eg it's written before unclean shutdown), and it's
> > lost in recovery, what will happen? Seems the data of lost disk will be read as
> > 0? It will break the assumption above too. If I understand the code clearly
> > (maybe not, need clarification), this is a design flaw.
>
> PPL is only used to update the parity for a stripe, data chunks are not
> modified at all during PPL recovery. The assumption was that it would
> protect only from silent data corruption, to eliminate the cases when
> data that was not touched by a write request could change. So if a dirty
> disk is lost, no recovery is performed for this stripe (parity is not
> updated). For full stripe write we only recalculate the parity after a
> dirty shutdown if all disks are available (like resync). So you are
> right that it is still possible to have arbitrary data in the written
> part of a stripe if that disk is lost. In such case the behavior is the
> same as in plain raid5.
Ok, this matches my understanding. This isn't a completed solution but does
help a lot. If users want to use this, there is no reason to not support it.
After you fix the alignment issue and describe the solution in details, I'll
look at it again.
Thanks,
Shaohua
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 00/12] Partial Parity Log for MD RAID 5
2016-12-16 23:24 ` Shaohua Li
@ 2017-01-03 15:42 ` Jes Sorensen
2017-01-04 8:01 ` Artur Paszkiewicz
0 siblings, 1 reply; 38+ messages in thread
From: Jes Sorensen @ 2017-01-03 15:42 UTC (permalink / raw)
To: Shaohua Li; +Cc: Artur Paszkiewicz, NeilBrown, linux-raid
Shaohua Li <shli@kernel.org> writes:
> On Thu, Dec 15, 2016 at 12:44:57PM +0100, Artur Paszkiewicz wrote:
>> On 12/14/2016 08:47 PM, Shaohua Li wrote:
>> > For the implementation, I don't understand how the ppl works much,
>> > there aren't
>> > many details there. Two things I noted:
>> >
>> > - The code skips the log for full stripe write. This isn't
>> > good. It would means
>> > after a unclean shutdown/recovery, one disk has arbitrary data,
>> > not the old
>> > data and new data. This breaks an assumption in filesystem, after a failed
>> > write to a sector, the sector has either old or new data. Thinking about a
>> > write to superblock. The data could be old or new superblock,
>> > but it's still a
>> > superblock, not something random.
>> >
>> > - From the patch 6 & 10, looks PPL only help recover unwritten disks. If one
>> > disk of a stripe is dirty (eg it's written before unclean
>> > shutdown), and it's
>> > lost in recovery, what will happen? Seems the data of lost disk
>> > will be read as
>> > 0? It will break the assumption above too. If I understand the
>> > code clearly
>> > (maybe not, need clarification), this is a design flaw.
>>
>> PPL is only used to update the parity for a stripe, data chunks are not
>> modified at all during PPL recovery. The assumption was that it would
>> protect only from silent data corruption, to eliminate the cases when
>> data that was not touched by a write request could change. So if a dirty
>> disk is lost, no recovery is performed for this stripe (parity is not
>> updated). For full stripe write we only recalculate the parity after a
>> dirty shutdown if all disks are available (like resync). So you are
>> right that it is still possible to have arbitrary data in the written
>> part of a stripe if that disk is lost. In such case the behavior is the
>> same as in plain raid5.
>
> Ok, this matches my understanding. This isn't a completed solution but does
> help a lot. If users want to use this, there is no reason to not support it.
> After you fix the alignment issue and describe the solution in details, I'll
> look at it again.
Artur,
Did you make any progress getting the alignment issue resolved?
I'd really like to get an mdadm release out the door this week, so
getting this resolved would be awesome. Hint hint ;)
Cheers,
Jes
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 00/12] Partial Parity Log for MD RAID 5
2017-01-03 15:42 ` Jes Sorensen
@ 2017-01-04 8:01 ` Artur Paszkiewicz
2017-01-04 13:29 ` Jes Sorensen
0 siblings, 1 reply; 38+ messages in thread
From: Artur Paszkiewicz @ 2017-01-04 8:01 UTC (permalink / raw)
To: Jes Sorensen, Shaohua Li; +Cc: NeilBrown, linux-raid
On 01/03/2017 04:42 PM, Jes Sorensen wrote:
> Artur,
>
> Did you make any progress getting the alignment issue resolved?
>
> I'd really like to get an mdadm release out the door this week, so
> getting this resolved would be awesome. Hint hint ;)
Jes,
The alignment issue is fixed but I'm currently making changes in the
kernel part after comments from Neil. This also requires modifications
in mdadm, maybe even more than in the kernel. I need a few more days to
finish this, I'll be sending the next version of the patches probably
next week. So don't let me keep you if you want to make a release this
week.
Thanks,
Artur
^ permalink raw reply [flat|nested] 38+ messages in thread
* Re: [PATCH v2 00/12] Partial Parity Log for MD RAID 5
2017-01-04 8:01 ` Artur Paszkiewicz
@ 2017-01-04 13:29 ` Jes Sorensen
0 siblings, 0 replies; 38+ messages in thread
From: Jes Sorensen @ 2017-01-04 13:29 UTC (permalink / raw)
To: Artur Paszkiewicz; +Cc: Shaohua Li, NeilBrown, linux-raid
Artur Paszkiewicz <artur.paszkiewicz@intel.com> writes:
> On 01/03/2017 04:42 PM, Jes Sorensen wrote:
>> Artur,
>>
>> Did you make any progress getting the alignment issue resolved?
>>
>> I'd really like to get an mdadm release out the door this week, so
>> getting this resolved would be awesome. Hint hint ;)
>
> Jes,
>
> The alignment issue is fixed but I'm currently making changes in the
> kernel part after comments from Neil. This also requires modifications
> in mdadm, maybe even more than in the kernel. I need a few more days to
> finish this, I'll be sending the next version of the patches probably
> next week. So don't let me keep you if you want to make a release this
> week.
Artur,
Thanks for the update - it sounds like it is probably wise to let this
stew a bit and let all the dust settle before we put it into an official
release.
Cheers,
Jes
^ permalink raw reply [flat|nested] 38+ messages in thread