* [PATCH 1/4] raid5-cache: use rcu api to access r5conf->log
2016-01-05 21:26 [PATCH 0/4]raid5-cache: fix journal hotadd Shaohua Li
@ 2016-01-05 21:26 ` Shaohua Li
2016-01-06 1:04 ` NeilBrown
2016-01-05 21:26 ` [PATCH 2/4] raid5-cache: avoid write failure for journal hotadd Shaohua Li
` (2 subsequent siblings)
3 siblings, 1 reply; 10+ messages in thread
From: Shaohua Li @ 2016-01-05 21:26 UTC (permalink / raw)
To: linux-raid; +Cc: Kernel-team, songliubraving, neilb
If we add journal disk to an array which isn't created with journal, IO
might be running in the array. We must be careful about the log checks
to make sure log is fully initialized, which is the job of
rcu_dereference(). Don't need rcu read lock protection here, as
hotremove only happens when there is no write requests.
Signed-off-by: Shaohua Li <shli@fb.com>
---
drivers/md/raid5-cache.c | 19 +++++++++++++------
drivers/md/raid5.c | 20 ++++++++++----------
drivers/md/raid5.h | 10 +++++-----
3 files changed, 28 insertions(+), 21 deletions(-)
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index d242a36..31e0fad 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -438,7 +438,7 @@ static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
* running in raid5d, where reclaim could wait for raid5d too (when it flushes
* data from log to raid disks), so we shouldn't wait for reclaim here
*/
-int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
+int r5l_write_stripe(struct r5conf *conf, struct stripe_head *sh)
{
int write_disks = 0;
int data_pages, parity_pages;
@@ -446,6 +446,7 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
int reserve;
int i;
int ret = 0;
+ struct r5l_log *log = rcu_dereference(conf->log);
if (!log)
return -EAGAIN;
@@ -513,8 +514,9 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
return 0;
}
-void r5l_write_stripe_run(struct r5l_log *log)
+void r5l_write_stripe_run(struct r5conf *conf)
{
+ struct r5l_log *log = rcu_dereference(conf->log);
if (!log)
return;
mutex_lock(&log->io_mutex);
@@ -522,8 +524,10 @@ void r5l_write_stripe_run(struct r5l_log *log)
mutex_unlock(&log->io_mutex);
}
-int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
+int r5l_handle_flush_request(struct r5conf *conf, struct bio *bio)
{
+ struct r5l_log *log = ACCESS_ONCE(conf->log);
+
if (!log)
return -ENODEV;
/*
@@ -664,9 +668,10 @@ static void r5l_log_flush_endio(struct bio *bio)
* only write stripes of an io_unit to raid disks till the io_unit is the first
* one whose data/parity is in log.
*/
-void r5l_flush_stripe_to_raid(struct r5l_log *log)
+void r5l_flush_stripe_to_raid(struct r5conf *conf)
{
bool do_flush;
+ struct r5l_log *log = rcu_dereference(conf->log);
if (!log || !log->need_cache_flush)
return;
@@ -800,7 +805,7 @@ static void r5l_reclaim_thread(struct md_thread *thread)
{
struct mddev *mddev = thread->mddev;
struct r5conf *conf = mddev->private;
- struct r5l_log *log = conf->log;
+ struct r5l_log *log = rcu_dereference(conf->log);
if (!log)
return;
@@ -820,9 +825,11 @@ static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
md_wakeup_thread(log->reclaim_thread);
}
-void r5l_quiesce(struct r5l_log *log, int state)
+void r5l_quiesce(struct r5conf *conf, int state)
{
struct mddev *mddev;
+ struct r5l_log *log = rcu_dereference(conf->log);
+
if (!log || state == 2)
return;
if (state == 0) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a086014..e74ead1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -757,7 +757,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
- if (conf->log)
+ if (ACCESS_ONCE(conf->log))
return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
@@ -897,7 +897,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
might_sleep();
- if (r5l_write_stripe(conf->log, sh) == 0)
+ if (r5l_write_stripe(conf, sh) == 0)
return;
for (i = disks; i--; ) {
int rw;
@@ -5148,7 +5148,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
bool do_prepare;
if (unlikely(bi->bi_rw & REQ_FLUSH)) {
- int ret = r5l_handle_flush_request(conf->log, bi);
+ int ret = r5l_handle_flush_request(conf, bi);
if (ret == 0)
return;
@@ -5751,7 +5751,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
break;
if (i == NR_STRIPE_HASH_LOCKS) {
spin_unlock_irq(&conf->device_lock);
- r5l_flush_stripe_to_raid(conf->log);
+ r5l_flush_stripe_to_raid(conf);
spin_lock_irq(&conf->device_lock);
return batch_size;
}
@@ -5762,7 +5762,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
release_inactive_stripe_list(conf, temp_inactive_list,
NR_STRIPE_HASH_LOCKS);
- r5l_flush_stripe_to_raid(conf->log);
+ r5l_flush_stripe_to_raid(conf);
if (release_inactive) {
spin_lock_irq(&conf->device_lock);
return 0;
@@ -5770,7 +5770,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
for (i = 0; i < batch_size; i++)
handle_stripe(batch[i]);
- r5l_write_stripe_run(conf->log);
+ r5l_write_stripe_run(conf);
cond_resched();
@@ -5904,7 +5904,7 @@ static void raid5d(struct md_thread *thread)
mutex_unlock(&conf->cache_size_mutex);
}
- r5l_flush_stripe_to_raid(conf->log);
+ r5l_flush_stripe_to_raid(conf);
async_tx_issue_pending_all();
blk_finish_plug(&plug);
@@ -7291,7 +7291,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
sector_t newsize;
struct r5conf *conf = mddev->private;
- if (conf->log)
+ if (ACCESS_ONCE(conf->log))
return -EINVAL;
sectors &= ~((sector_t)conf->chunk_sectors - 1);
newsize = raid5_size(mddev, sectors, mddev->raid_disks);
@@ -7344,7 +7344,7 @@ static int check_reshape(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
- if (conf->log)
+ if (ACCESS_ONCE(conf->log))
return -EINVAL;
if (mddev->delta_disks == 0 &&
mddev->new_layout == mddev->layout &&
@@ -7622,7 +7622,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
unlock_all_device_hash_locks_irq(conf);
break;
}
- r5l_quiesce(conf->log, state);
+ r5l_quiesce(conf, state);
}
static void *raid45_takeover_raid0(struct mddev *mddev, int level)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index a415e1c..96969fb 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -626,11 +626,11 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce);
extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
extern void r5l_exit_log(struct r5l_log *log);
-extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
-extern void r5l_write_stripe_run(struct r5l_log *log);
-extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
+extern int r5l_write_stripe(struct r5conf *conf, struct stripe_head *head_sh);
+extern void r5l_write_stripe_run(struct r5conf *conf);
+extern void r5l_flush_stripe_to_raid(struct r5conf *conf);
extern void r5l_stripe_write_finished(struct stripe_head *sh);
-extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
-extern void r5l_quiesce(struct r5l_log *log, int state);
+extern int r5l_handle_flush_request(struct r5conf *conf, struct bio *bio);
+extern void r5l_quiesce(struct r5conf *conf, int state);
extern bool r5l_log_disk_error(struct r5conf *conf);
#endif
--
2.4.6
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 2/4] raid5-cache: avoid write failure for journal hotadd
2016-01-05 21:26 [PATCH 0/4]raid5-cache: fix journal hotadd Shaohua Li
2016-01-05 21:26 ` [PATCH 1/4] raid5-cache: use rcu api to access r5conf->log Shaohua Li
@ 2016-01-05 21:26 ` Shaohua Li
2016-01-06 1:03 ` NeilBrown
2016-01-05 21:26 ` [PATCH 3/4] raid5-cache: handle flush request " Shaohua Li
2016-01-05 21:26 ` [PATCH 4/4] raid5-cache: handle batch stripe " Shaohua Li
3 siblings, 1 reply; 10+ messages in thread
From: Shaohua Li @ 2016-01-05 21:26 UTC (permalink / raw)
To: linux-raid; +Cc: Kernel-team, songliubraving, neilb
r5l_log_disk_error() will make write request fail if there is no log,
but MD_HAS_JOURNAL is set. When we hotadd journal, MD_HAS_JOURNAL is set
but the r5l_log isn't fully initialized yet. Adding a new flag to
indicate the situation and let r5l_log_disk_error() handle the
situation.
Based on Song Liu's original patch
Signed-off-by: Shaohua Li <shli@fb.com>
---
drivers/md/md.c | 10 +++++++++-
drivers/md/md.h | 2 ++
drivers/md/raid5-cache.c | 16 +++++++++++++---
3 files changed, 24 insertions(+), 4 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c0c3e6d..9f1d609 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6899,8 +6899,16 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
else if (!(info.state & (1<<MD_DISK_SYNC)))
/* Need to clear read-only for this */
break;
- else
+ else {
+ if ((info.state & (1<<MD_DISK_JOURNAL)) &&
+ !test_bit(MD_HAS_JOURNAL, &mddev->flags))
+ set_bit(MD_JOURNAL_NOT_INITIALIZED,
+ &mddev->flags);
err = add_new_disk(mddev, &info);
+ if (err)
+ clear_bit(MD_JOURNAL_NOT_INITIALIZED,
+ &mddev->flags);
+ }
goto unlock;
}
break;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index e16a17c..d18e0d4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -238,6 +238,8 @@ struct mddev {
#define MD_RELOAD_SB 7 /* Reload the superblock because another node
* updated it.
*/
+#define MD_JOURNAL_NOT_INITIALIZED 8 /* hot add a journal, and journal isn't
+ * ready to use yet */
int suspended;
atomic_t active_io;
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 31e0fad..5a0f680 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -859,9 +859,17 @@ bool r5l_log_disk_error(struct r5conf *conf)
rcu_read_lock();
log = rcu_dereference(conf->log);
- if (!log)
- ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
- else
+ if (!log) {
+ ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
+ !test_bit(MD_JOURNAL_NOT_INITIALIZED, &conf->mddev->flags);
+ smp_mb__after_atomic();
+ /*
+ * r5l_init_log sets ->log first and then clear INITIALIZED, we
+ * check in reverse order to avoid race condition.
+ */
+ log = rcu_dereference(conf->log);
+ }
+ if (log)
ret = test_bit(Faulty, &log->rdev->flags);
rcu_read_unlock();
return ret;
@@ -1242,6 +1250,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
goto error;
rcu_assign_pointer(conf->log, log);
+ smp_mb__before_atomic();
+ clear_bit(MD_JOURNAL_NOT_INITIALIZED, &conf->mddev->flags);
return 0;
error:
--
2.4.6
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 3/4] raid5-cache: handle flush request for journal hotadd
2016-01-05 21:26 [PATCH 0/4]raid5-cache: fix journal hotadd Shaohua Li
2016-01-05 21:26 ` [PATCH 1/4] raid5-cache: use rcu api to access r5conf->log Shaohua Li
2016-01-05 21:26 ` [PATCH 2/4] raid5-cache: avoid write failure for journal hotadd Shaohua Li
@ 2016-01-05 21:26 ` Shaohua Li
2016-01-06 1:12 ` NeilBrown
2016-01-05 21:26 ` [PATCH 4/4] raid5-cache: handle batch stripe " Shaohua Li
3 siblings, 1 reply; 10+ messages in thread
From: Shaohua Li @ 2016-01-05 21:26 UTC (permalink / raw)
To: linux-raid; +Cc: Kernel-team, songliubraving, neilb
When we hotadd journal for array which isn't created with journal, the
array might be running write requests. Such writes aren't protected by
journal yet, so we can't skip disk flush. There is no easy way to know
when all such writes are finished, but the time should be enough after
reclaim runs once.
Signed-off-by: Shaohua Li <shli@fb.com>
---
drivers/md/raid5-cache.c | 26 ++++++++++++++++++++++++--
1 file changed, 24 insertions(+), 2 deletions(-)
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 5a0f680..fcda2d2 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -97,6 +97,7 @@ struct r5l_log {
bool need_cache_flush;
bool in_teardown;
+ bool force_flush_all_disks;
};
/*
@@ -526,10 +527,27 @@ void r5l_write_stripe_run(struct r5conf *conf)
int r5l_handle_flush_request(struct r5conf *conf, struct bio *bio)
{
- struct r5l_log *log = ACCESS_ONCE(conf->log);
+ struct r5l_log *log;
- if (!log)
+ /* raid5_remove_disk writes_pending check isn't helpful for flush */
+ rcu_read_lock();
+ log = rcu_dereference(conf->log);
+ if (!log) {
+ rcu_read_unlock();
return -ENODEV;
+ }
+
+ /*
+ * If we add journal to array without journal before, the array might
+ * run write requests before journal runs. such requests are not
+ * protected by journal, so we can't skip flush.
+ */
+ if (log->force_flush_all_disks) {
+ rcu_read_unlock();
+ md_flush_request(log->rdev->mddev, bio);
+ return 0;
+ }
+ rcu_read_unlock();
/*
* we flush log disk cache first, then write stripe data to raid disks.
* So if bio is finished, the log disk cache is flushed already. The
@@ -798,6 +816,8 @@ static void r5l_do_reclaim(struct r5l_log *log)
log->last_cp_seq = next_cp_seq;
mutex_unlock(&log->io_mutex);
+ log->force_flush_all_disks = false;
+
r5l_run_no_space_stripes(log);
}
@@ -1246,6 +1266,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
INIT_LIST_HEAD(&log->no_space_stripes);
spin_lock_init(&log->no_space_stripes_lock);
+ if (test_bit(MD_JOURNAL_NOT_INITIALIZED, &conf->mddev->flags))
+ log->force_flush_all_disks = true;
if (r5l_load_log(log))
goto error;
--
2.4.6
^ permalink raw reply related [flat|nested] 10+ messages in thread